66 | 34.1M | } dec_group.cc:decltype (Zero((hwy::N_SSE4::Simd<int, 4ul, 0>::Rebind<float>)())) jxl::N_SSE4::(anonymous namespace)::AdjustQuantBias<hwy::N_SSE4::Simd<int, 4ul, 0> >(hwy::N_SSE4::Simd<int, 4ul, 0>, unsigned long, decltype (Zero((hwy::N_SSE4::Simd<int, 4ul, 0>)())), float const*) Line | Count | Source | 36 | 10.5M | const float* HWY_RESTRICT biases) { | 37 | 10.5M | const Rebind<float, DI> df; | 38 | | | 39 | 10.5M | const auto quant = ConvertTo(df, quant_i); | 40 | | | 41 | | // Compare |quant|, keep sign bit for negating result. | 42 | 10.5M | const auto kSign = BitCast(df, Set(di, INT32_MIN)); | 43 | 10.5M | const auto sign = And(quant, kSign); // TODO(janwas): = abs ^ orig | 44 | 10.5M | const auto abs_quant = AndNot(kSign, quant); | 45 | | | 46 | | // If |x| is 1, kZeroBias creates a different bias for each channel. | 47 | | // We're implementing the following: | 48 | | // if (quant == 0) return 0; | 49 | | // if (quant == 1) return biases[c]; | 50 | | // if (quant == -1) return -biases[c]; | 51 | | // return quant - biases[3] / quant; | 52 | | | 53 | | // Integer comparison is not helpful because Clang incurs bypass penalties | 54 | | // from unnecessarily mixing integer and float. | 55 | 10.5M | const auto is_01 = Lt(abs_quant, Set(df, 1.125f)); | 56 | 10.5M | const auto not_0 = Gt(abs_quant, Zero(df)); | 57 | | | 58 | | // Bitwise logic is faster than quant * biases[c]. | 59 | 10.5M | const auto one_bias = IfThenElseZero(not_0, Xor(Set(df, biases[c]), sign)); | 60 | | | 61 | | // About 2E-5 worse than ReciprocalNR or division. | 62 | 10.5M | const auto bias = | 63 | 10.5M | NegMulAdd(Set(df, biases[3]), ApproximateReciprocal(quant), quant); | 64 | | | 65 | 10.5M | return IfThenElse(is_01, one_bias, bias); | 66 | 10.5M | } |
dec_group.cc:decltype (Zero((hwy::N_AVX2::Simd<int, 8ul, 0>::Rebind<float>)())) jxl::N_AVX2::(anonymous namespace)::AdjustQuantBias<hwy::N_AVX2::Simd<int, 8ul, 0> >(hwy::N_AVX2::Simd<int, 8ul, 0>, unsigned long, decltype (Zero((hwy::N_AVX2::Simd<int, 8ul, 0>)())), float const*) Line | Count | Source | 36 | 8.17M | const float* HWY_RESTRICT biases) { | 37 | 8.17M | const Rebind<float, DI> df; | 38 | | | 39 | 8.17M | const auto quant = ConvertTo(df, quant_i); | 40 | | | 41 | | // Compare |quant|, keep sign bit for negating result. | 42 | 8.17M | const auto kSign = BitCast(df, Set(di, INT32_MIN)); | 43 | 8.17M | const auto sign = And(quant, kSign); // TODO(janwas): = abs ^ orig | 44 | 8.17M | const auto abs_quant = AndNot(kSign, quant); | 45 | | | 46 | | // If |x| is 1, kZeroBias creates a different bias for each channel. | 47 | | // We're implementing the following: | 48 | | // if (quant == 0) return 0; | 49 | | // if (quant == 1) return biases[c]; | 50 | | // if (quant == -1) return -biases[c]; | 51 | | // return quant - biases[3] / quant; | 52 | | | 53 | | // Integer comparison is not helpful because Clang incurs bypass penalties | 54 | | // from unnecessarily mixing integer and float. | 55 | 8.17M | const auto is_01 = Lt(abs_quant, Set(df, 1.125f)); | 56 | 8.17M | const auto not_0 = Gt(abs_quant, Zero(df)); | 57 | | | 58 | | // Bitwise logic is faster than quant * biases[c]. | 59 | 8.17M | const auto one_bias = IfThenElseZero(not_0, Xor(Set(df, biases[c]), sign)); | 60 | | | 61 | | // About 2E-5 worse than ReciprocalNR or division. | 62 | 8.17M | const auto bias = | 63 | 8.17M | NegMulAdd(Set(df, biases[3]), ApproximateReciprocal(quant), quant); | 64 | | | 65 | 8.17M | return IfThenElse(is_01, one_bias, bias); | 66 | 8.17M | } |
Unexecuted instantiation: dec_group.cc:decltype (Zero((hwy::N_AVX3::Simd<int, 16ul, 0>::Rebind<float>)())) jxl::N_AVX3::(anonymous namespace)::AdjustQuantBias<hwy::N_AVX3::Simd<int, 16ul, 0> >(hwy::N_AVX3::Simd<int, 16ul, 0>, unsigned long, decltype (Zero((hwy::N_AVX3::Simd<int, 16ul, 0>)())), float const*) dec_group.cc:decltype (Zero((hwy::N_EMU128::Simd<int, 4ul, 0>::Rebind<float>)())) jxl::N_EMU128::(anonymous namespace)::AdjustQuantBias<hwy::N_EMU128::Simd<int, 4ul, 0> >(hwy::N_EMU128::Simd<int, 4ul, 0>, unsigned long, decltype (Zero((hwy::N_EMU128::Simd<int, 4ul, 0>)())), float const*) Line | Count | Source | 36 | 15.4M | const float* HWY_RESTRICT biases) { | 37 | 15.4M | const Rebind<float, DI> df; | 38 | | | 39 | 15.4M | const auto quant = ConvertTo(df, quant_i); | 40 | | | 41 | | // Compare |quant|, keep sign bit for negating result. | 42 | 15.4M | const auto kSign = BitCast(df, Set(di, INT32_MIN)); | 43 | 15.4M | const auto sign = And(quant, kSign); // TODO(janwas): = abs ^ orig | 44 | 15.4M | const auto abs_quant = AndNot(kSign, quant); | 45 | | | 46 | | // If |x| is 1, kZeroBias creates a different bias for each channel. | 47 | | // We're implementing the following: | 48 | | // if (quant == 0) return 0; | 49 | | // if (quant == 1) return biases[c]; | 50 | | // if (quant == -1) return -biases[c]; | 51 | | // return quant - biases[3] / quant; | 52 | | | 53 | | // Integer comparison is not helpful because Clang incurs bypass penalties | 54 | | // from unnecessarily mixing integer and float. | 55 | 15.4M | const auto is_01 = Lt(abs_quant, Set(df, 1.125f)); | 56 | 15.4M | const auto not_0 = Gt(abs_quant, Zero(df)); | 57 | | | 58 | | // Bitwise logic is faster than quant * biases[c]. | 59 | 15.4M | const auto one_bias = IfThenElseZero(not_0, Xor(Set(df, biases[c]), sign)); | 60 | | | 61 | | // About 2E-5 worse than ReciprocalNR or division. | 62 | 15.4M | const auto bias = | 63 | 15.4M | NegMulAdd(Set(df, biases[3]), ApproximateReciprocal(quant), quant); | 64 | | | 65 | 15.4M | return IfThenElse(is_01, one_bias, bias); | 66 | 15.4M | } |
Unexecuted instantiation: enc_group.cc:decltype (Zero((hwy::N_SSE4::Simd<int, 4ul, 0>::Rebind<float>)())) jxl::N_SSE4::(anonymous namespace)::AdjustQuantBias<hwy::N_SSE4::Simd<int, 4ul, 0> >(hwy::N_SSE4::Simd<int, 4ul, 0>, unsigned long, decltype (Zero((hwy::N_SSE4::Simd<int, 4ul, 0>)())), float const*) enc_group.cc:decltype (Zero((hwy::N_AVX2::Simd<int, 8ul, 0>::Rebind<float>)())) jxl::N_AVX2::(anonymous namespace)::AdjustQuantBias<hwy::N_AVX2::Simd<int, 8ul, 0> >(hwy::N_AVX2::Simd<int, 8ul, 0>, unsigned long, decltype (Zero((hwy::N_AVX2::Simd<int, 8ul, 0>)())), float const*) Line | Count | Source | 36 | 736 | const float* HWY_RESTRICT biases) { | 37 | 736 | const Rebind<float, DI> df; | 38 | | | 39 | 736 | const auto quant = ConvertTo(df, quant_i); | 40 | | | 41 | | // Compare |quant|, keep sign bit for negating result. | 42 | 736 | const auto kSign = BitCast(df, Set(di, INT32_MIN)); | 43 | 736 | const auto sign = And(quant, kSign); // TODO(janwas): = abs ^ orig | 44 | 736 | const auto abs_quant = AndNot(kSign, quant); | 45 | | | 46 | | // If |x| is 1, kZeroBias creates a different bias for each channel. | 47 | | // We're implementing the following: | 48 | | // if (quant == 0) return 0; | 49 | | // if (quant == 1) return biases[c]; | 50 | | // if (quant == -1) return -biases[c]; | 51 | | // return quant - biases[3] / quant; | 52 | | | 53 | | // Integer comparison is not helpful because Clang incurs bypass penalties | 54 | | // from unnecessarily mixing integer and float. | 55 | 736 | const auto is_01 = Lt(abs_quant, Set(df, 1.125f)); | 56 | 736 | const auto not_0 = Gt(abs_quant, Zero(df)); | 57 | | | 58 | | // Bitwise logic is faster than quant * biases[c]. | 59 | 736 | const auto one_bias = IfThenElseZero(not_0, Xor(Set(df, biases[c]), sign)); | 60 | | | 61 | | // About 2E-5 worse than ReciprocalNR or division. | 62 | 736 | const auto bias = | 63 | 736 | NegMulAdd(Set(df, biases[3]), ApproximateReciprocal(quant), quant); | 64 | | | 65 | 736 | return IfThenElse(is_01, one_bias, bias); | 66 | 736 | } |
Unexecuted instantiation: enc_group.cc:decltype (Zero((hwy::N_AVX3::Simd<int, 16ul, 0>::Rebind<float>)())) jxl::N_AVX3::(anonymous namespace)::AdjustQuantBias<hwy::N_AVX3::Simd<int, 16ul, 0> >(hwy::N_AVX3::Simd<int, 16ul, 0>, unsigned long, decltype (Zero((hwy::N_AVX3::Simd<int, 16ul, 0>)())), float const*) Unexecuted instantiation: enc_group.cc:decltype (Zero((hwy::N_EMU128::Simd<int, 4ul, 0>::Rebind<float>)())) jxl::N_EMU128::(anonymous namespace)::AdjustQuantBias<hwy::N_EMU128::Simd<int, 4ul, 0> >(hwy::N_EMU128::Simd<int, 4ul, 0>, unsigned long, decltype (Zero((hwy::N_EMU128::Simd<int, 4ul, 0>)())), float const*) |