/src/libmpeg2/common/x86/icv_sad_ssse3.c
Line | Count | Source |
1 | | /****************************************************************************** |
2 | | * |
3 | | * Copyright (C) 2015 The Android Open Source Project |
4 | | * |
5 | | * Licensed under the Apache License, Version 2.0 (the "License"); |
6 | | * you may not use this file except in compliance with the License. |
7 | | * You may obtain a copy of the License at: |
8 | | * |
9 | | * http://www.apache.org/licenses/LICENSE-2.0 |
10 | | * |
11 | | * Unless required by applicable law or agreed to in writing, software |
12 | | * distributed under the License is distributed on an "AS IS" BASIS, |
13 | | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | * See the License for the specific language governing permissions and |
15 | | * limitations under the License. |
16 | | * |
17 | | ***************************************************************************** |
18 | | * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore |
19 | | */ |
20 | | /** |
21 | | ******************************************************************************* |
22 | | * @file |
23 | | * icv_sad.c |
24 | | * |
25 | | * @brief |
26 | | * This file contains the functions to compute SAD |
27 | | * |
28 | | * @author |
29 | | * Ittiam |
30 | | * |
31 | | * @par List of Functions: |
32 | | * icv_sad_8x4_ssse3() |
33 | | * |
34 | | * @remarks |
35 | | * None |
36 | | * |
37 | | ******************************************************************************* |
38 | | */ |
39 | | /*****************************************************************************/ |
40 | | /* File Includes */ |
41 | | /*****************************************************************************/ |
42 | | /* System include files */ |
43 | | #include <stdio.h> |
44 | | #include <stdint.h> |
45 | | #include <string.h> |
46 | | #include <stdlib.h> |
47 | | #include <assert.h> |
48 | | #include <immintrin.h> |
49 | | |
50 | | /* User include files */ |
51 | | #include "icv_datatypes.h" |
52 | | #include "icv_macros.h" |
53 | | #include "icv_platform_macros.h" |
54 | | #include "icv.h" |
55 | | |
56 | | /** |
57 | | ******************************************************************************* |
58 | | * |
59 | | * @brief |
60 | | * Compute 8x4 SAD |
61 | | * |
62 | | * @par Description |
63 | | * Compute 8x4 sum of absolute differences between source and reference block |
64 | | * |
65 | | * @param[in] pu1_src |
66 | | * Source buffer |
67 | | * |
68 | | * @param[in] pu1_ref |
69 | | * Reference buffer |
70 | | * |
71 | | * @param[in] src_strd |
72 | | * Source stride |
73 | | * |
74 | | * @param[in] ref_strd |
75 | | * Reference stride |
76 | | * |
77 | | * @param[in] wd |
78 | | * Assumed to be 8 |
79 | | * |
80 | | * @param[in] ht |
81 | | * Assumed to be 4 |
82 | | |
83 | | * @returns |
84 | | * SAD |
85 | | * |
86 | | * @remarks |
87 | | * |
88 | | ******************************************************************************* |
89 | | */ |
90 | | WORD32 icv_sad_8x4_ssse3(UWORD8 *pu1_src, |
91 | | UWORD8 *pu1_ref, |
92 | | WORD32 src_strd, |
93 | | WORD32 ref_strd, |
94 | | WORD32 wd, |
95 | | WORD32 ht) |
96 | 93.3M | { |
97 | 93.3M | WORD32 sad; |
98 | 93.3M | __m128 src_r0, src_r1; |
99 | 93.3M | __m128 ref_r0, ref_r1; |
100 | 93.3M | __m128i res_r0, res_r1; |
101 | | |
102 | 93.3M | UNUSED(wd); |
103 | 93.3M | UNUSED(ht); |
104 | 93.3M | ASSERT(wd == 8); |
105 | 93.3M | ASSERT(ht == 4); |
106 | | |
107 | | /* Load source */ |
108 | 93.4M | src_r0 = (__m128)_mm_loadl_epi64((__m128i *) (pu1_src)); |
109 | 93.4M | pu1_src += src_strd; |
110 | | |
111 | 93.4M | src_r1 = (__m128)_mm_loadl_epi64((__m128i *) (pu1_src)); |
112 | 93.4M | pu1_src += src_strd; |
113 | | |
114 | 93.4M | src_r0 = _mm_loadh_pi (src_r0, (__m64 *) (pu1_src)); |
115 | 93.4M | pu1_src += src_strd; |
116 | | |
117 | 93.4M | src_r1 = _mm_loadh_pi (src_r1, (__m64 *) (pu1_src)); |
118 | 93.4M | pu1_src += src_strd; |
119 | | |
120 | | |
121 | | /* Load reference */ |
122 | 93.4M | ref_r0 = (__m128)_mm_loadl_epi64((__m128i *) (pu1_ref)); |
123 | 93.4M | pu1_ref += ref_strd; |
124 | | |
125 | 93.4M | ref_r1 = (__m128)_mm_loadl_epi64((__m128i *) (pu1_ref)); |
126 | 93.4M | pu1_ref += ref_strd; |
127 | | |
128 | 93.4M | ref_r0 = _mm_loadh_pi (ref_r0, (__m64 *) (pu1_ref)); |
129 | 93.4M | pu1_ref += ref_strd; |
130 | | |
131 | 93.4M | ref_r1 = _mm_loadh_pi (ref_r1, (__m64 *) (pu1_ref)); |
132 | 93.4M | pu1_ref += ref_strd; |
133 | | |
134 | | /* Compute SAD for each row */ |
135 | 93.4M | res_r0 = _mm_sad_epu8((__m128i)src_r0, (__m128i)ref_r0); |
136 | 93.4M | res_r1 = _mm_sad_epu8((__m128i)src_r1, (__m128i)ref_r1); |
137 | | |
138 | | /* Accumulate SAD */ |
139 | 93.4M | res_r0 = _mm_add_epi64(res_r0, res_r1); |
140 | 93.4M | res_r0 = _mm_add_epi64(res_r0, _mm_srli_si128(res_r0, 8)); |
141 | | |
142 | 93.4M | sad = _mm_cvtsi128_si32(res_r0); |
143 | | |
144 | 93.4M | return sad; |
145 | 92.8M | } |