/work/svt-av1/Source/Lib/Codec/hash_motion.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2018, Alliance for Open Media. All rights reserved |
3 | | * |
4 | | * This source code is subject to the terms of the BSD 2 Clause License and |
5 | | * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
6 | | * was not distributed with this source code in the LICENSE file, you can |
7 | | * obtain it at https://www.aomedia.org/license/software-license. If the Alliance for Open |
8 | | * Media Patent License 1.0 was not distributed with this source code in the |
9 | | * PATENTS file, you can obtain it at https://www.aomedia.org/license/patent-license. |
10 | | */ |
11 | | |
12 | | #include "aom_dsp_rtcd.h" |
13 | | #include "hash_motion.h" |
14 | | #include "pcs.h" |
15 | | |
16 | | static const int crc_bits = 16; |
17 | | static const int block_size_bits = 3; |
18 | | |
19 | 474 | static void hash_table_clear_all(HashTable* p_hash_table) { |
20 | 474 | if (p_hash_table->p_lookup_table == NULL) { |
21 | 474 | return; |
22 | 474 | } |
23 | 0 | int max_addr = 1 << (crc_bits + block_size_bits); |
24 | 0 | for (int i = 0; i < max_addr; i++) { |
25 | 0 | if (p_hash_table->p_lookup_table[i] != NULL) { |
26 | 0 | svt_aom_vector_destroy(p_hash_table->p_lookup_table[i]); |
27 | 0 | EB_FREE(p_hash_table->p_lookup_table[i]); |
28 | 0 | p_hash_table->p_lookup_table[i] = NULL; |
29 | 0 | } |
30 | 0 | } |
31 | 0 | } |
32 | | |
33 | 0 | static void get_pixels_in_1d_char_array_by_block_2x2(uint8_t* y_src, int stride, uint8_t* p_pixels_in1D) { |
34 | 0 | uint8_t* p_pel = y_src; |
35 | 0 | int index = 0; |
36 | 0 | for (int i = 0; i < 2; i++) { |
37 | 0 | for (int j = 0; j < 2; j++) { |
38 | 0 | p_pixels_in1D[index++] = p_pel[j]; |
39 | 0 | } |
40 | 0 | p_pel += stride; |
41 | 0 | } |
42 | 0 | } |
43 | | |
44 | 0 | static void get_pixels_in_1d_short_array_by_block_2x2(uint16_t* y_src, int stride, uint16_t* p_pixels_in1D) { |
45 | 0 | uint16_t* p_pel = y_src; |
46 | 0 | int index = 0; |
47 | 0 | for (int i = 0; i < 2; i++) { |
48 | 0 | for (int j = 0; j < 2; j++) { |
49 | 0 | p_pixels_in1D[index++] = p_pel[j]; |
50 | 0 | } |
51 | 0 | p_pel += stride; |
52 | 0 | } |
53 | 0 | } |
54 | | |
55 | | // the hash value (hash_value1 consists two parts, the first 3 bits relate to |
56 | | // the block size and the remaining 16 bits are the crc values. This fuction |
57 | | // is used to get the first 3 bits. |
58 | | static int hash_block_size_to_index(int block_size) { |
59 | | switch (block_size) { |
60 | | case 4: |
61 | | return 0; |
62 | | case 8: |
63 | | return 1; |
64 | | case 16: |
65 | | return 2; |
66 | | case 32: |
67 | | return 3; |
68 | | case 64: |
69 | | return 4; |
70 | | case 128: |
71 | | return 5; |
72 | | default: |
73 | | return -1; |
74 | | } |
75 | | } |
76 | | |
77 | 0 | static uint32_t get_identity_hash_value(const uint8_t a, const uint8_t b, const uint8_t c, const uint8_t d) { |
78 | | // The four input values add up to 32 bits, which is the size of the output. |
79 | | // Just pack those values as is. |
80 | 0 | return ((uint32_t)a << 24) + ((uint32_t)b << 16) + ((uint32_t)c << 8) + ((uint32_t)d); |
81 | 0 | } |
82 | | |
83 | 0 | static uint32_t get_xor_hash_value_hbd(const uint16_t a, const uint16_t b, const uint16_t c, const uint16_t d) { |
84 | 0 | uint32_t result; |
85 | | // Pack the lower 8 bits of each input value to the 32 bit output, then xor |
86 | | // with the upper 8 bits of each input value. |
87 | 0 | result = ((uint32_t)(a & 0x00ff) << 24) + ((uint32_t)(b & 0x00ff) << 16) + ((uint32_t)(c & 0x00ff) << 8) + |
88 | 0 | ((uint32_t)(d & 0x00ff)); |
89 | 0 | result ^= ((uint32_t)(a & 0xff00) << 16) + ((uint32_t)(b & 0xff00) << 8) + ((uint32_t)(c & 0xff00)) + |
90 | 0 | ((uint32_t)(d & 0xff00) >> 8); |
91 | 0 | return result; |
92 | 0 | } |
93 | | |
94 | 474 | void svt_av1_hash_table_destroy(HashTable* p_hash_table) { |
95 | 474 | hash_table_clear_all(p_hash_table); |
96 | 474 | EB_FREE_ARRAY(p_hash_table->p_lookup_table); |
97 | 474 | p_hash_table->p_lookup_table = NULL; |
98 | 474 | } |
99 | | |
100 | 0 | EbErrorType svt_aom_rtime_alloc_svt_av1_hash_table_create(HashTable* p_hash_table) { |
101 | 0 | EbErrorType err_code = EB_ErrorNone; |
102 | 0 | ; |
103 | |
|
104 | 0 | if (p_hash_table->p_lookup_table != NULL) { |
105 | 0 | hash_table_clear_all(p_hash_table); |
106 | 0 | return err_code; |
107 | 0 | } |
108 | 0 | const int max_addr = 1 << (crc_bits + block_size_bits); |
109 | 0 | EB_CALLOC_ARRAY(p_hash_table->p_lookup_table, max_addr); |
110 | | |
111 | 0 | return err_code; |
112 | 0 | } |
113 | | |
114 | | static bool hash_table_add_to_table(HashTable* p_hash_table, uint32_t hash_value, const BlockHash* curr_block_hash, |
115 | | uint16_t max_cand_per_bucket) { |
116 | | if (p_hash_table->p_lookup_table[hash_value] == NULL) { |
117 | | EB_MALLOC_OBJECT_NO_CHECK(p_hash_table->p_lookup_table[hash_value]); |
118 | | if (p_hash_table->p_lookup_table[hash_value] == NULL) { |
119 | | return false; |
120 | | } |
121 | | if (svt_aom_vector_setup(p_hash_table->p_lookup_table[hash_value], 10, sizeof(*curr_block_hash)) == |
122 | | VECTOR_ERROR) { |
123 | | return false; |
124 | | } |
125 | | } |
126 | | // Place an upper bound each hash table bucket to up to 256 intrabc |
127 | | // block candidates, and ignore subsequent ones. Considering more can |
128 | | // unnecessarily slow down encoding for virtually no efficiency gain. |
129 | | if (svt_aom_vector_byte_size(p_hash_table->p_lookup_table[hash_value]) < |
130 | | max_cand_per_bucket * sizeof(*curr_block_hash)) { |
131 | | if (svt_aom_vector_push_back(p_hash_table->p_lookup_table[hash_value], (void*)curr_block_hash) == |
132 | | VECTOR_ERROR) { |
133 | | return false; |
134 | | } |
135 | | } |
136 | | return true; |
137 | | } |
138 | | |
139 | 0 | int32_t svt_av1_hash_table_count(const HashTable* p_hash_table, uint32_t hash_value) { |
140 | 0 | if (p_hash_table->p_lookup_table[hash_value] == NULL) { |
141 | 0 | return 0; |
142 | 0 | } else { |
143 | 0 | return (int32_t)(p_hash_table->p_lookup_table[hash_value]->size); |
144 | 0 | } |
145 | 0 | } |
146 | | |
147 | 0 | Iterator svt_av1_hash_get_first_iterator(HashTable* p_hash_table, uint32_t hash_value) { |
148 | 0 | assert(svt_av1_hash_table_count(p_hash_table, hash_value) > 0); |
149 | 0 | return svt_aom_vector_begin(p_hash_table->p_lookup_table[hash_value]); |
150 | 0 | } |
151 | | |
152 | | void svt_av1_generate_block_2x2_hash_value(const Yv12BufferConfig* picture, uint32_t* pic_block_hash, |
153 | 0 | PictureControlSet* pcs) { |
154 | 0 | const int width = 2; |
155 | 0 | const int height = 2; |
156 | 0 | const int x_end = picture->y_crop_width - width + 1; |
157 | 0 | const int y_end = picture->y_crop_height - height + 1; |
158 | 0 | (void)pcs; |
159 | 0 | if (picture->flags & YV12_FLAG_HIGHBITDEPTH) { |
160 | 0 | uint16_t p[4]; |
161 | 0 | int pos = 0; |
162 | 0 | for (int y_pos = 0; y_pos < y_end; y_pos++) { |
163 | 0 | for (int x_pos = 0; x_pos < x_end; x_pos++) { |
164 | 0 | get_pixels_in_1d_short_array_by_block_2x2( |
165 | 0 | CONVERT_TO_SHORTPTR(picture->y_buffer) + y_pos * picture->y_stride + x_pos, picture->y_stride, p); |
166 | | // For HBD, we either have 40 or 48 bits of input data that the xor hash |
167 | | // reduce to 32 bits. We intentionally don't want to "discard" bits to |
168 | | // avoid any kind of biasing. |
169 | 0 | pic_block_hash[pos] = get_xor_hash_value_hbd(p[0], p[1], p[2], p[3]); |
170 | 0 | pos++; |
171 | 0 | } |
172 | 0 | pos += width - 1; |
173 | 0 | } |
174 | 0 | } else { |
175 | 0 | uint8_t p[4]; |
176 | 0 | int pos = 0; |
177 | 0 | for (int y_pos = 0; y_pos < y_end; y_pos++) { |
178 | 0 | for (int x_pos = 0; x_pos < x_end; x_pos++) { |
179 | 0 | get_pixels_in_1d_char_array_by_block_2x2( |
180 | 0 | picture->y_buffer + y_pos * picture->y_stride + x_pos, picture->y_stride, p); |
181 | | // This 2x2 hash isn't used directly as a "key" for the hash table, so |
182 | | // we can afford to just copy the 4 8-bit pixel values as a single |
183 | | // 32-bit value directly. (i.e. there are no concerns of a lack of |
184 | | // uniform distribution) |
185 | 0 | pic_block_hash[pos] = get_identity_hash_value(p[0], p[1], p[2], p[3]); |
186 | 0 | pos++; |
187 | 0 | } |
188 | 0 | pos += width - 1; |
189 | 0 | } |
190 | 0 | } |
191 | 0 | } |
192 | | |
193 | | void svt_av1_generate_block_hash_value(const Yv12BufferConfig* picture, int block_size, uint32_t* src_pic_block_hash, |
194 | 0 | uint32_t* dst_pic_block_hash, PictureControlSet* pcs) { |
195 | 0 | const int pic_width = picture->y_crop_width; |
196 | 0 | const int x_end = picture->y_crop_width - block_size + 1; |
197 | 0 | const int y_end = picture->y_crop_height - block_size + 1; |
198 | |
|
199 | 0 | const int src_size = block_size >> 1; |
200 | |
|
201 | 0 | uint32_t p[4]; |
202 | 0 | const int length = sizeof(p); |
203 | |
|
204 | 0 | int pos = 0; |
205 | 0 | for (int y_pos = 0; y_pos < y_end; y_pos++) { |
206 | 0 | for (int x_pos = 0; x_pos < x_end; x_pos++) { |
207 | 0 | p[0] = src_pic_block_hash[pos]; |
208 | 0 | p[1] = src_pic_block_hash[pos + src_size]; |
209 | 0 | p[2] = src_pic_block_hash[pos + src_size * pic_width]; |
210 | 0 | p[3] = src_pic_block_hash[pos + src_size * pic_width + src_size]; |
211 | 0 | dst_pic_block_hash[pos] = svt_av1_get_crc32c_value(&pcs->crc_calculator, (uint8_t*)p, length); |
212 | |
|
213 | 0 | pos++; |
214 | 0 | } |
215 | 0 | pos += block_size - 1; |
216 | 0 | } |
217 | 0 | } |
218 | | |
219 | | bool svt_aom_rtime_alloc_svt_av1_add_to_hash_map_by_row_with_precal_data(HashTable* p_hash_table, uint32_t* pic_hash, |
220 | | int pic_width, int pic_height, int block_size, |
221 | 0 | uint16_t max_cand_per_bucket) { |
222 | 0 | const int x_end = pic_width - block_size + 1; |
223 | 0 | const int y_end = pic_height - block_size + 1; |
224 | |
|
225 | 0 | int add_value = hash_block_size_to_index(block_size); |
226 | 0 | assert(add_value >= 0); |
227 | 0 | add_value <<= crc_bits; |
228 | 0 | const int crc_mask = (1 << crc_bits) - 1; |
229 | 0 | int step = block_size; |
230 | 0 | int x_offset = 0; |
231 | 0 | int y_offset = 0; |
232 | | |
233 | | // Explore the entire frame hierarchically to add intrabc candidate blocks to |
234 | | // the hash table, by starting with coarser steps (the block size), towards |
235 | | // finer-grained steps until every candidate block has been considered. |
236 | | // The nested for loop goes through the pic_hash array column by column. |
237 | | |
238 | | // Doing a hierarchical block exploration helps maximize spatial dispersion |
239 | | // of the first and foremost candidate blocks while minimizing overlap between |
240 | | // them. This is helpful because we only keep up to 256 entries of the |
241 | | // same candidate block (located in different places), so we want those |
242 | | // entries to cover the biggest area of the image to encode to maximize coding |
243 | | // efficiency. |
244 | | |
245 | | // This is the coordinate exploration order example for an 8x8 region, with |
246 | | // block_size = 4. The top-left corner (x, y) coordinates of each candidate |
247 | | // block are shown below. There are 5 * 5 (25) candidate blocks. |
248 | | // x 0 1 2 3 4 5 6 7 |
249 | | // y +------------------------ |
250 | | // 0 | 1 10 5 13 3 |
251 | | // 1 | 16 22 18 24 20 |
252 | | // 2 | 7 11 9 14 8 |
253 | | // 3 | 17 23 19 25 21 |
254 | | // 4 | 2 12 6 15 4--------+ |
255 | | // 5 | | 4 x 4 | |
256 | | // 6 | | block | |
257 | | // 7 | +--------+ |
258 | | |
259 | | // Please note that due to the way block exploration works, the smallest step |
260 | | // used is 2 (i.e. no two adjacent blocks will be explored consecutively). |
261 | | // Also, the exploration is designed to visit each block candidate only once. |
262 | 0 | while (step > 1) { |
263 | 0 | for (int x_pos = x_offset; x_pos < x_end; x_pos += step) { |
264 | 0 | for (int y_pos = y_offset; y_pos < y_end; y_pos += step) { |
265 | 0 | const int pos = y_pos * pic_width + x_pos; |
266 | 0 | BlockHash curr_block_hash; |
267 | |
|
268 | 0 | curr_block_hash.x = x_pos; |
269 | 0 | curr_block_hash.y = y_pos; |
270 | |
|
271 | 0 | const uint32_t hash_value1 = (pic_hash[pos] & crc_mask) + add_value; |
272 | 0 | curr_block_hash.hash_value2 = pic_hash[pos]; |
273 | 0 | if (!hash_table_add_to_table(p_hash_table, hash_value1, &curr_block_hash, max_cand_per_bucket)) { |
274 | 0 | return false; |
275 | 0 | } |
276 | 0 | } |
277 | 0 | } |
278 | | |
279 | | // Adjust offsets and step sizes with this state machine. |
280 | | // State 0 is needed because no blocks in pic_hash have been explored, |
281 | | // so exploration requires a way to account for blocks with both zero |
282 | | // x_offset and zero y_offset. |
283 | | // State 0 is always meant to be executed first, but the relative order of |
284 | | // states 1, 2 and 3 can be arbitrary, as long as no two adjacent blocks |
285 | | // are explored consecutively. |
286 | 0 | if (x_offset == 0 && y_offset == 0) { |
287 | | // State 0 -> State 1: special case |
288 | | // This state transition will only execute when step == block_size |
289 | 0 | x_offset = step / 2; |
290 | 0 | } else if (x_offset == step / 2 && y_offset == 0) { |
291 | | // State 1 -> State 2 |
292 | 0 | x_offset = 0; |
293 | 0 | y_offset = step / 2; |
294 | 0 | } else if (x_offset == 0 && y_offset == step / 2) { |
295 | | // State 2 -> State 3 |
296 | 0 | x_offset = step / 2; |
297 | 0 | } else { |
298 | 0 | assert(x_offset == step / 2 && y_offset == step / 2); |
299 | | // State 3 -> State 1: We've fully explored all the coordinates for the |
300 | | // current step size, continue by halving the step size |
301 | 0 | step /= 2; |
302 | 0 | x_offset = step / 2; |
303 | 0 | y_offset = 0; |
304 | 0 | } |
305 | 0 | } |
306 | | |
307 | 0 | return true; |
308 | 0 | } |
309 | | |
310 | | void svt_av1_get_block_hash_value(uint8_t* y_src, int stride, int block_size, uint32_t* hash_value1, |
311 | | uint32_t* hash_value2, int use_highbitdepth, struct PictureControlSet* pcs, |
312 | 0 | IntraBcContext* x) { |
313 | 0 | UNUSED(pcs); |
314 | 0 | const int add_value = hash_block_size_to_index(block_size) << crc_bits; |
315 | 0 | assert(add_value >= 0); |
316 | 0 | const int crc_mask = (1 << crc_bits) - 1; |
317 | | |
318 | | // 2x2 subblock hash values in current CU |
319 | 0 | int sub_block_in_width = (block_size >> 1); |
320 | 0 | if (use_highbitdepth) { |
321 | 0 | uint16_t pixel_to_hash[4]; |
322 | 0 | uint16_t* y16_src = CONVERT_TO_SHORTPTR(y_src); |
323 | 0 | for (int y_pos = 0; y_pos < block_size; y_pos += 2) { |
324 | 0 | for (int x_pos = 0; x_pos < block_size; x_pos += 2) { |
325 | 0 | int pos = (y_pos >> 1) * sub_block_in_width + (x_pos >> 1); |
326 | 0 | get_pixels_in_1d_short_array_by_block_2x2(y16_src + y_pos * stride + x_pos, stride, pixel_to_hash); |
327 | 0 | assert(pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH); |
328 | | // For HBD, we either have 40 or 48 bits of input data that the xor hash |
329 | | // reduce to 32 bits. We intentionally don't want to "discard" bits to |
330 | | // avoid any kind of biasing. |
331 | 0 | x->hash_value_buffer[0][pos] = get_xor_hash_value_hbd( |
332 | 0 | pixel_to_hash[0], pixel_to_hash[1], pixel_to_hash[2], pixel_to_hash[3]); |
333 | 0 | } |
334 | 0 | } |
335 | 0 | } else { |
336 | 0 | uint8_t pixel_to_hash[4]; |
337 | 0 | for (int y_pos = 0; y_pos < block_size; y_pos += 2) { |
338 | 0 | for (int x_pos = 0; x_pos < block_size; x_pos += 2) { |
339 | 0 | int pos = (y_pos >> 1) * sub_block_in_width + (x_pos >> 1); |
340 | 0 | get_pixels_in_1d_char_array_by_block_2x2(y_src + y_pos * stride + x_pos, stride, pixel_to_hash); |
341 | 0 | assert(pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH); |
342 | | // This 2x2 hash isn't used directly as a "key" for the hash table, so |
343 | | // we can afford to just copy the 4 8-bit pixel values as a single |
344 | | // 32-bit value directly. (i.e. there are no concerns of a lack of |
345 | | // uniform distribution) |
346 | 0 | x->hash_value_buffer[0][pos] = get_identity_hash_value( |
347 | 0 | pixel_to_hash[0], pixel_to_hash[1], pixel_to_hash[2], pixel_to_hash[3]); |
348 | 0 | } |
349 | 0 | } |
350 | 0 | } |
351 | |
|
352 | 0 | int src_sub_block_in_width = sub_block_in_width; |
353 | 0 | sub_block_in_width >>= 1; |
354 | |
|
355 | 0 | int src_idx = 0; |
356 | 0 | int dst_idx = 1 - src_idx; |
357 | | |
358 | | // 4x4 subblock hash values to current block hash values |
359 | 0 | uint32_t to_hash[4]; |
360 | 0 | for (int sub_width = 4; sub_width <= block_size; sub_width *= 2, src_idx = 1 - src_idx) { |
361 | 0 | dst_idx = 1 - src_idx; |
362 | |
|
363 | 0 | int dst_pos = 0; |
364 | 0 | for (int y_pos = 0; y_pos < sub_block_in_width; y_pos++) { |
365 | 0 | for (int x_pos = 0; x_pos < sub_block_in_width; x_pos++) { |
366 | 0 | int src_pos = (y_pos << 1) * src_sub_block_in_width + (x_pos << 1); |
367 | |
|
368 | 0 | assert(src_pos + 1 < AOM_BUFFER_SIZE_FOR_BLOCK_HASH); |
369 | 0 | assert(src_pos + src_sub_block_in_width + 1 < AOM_BUFFER_SIZE_FOR_BLOCK_HASH); |
370 | 0 | assert(dst_pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH); |
371 | |
|
372 | 0 | to_hash[0] = x->hash_value_buffer[src_idx][src_pos]; |
373 | 0 | to_hash[1] = x->hash_value_buffer[src_idx][src_pos + 1]; |
374 | 0 | to_hash[2] = x->hash_value_buffer[src_idx][src_pos + src_sub_block_in_width]; |
375 | 0 | to_hash[3] = x->hash_value_buffer[src_idx][src_pos + src_sub_block_in_width + 1]; |
376 | |
|
377 | 0 | x->hash_value_buffer[dst_idx][dst_pos] = svt_av1_get_crc32c_value( |
378 | 0 | &x->crc_calculator, (uint8_t*)to_hash, sizeof(to_hash)); |
379 | 0 | dst_pos++; |
380 | 0 | } |
381 | 0 | } |
382 | |
|
383 | 0 | src_sub_block_in_width = sub_block_in_width; |
384 | 0 | sub_block_in_width >>= 1; |
385 | 0 | } |
386 | |
|
387 | 0 | *hash_value1 = (x->hash_value_buffer[dst_idx][0] & crc_mask) + add_value; |
388 | 0 | *hash_value2 = x->hash_value_buffer[dst_idx][0]; |
389 | 0 | } |