Coverage Report

Created: 2025-08-26 06:38

/src/libavc/common/ih264_resi_trans_quant.c
Line
Count
Source (jump to first uncovered line)
1
/******************************************************************************
2
 *
3
 * Copyright (C) 2015 The Android Open Source Project
4
 *
5
 * Licensed under the Apache License, Version 2.0 (the "License");
6
 * you may not use this file except in compliance with the License.
7
 * You may obtain a copy of the License at:
8
 *
9
 * http://www.apache.org/licenses/LICENSE-2.0
10
 *
11
 * Unless required by applicable law or agreed to in writing, software
12
 * distributed under the License is distributed on an "AS IS" BASIS,
13
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
 * See the License for the specific language governing permissions and
15
 * limitations under the License.
16
 *
17
 *****************************************************************************
18
 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19
*/
20
21
/**
22
*******************************************************************************
23
* @file
24
*  ih264_resi_trans_quant.c
25
*
26
* @brief
27
*  Contains function definitions single stage forward transform for H.264
28
*  It will calculate the residue, do the cf and then do quantization
29
*
30
* @author
31
*  ittiam
32
*
33
* @par List of Functions:
34
*  - ih264_resi_trans_quant_4x4
35
*  - ih264_resi_trans_quant_chroma_4x4
36
*  - ih264_hadamard_quant_4x4
37
*  - ih264_hadamard_quant_2x2_uv
38
*  - ih264_resi_trans_quant_8x8
39
*
40
* @remarks
41
*  none
42
*
43
*******************************************************************************
44
*/
45
46
47
/*****************************************************************************/
48
/* File Includes                                                             */
49
/*****************************************************************************/
50
51
/* System Include Files */
52
#include <stddef.h>
53
54
/* User Include Files */
55
#include "ih264_typedefs.h"
56
#include "ih264_defs.h"
57
#include "ih264_macros.h"
58
#include "ih264_size_defs.h"
59
#include "ih264_trans_macros.h"
60
#include "ih264_trans_data.h"
61
#include "ih264_structs.h"
62
#include "ih264_trans_quant_itrans_iquant.h"
63
64
65
/*****************************************************************************/
66
/* Function Definitions                                                      */
67
/*****************************************************************************/
68
69
/**
70
*******************************************************************************
71
*
72
* @brief
73
*  This function performs forward transform and quantization on a 4x4 block
74
*
75
* @par Description:
76
*  The function accepts source buffer and estimation buffer. From these, it
77
*  computes the residue. This is residue is then transformed and quantized.
78
*  The transform and quantization are in placed computed. They use the residue
79
*  buffer for this.
80
*
81
* @param[in] pu1_src
82
*  Pointer to source sub-block
83
*
84
* @param[in] pu1_pred
85
*  Pointer to prediction sub-block
86
*
87
* @param[in] pi2_out
88
*  Pointer to residual sub-block
89
*
90
* @param[in] src_strd
91
*  Source stride
92
*
93
* @param[in] pred_strd
94
*  Prediction stride
95
*
96
* @param[in] pu2_scale_matrix
97
*  Pointer to Forward Quant Scale Matrix
98
*
99
* @param[in] pu2_threshold_matrix
100
*  Pointer to Forward Quant Threshold Matrix
101
*
102
* @param[in] u4_qbits
103
*  QP_BITS_h264_4x4 + floor(QP/6)
104
*
105
* @param[in] u4_round_factor
106
*  Quantization Round factor
107
*
108
* @param[out] pu1_nnz
109
*  Total non-zero coefficients in the current sub-block
110
*
111
* @param[in] pi2_alt_dc_addr
112
*  DC Coefficient of the block
113
*
114
* @remarks none
115
*
116
*******************************************************************************
117
*/
118
void ih264_resi_trans_quant_4x4(UWORD8 *pu1_src,
119
                                UWORD8 *pu1_pred,
120
                                WORD16 *pi2_out,
121
                                WORD32 src_strd,
122
                                WORD32 pred_strd,
123
                                const UWORD16 *pu2_scale_matrix,
124
                                const UWORD16 *pu2_threshold_matrix,
125
                                UWORD32 u4_qbits,
126
                                UWORD32 u4_round_factor,
127
                                UWORD8 *pu1_nnz,
128
                                WORD16 *pi2_alt_dc_addr)
129
0
{
130
0
    UWORD32 i;
131
0
    WORD32 x0, x1, x2, x3, x4, x5, x6, x7;
132
0
    WORD32 i4_value;
133
0
    WORD16 *pi2_out_tmp = pi2_out;
134
0
    UWORD32 u4_nonzero_coeff = 0;
135
136
0
    for (i = 0; i < SUB_BLK_WIDTH_4x4; i++)
137
0
    {
138
        /* computing prediction error (residue) */
139
0
        x4 = pu1_src[0] - pu1_pred[0];
140
0
        x5 = pu1_src[1] - pu1_pred[1];
141
0
        x6 = pu1_src[2] - pu1_pred[2];
142
0
        x7 = pu1_src[3] - pu1_pred[3];
143
144
        /* Horizontal transform */
145
0
        x0 = x4 + x7;
146
0
        x1 = x5 + x6;
147
0
        x2 = x5 - x6;
148
0
        x3 = x4 - x7;
149
150
0
        pi2_out_tmp[0] = x0 + x1;
151
0
        pi2_out_tmp[1] = (x3 << 1) + x2;
152
0
        pi2_out_tmp[2] = x0 - x1;
153
0
        pi2_out_tmp[3] = x3 - (x2 << 1);
154
155
        /* pointing to next row; */
156
0
        pu1_src += src_strd;
157
0
        pu1_pred += pred_strd;
158
0
        pi2_out_tmp += 4;
159
0
    }
160
161
0
    pi2_out_tmp = pi2_out;
162
0
    for (i = 0; i < SUB_BLK_WIDTH_4x4; i++)
163
0
    {
164
        /* Vertical transform and quantization */
165
0
        x4 = pi2_out_tmp[0];
166
0
        x5 = pi2_out_tmp[4];
167
0
        x6 = pi2_out_tmp[8];
168
0
        x7 = pi2_out_tmp[12];
169
170
0
        x0 = x4 + x7;
171
0
        x1 = x5 + x6;
172
0
        x2 = x5 - x6;
173
0
        x3 = x4 - x7;
174
175
        /* quantization is done in place */
176
0
        i4_value = x0 + x1;
177
0
        if(i == 0)
178
0
        {
179
0
            (*pi2_alt_dc_addr) = i4_value;
180
0
        }
181
0
        FWD_QUANT(i4_value, pu2_threshold_matrix[0],
182
0
                  pu2_scale_matrix[0], u4_round_factor, u4_qbits,
183
0
                  u4_nonzero_coeff);
184
0
        pi2_out_tmp[0] = i4_value;
185
186
0
        i4_value = (x3 << 1) + x2;
187
0
        FWD_QUANT(i4_value, pu2_threshold_matrix[4],
188
0
                  pu2_scale_matrix[4], u4_round_factor, u4_qbits,
189
0
                  u4_nonzero_coeff);
190
0
        pi2_out_tmp[4] = i4_value;
191
192
0
        i4_value = x0 - x1;
193
0
        FWD_QUANT(i4_value, pu2_threshold_matrix[8],
194
0
                  pu2_scale_matrix[8], u4_round_factor, u4_qbits,
195
0
                  u4_nonzero_coeff);
196
0
        pi2_out_tmp[8] = i4_value;
197
198
0
        i4_value = x3 - (x2 << 1);
199
0
        FWD_QUANT(i4_value, pu2_threshold_matrix[12],
200
0
                  pu2_scale_matrix[12], u4_round_factor, u4_qbits,
201
0
                  u4_nonzero_coeff);
202
0
        pi2_out_tmp[12] = i4_value;
203
204
0
        pi2_out_tmp++;
205
0
        pu2_scale_matrix++;
206
0
        pu2_threshold_matrix++;
207
0
    }
208
209
    /* Return total nonzero coefficients in the current sub block */
210
0
    *pu1_nnz =  u4_nonzero_coeff;
211
0
}
212
213
/**
214
*******************************************************************************
215
*
216
* @brief
217
*  This function performs forward transform and quantization on a 4x4
218
*  chroma block with interleaved values
219
*
220
* @par Description:
221
*  The function accepts source buffer and estimation buffer. From these, it
222
*  computes the residue. This is residue is then transformed and quantized.
223
*  The transform and quantization are in placed computed. They use the residue
224
*  buffer for this.
225
*
226
* @param[in] pu1_src
227
*  Pointer to source sub-block
228
*
229
* @param[in] pu1_pred
230
*  Pointer to prediction sub-block
231
*
232
* @param[in] pi2_out
233
*  Pointer to residual sub-block
234
*
235
* @param[in] src_strd
236
*  Source stride
237
*
238
* @param[in] pred_strd
239
*  Prediction stride
240
*
241
* @param[in] pu2_scale_matrix
242
*  Pointer to Forward Quant Scale Matrix
243
*
244
* @param[in] pu2_threshold_matrix
245
*  Pointer to Forward Quant Threshold Matrix
246
*
247
* @param[in] u4_qbits
248
*  QP_BITS_h264_4x4 + floor(QP/6)
249
*
250
* @param[in] u4_round_factor
251
*  Quantization Round factor
252
*
253
* @param[out] pu1_nnz
254
*  Total non-zero coefficients in the current sub-block
255
*
256
* @param[in] pi2_alt_dc_addr
257
*  DC Coefficient of the block
258
*
259
* @remarks none
260
*
261
*******************************************************************************
262
*/
263
void ih264_resi_trans_quant_chroma_4x4(UWORD8 *pu1_src,
264
                                       UWORD8 *pu1_pred,
265
                                       WORD16 *pi2_out,
266
                                       WORD32 src_strd,
267
                                       WORD32 pred_strd,
268
                                       const UWORD16 *pu2_scale_matrix,
269
                                       const UWORD16 *pu2_threshold_matrix,
270
                                       UWORD32 u4_qbits,
271
                                       UWORD32 u4_round_factor,
272
                                       UWORD8 *pu1_nnz,
273
                                       WORD16 *pu1_dc_alt_addr)
274
0
{
275
0
    UWORD32 i;
276
0
    WORD32 x0, x1, x2, x3, x4, x5, x6, x7;
277
0
    WORD32 i4_value;
278
0
    WORD16 *pi2_out_tmp = pi2_out;
279
0
    UWORD32 u4_nonzero_coeff = 0;
280
281
0
    for(i = 0; i < SUB_BLK_WIDTH_4x4; i++)
282
0
    {
283
        /* computing prediction error (residue) */
284
0
        x4 = pu1_src[0] - pu1_pred[0];
285
0
        x5 = pu1_src[2] - pu1_pred[2];
286
0
        x6 = pu1_src[4] - pu1_pred[4];
287
0
        x7 = pu1_src[6] - pu1_pred[6];
288
289
        /* Horizontal transform */
290
0
        x0 = x4 + x7;
291
0
        x1 = x5 + x6;
292
0
        x2 = x5 - x6;
293
0
        x3 = x4 - x7;
294
295
0
        pi2_out_tmp[0] = x0 + x1;
296
0
        pi2_out_tmp[1] = (x3 << 1) + x2;
297
0
        pi2_out_tmp[2] = x0 - x1;
298
0
        pi2_out_tmp[3] = x3 - (x2 << 1);
299
300
        /* pointing to next row; */
301
0
        pu1_src += src_strd;
302
0
        pu1_pred += pred_strd;
303
0
        pi2_out_tmp += 4;
304
0
    }
305
306
0
    pi2_out_tmp = pi2_out;
307
0
    for(i = 0; i < SUB_BLK_WIDTH_4x4; i++)
308
0
    {
309
        /* Vertical transform and quantization */
310
0
        x4 = pi2_out_tmp[0];
311
0
        x5 = pi2_out_tmp[4];
312
0
        x6 = pi2_out_tmp[8];
313
0
        x7 = pi2_out_tmp[12];
314
315
0
        x0 = x4 + x7;
316
0
        x1 = x5 + x6;
317
0
        x2 = x5 - x6;
318
0
        x3 = x4 - x7;
319
320
        /* quantization is done in place */
321
0
        i4_value = x0 + x1;
322
0
        if(i == 0)
323
0
        {
324
0
            *pu1_dc_alt_addr = i4_value;
325
0
        }
326
0
        FWD_QUANT(i4_value, pu2_threshold_matrix[0],
327
0
                  pu2_scale_matrix[0], u4_round_factor, u4_qbits,
328
0
                  u4_nonzero_coeff);
329
0
        pi2_out_tmp[0] = i4_value;
330
331
0
        i4_value = (x3 << 1) + x2;
332
0
        FWD_QUANT(i4_value, pu2_threshold_matrix[4],
333
0
                  pu2_scale_matrix[4], u4_round_factor, u4_qbits,
334
0
                  u4_nonzero_coeff);
335
0
        pi2_out_tmp[4] = i4_value;
336
337
0
        i4_value = x0 - x1;
338
0
        FWD_QUANT(i4_value, pu2_threshold_matrix[8],
339
0
                  pu2_scale_matrix[8], u4_round_factor, u4_qbits,
340
0
                  u4_nonzero_coeff);
341
0
        pi2_out_tmp[8] = i4_value;
342
343
0
        i4_value = x3 - (x2 << 1);
344
0
        FWD_QUANT(i4_value, pu2_threshold_matrix[12],
345
0
                  pu2_scale_matrix[12], u4_round_factor, u4_qbits,
346
0
                  u4_nonzero_coeff);
347
0
        pi2_out_tmp[12] = i4_value;
348
349
0
        pi2_out_tmp++;
350
0
        pu2_scale_matrix++;
351
0
        pu2_threshold_matrix++;
352
0
    }
353
354
    /* Return total nonzero coefficients in the current sub block */
355
0
    *pu1_nnz =  u4_nonzero_coeff;
356
0
}
357
358
/**
359
*******************************************************************************
360
*
361
* @brief
362
*  This function performs forward hadamard transform and quantization on a
363
*  4x4 block
364
*
365
* @par Description:
366
*  The function accepts source buffer and estimation buffer. From these, it
367
*  computes the residue. This is residue is then transformed and quantized.
368
*  The transform and quantization are in placed computed. They use the residue
369
*  buffer for this.
370
*
371
* @param[in] pu1_src
372
*  Pointer to source sub-block
373
*
374
* @param[in] pi2_dst
375
*  Pointer to destination sub-block
376
*
377
* @param[in] pu2_threshold_matrix
378
*  Pointer to Forward Quant Threshold Matrix
379
*
380
* @param[in] pu2_scale_matrix
381
*  Pointer to Forward Quant Scale Matrix
382
*
383
* @param[in] u4_qbits
384
*  QP_BITS_h264_4x4 + floor(QP/6)
385
*
386
* @param[in] u4_round_factor
387
*  Quantization Round factor
388
*
389
* @param[out] pu1_nnz
390
*  Total non-zero coefficients in the current sub-block
391
*
392
* @remarks none
393
*
394
********************************************************************************
395
*/
396
void ih264_hadamard_quant_4x4(WORD16 *pi2_src,
397
                              WORD16 *pi2_dst,
398
                              const UWORD16 *pu2_scale_matrix,
399
                              const UWORD16 *pu2_threshold_matrix,
400
                              UWORD32 u4_qbits,
401
                              UWORD32 u4_round_factor,
402
                              UWORD8 *pu1_nnz)
403
0
{
404
0
    WORD32 i;
405
0
    WORD32 x0, x1, x2, x3, x4, x5, x6, x7, i4_value;
406
407
0
    *pu1_nnz = 0;
408
409
0
    for(i = 0; i < SUB_BLK_WIDTH_4x4; i++)
410
0
    {
411
0
        x4 = pi2_src[0];
412
0
        x5 = pi2_src[1];
413
0
        x6 = pi2_src[2];
414
0
        x7 = pi2_src[3];
415
416
0
        x0 = x4 + x7;
417
0
        x1 = x5 + x6;
418
0
        x2 = x5 - x6;
419
0
        x3 = x4 - x7;
420
421
0
        pi2_dst[0] = x0 + x1;
422
0
        pi2_dst[1] = x3 + x2;
423
0
        pi2_dst[2] = x0 - x1;
424
0
        pi2_dst[3] = x3 - x2;
425
426
0
        pi2_src += 4;
427
0
        pi2_dst += 4;
428
0
    }
429
430
    /* Vertical transform and quantization */
431
0
    pi2_dst -= SUB_BLK_WIDTH_4x4 << 2;
432
433
0
    for(i = 0; i < SUB_BLK_WIDTH_4x4; i++)
434
0
    {
435
0
        x4 = pi2_dst[0];
436
0
        x5 = pi2_dst[4];
437
0
        x6 = pi2_dst[8];
438
0
        x7 = pi2_dst[12];
439
440
0
        x0 = x4 + x7;
441
0
        x1 = x5 + x6;
442
0
        x2 = x5 - x6;
443
0
        x3 = x4 - x7;
444
445
0
        i4_value = (x0 + x1) >> 1;
446
0
        FWD_QUANT(i4_value, pu2_threshold_matrix[0],
447
0
                  pu2_scale_matrix[0], u4_round_factor, u4_qbits, pu1_nnz[0]);
448
0
        pi2_dst[0] = i4_value;
449
450
0
        i4_value = (x3 + x2) >> 1;
451
0
        FWD_QUANT(i4_value, pu2_threshold_matrix[0],
452
0
                  pu2_scale_matrix[0], u4_round_factor, u4_qbits, pu1_nnz[0]);
453
0
        pi2_dst[4] = i4_value;
454
455
0
        i4_value = (x0 - x1) >> 1;
456
0
        FWD_QUANT(i4_value, pu2_threshold_matrix[0],
457
0
                  pu2_scale_matrix[0], u4_round_factor, u4_qbits, pu1_nnz[0]);
458
0
        pi2_dst[8] = i4_value;
459
460
0
        i4_value = (x3 - x2) >> 1;
461
0
        FWD_QUANT(i4_value, pu2_threshold_matrix[0],
462
0
                  pu2_scale_matrix[0], u4_round_factor, u4_qbits, pu1_nnz[0]);
463
0
        pi2_dst[12] = i4_value;
464
465
0
        pi2_dst++;
466
0
    }
467
0
}
468
469
/**
470
*******************************************************************************
471
*
472
* @brief
473
*   This function performs forward hadamard transform and quantization on a
474
*   2x2 block for both U and V planes
475
*
476
* @par Description:
477
*  The function accepts source buffer and estimation buffer. From these, it
478
*  computes the residue. This is residue is then transformed and quantized.
479
*  The transform and quantization are in placed computed. They use the residue
480
*  buffer for this.
481
*
482
* @param[in] pu1_src
483
*  Pointer to source sub-block
484
*
485
* @param[in] pi2_dst
486
*  Pointer to destination sub-block
487
*
488
* @param[in] pu2_threshold_matrix
489
*  Pointer to Forward Quant Threshold Matrix
490
*
491
* @param[in] pu2_scale_matrix
492
*  Pointer to Forward Quant Scale Matrix
493
*
494
* @param[in] u4_qbits
495
*  QP_BITS_h264_4x4 + floor(QP/6)
496
*
497
* @param[in] u4_round_factor
498
*  Quantization Round factor
499
*
500
* @param[out] pu1_nnz
501
*  Total non-zero coefficients in the current sub-block
502
*
503
* @remarks
504
*  NNZ for dc is populated at 0 and 5th position of pu1_nnz
505
*
506
*******************************************************************************
507
*/
508
void ih264_hadamard_quant_2x2_uv(WORD16 *pi2_src,
509
                                 WORD16 *pi2_dst,
510
                                 const UWORD16 *pu2_scale_matrix,
511
                                 const UWORD16 *pu2_threshold_matrix,
512
                                 UWORD32 u4_qbits,
513
                                 UWORD32 u4_round_factor,
514
                                 UWORD8 *pu1_nnz)
515
0
{
516
0
    WORD32 x0, x1, x2, x3, x4, x5, x6, x7;
517
0
    WORD32 i4_value, plane;
518
519
0
    for(plane = 0; plane < 2; plane++)
520
0
    {
521
0
        pu1_nnz[plane] = 0;
522
523
        /* Horizontal transform */
524
0
        x4 = pi2_src[0];
525
0
        x5 = pi2_src[1];
526
0
        x6 = pi2_src[2];
527
0
        x7 = pi2_src[3];
528
529
0
        x0 = x4 + x5;
530
0
        x1 = x4 - x5;
531
0
        x2 = x6 + x7;
532
0
        x3 = x6 - x7;
533
534
        /* Vertical transform and quantization */
535
0
        i4_value = (x0 + x2);
536
0
        FWD_QUANT(i4_value, pu2_threshold_matrix[0],
537
0
                  pu2_scale_matrix[0], u4_round_factor, u4_qbits,
538
0
                  pu1_nnz[plane]);
539
0
        pi2_dst[0] = i4_value;
540
541
0
        i4_value = (x0 - x2);
542
0
        FWD_QUANT(i4_value, pu2_threshold_matrix[0],
543
0
                  pu2_scale_matrix[0], u4_round_factor, u4_qbits,
544
0
                  pu1_nnz[plane]);
545
0
        pi2_dst[2] = i4_value;
546
547
0
        i4_value = (x1 - x3);
548
0
        FWD_QUANT(i4_value, pu2_threshold_matrix[0],
549
0
                  pu2_scale_matrix[0], u4_round_factor, u4_qbits,
550
0
                  pu1_nnz[plane]);
551
0
        pi2_dst[3] = i4_value;
552
553
0
        i4_value = (x1 + x3);
554
0
        FWD_QUANT(i4_value, pu2_threshold_matrix[0],
555
0
                  pu2_scale_matrix[0], u4_round_factor, u4_qbits,
556
0
                  pu1_nnz[plane]);
557
0
        pi2_dst[1] = i4_value;
558
559
0
        pi2_dst += 4;
560
0
        pi2_src += 4;
561
0
    }
562
0
}
563
564
/**
565
*******************************************************************************
566
*
567
* @brief
568
*  This function performs Single stage forward transform CF8 and quantization
569
*  on 8x8 blocks
570
*
571
* @par Description:
572
*  Performs single stage 8x8 forward transform CF8 after calculating the residue
573
*  The result is then quantized
574
*
575
* @param[in] pu1_src
576
*  Pointer to source sub-block
577
*
578
* @param[in] pu1_pred
579
*  Pointer to prediction sub-block
580
*
581
* @param[in] pi2_out
582
*  Pointer to residual sub-block
583
*
584
* @param[in] src_strd
585
*  Source stride
586
*
587
* @param[in] pred_strd
588
*  Prediction stride
589
*
590
* @param[in] pu2_scale_matrix
591
*  Pointer to Forward Quant Scale Matrix
592
*
593
* @param[in] pu2_threshold_matrix
594
*  Pointer to Forward Quant Threshold Matrix
595
*
596
* @param[in] u4_qbits
597
*  QP_BITS_h264_8x8 + floor(QP/6)
598
*
599
* @param[in] u4_round_factor
600
*  Quantization Round factor
601
*
602
* @param[out] pu1_nnz
603
*  Total non-zero coefficients in the current sub-block
604
*
605
* @param[in] pi2_alt_dc_addr
606
*  UNUSED
607
*
608
* @returns none
609
*
610
* @remarks:
611
*  TODO: This function needs to be tested before integration
612
*
613
*******************************************************************************
614
*/
615
void ih264_resi_trans_quant_8x8(UWORD8 *pu1_src,
616
                                UWORD8 *pu1_pred,
617
                                WORD16 *pi2_out,
618
                                WORD32 src_strd,
619
                                WORD32 pred_strd,
620
                                const UWORD16 *pu2_scale_matrix,
621
                                const UWORD16 *pu2_threshold_matrix,
622
                                UWORD32 u4_qbits,
623
                                UWORD32 u4_round_factor,
624
                                UWORD8 *pu1_nnz,
625
                                WORD16 *pu1_dc_alt_addr)
626
0
{
627
0
    WORD16 *pi2_out_tmp = pi2_out;
628
0
    WORD32 i;
629
0
    WORD32 a0, a1, a2, a3, a4, a5, a6, a7;
630
0
    WORD32 r0, r1, r2, r3, r4, r5, r6, r7;
631
0
    UWORD32 u4_nonzero_coeff = 0;
632
633
0
    UNUSED(pu1_dc_alt_addr);
634
635
    /* Horizontal transform */
636
0
    for(i = 0; i < SUB_BLK_WIDTH_8x8; ++i)
637
0
    {
638
0
        r0 = pu1_src[0];
639
0
        r0 -= pu1_pred[0];
640
0
        r1 = pu1_src[1];
641
0
        r1 -= pu1_pred[1];
642
0
        r2 = pu1_src[2]; r2 -= pu1_pred[2];
643
0
        r3 = pu1_src[3]; r3 -= pu1_pred[3];
644
0
        r4 = pu1_src[4]; r4 -= pu1_pred[4];
645
0
        r5 = pu1_src[5]; r5 -= pu1_pred[5];
646
0
        r6 = pu1_src[6]; r6 -= pu1_pred[6];
647
0
        r7 = pu1_src[7]; r7 -= pu1_pred[7];
648
649
0
        a0 = r0 + r7;
650
0
        a1 = r1 + r6;
651
0
        a2 = r2 + r5;
652
0
        a3 = r3 + r4;
653
654
0
        a4 = a0 + a3;
655
0
        a5 = a1 + a2;
656
0
        a6 = a0 - a3;
657
0
        a7 = a1 - a2;
658
659
0
        pi2_out_tmp[0] = a4 + a5;
660
0
        pi2_out_tmp[2] = a6 + (a7 >> 1);
661
0
        pi2_out_tmp[4] = a4 - a5;
662
0
        pi2_out_tmp[6] = (a6 >> 1) - a7;
663
664
0
        a0 = r0 - r7;
665
0
        a1 = r1 - r6;
666
0
        a2 = r2 - r5;
667
0
        a3 = r3 - r4;
668
669
0
        a4 = a1 + a2 + ((a0 >> 1) + a0);
670
0
        a5 = a0 - a3 - ((a2 >> 1) + a2);
671
0
        a6 = a0 + a3 - ((a1 >> 1) + a1);
672
0
        a7 = a1 - a2 + ((a3 >> 1) + a3);
673
674
0
        pi2_out_tmp[1] = a4 + (a7 >> 2);
675
0
        pi2_out_tmp[3] = a5 + (a6 >> 2);
676
0
        pi2_out_tmp[5] = a6 - (a5 >> 2);
677
0
        pi2_out_tmp[7] = (a4 >> 2) - a7;
678
679
0
        pu1_src += src_strd;
680
0
        pu1_pred += pred_strd;
681
0
        pi2_out_tmp += 8;
682
0
    }
683
684
    /* vertical transform and quant */
685
0
    pi2_out_tmp = pi2_out;
686
0
    for(i = 0; i < SUB_BLK_WIDTH_8x8; ++i)
687
0
    {
688
0
        r0 = pi2_out_tmp[0];
689
0
        r1 = pi2_out_tmp[8];
690
0
        r2 = pi2_out_tmp[16];
691
0
        r3 = pi2_out_tmp[24];
692
0
        r4 = pi2_out_tmp[32];
693
0
        r5 = pi2_out_tmp[40];
694
0
        r6 = pi2_out_tmp[48];
695
0
        r7 = pi2_out_tmp[56];
696
697
0
        a0 = r0 + r7;
698
0
        a1 = r1 + r6;
699
0
        a2 = r2 + r5;
700
0
        a3 = r3 + r4;
701
702
0
        a4 = a0 + a3;
703
0
        a5 = a1 + a2;
704
0
        a6 = a0 - a3;
705
0
        a7 = a1 - a2;
706
707
0
        a0 = r0 - r7;
708
0
        a1 = r1 - r6;
709
0
        a2 = r2 - r5;
710
0
        a3 = r3 - r4;
711
712
0
        r0 = a4 + a5;
713
0
        r2 = a6 + (a7 >> 1);
714
0
        r4 = a4 - a5;
715
0
        r6 = (a6 >> 1) - a7;
716
717
0
        a4 = a1 + a2 + ((a0 >> 1) + a0);
718
0
        a5 = a0 - a3 - ((a2 >> 1) + a2);
719
0
        a6 = a0 + a3 - ((a1 >> 1) + a1);
720
0
        a7 = a1 - a2 + ((a3 >> 1) + a3);
721
722
0
        r1 = a4 + (a7 >> 2);
723
0
        r3 = a5 + (a6 >> 2);
724
0
        r5 = a6 - (a5 >> 2);
725
0
        r7 = (a4 >> 2) - a7;
726
727
0
        FWD_QUANT(r0, pu2_threshold_matrix[0],
728
0
                  pu2_scale_matrix[0], u4_round_factor, u4_qbits,
729
0
                  u4_nonzero_coeff);
730
0
        pi2_out_tmp[0] = r0;
731
732
0
        FWD_QUANT(r1, pu2_threshold_matrix[8],
733
0
                  pu2_scale_matrix[8], u4_round_factor, u4_qbits,
734
0
                  u4_nonzero_coeff);
735
0
        pi2_out_tmp[8] = r1;
736
737
0
        FWD_QUANT(r2, pu2_threshold_matrix[16],
738
0
                  pu2_scale_matrix[16], u4_round_factor, u4_qbits,
739
0
                  u4_nonzero_coeff);
740
0
        pi2_out_tmp[16] = r2;
741
742
0
        FWD_QUANT(r3, pu2_threshold_matrix[24],
743
0
                  pu2_scale_matrix[24], u4_round_factor, u4_qbits,
744
0
                  u4_nonzero_coeff);
745
0
        pi2_out_tmp[24] = r3;
746
747
0
        FWD_QUANT(r4, pu2_threshold_matrix[32],
748
0
                  pu2_scale_matrix[32], u4_round_factor, u4_qbits,
749
0
                  u4_nonzero_coeff);
750
0
        pi2_out_tmp[32] = r4;
751
752
0
        FWD_QUANT(r5, pu2_threshold_matrix[40],
753
0
                  pu2_scale_matrix[40], u4_round_factor, u4_qbits,
754
0
                  u4_nonzero_coeff);
755
0
        pi2_out_tmp[40] = r5;
756
757
0
        FWD_QUANT(r6, pu2_threshold_matrix[48],
758
0
                  pu2_scale_matrix[48], u4_round_factor, u4_qbits,
759
0
                  u4_nonzero_coeff);
760
0
        pi2_out_tmp[48] = r6;
761
762
0
        FWD_QUANT(r7, pu2_threshold_matrix[56],
763
0
                  pu2_scale_matrix[56], u4_round_factor, u4_qbits,
764
0
                  u4_nonzero_coeff);
765
0
        pi2_out_tmp[56] = r7;
766
767
0
        pi2_out_tmp++;
768
0
        pu2_scale_matrix++;
769
0
        pu2_threshold_matrix++;
770
0
    }
771
    /* Return total nonzero coefficients in the current sub block */
772
0
    *pu1_nnz =  u4_nonzero_coeff;
773
0
}