/src/vlc/modules/packetizer/startcode_helper.h
Line | Count | Source (jump to first uncovered line) |
1 | | /***************************************************************************** |
2 | | * startcode_helper.h: Startcodes helpers |
3 | | ***************************************************************************** |
4 | | * Copyright (C) 2016 VideoLAN Authors |
5 | | * |
6 | | * This program is free software; you can redistribute it and/or modify it |
7 | | * under the terms of the GNU Lesser General Public License as published by |
8 | | * the Free Software Foundation; either version 2.1 of the License, or |
9 | | * (at your option) any later version. |
10 | | * |
11 | | * This program is distributed in the hope that it will be useful, |
12 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
14 | | * GNU Lesser General Public License for more details. |
15 | | * |
16 | | * You should have received a copy of the GNU Lesser General Public License |
17 | | * along with this program; if not, write to the Free Software Foundation, |
18 | | * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA. |
19 | | *****************************************************************************/ |
20 | | #ifndef VLC_STARTCODE_HELPER_H_ |
21 | | #define VLC_STARTCODE_HELPER_H_ |
22 | | |
23 | | #include <vlc_cpu.h> |
24 | | |
25 | | #ifdef CAN_COMPILE_SSE2 |
26 | | # if defined __has_attribute |
27 | | # if __has_attribute(__vector_size__) |
28 | | # define HAS_ATTRIBUTE_VECTORSIZE |
29 | | # endif |
30 | | # endif |
31 | | |
32 | | # ifdef HAS_ATTRIBUTE_VECTORSIZE |
33 | | typedef unsigned char v16qu __attribute__((__vector_size__(16))); |
34 | | # endif |
35 | | #endif |
36 | | |
37 | | /* Looks up efficiently for an AnnexB startcode 0x00 0x00 0x01 |
38 | | * by using a 4 times faster trick than single byte lookup. */ |
39 | | |
40 | 14.2M | #define TRY_MATCH(p,a) {\ |
41 | 14.2M | if (p[a+1] == 0) {\ |
42 | 11.3M | if (p[a+0] == 0 && p[a+2] == 1)\ |
43 | 11.3M | return a+p;\ |
44 | 11.3M | if (p[a+2] == 0 && p[a+3] == 1)\ |
45 | 10.9M | return a+p+1;\ |
46 | 10.9M | }\ |
47 | 14.2M | if (p[a+3] == 0) {\ |
48 | 11.0M | if (p[a+2] == 0 && p[a+4] == 1)\ |
49 | 11.0M | return a+p+2;\ |
50 | 11.0M | if (p[a+4] == 0 && p[a+5] == 1)\ |
51 | 10.7M | return a+p+3;\ |
52 | 10.7M | }\ |
53 | 13.6M | } |
54 | | |
55 | | #ifdef CAN_COMPILE_SSE2 |
56 | | |
57 | | __attribute__ ((__target__ ("sse2"))) |
58 | | static inline const uint8_t * startcode_FindAnnexB_SSE2( const uint8_t *p, const uint8_t *end ) |
59 | 4.10M | { |
60 | | /* First align to 16 */ |
61 | | /* Skipping this step and doing unaligned loads isn't faster */ |
62 | 4.10M | const uint8_t *alignedend = p + 16 - ((intptr_t)p & 15); |
63 | 17.9M | for (end -= 3; p < alignedend && p <= end; p++) { |
64 | 16.6M | if (p[0] == 0 && p[1] == 0 && p[2] == 1) |
65 | 2.82M | return p; |
66 | 16.6M | } |
67 | | |
68 | 1.27M | if( p == end ) |
69 | 1.15k | return NULL; |
70 | | |
71 | 1.27M | alignedend = end - ((intptr_t) end & 15); |
72 | 1.27M | if( alignedend > p ) |
73 | 1.24M | { |
74 | 1.24M | # ifdef HAS_ATTRIBUTE_VECTORSIZE |
75 | 1.24M | const v16qu zeros = { 0 }; |
76 | 1.24M | # endif |
77 | | |
78 | 12.3M | for( ; p < alignedend; p += 16) |
79 | 12.2M | { |
80 | 12.2M | uint32_t match; |
81 | 12.2M | # ifdef HAS_ATTRIBUTE_VECTORSIZE |
82 | 12.2M | asm volatile( |
83 | 12.2M | "movdqa 0(%[v]), %%xmm0\n" |
84 | 12.2M | "pcmpeqb %[czero], %%xmm0\n" |
85 | 12.2M | "pmovmskb %%xmm0, %[match]\n" /* mask will be in reversed match order */ |
86 | 12.2M | : [match]"=r"(match) |
87 | 12.2M | : [v]"r"(p), [czero]"x"(zeros) |
88 | 12.2M | : "xmm0" |
89 | 12.2M | ); |
90 | | # else |
91 | | asm volatile( |
92 | | "movdqa 0(%[v]), %%xmm0\n" |
93 | | "pxor %%xmm1, %%xmm1\n" |
94 | | "pcmpeqb %%xmm1, %%xmm0\n" |
95 | | "pmovmskb %%xmm0, %[match]\n" /* mask will be in reversed match order */ |
96 | | : [match]"=r"(match) |
97 | | : [v]"r"(p) |
98 | | : "xmm0", "xmm1" |
99 | | ); |
100 | | # endif |
101 | 12.2M | if( match & 0x000F ) |
102 | 11.8M | TRY_MATCH(p, 0); |
103 | 11.8M | if( match & 0x00F0 ) |
104 | 11.4M | TRY_MATCH(p, 4); |
105 | 11.4M | if( match & 0x0F00 ) |
106 | 11.2M | TRY_MATCH(p, 8); |
107 | 11.2M | if( match & 0xF000 ) |
108 | 11.1M | TRY_MATCH(p, 12); |
109 | 11.1M | } |
110 | 1.24M | } |
111 | | |
112 | 1.32M | for (; p <= end; p++) { |
113 | 1.21M | if (p[0] == 0 && p[1] == 0 && p[2] == 1) |
114 | 13.8k | return p; |
115 | 1.21M | } |
116 | | |
117 | 104k | return NULL; |
118 | 118k | } h264.c:startcode_FindAnnexB_SSE2 Line | Count | Source | 59 | 1.98M | { | 60 | | /* First align to 16 */ | 61 | | /* Skipping this step and doing unaligned loads isn't faster */ | 62 | 1.98M | const uint8_t *alignedend = p + 16 - ((intptr_t)p & 15); | 63 | 8.42M | for (end -= 3; p < alignedend && p <= end; p++) { | 64 | 7.82M | if (p[0] == 0 && p[1] == 0 && p[2] == 1) | 65 | 1.39M | return p; | 66 | 7.82M | } | 67 | | | 68 | 593k | if( p == end ) | 69 | 796 | return NULL; | 70 | | | 71 | 592k | alignedend = end - ((intptr_t) end & 15); | 72 | 592k | if( alignedend > p ) | 73 | 573k | { | 74 | 573k | # ifdef HAS_ATTRIBUTE_VECTORSIZE | 75 | 573k | const v16qu zeros = { 0 }; | 76 | 573k | # endif | 77 | | | 78 | 5.14M | for( ; p < alignedend; p += 16) | 79 | 5.10M | { | 80 | 5.10M | uint32_t match; | 81 | 5.10M | # ifdef HAS_ATTRIBUTE_VECTORSIZE | 82 | 5.10M | asm volatile( | 83 | 5.10M | "movdqa 0(%[v]), %%xmm0\n" | 84 | 5.10M | "pcmpeqb %[czero], %%xmm0\n" | 85 | 5.10M | "pmovmskb %%xmm0, %[match]\n" /* mask will be in reversed match order */ | 86 | 5.10M | : [match]"=r"(match) | 87 | 5.10M | : [v]"r"(p), [czero]"x"(zeros) | 88 | 5.10M | : "xmm0" | 89 | 5.10M | ); | 90 | | # else | 91 | | asm volatile( | 92 | | "movdqa 0(%[v]), %%xmm0\n" | 93 | | "pxor %%xmm1, %%xmm1\n" | 94 | | "pcmpeqb %%xmm1, %%xmm0\n" | 95 | | "pmovmskb %%xmm0, %[match]\n" /* mask will be in reversed match order */ | 96 | | : [match]"=r"(match) | 97 | | : [v]"r"(p) | 98 | | : "xmm0", "xmm1" | 99 | | ); | 100 | | # endif | 101 | 5.10M | if( match & 0x000F ) | 102 | 4.88M | TRY_MATCH(p, 0); | 103 | 4.88M | if( match & 0x00F0 ) | 104 | 4.73M | TRY_MATCH(p, 4); | 105 | 4.73M | if( match & 0x0F00 ) | 106 | 4.63M | TRY_MATCH(p, 8); | 107 | 4.63M | if( match & 0xF000 ) | 108 | 4.56M | TRY_MATCH(p, 12); | 109 | 4.56M | } | 110 | 573k | } | 111 | | | 112 | 556k | for (; p <= end; p++) { | 113 | 511k | if (p[0] == 0 && p[1] == 0 && p[2] == 1) | 114 | 7.95k | return p; | 115 | 511k | } | 116 | | | 117 | 45.0k | return NULL; | 118 | 53.0k | } |
Unexecuted instantiation: hxxx_sei.c:startcode_FindAnnexB_SSE2 Unexecuted instantiation: h264_nal.c:startcode_FindAnnexB_SSE2 Unexecuted instantiation: h264_slice.c:startcode_FindAnnexB_SSE2 hevc.c:startcode_FindAnnexB_SSE2 Line | Count | Source | 59 | 2.09M | { | 60 | | /* First align to 16 */ | 61 | | /* Skipping this step and doing unaligned loads isn't faster */ | 62 | 2.09M | const uint8_t *alignedend = p + 16 - ((intptr_t)p & 15); | 63 | 9.39M | for (end -= 3; p < alignedend && p <= end; p++) { | 64 | 8.72M | if (p[0] == 0 && p[1] == 0 && p[2] == 1) | 65 | 1.41M | return p; | 66 | 8.72M | } | 67 | | | 68 | 674k | if( p == end ) | 69 | 357 | return NULL; | 70 | | | 71 | 674k | alignedend = end - ((intptr_t) end & 15); | 72 | 674k | if( alignedend > p ) | 73 | 658k | { | 74 | 658k | # ifdef HAS_ATTRIBUTE_VECTORSIZE | 75 | 658k | const v16qu zeros = { 0 }; | 76 | 658k | # endif | 77 | | | 78 | 7.18M | for( ; p < alignedend; p += 16) | 79 | 7.14M | { | 80 | 7.14M | uint32_t match; | 81 | 7.14M | # ifdef HAS_ATTRIBUTE_VECTORSIZE | 82 | 7.14M | asm volatile( | 83 | 7.14M | "movdqa 0(%[v]), %%xmm0\n" | 84 | 7.14M | "pcmpeqb %[czero], %%xmm0\n" | 85 | 7.14M | "pmovmskb %%xmm0, %[match]\n" /* mask will be in reversed match order */ | 86 | 7.14M | : [match]"=r"(match) | 87 | 7.14M | : [v]"r"(p), [czero]"x"(zeros) | 88 | 7.14M | : "xmm0" | 89 | 7.14M | ); | 90 | | # else | 91 | | asm volatile( | 92 | | "movdqa 0(%[v]), %%xmm0\n" | 93 | | "pxor %%xmm1, %%xmm1\n" | 94 | | "pcmpeqb %%xmm1, %%xmm0\n" | 95 | | "pmovmskb %%xmm0, %[match]\n" /* mask will be in reversed match order */ | 96 | | : [match]"=r"(match) | 97 | | : [v]"r"(p) | 98 | | : "xmm0", "xmm1" | 99 | | ); | 100 | | # endif | 101 | 7.14M | if( match & 0x000F ) | 102 | 6.90M | TRY_MATCH(p, 0); | 103 | 6.90M | if( match & 0x00F0 ) | 104 | 6.72M | TRY_MATCH(p, 4); | 105 | 6.72M | if( match & 0x0F00 ) | 106 | 6.61M | TRY_MATCH(p, 8); | 107 | 6.61M | if( match & 0xF000 ) | 108 | 6.53M | TRY_MATCH(p, 12); | 109 | 6.53M | } | 110 | 658k | } | 111 | | | 112 | 761k | for (; p <= end; p++) { | 113 | 702k | if (p[0] == 0 && p[1] == 0 && p[2] == 1) | 114 | 5.80k | return p; | 115 | 702k | } | 116 | | | 117 | 58.5k | return NULL; | 118 | 64.3k | } |
Unexecuted instantiation: hevc_nal.c:startcode_FindAnnexB_SSE2 Unexecuted instantiation: mpeg4video.c:startcode_FindAnnexB_SSE2 Unexecuted instantiation: mpegvideo.c:startcode_FindAnnexB_SSE2 vc1.c:startcode_FindAnnexB_SSE2 Line | Count | Source | 59 | 21.2k | { | 60 | | /* First align to 16 */ | 61 | | /* Skipping this step and doing unaligned loads isn't faster */ | 62 | 21.2k | const uint8_t *alignedend = p + 16 - ((intptr_t)p & 15); | 63 | 106k | for (end -= 3; p < alignedend && p <= end; p++) { | 64 | 97.6k | if (p[0] == 0 && p[1] == 0 && p[2] == 1) | 65 | 12.1k | return p; | 66 | 97.6k | } | 67 | | | 68 | 9.10k | if( p == end ) | 69 | 1 | return NULL; | 70 | | | 71 | 9.10k | alignedend = end - ((intptr_t) end & 15); | 72 | 9.10k | if( alignedend > p ) | 73 | 8.84k | { | 74 | 8.84k | # ifdef HAS_ATTRIBUTE_VECTORSIZE | 75 | 8.84k | const v16qu zeros = { 0 }; | 76 | 8.84k | # endif | 77 | | | 78 | 23.7k | for( ; p < alignedend; p += 16) | 79 | 23.2k | { | 80 | 23.2k | uint32_t match; | 81 | 23.2k | # ifdef HAS_ATTRIBUTE_VECTORSIZE | 82 | 23.2k | asm volatile( | 83 | 23.2k | "movdqa 0(%[v]), %%xmm0\n" | 84 | 23.2k | "pcmpeqb %[czero], %%xmm0\n" | 85 | 23.2k | "pmovmskb %%xmm0, %[match]\n" /* mask will be in reversed match order */ | 86 | 23.2k | : [match]"=r"(match) | 87 | 23.2k | : [v]"r"(p), [czero]"x"(zeros) | 88 | 23.2k | : "xmm0" | 89 | 23.2k | ); | 90 | | # else | 91 | | asm volatile( | 92 | | "movdqa 0(%[v]), %%xmm0\n" | 93 | | "pxor %%xmm1, %%xmm1\n" | 94 | | "pcmpeqb %%xmm1, %%xmm0\n" | 95 | | "pmovmskb %%xmm0, %[match]\n" /* mask will be in reversed match order */ | 96 | | : [match]"=r"(match) | 97 | | : [v]"r"(p) | 98 | | : "xmm0", "xmm1" | 99 | | ); | 100 | | # endif | 101 | 23.2k | if( match & 0x000F ) | 102 | 20.8k | TRY_MATCH(p, 0); | 103 | 20.8k | if( match & 0x00F0 ) | 104 | 18.6k | TRY_MATCH(p, 4); | 105 | 18.6k | if( match & 0x0F00 ) | 106 | 16.6k | TRY_MATCH(p, 8); | 107 | 16.6k | if( match & 0xF000 ) | 108 | 14.8k | TRY_MATCH(p, 12); | 109 | 14.8k | } | 110 | 8.84k | } | 111 | | | 112 | 4.27k | for (; p <= end; p++) { | 113 | 3.69k | if (p[0] == 0 && p[1] == 0 && p[2] == 1) | 114 | 134 | return p; | 115 | 3.69k | } | 116 | | | 117 | 574 | return NULL; | 118 | 708 | } |
|
119 | | |
120 | | #endif |
121 | | |
122 | | /* That code is adapted from libav's ff_avc_find_startcode_internal |
123 | | * and i believe the trick originated from |
124 | | * https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord |
125 | | */ |
126 | | static inline const uint8_t * startcode_FindAnnexB_Bits( const uint8_t *p, const uint8_t *end ) |
127 | 0 | { |
128 | 0 | const uint8_t *a = p + 4 - ((intptr_t)p & 3); |
129 | 0 |
|
130 | 0 | for (end -= 3; p < a && p <= end; p++) { |
131 | 0 | if (p[0] == 0 && p[1] == 0 && p[2] == 1) |
132 | 0 | return p; |
133 | 0 | } |
134 | 0 |
|
135 | 0 | for (end -= 3; p < end; p += 4) { |
136 | 0 | uint32_t x; |
137 | 0 | memcpy(&x, p, sizeof(x)); |
138 | 0 | if ((x - 0x01010101) & (~x) & 0x80808080) |
139 | 0 | { |
140 | 0 | /* matching DW isn't faster */ |
141 | 0 | TRY_MATCH(p, 0); |
142 | 0 | } |
143 | 0 | } |
144 | 0 |
|
145 | 0 | for (end += 3; p <= end; p++) { |
146 | 0 | if (p[0] == 0 && p[1] == 0 && p[2] == 1) |
147 | 0 | return p; |
148 | 0 | } |
149 | 0 |
|
150 | 0 | return NULL; |
151 | 0 | } Unexecuted instantiation: h264.c:startcode_FindAnnexB_Bits Unexecuted instantiation: hxxx_sei.c:startcode_FindAnnexB_Bits Unexecuted instantiation: h264_nal.c:startcode_FindAnnexB_Bits Unexecuted instantiation: h264_slice.c:startcode_FindAnnexB_Bits Unexecuted instantiation: hevc.c:startcode_FindAnnexB_Bits Unexecuted instantiation: hevc_nal.c:startcode_FindAnnexB_Bits Unexecuted instantiation: mpeg4video.c:startcode_FindAnnexB_Bits Unexecuted instantiation: mpegvideo.c:startcode_FindAnnexB_Bits Unexecuted instantiation: vc1.c:startcode_FindAnnexB_Bits |
152 | | #undef TRY_MATCH |
153 | | |
154 | | #ifdef CAN_COMPILE_SSE2 |
155 | | static inline const uint8_t * startcode_FindAnnexB( const uint8_t *p, const uint8_t *end ) |
156 | 4.10M | { |
157 | 4.10M | if (vlc_CPU_SSE2()) |
158 | 4.10M | return startcode_FindAnnexB_SSE2(p, end); |
159 | 0 | else |
160 | 0 | return startcode_FindAnnexB_Bits(p, end); |
161 | 4.10M | } h264.c:startcode_FindAnnexB Line | Count | Source | 156 | 1.98M | { | 157 | 1.98M | if (vlc_CPU_SSE2()) | 158 | 1.98M | return startcode_FindAnnexB_SSE2(p, end); | 159 | 0 | else | 160 | 0 | return startcode_FindAnnexB_Bits(p, end); | 161 | 1.98M | } |
Unexecuted instantiation: hxxx_sei.c:startcode_FindAnnexB Unexecuted instantiation: h264_nal.c:startcode_FindAnnexB Unexecuted instantiation: h264_slice.c:startcode_FindAnnexB hevc.c:startcode_FindAnnexB Line | Count | Source | 156 | 2.09M | { | 157 | 2.09M | if (vlc_CPU_SSE2()) | 158 | 2.09M | return startcode_FindAnnexB_SSE2(p, end); | 159 | 0 | else | 160 | 0 | return startcode_FindAnnexB_Bits(p, end); | 161 | 2.09M | } |
Unexecuted instantiation: hevc_nal.c:startcode_FindAnnexB Unexecuted instantiation: mpeg4video.c:startcode_FindAnnexB Unexecuted instantiation: mpegvideo.c:startcode_FindAnnexB vc1.c:startcode_FindAnnexB Line | Count | Source | 156 | 21.2k | { | 157 | 21.2k | if (vlc_CPU_SSE2()) | 158 | 21.2k | return startcode_FindAnnexB_SSE2(p, end); | 159 | 0 | else | 160 | 0 | return startcode_FindAnnexB_Bits(p, end); | 161 | 21.2k | } |
|
162 | | #else |
163 | | #define startcode_FindAnnexB startcode_FindAnnexB_Bits |
164 | | #endif |
165 | | |
166 | | #endif |