/src/vlc/modules/packetizer/startcode_helper.h
Line | Count | Source |
1 | | /***************************************************************************** |
2 | | * startcode_helper.h: Startcodes helpers |
3 | | ***************************************************************************** |
4 | | * Copyright (C) 2016 VideoLAN Authors |
5 | | * |
6 | | * This program is free software; you can redistribute it and/or modify it |
7 | | * under the terms of the GNU Lesser General Public License as published by |
8 | | * the Free Software Foundation; either version 2.1 of the License, or |
9 | | * (at your option) any later version. |
10 | | * |
11 | | * This program is distributed in the hope that it will be useful, |
12 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
14 | | * GNU Lesser General Public License for more details. |
15 | | * |
16 | | * You should have received a copy of the GNU Lesser General Public License |
17 | | * along with this program; if not, write to the Free Software Foundation, |
18 | | * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA. |
19 | | *****************************************************************************/ |
20 | | #ifndef VLC_STARTCODE_HELPER_H_ |
21 | | #define VLC_STARTCODE_HELPER_H_ |
22 | | |
23 | | #include <vlc_cpu.h> |
24 | | |
25 | | #ifdef CAN_COMPILE_SSE2 |
26 | | # if defined __has_attribute |
27 | | # if __has_attribute(__vector_size__) |
28 | | # define HAS_ATTRIBUTE_VECTORSIZE |
29 | | # endif |
30 | | # endif |
31 | | |
32 | | # ifdef HAS_ATTRIBUTE_VECTORSIZE |
33 | | typedef unsigned char v16qu __attribute__((__vector_size__(16))); |
34 | | # endif |
35 | | #endif |
36 | | |
37 | | /* Looks up efficiently for an AnnexB startcode 0x00 0x00 0x01 |
38 | | * by using a 4 times faster trick than single byte lookup. */ |
39 | | |
40 | 16.2M | #define TRY_MATCH(p,a) {\ |
41 | 16.2M | if (p[a+1] == 0) {\ |
42 | 12.9M | if (p[a+0] == 0 && p[a+2] == 1)\ |
43 | 12.9M | return a+p;\ |
44 | 12.9M | if (p[a+2] == 0 && p[a+3] == 1)\ |
45 | 12.5M | return a+p+1;\ |
46 | 12.5M | }\ |
47 | 16.2M | if (p[a+3] == 0) {\ |
48 | 12.6M | if (p[a+2] == 0 && p[a+4] == 1)\ |
49 | 12.6M | return a+p+2;\ |
50 | 12.6M | if (p[a+4] == 0 && p[a+5] == 1)\ |
51 | 12.3M | return a+p+3;\ |
52 | 12.3M | }\ |
53 | 15.5M | } |
54 | | |
55 | | #ifdef CAN_COMPILE_SSE2 |
56 | | |
57 | | __attribute__ ((__target__ ("sse2"))) |
58 | | static inline const uint8_t * startcode_FindAnnexB_SSE2( const uint8_t *p, const uint8_t *end ) |
59 | 5.11M | { |
60 | | /* First align to 16 */ |
61 | | /* Skipping this step and doing unaligned loads isn't faster */ |
62 | 5.11M | const uint8_t *alignedend = p + 16 - ((intptr_t)p & 15); |
63 | 20.8M | for (end -= 3; p < alignedend && p <= end; p++) { |
64 | 19.4M | if (p[0] == 0 && p[1] == 0 && p[2] == 1) |
65 | 3.74M | return p; |
66 | 19.4M | } |
67 | | |
68 | 1.36M | if( p == end ) |
69 | 1.25k | return NULL; |
70 | | |
71 | 1.36M | alignedend = end - ((intptr_t) end & 15); |
72 | 1.36M | if( alignedend > p ) |
73 | 1.32M | { |
74 | 1.32M | # ifdef HAS_ATTRIBUTE_VECTORSIZE |
75 | 1.32M | const v16qu zeros = { 0 }; |
76 | 1.32M | # endif |
77 | | |
78 | 11.6M | for( ; p < alignedend; p += 16) |
79 | 11.5M | { |
80 | 11.5M | uint32_t match; |
81 | 11.5M | # ifdef HAS_ATTRIBUTE_VECTORSIZE |
82 | 11.5M | asm volatile( |
83 | 11.5M | "movdqa 0(%[v]), %%xmm0\n" |
84 | 11.5M | "pcmpeqb %[czero], %%xmm0\n" |
85 | 11.5M | "pmovmskb %%xmm0, %[match]\n" /* mask will be in reversed match order */ |
86 | 11.5M | : [match]"=r"(match) |
87 | 11.5M | : [v]"r"(p), [czero]"x"(zeros) |
88 | 11.5M | : "xmm0" |
89 | 11.5M | ); |
90 | | # else |
91 | | asm volatile( |
92 | | "movdqa 0(%[v]), %%xmm0\n" |
93 | | "pxor %%xmm1, %%xmm1\n" |
94 | | "pcmpeqb %%xmm1, %%xmm0\n" |
95 | | "pmovmskb %%xmm0, %[match]\n" /* mask will be in reversed match order */ |
96 | | : [match]"=r"(match) |
97 | | : [v]"r"(p) |
98 | | : "xmm0", "xmm1" |
99 | | ); |
100 | | # endif |
101 | 11.5M | if( match & 0x000F ) |
102 | 11.0M | TRY_MATCH(p, 0); |
103 | 11.0M | if( match & 0x00F0 ) |
104 | 10.6M | TRY_MATCH(p, 4); |
105 | 10.6M | if( match & 0x0F00 ) |
106 | 10.4M | TRY_MATCH(p, 8); |
107 | 10.4M | if( match & 0xF000 ) |
108 | 10.3M | TRY_MATCH(p, 12); |
109 | 10.3M | } |
110 | 1.32M | } |
111 | | |
112 | 1.22M | for (; p <= end; p++) { |
113 | 1.12M | if (p[0] == 0 && p[1] == 0 && p[2] == 1) |
114 | 14.9k | return p; |
115 | 1.12M | } |
116 | | |
117 | 99.4k | return NULL; |
118 | 114k | } h264.c:startcode_FindAnnexB_SSE2 Line | Count | Source | 59 | 2.12M | { | 60 | | /* First align to 16 */ | 61 | | /* Skipping this step and doing unaligned loads isn't faster */ | 62 | 2.12M | const uint8_t *alignedend = p + 16 - ((intptr_t)p & 15); | 63 | 8.53M | for (end -= 3; p < alignedend && p <= end; p++) { | 64 | 7.95M | if (p[0] == 0 && p[1] == 0 && p[2] == 1) | 65 | 1.54M | return p; | 66 | 7.95M | } | 67 | | | 68 | 574k | if( p == end ) | 69 | 854 | return NULL; | 70 | | | 71 | 573k | alignedend = end - ((intptr_t) end & 15); | 72 | 573k | if( alignedend > p ) | 73 | 554k | { | 74 | 554k | # ifdef HAS_ATTRIBUTE_VECTORSIZE | 75 | 554k | const v16qu zeros = { 0 }; | 76 | 554k | # endif | 77 | | | 78 | 4.82M | for( ; p < alignedend; p += 16) | 79 | 4.79M | { | 80 | 4.79M | uint32_t match; | 81 | 4.79M | # ifdef HAS_ATTRIBUTE_VECTORSIZE | 82 | 4.79M | asm volatile( | 83 | 4.79M | "movdqa 0(%[v]), %%xmm0\n" | 84 | 4.79M | "pcmpeqb %[czero], %%xmm0\n" | 85 | 4.79M | "pmovmskb %%xmm0, %[match]\n" /* mask will be in reversed match order */ | 86 | 4.79M | : [match]"=r"(match) | 87 | 4.79M | : [v]"r"(p), [czero]"x"(zeros) | 88 | 4.79M | : "xmm0" | 89 | 4.79M | ); | 90 | | # else | 91 | | asm volatile( | 92 | | "movdqa 0(%[v]), %%xmm0\n" | 93 | | "pxor %%xmm1, %%xmm1\n" | 94 | | "pcmpeqb %%xmm1, %%xmm0\n" | 95 | | "pmovmskb %%xmm0, %[match]\n" /* mask will be in reversed match order */ | 96 | | : [match]"=r"(match) | 97 | | : [v]"r"(p) | 98 | | : "xmm0", "xmm1" | 99 | | ); | 100 | | # endif | 101 | 4.79M | if( match & 0x000F ) | 102 | 4.57M | TRY_MATCH(p, 0); | 103 | 4.57M | if( match & 0x00F0 ) | 104 | 4.43M | TRY_MATCH(p, 4); | 105 | 4.43M | if( match & 0x0F00 ) | 106 | 4.33M | TRY_MATCH(p, 8); | 107 | 4.33M | if( match & 0xF000 ) | 108 | 4.27M | TRY_MATCH(p, 12); | 109 | 4.27M | } | 110 | 554k | } | 111 | | | 112 | 513k | for (; p <= end; p++) { | 113 | 470k | if (p[0] == 0 && p[1] == 0 && p[2] == 1) | 114 | 7.97k | return p; | 115 | 470k | } | 116 | | | 117 | 43.0k | return NULL; | 118 | 50.9k | } |
Unexecuted instantiation: hxxx_sei.c:startcode_FindAnnexB_SSE2 Unexecuted instantiation: h264_nal.c:startcode_FindAnnexB_SSE2 Unexecuted instantiation: h264_slice.c:startcode_FindAnnexB_SSE2 hevc.c:startcode_FindAnnexB_SSE2 Line | Count | Source | 59 | 2.96M | { | 60 | | /* First align to 16 */ | 61 | | /* Skipping this step and doing unaligned loads isn't faster */ | 62 | 2.96M | const uint8_t *alignedend = p + 16 - ((intptr_t)p & 15); | 63 | 12.1M | for (end -= 3; p < alignedend && p <= end; p++) { | 64 | 11.3M | if (p[0] == 0 && p[1] == 0 && p[2] == 1) | 65 | 2.18M | return p; | 66 | 11.3M | } | 67 | | | 68 | 781k | if( p == end ) | 69 | 401 | return NULL; | 70 | | | 71 | 781k | alignedend = end - ((intptr_t) end & 15); | 72 | 781k | if( alignedend > p ) | 73 | 763k | { | 74 | 763k | # ifdef HAS_ATTRIBUTE_VECTORSIZE | 75 | 763k | const v16qu zeros = { 0 }; | 76 | 763k | # endif | 77 | | | 78 | 6.80M | for( ; p < alignedend; p += 16) | 79 | 6.75M | { | 80 | 6.75M | uint32_t match; | 81 | 6.75M | # ifdef HAS_ATTRIBUTE_VECTORSIZE | 82 | 6.75M | asm volatile( | 83 | 6.75M | "movdqa 0(%[v]), %%xmm0\n" | 84 | 6.75M | "pcmpeqb %[czero], %%xmm0\n" | 85 | 6.75M | "pmovmskb %%xmm0, %[match]\n" /* mask will be in reversed match order */ | 86 | 6.75M | : [match]"=r"(match) | 87 | 6.75M | : [v]"r"(p), [czero]"x"(zeros) | 88 | 6.75M | : "xmm0" | 89 | 6.75M | ); | 90 | | # else | 91 | | asm volatile( | 92 | | "movdqa 0(%[v]), %%xmm0\n" | 93 | | "pxor %%xmm1, %%xmm1\n" | 94 | | "pcmpeqb %%xmm1, %%xmm0\n" | 95 | | "pmovmskb %%xmm0, %[match]\n" /* mask will be in reversed match order */ | 96 | | : [match]"=r"(match) | 97 | | : [v]"r"(p) | 98 | | : "xmm0", "xmm1" | 99 | | ); | 100 | | # endif | 101 | 6.75M | if( match & 0x000F ) | 102 | 6.40M | TRY_MATCH(p, 0); | 103 | 6.40M | if( match & 0x00F0 ) | 104 | 6.19M | TRY_MATCH(p, 4); | 105 | 6.19M | if( match & 0x0F00 ) | 106 | 6.10M | TRY_MATCH(p, 8); | 107 | 6.10M | if( match & 0xF000 ) | 108 | 6.03M | TRY_MATCH(p, 12); | 109 | 6.03M | } | 110 | 763k | } | 111 | | | 112 | 702k | for (; p <= end; p++) { | 113 | 647k | if (p[0] == 0 && p[1] == 0 && p[2] == 1) | 114 | 6.82k | return p; | 115 | 647k | } | 116 | | | 117 | 55.6k | return NULL; | 118 | 62.5k | } |
Unexecuted instantiation: hevc_nal.c:startcode_FindAnnexB_SSE2 Unexecuted instantiation: mpeg4video.c:startcode_FindAnnexB_SSE2 Unexecuted instantiation: mpegvideo.c:startcode_FindAnnexB_SSE2 vc1.c:startcode_FindAnnexB_SSE2 Line | Count | Source | 59 | 28.5k | { | 60 | | /* First align to 16 */ | 61 | | /* Skipping this step and doing unaligned loads isn't faster */ | 62 | 28.5k | const uint8_t *alignedend = p + 16 - ((intptr_t)p & 15); | 63 | 135k | for (end -= 3; p < alignedend && p <= end; p++) { | 64 | 124k | if (p[0] == 0 && p[1] == 0 && p[2] == 1) | 65 | 17.3k | return p; | 66 | 124k | } | 67 | | | 68 | 11.2k | if( p == end ) | 69 | 2 | return NULL; | 70 | | | 71 | 11.2k | alignedend = end - ((intptr_t) end & 15); | 72 | 11.2k | if( alignedend > p ) | 73 | 10.9k | { | 74 | 10.9k | # ifdef HAS_ATTRIBUTE_VECTORSIZE | 75 | 10.9k | const v16qu zeros = { 0 }; | 76 | 10.9k | # endif | 77 | | | 78 | 28.3k | for( ; p < alignedend; p += 16) | 79 | 27.7k | { | 80 | 27.7k | uint32_t match; | 81 | 27.7k | # ifdef HAS_ATTRIBUTE_VECTORSIZE | 82 | 27.7k | asm volatile( | 83 | 27.7k | "movdqa 0(%[v]), %%xmm0\n" | 84 | 27.7k | "pcmpeqb %[czero], %%xmm0\n" | 85 | 27.7k | "pmovmskb %%xmm0, %[match]\n" /* mask will be in reversed match order */ | 86 | 27.7k | : [match]"=r"(match) | 87 | 27.7k | : [v]"r"(p), [czero]"x"(zeros) | 88 | 27.7k | : "xmm0" | 89 | 27.7k | ); | 90 | | # else | 91 | | asm volatile( | 92 | | "movdqa 0(%[v]), %%xmm0\n" | 93 | | "pxor %%xmm1, %%xmm1\n" | 94 | | "pcmpeqb %%xmm1, %%xmm0\n" | 95 | | "pmovmskb %%xmm0, %[match]\n" /* mask will be in reversed match order */ | 96 | | : [match]"=r"(match) | 97 | | : [v]"r"(p) | 98 | | : "xmm0", "xmm1" | 99 | | ); | 100 | | # endif | 101 | 27.7k | if( match & 0x000F ) | 102 | 24.5k | TRY_MATCH(p, 0); | 103 | 24.5k | if( match & 0x00F0 ) | 104 | 21.7k | TRY_MATCH(p, 4); | 105 | 21.7k | if( match & 0x0F00 ) | 106 | 19.3k | TRY_MATCH(p, 8); | 107 | 19.3k | if( match & 0xF000 ) | 108 | 17.3k | TRY_MATCH(p, 12); | 109 | 17.3k | } | 110 | 10.9k | } | 111 | | | 112 | 4.80k | for (; p <= end; p++) { | 113 | 4.09k | if (p[0] == 0 && p[1] == 0 && p[2] == 1) | 114 | 142 | return p; | 115 | 4.09k | } | 116 | | | 117 | 715 | return NULL; | 118 | 857 | } |
|
119 | | |
120 | | #endif |
121 | | |
122 | | /* That code is adapted from libav's ff_avc_find_startcode_internal |
123 | | * and i believe the trick originated from |
124 | | * https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord |
125 | | */ |
126 | | static inline const uint8_t * startcode_FindAnnexB_Bits( const uint8_t *p, const uint8_t *end ) |
127 | 0 | { |
128 | 0 | const uint8_t *a = p + 4 - ((intptr_t)p & 3); |
129 | 0 |
|
130 | 0 | for (end -= 3; p < a && p <= end; p++) { |
131 | 0 | if (p[0] == 0 && p[1] == 0 && p[2] == 1) |
132 | 0 | return p; |
133 | 0 | } |
134 | 0 |
|
135 | 0 | for (end -= 3; p < end; p += 4) { |
136 | 0 | uint32_t x; |
137 | 0 | memcpy(&x, p, sizeof(x)); |
138 | 0 | if ((x - 0x01010101) & (~x) & 0x80808080) |
139 | 0 | { |
140 | 0 | /* matching DW isn't faster */ |
141 | 0 | TRY_MATCH(p, 0); |
142 | 0 | } |
143 | 0 | } |
144 | 0 |
|
145 | 0 | for (end += 3; p <= end; p++) { |
146 | 0 | if (p[0] == 0 && p[1] == 0 && p[2] == 1) |
147 | 0 | return p; |
148 | 0 | } |
149 | 0 |
|
150 | 0 | return NULL; |
151 | 0 | } Unexecuted instantiation: h264.c:startcode_FindAnnexB_Bits Unexecuted instantiation: hxxx_sei.c:startcode_FindAnnexB_Bits Unexecuted instantiation: h264_nal.c:startcode_FindAnnexB_Bits Unexecuted instantiation: h264_slice.c:startcode_FindAnnexB_Bits Unexecuted instantiation: hevc.c:startcode_FindAnnexB_Bits Unexecuted instantiation: hevc_nal.c:startcode_FindAnnexB_Bits Unexecuted instantiation: mpeg4video.c:startcode_FindAnnexB_Bits Unexecuted instantiation: mpegvideo.c:startcode_FindAnnexB_Bits Unexecuted instantiation: vc1.c:startcode_FindAnnexB_Bits |
152 | | #undef TRY_MATCH |
153 | | |
154 | | #ifdef CAN_COMPILE_SSE2 |
155 | | static inline const uint8_t * startcode_FindAnnexB( const uint8_t *p, const uint8_t *end ) |
156 | 5.11M | { |
157 | 5.11M | if (vlc_CPU_SSE2()) |
158 | 5.11M | return startcode_FindAnnexB_SSE2(p, end); |
159 | 0 | else |
160 | 0 | return startcode_FindAnnexB_Bits(p, end); |
161 | 5.11M | } h264.c:startcode_FindAnnexB Line | Count | Source | 156 | 2.12M | { | 157 | 2.12M | if (vlc_CPU_SSE2()) | 158 | 2.12M | return startcode_FindAnnexB_SSE2(p, end); | 159 | 0 | else | 160 | 0 | return startcode_FindAnnexB_Bits(p, end); | 161 | 2.12M | } |
Unexecuted instantiation: hxxx_sei.c:startcode_FindAnnexB Unexecuted instantiation: h264_nal.c:startcode_FindAnnexB Unexecuted instantiation: h264_slice.c:startcode_FindAnnexB hevc.c:startcode_FindAnnexB Line | Count | Source | 156 | 2.96M | { | 157 | 2.96M | if (vlc_CPU_SSE2()) | 158 | 2.96M | return startcode_FindAnnexB_SSE2(p, end); | 159 | 0 | else | 160 | 0 | return startcode_FindAnnexB_Bits(p, end); | 161 | 2.96M | } |
Unexecuted instantiation: hevc_nal.c:startcode_FindAnnexB Unexecuted instantiation: mpeg4video.c:startcode_FindAnnexB Unexecuted instantiation: mpegvideo.c:startcode_FindAnnexB vc1.c:startcode_FindAnnexB Line | Count | Source | 156 | 28.5k | { | 157 | 28.5k | if (vlc_CPU_SSE2()) | 158 | 28.5k | return startcode_FindAnnexB_SSE2(p, end); | 159 | 0 | else | 160 | 0 | return startcode_FindAnnexB_Bits(p, end); | 161 | 28.5k | } |
|
162 | | #else |
163 | | #define startcode_FindAnnexB startcode_FindAnnexB_Bits |
164 | | #endif |
165 | | |
166 | | #endif |