Coverage Report

Created: 2025-08-25 07:17

/src/vlc/modules/packetizer/startcode_helper.h
Line
Count
Source (jump to first uncovered line)
1
/*****************************************************************************
2
 * startcode_helper.h: Startcodes helpers
3
 *****************************************************************************
4
 * Copyright (C) 2016 VideoLAN Authors
5
 *
6
 * This program is free software; you can redistribute it and/or modify it
7
 * under the terms of the GNU Lesser General Public License as published by
8
 * the Free Software Foundation; either version 2.1 of the License, or
9
 * (at your option) any later version.
10
 *
11
 * This program is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
 * GNU Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public License
17
 * along with this program; if not, write to the Free Software Foundation,
18
 * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
19
 *****************************************************************************/
20
#ifndef VLC_STARTCODE_HELPER_H_
21
#define VLC_STARTCODE_HELPER_H_
22
23
#include <vlc_cpu.h>
24
25
#ifdef CAN_COMPILE_SSE2
26
#  if defined __has_attribute
27
#    if __has_attribute(__vector_size__)
28
#      define HAS_ATTRIBUTE_VECTORSIZE
29
#    endif
30
#  endif
31
32
#  ifdef HAS_ATTRIBUTE_VECTORSIZE
33
    typedef unsigned char v16qu __attribute__((__vector_size__(16)));
34
#  endif
35
#endif
36
37
/* Looks up efficiently for an AnnexB startcode 0x00 0x00 0x01
38
 * by using a 4 times faster trick than single byte lookup. */
39
40
14.2M
#define TRY_MATCH(p,a) {\
41
14.2M
     if (p[a+1] == 0) {\
42
11.3M
            if (p[a+0] == 0 && p[a+2] == 1)\
43
11.3M
                return a+p;\
44
11.3M
            if (p[a+2] == 0 && p[a+3] == 1)\
45
10.9M
                return a+p+1;\
46
10.9M
        }\
47
14.2M
        if (p[a+3] == 0) {\
48
11.0M
            if (p[a+2] == 0 && p[a+4] == 1)\
49
11.0M
                return a+p+2;\
50
11.0M
            if (p[a+4] == 0 && p[a+5] == 1)\
51
10.7M
                return a+p+3;\
52
10.7M
        }\
53
13.6M
    }
54
55
#ifdef CAN_COMPILE_SSE2
56
57
__attribute__ ((__target__ ("sse2")))
58
static inline const uint8_t * startcode_FindAnnexB_SSE2( const uint8_t *p, const uint8_t *end )
59
4.10M
{
60
    /* First align to 16 */
61
    /* Skipping this step and doing unaligned loads isn't faster */
62
4.10M
    const uint8_t *alignedend = p + 16 - ((intptr_t)p & 15);
63
17.9M
    for (end -= 3; p < alignedend && p <= end; p++) {
64
16.6M
        if (p[0] == 0 && p[1] == 0 && p[2] == 1)
65
2.82M
            return p;
66
16.6M
    }
67
68
1.27M
    if( p == end )
69
1.15k
        return NULL;
70
71
1.27M
    alignedend = end - ((intptr_t) end & 15);
72
1.27M
    if( alignedend > p )
73
1.24M
    {
74
1.24M
#  ifdef HAS_ATTRIBUTE_VECTORSIZE
75
1.24M
        const v16qu zeros = { 0 };
76
1.24M
#  endif
77
78
12.3M
        for( ; p < alignedend; p += 16)
79
12.2M
        {
80
12.2M
            uint32_t match;
81
12.2M
#  ifdef HAS_ATTRIBUTE_VECTORSIZE
82
12.2M
            asm volatile(
83
12.2M
                "movdqa   0(%[v]),   %%xmm0\n"
84
12.2M
                "pcmpeqb %[czero],   %%xmm0\n"
85
12.2M
                "pmovmskb  %%xmm0,   %[match]\n" /* mask will be in reversed match order */
86
12.2M
                : [match]"=r"(match)
87
12.2M
                : [v]"r"(p), [czero]"x"(zeros)
88
12.2M
                : "xmm0"
89
12.2M
            );
90
#  else
91
            asm volatile(
92
                "movdqa   0(%[v]),   %%xmm0\n"
93
                "pxor      %%xmm1,   %%xmm1\n"
94
                "pcmpeqb   %%xmm1,   %%xmm0\n"
95
                "pmovmskb  %%xmm0,   %[match]\n" /* mask will be in reversed match order */
96
                : [match]"=r"(match)
97
                : [v]"r"(p)
98
                : "xmm0", "xmm1"
99
            );
100
#  endif
101
12.2M
            if( match & 0x000F )
102
11.8M
                TRY_MATCH(p, 0);
103
11.8M
            if( match & 0x00F0 )
104
11.4M
                TRY_MATCH(p, 4);
105
11.4M
            if( match & 0x0F00 )
106
11.2M
                TRY_MATCH(p, 8);
107
11.2M
            if( match & 0xF000 )
108
11.1M
                TRY_MATCH(p, 12);
109
11.1M
        }
110
1.24M
    }
111
112
1.32M
    for (; p <= end; p++) {
113
1.21M
        if (p[0] == 0 && p[1] == 0 && p[2] == 1)
114
13.8k
            return p;
115
1.21M
    }
116
117
104k
    return NULL;
118
118k
}
h264.c:startcode_FindAnnexB_SSE2
Line
Count
Source
59
1.98M
{
60
    /* First align to 16 */
61
    /* Skipping this step and doing unaligned loads isn't faster */
62
1.98M
    const uint8_t *alignedend = p + 16 - ((intptr_t)p & 15);
63
8.42M
    for (end -= 3; p < alignedend && p <= end; p++) {
64
7.82M
        if (p[0] == 0 && p[1] == 0 && p[2] == 1)
65
1.39M
            return p;
66
7.82M
    }
67
68
593k
    if( p == end )
69
796
        return NULL;
70
71
592k
    alignedend = end - ((intptr_t) end & 15);
72
592k
    if( alignedend > p )
73
573k
    {
74
573k
#  ifdef HAS_ATTRIBUTE_VECTORSIZE
75
573k
        const v16qu zeros = { 0 };
76
573k
#  endif
77
78
5.14M
        for( ; p < alignedend; p += 16)
79
5.10M
        {
80
5.10M
            uint32_t match;
81
5.10M
#  ifdef HAS_ATTRIBUTE_VECTORSIZE
82
5.10M
            asm volatile(
83
5.10M
                "movdqa   0(%[v]),   %%xmm0\n"
84
5.10M
                "pcmpeqb %[czero],   %%xmm0\n"
85
5.10M
                "pmovmskb  %%xmm0,   %[match]\n" /* mask will be in reversed match order */
86
5.10M
                : [match]"=r"(match)
87
5.10M
                : [v]"r"(p), [czero]"x"(zeros)
88
5.10M
                : "xmm0"
89
5.10M
            );
90
#  else
91
            asm volatile(
92
                "movdqa   0(%[v]),   %%xmm0\n"
93
                "pxor      %%xmm1,   %%xmm1\n"
94
                "pcmpeqb   %%xmm1,   %%xmm0\n"
95
                "pmovmskb  %%xmm0,   %[match]\n" /* mask will be in reversed match order */
96
                : [match]"=r"(match)
97
                : [v]"r"(p)
98
                : "xmm0", "xmm1"
99
            );
100
#  endif
101
5.10M
            if( match & 0x000F )
102
4.88M
                TRY_MATCH(p, 0);
103
4.88M
            if( match & 0x00F0 )
104
4.73M
                TRY_MATCH(p, 4);
105
4.73M
            if( match & 0x0F00 )
106
4.63M
                TRY_MATCH(p, 8);
107
4.63M
            if( match & 0xF000 )
108
4.56M
                TRY_MATCH(p, 12);
109
4.56M
        }
110
573k
    }
111
112
556k
    for (; p <= end; p++) {
113
511k
        if (p[0] == 0 && p[1] == 0 && p[2] == 1)
114
7.95k
            return p;
115
511k
    }
116
117
45.0k
    return NULL;
118
53.0k
}
Unexecuted instantiation: hxxx_sei.c:startcode_FindAnnexB_SSE2
Unexecuted instantiation: h264_nal.c:startcode_FindAnnexB_SSE2
Unexecuted instantiation: h264_slice.c:startcode_FindAnnexB_SSE2
hevc.c:startcode_FindAnnexB_SSE2
Line
Count
Source
59
2.09M
{
60
    /* First align to 16 */
61
    /* Skipping this step and doing unaligned loads isn't faster */
62
2.09M
    const uint8_t *alignedend = p + 16 - ((intptr_t)p & 15);
63
9.39M
    for (end -= 3; p < alignedend && p <= end; p++) {
64
8.72M
        if (p[0] == 0 && p[1] == 0 && p[2] == 1)
65
1.41M
            return p;
66
8.72M
    }
67
68
674k
    if( p == end )
69
357
        return NULL;
70
71
674k
    alignedend = end - ((intptr_t) end & 15);
72
674k
    if( alignedend > p )
73
658k
    {
74
658k
#  ifdef HAS_ATTRIBUTE_VECTORSIZE
75
658k
        const v16qu zeros = { 0 };
76
658k
#  endif
77
78
7.18M
        for( ; p < alignedend; p += 16)
79
7.14M
        {
80
7.14M
            uint32_t match;
81
7.14M
#  ifdef HAS_ATTRIBUTE_VECTORSIZE
82
7.14M
            asm volatile(
83
7.14M
                "movdqa   0(%[v]),   %%xmm0\n"
84
7.14M
                "pcmpeqb %[czero],   %%xmm0\n"
85
7.14M
                "pmovmskb  %%xmm0,   %[match]\n" /* mask will be in reversed match order */
86
7.14M
                : [match]"=r"(match)
87
7.14M
                : [v]"r"(p), [czero]"x"(zeros)
88
7.14M
                : "xmm0"
89
7.14M
            );
90
#  else
91
            asm volatile(
92
                "movdqa   0(%[v]),   %%xmm0\n"
93
                "pxor      %%xmm1,   %%xmm1\n"
94
                "pcmpeqb   %%xmm1,   %%xmm0\n"
95
                "pmovmskb  %%xmm0,   %[match]\n" /* mask will be in reversed match order */
96
                : [match]"=r"(match)
97
                : [v]"r"(p)
98
                : "xmm0", "xmm1"
99
            );
100
#  endif
101
7.14M
            if( match & 0x000F )
102
6.90M
                TRY_MATCH(p, 0);
103
6.90M
            if( match & 0x00F0 )
104
6.72M
                TRY_MATCH(p, 4);
105
6.72M
            if( match & 0x0F00 )
106
6.61M
                TRY_MATCH(p, 8);
107
6.61M
            if( match & 0xF000 )
108
6.53M
                TRY_MATCH(p, 12);
109
6.53M
        }
110
658k
    }
111
112
761k
    for (; p <= end; p++) {
113
702k
        if (p[0] == 0 && p[1] == 0 && p[2] == 1)
114
5.80k
            return p;
115
702k
    }
116
117
58.5k
    return NULL;
118
64.3k
}
Unexecuted instantiation: hevc_nal.c:startcode_FindAnnexB_SSE2
Unexecuted instantiation: mpeg4video.c:startcode_FindAnnexB_SSE2
Unexecuted instantiation: mpegvideo.c:startcode_FindAnnexB_SSE2
vc1.c:startcode_FindAnnexB_SSE2
Line
Count
Source
59
21.2k
{
60
    /* First align to 16 */
61
    /* Skipping this step and doing unaligned loads isn't faster */
62
21.2k
    const uint8_t *alignedend = p + 16 - ((intptr_t)p & 15);
63
106k
    for (end -= 3; p < alignedend && p <= end; p++) {
64
97.6k
        if (p[0] == 0 && p[1] == 0 && p[2] == 1)
65
12.1k
            return p;
66
97.6k
    }
67
68
9.10k
    if( p == end )
69
1
        return NULL;
70
71
9.10k
    alignedend = end - ((intptr_t) end & 15);
72
9.10k
    if( alignedend > p )
73
8.84k
    {
74
8.84k
#  ifdef HAS_ATTRIBUTE_VECTORSIZE
75
8.84k
        const v16qu zeros = { 0 };
76
8.84k
#  endif
77
78
23.7k
        for( ; p < alignedend; p += 16)
79
23.2k
        {
80
23.2k
            uint32_t match;
81
23.2k
#  ifdef HAS_ATTRIBUTE_VECTORSIZE
82
23.2k
            asm volatile(
83
23.2k
                "movdqa   0(%[v]),   %%xmm0\n"
84
23.2k
                "pcmpeqb %[czero],   %%xmm0\n"
85
23.2k
                "pmovmskb  %%xmm0,   %[match]\n" /* mask will be in reversed match order */
86
23.2k
                : [match]"=r"(match)
87
23.2k
                : [v]"r"(p), [czero]"x"(zeros)
88
23.2k
                : "xmm0"
89
23.2k
            );
90
#  else
91
            asm volatile(
92
                "movdqa   0(%[v]),   %%xmm0\n"
93
                "pxor      %%xmm1,   %%xmm1\n"
94
                "pcmpeqb   %%xmm1,   %%xmm0\n"
95
                "pmovmskb  %%xmm0,   %[match]\n" /* mask will be in reversed match order */
96
                : [match]"=r"(match)
97
                : [v]"r"(p)
98
                : "xmm0", "xmm1"
99
            );
100
#  endif
101
23.2k
            if( match & 0x000F )
102
20.8k
                TRY_MATCH(p, 0);
103
20.8k
            if( match & 0x00F0 )
104
18.6k
                TRY_MATCH(p, 4);
105
18.6k
            if( match & 0x0F00 )
106
16.6k
                TRY_MATCH(p, 8);
107
16.6k
            if( match & 0xF000 )
108
14.8k
                TRY_MATCH(p, 12);
109
14.8k
        }
110
8.84k
    }
111
112
4.27k
    for (; p <= end; p++) {
113
3.69k
        if (p[0] == 0 && p[1] == 0 && p[2] == 1)
114
134
            return p;
115
3.69k
    }
116
117
574
    return NULL;
118
708
}
119
120
#endif
121
122
/* That code is adapted from libav's ff_avc_find_startcode_internal
123
 * and i believe the trick originated from
124
 * https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
125
 */
126
static inline const uint8_t * startcode_FindAnnexB_Bits( const uint8_t *p, const uint8_t *end )
127
0
{
128
0
    const uint8_t *a = p + 4 - ((intptr_t)p & 3);
129
0
130
0
    for (end -= 3; p < a && p <= end; p++) {
131
0
        if (p[0] == 0 && p[1] == 0 && p[2] == 1)
132
0
            return p;
133
0
    }
134
0
135
0
    for (end -= 3; p < end; p += 4) {
136
0
        uint32_t x;
137
0
        memcpy(&x, p, sizeof(x));
138
0
        if ((x - 0x01010101) & (~x) & 0x80808080)
139
0
        {
140
0
            /* matching DW isn't faster */
141
0
            TRY_MATCH(p, 0);
142
0
        }
143
0
    }
144
0
145
0
    for (end += 3; p <= end; p++) {
146
0
        if (p[0] == 0 && p[1] == 0 && p[2] == 1)
147
0
            return p;
148
0
    }
149
0
150
0
    return NULL;
151
0
}
Unexecuted instantiation: h264.c:startcode_FindAnnexB_Bits
Unexecuted instantiation: hxxx_sei.c:startcode_FindAnnexB_Bits
Unexecuted instantiation: h264_nal.c:startcode_FindAnnexB_Bits
Unexecuted instantiation: h264_slice.c:startcode_FindAnnexB_Bits
Unexecuted instantiation: hevc.c:startcode_FindAnnexB_Bits
Unexecuted instantiation: hevc_nal.c:startcode_FindAnnexB_Bits
Unexecuted instantiation: mpeg4video.c:startcode_FindAnnexB_Bits
Unexecuted instantiation: mpegvideo.c:startcode_FindAnnexB_Bits
Unexecuted instantiation: vc1.c:startcode_FindAnnexB_Bits
152
#undef TRY_MATCH
153
154
#ifdef CAN_COMPILE_SSE2
155
static inline const uint8_t * startcode_FindAnnexB( const uint8_t *p, const uint8_t *end )
156
4.10M
{
157
4.10M
    if (vlc_CPU_SSE2())
158
4.10M
        return startcode_FindAnnexB_SSE2(p, end);
159
0
    else
160
0
        return startcode_FindAnnexB_Bits(p, end);
161
4.10M
}
h264.c:startcode_FindAnnexB
Line
Count
Source
156
1.98M
{
157
1.98M
    if (vlc_CPU_SSE2())
158
1.98M
        return startcode_FindAnnexB_SSE2(p, end);
159
0
    else
160
0
        return startcode_FindAnnexB_Bits(p, end);
161
1.98M
}
Unexecuted instantiation: hxxx_sei.c:startcode_FindAnnexB
Unexecuted instantiation: h264_nal.c:startcode_FindAnnexB
Unexecuted instantiation: h264_slice.c:startcode_FindAnnexB
hevc.c:startcode_FindAnnexB
Line
Count
Source
156
2.09M
{
157
2.09M
    if (vlc_CPU_SSE2())
158
2.09M
        return startcode_FindAnnexB_SSE2(p, end);
159
0
    else
160
0
        return startcode_FindAnnexB_Bits(p, end);
161
2.09M
}
Unexecuted instantiation: hevc_nal.c:startcode_FindAnnexB
Unexecuted instantiation: mpeg4video.c:startcode_FindAnnexB
Unexecuted instantiation: mpegvideo.c:startcode_FindAnnexB
vc1.c:startcode_FindAnnexB
Line
Count
Source
156
21.2k
{
157
21.2k
    if (vlc_CPU_SSE2())
158
21.2k
        return startcode_FindAnnexB_SSE2(p, end);
159
0
    else
160
0
        return startcode_FindAnnexB_Bits(p, end);
161
21.2k
}
162
#else
163
    #define startcode_FindAnnexB startcode_FindAnnexB_Bits
164
#endif
165
166
#endif