Coverage Report

Created: 2025-10-10 06:26

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/vlc/modules/packetizer/startcode_helper.h
Line
Count
Source
1
/*****************************************************************************
2
 * startcode_helper.h: Startcodes helpers
3
 *****************************************************************************
4
 * Copyright (C) 2016 VideoLAN Authors
5
 *
6
 * This program is free software; you can redistribute it and/or modify it
7
 * under the terms of the GNU Lesser General Public License as published by
8
 * the Free Software Foundation; either version 2.1 of the License, or
9
 * (at your option) any later version.
10
 *
11
 * This program is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
 * GNU Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public License
17
 * along with this program; if not, write to the Free Software Foundation,
18
 * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
19
 *****************************************************************************/
20
#ifndef VLC_STARTCODE_HELPER_H_
21
#define VLC_STARTCODE_HELPER_H_
22
23
#include <vlc_cpu.h>
24
25
#ifdef CAN_COMPILE_SSE2
26
#  if defined __has_attribute
27
#    if __has_attribute(__vector_size__)
28
#      define HAS_ATTRIBUTE_VECTORSIZE
29
#    endif
30
#  endif
31
32
#  ifdef HAS_ATTRIBUTE_VECTORSIZE
33
    typedef unsigned char v16qu __attribute__((__vector_size__(16)));
34
#  endif
35
#endif
36
37
/* Looks up efficiently for an AnnexB startcode 0x00 0x00 0x01
38
 * by using a 4 times faster trick than single byte lookup. */
39
40
16.2M
#define TRY_MATCH(p,a) {\
41
16.2M
     if (p[a+1] == 0) {\
42
12.9M
            if (p[a+0] == 0 && p[a+2] == 1)\
43
12.9M
                return a+p;\
44
12.9M
            if (p[a+2] == 0 && p[a+3] == 1)\
45
12.5M
                return a+p+1;\
46
12.5M
        }\
47
16.2M
        if (p[a+3] == 0) {\
48
12.6M
            if (p[a+2] == 0 && p[a+4] == 1)\
49
12.6M
                return a+p+2;\
50
12.6M
            if (p[a+4] == 0 && p[a+5] == 1)\
51
12.3M
                return a+p+3;\
52
12.3M
        }\
53
15.5M
    }
54
55
#ifdef CAN_COMPILE_SSE2
56
57
__attribute__ ((__target__ ("sse2")))
58
static inline const uint8_t * startcode_FindAnnexB_SSE2( const uint8_t *p, const uint8_t *end )
59
5.11M
{
60
    /* First align to 16 */
61
    /* Skipping this step and doing unaligned loads isn't faster */
62
5.11M
    const uint8_t *alignedend = p + 16 - ((intptr_t)p & 15);
63
20.8M
    for (end -= 3; p < alignedend && p <= end; p++) {
64
19.4M
        if (p[0] == 0 && p[1] == 0 && p[2] == 1)
65
3.74M
            return p;
66
19.4M
    }
67
68
1.36M
    if( p == end )
69
1.25k
        return NULL;
70
71
1.36M
    alignedend = end - ((intptr_t) end & 15);
72
1.36M
    if( alignedend > p )
73
1.32M
    {
74
1.32M
#  ifdef HAS_ATTRIBUTE_VECTORSIZE
75
1.32M
        const v16qu zeros = { 0 };
76
1.32M
#  endif
77
78
11.6M
        for( ; p < alignedend; p += 16)
79
11.5M
        {
80
11.5M
            uint32_t match;
81
11.5M
#  ifdef HAS_ATTRIBUTE_VECTORSIZE
82
11.5M
            asm volatile(
83
11.5M
                "movdqa   0(%[v]),   %%xmm0\n"
84
11.5M
                "pcmpeqb %[czero],   %%xmm0\n"
85
11.5M
                "pmovmskb  %%xmm0,   %[match]\n" /* mask will be in reversed match order */
86
11.5M
                : [match]"=r"(match)
87
11.5M
                : [v]"r"(p), [czero]"x"(zeros)
88
11.5M
                : "xmm0"
89
11.5M
            );
90
#  else
91
            asm volatile(
92
                "movdqa   0(%[v]),   %%xmm0\n"
93
                "pxor      %%xmm1,   %%xmm1\n"
94
                "pcmpeqb   %%xmm1,   %%xmm0\n"
95
                "pmovmskb  %%xmm0,   %[match]\n" /* mask will be in reversed match order */
96
                : [match]"=r"(match)
97
                : [v]"r"(p)
98
                : "xmm0", "xmm1"
99
            );
100
#  endif
101
11.5M
            if( match & 0x000F )
102
11.0M
                TRY_MATCH(p, 0);
103
11.0M
            if( match & 0x00F0 )
104
10.6M
                TRY_MATCH(p, 4);
105
10.6M
            if( match & 0x0F00 )
106
10.4M
                TRY_MATCH(p, 8);
107
10.4M
            if( match & 0xF000 )
108
10.3M
                TRY_MATCH(p, 12);
109
10.3M
        }
110
1.32M
    }
111
112
1.22M
    for (; p <= end; p++) {
113
1.12M
        if (p[0] == 0 && p[1] == 0 && p[2] == 1)
114
14.9k
            return p;
115
1.12M
    }
116
117
99.4k
    return NULL;
118
114k
}
h264.c:startcode_FindAnnexB_SSE2
Line
Count
Source
59
2.12M
{
60
    /* First align to 16 */
61
    /* Skipping this step and doing unaligned loads isn't faster */
62
2.12M
    const uint8_t *alignedend = p + 16 - ((intptr_t)p & 15);
63
8.53M
    for (end -= 3; p < alignedend && p <= end; p++) {
64
7.95M
        if (p[0] == 0 && p[1] == 0 && p[2] == 1)
65
1.54M
            return p;
66
7.95M
    }
67
68
574k
    if( p == end )
69
854
        return NULL;
70
71
573k
    alignedend = end - ((intptr_t) end & 15);
72
573k
    if( alignedend > p )
73
554k
    {
74
554k
#  ifdef HAS_ATTRIBUTE_VECTORSIZE
75
554k
        const v16qu zeros = { 0 };
76
554k
#  endif
77
78
4.82M
        for( ; p < alignedend; p += 16)
79
4.79M
        {
80
4.79M
            uint32_t match;
81
4.79M
#  ifdef HAS_ATTRIBUTE_VECTORSIZE
82
4.79M
            asm volatile(
83
4.79M
                "movdqa   0(%[v]),   %%xmm0\n"
84
4.79M
                "pcmpeqb %[czero],   %%xmm0\n"
85
4.79M
                "pmovmskb  %%xmm0,   %[match]\n" /* mask will be in reversed match order */
86
4.79M
                : [match]"=r"(match)
87
4.79M
                : [v]"r"(p), [czero]"x"(zeros)
88
4.79M
                : "xmm0"
89
4.79M
            );
90
#  else
91
            asm volatile(
92
                "movdqa   0(%[v]),   %%xmm0\n"
93
                "pxor      %%xmm1,   %%xmm1\n"
94
                "pcmpeqb   %%xmm1,   %%xmm0\n"
95
                "pmovmskb  %%xmm0,   %[match]\n" /* mask will be in reversed match order */
96
                : [match]"=r"(match)
97
                : [v]"r"(p)
98
                : "xmm0", "xmm1"
99
            );
100
#  endif
101
4.79M
            if( match & 0x000F )
102
4.57M
                TRY_MATCH(p, 0);
103
4.57M
            if( match & 0x00F0 )
104
4.43M
                TRY_MATCH(p, 4);
105
4.43M
            if( match & 0x0F00 )
106
4.33M
                TRY_MATCH(p, 8);
107
4.33M
            if( match & 0xF000 )
108
4.27M
                TRY_MATCH(p, 12);
109
4.27M
        }
110
554k
    }
111
112
513k
    for (; p <= end; p++) {
113
470k
        if (p[0] == 0 && p[1] == 0 && p[2] == 1)
114
7.97k
            return p;
115
470k
    }
116
117
43.0k
    return NULL;
118
50.9k
}
Unexecuted instantiation: hxxx_sei.c:startcode_FindAnnexB_SSE2
Unexecuted instantiation: h264_nal.c:startcode_FindAnnexB_SSE2
Unexecuted instantiation: h264_slice.c:startcode_FindAnnexB_SSE2
hevc.c:startcode_FindAnnexB_SSE2
Line
Count
Source
59
2.96M
{
60
    /* First align to 16 */
61
    /* Skipping this step and doing unaligned loads isn't faster */
62
2.96M
    const uint8_t *alignedend = p + 16 - ((intptr_t)p & 15);
63
12.1M
    for (end -= 3; p < alignedend && p <= end; p++) {
64
11.3M
        if (p[0] == 0 && p[1] == 0 && p[2] == 1)
65
2.18M
            return p;
66
11.3M
    }
67
68
781k
    if( p == end )
69
401
        return NULL;
70
71
781k
    alignedend = end - ((intptr_t) end & 15);
72
781k
    if( alignedend > p )
73
763k
    {
74
763k
#  ifdef HAS_ATTRIBUTE_VECTORSIZE
75
763k
        const v16qu zeros = { 0 };
76
763k
#  endif
77
78
6.80M
        for( ; p < alignedend; p += 16)
79
6.75M
        {
80
6.75M
            uint32_t match;
81
6.75M
#  ifdef HAS_ATTRIBUTE_VECTORSIZE
82
6.75M
            asm volatile(
83
6.75M
                "movdqa   0(%[v]),   %%xmm0\n"
84
6.75M
                "pcmpeqb %[czero],   %%xmm0\n"
85
6.75M
                "pmovmskb  %%xmm0,   %[match]\n" /* mask will be in reversed match order */
86
6.75M
                : [match]"=r"(match)
87
6.75M
                : [v]"r"(p), [czero]"x"(zeros)
88
6.75M
                : "xmm0"
89
6.75M
            );
90
#  else
91
            asm volatile(
92
                "movdqa   0(%[v]),   %%xmm0\n"
93
                "pxor      %%xmm1,   %%xmm1\n"
94
                "pcmpeqb   %%xmm1,   %%xmm0\n"
95
                "pmovmskb  %%xmm0,   %[match]\n" /* mask will be in reversed match order */
96
                : [match]"=r"(match)
97
                : [v]"r"(p)
98
                : "xmm0", "xmm1"
99
            );
100
#  endif
101
6.75M
            if( match & 0x000F )
102
6.40M
                TRY_MATCH(p, 0);
103
6.40M
            if( match & 0x00F0 )
104
6.19M
                TRY_MATCH(p, 4);
105
6.19M
            if( match & 0x0F00 )
106
6.10M
                TRY_MATCH(p, 8);
107
6.10M
            if( match & 0xF000 )
108
6.03M
                TRY_MATCH(p, 12);
109
6.03M
        }
110
763k
    }
111
112
702k
    for (; p <= end; p++) {
113
647k
        if (p[0] == 0 && p[1] == 0 && p[2] == 1)
114
6.82k
            return p;
115
647k
    }
116
117
55.6k
    return NULL;
118
62.5k
}
Unexecuted instantiation: hevc_nal.c:startcode_FindAnnexB_SSE2
Unexecuted instantiation: mpeg4video.c:startcode_FindAnnexB_SSE2
Unexecuted instantiation: mpegvideo.c:startcode_FindAnnexB_SSE2
vc1.c:startcode_FindAnnexB_SSE2
Line
Count
Source
59
28.5k
{
60
    /* First align to 16 */
61
    /* Skipping this step and doing unaligned loads isn't faster */
62
28.5k
    const uint8_t *alignedend = p + 16 - ((intptr_t)p & 15);
63
135k
    for (end -= 3; p < alignedend && p <= end; p++) {
64
124k
        if (p[0] == 0 && p[1] == 0 && p[2] == 1)
65
17.3k
            return p;
66
124k
    }
67
68
11.2k
    if( p == end )
69
2
        return NULL;
70
71
11.2k
    alignedend = end - ((intptr_t) end & 15);
72
11.2k
    if( alignedend > p )
73
10.9k
    {
74
10.9k
#  ifdef HAS_ATTRIBUTE_VECTORSIZE
75
10.9k
        const v16qu zeros = { 0 };
76
10.9k
#  endif
77
78
28.3k
        for( ; p < alignedend; p += 16)
79
27.7k
        {
80
27.7k
            uint32_t match;
81
27.7k
#  ifdef HAS_ATTRIBUTE_VECTORSIZE
82
27.7k
            asm volatile(
83
27.7k
                "movdqa   0(%[v]),   %%xmm0\n"
84
27.7k
                "pcmpeqb %[czero],   %%xmm0\n"
85
27.7k
                "pmovmskb  %%xmm0,   %[match]\n" /* mask will be in reversed match order */
86
27.7k
                : [match]"=r"(match)
87
27.7k
                : [v]"r"(p), [czero]"x"(zeros)
88
27.7k
                : "xmm0"
89
27.7k
            );
90
#  else
91
            asm volatile(
92
                "movdqa   0(%[v]),   %%xmm0\n"
93
                "pxor      %%xmm1,   %%xmm1\n"
94
                "pcmpeqb   %%xmm1,   %%xmm0\n"
95
                "pmovmskb  %%xmm0,   %[match]\n" /* mask will be in reversed match order */
96
                : [match]"=r"(match)
97
                : [v]"r"(p)
98
                : "xmm0", "xmm1"
99
            );
100
#  endif
101
27.7k
            if( match & 0x000F )
102
24.5k
                TRY_MATCH(p, 0);
103
24.5k
            if( match & 0x00F0 )
104
21.7k
                TRY_MATCH(p, 4);
105
21.7k
            if( match & 0x0F00 )
106
19.3k
                TRY_MATCH(p, 8);
107
19.3k
            if( match & 0xF000 )
108
17.3k
                TRY_MATCH(p, 12);
109
17.3k
        }
110
10.9k
    }
111
112
4.80k
    for (; p <= end; p++) {
113
4.09k
        if (p[0] == 0 && p[1] == 0 && p[2] == 1)
114
142
            return p;
115
4.09k
    }
116
117
715
    return NULL;
118
857
}
119
120
#endif
121
122
/* That code is adapted from libav's ff_avc_find_startcode_internal
123
 * and i believe the trick originated from
124
 * https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
125
 */
126
static inline const uint8_t * startcode_FindAnnexB_Bits( const uint8_t *p, const uint8_t *end )
127
0
{
128
0
    const uint8_t *a = p + 4 - ((intptr_t)p & 3);
129
0
130
0
    for (end -= 3; p < a && p <= end; p++) {
131
0
        if (p[0] == 0 && p[1] == 0 && p[2] == 1)
132
0
            return p;
133
0
    }
134
0
135
0
    for (end -= 3; p < end; p += 4) {
136
0
        uint32_t x;
137
0
        memcpy(&x, p, sizeof(x));
138
0
        if ((x - 0x01010101) & (~x) & 0x80808080)
139
0
        {
140
0
            /* matching DW isn't faster */
141
0
            TRY_MATCH(p, 0);
142
0
        }
143
0
    }
144
0
145
0
    for (end += 3; p <= end; p++) {
146
0
        if (p[0] == 0 && p[1] == 0 && p[2] == 1)
147
0
            return p;
148
0
    }
149
0
150
0
    return NULL;
151
0
}
Unexecuted instantiation: h264.c:startcode_FindAnnexB_Bits
Unexecuted instantiation: hxxx_sei.c:startcode_FindAnnexB_Bits
Unexecuted instantiation: h264_nal.c:startcode_FindAnnexB_Bits
Unexecuted instantiation: h264_slice.c:startcode_FindAnnexB_Bits
Unexecuted instantiation: hevc.c:startcode_FindAnnexB_Bits
Unexecuted instantiation: hevc_nal.c:startcode_FindAnnexB_Bits
Unexecuted instantiation: mpeg4video.c:startcode_FindAnnexB_Bits
Unexecuted instantiation: mpegvideo.c:startcode_FindAnnexB_Bits
Unexecuted instantiation: vc1.c:startcode_FindAnnexB_Bits
152
#undef TRY_MATCH
153
154
#ifdef CAN_COMPILE_SSE2
155
static inline const uint8_t * startcode_FindAnnexB( const uint8_t *p, const uint8_t *end )
156
5.11M
{
157
5.11M
    if (vlc_CPU_SSE2())
158
5.11M
        return startcode_FindAnnexB_SSE2(p, end);
159
0
    else
160
0
        return startcode_FindAnnexB_Bits(p, end);
161
5.11M
}
h264.c:startcode_FindAnnexB
Line
Count
Source
156
2.12M
{
157
2.12M
    if (vlc_CPU_SSE2())
158
2.12M
        return startcode_FindAnnexB_SSE2(p, end);
159
0
    else
160
0
        return startcode_FindAnnexB_Bits(p, end);
161
2.12M
}
Unexecuted instantiation: hxxx_sei.c:startcode_FindAnnexB
Unexecuted instantiation: h264_nal.c:startcode_FindAnnexB
Unexecuted instantiation: h264_slice.c:startcode_FindAnnexB
hevc.c:startcode_FindAnnexB
Line
Count
Source
156
2.96M
{
157
2.96M
    if (vlc_CPU_SSE2())
158
2.96M
        return startcode_FindAnnexB_SSE2(p, end);
159
0
    else
160
0
        return startcode_FindAnnexB_Bits(p, end);
161
2.96M
}
Unexecuted instantiation: hevc_nal.c:startcode_FindAnnexB
Unexecuted instantiation: mpeg4video.c:startcode_FindAnnexB
Unexecuted instantiation: mpegvideo.c:startcode_FindAnnexB
vc1.c:startcode_FindAnnexB
Line
Count
Source
156
28.5k
{
157
28.5k
    if (vlc_CPU_SSE2())
158
28.5k
        return startcode_FindAnnexB_SSE2(p, end);
159
0
    else
160
0
        return startcode_FindAnnexB_Bits(p, end);
161
28.5k
}
162
#else
163
    #define startcode_FindAnnexB startcode_FindAnnexB_Bits
164
#endif
165
166
#endif