/src/ffmpeg/libavcodec/x86/mpeg4videodsp.c

Source
/*
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/mpeg4videodsp.h"
#include "libavcodec/videodsp.h"

#if HAVE_INLINE_ASM

static void gmc_mmx(uint8_t *dst, const uint8_t *src,
                    int stride, int h, int ox, int oy,
                    int dxx, int dxy, int dyx, int dyy,
                    int shift, int r, int width, int height)
{
    const int w    = 8;
    const int ix   = ox  >> (16 + shift);
    const int iy   = oy  >> (16 + shift);
    const int oxs  = ox  >> 4;
    const int oys  = oy  >> 4;
    const int dxxs = dxx >> 4;
    const int dxys = dxy >> 4;
    const int dyxs = dyx >> 4;
    const int dyys = dyy >> 4;
    const uint16_t r4[4]   = { r, r, r, r };
    const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
    const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
    const uint64_t shift2  = 2 * shift;
#define MAX_STRIDE 4096U
#define MAX_H 8U
    uint8_t edge_buf[(MAX_H + 1) * MAX_STRIDE];
    int x, y;

    const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
    const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
    const int dxh = dxy * (h - 1);
    const int dyw = dyx * (w - 1);
    int need_emu  =  (unsigned) ix >= width  - w || width < w ||
                     (unsigned) iy >= height - h || height< h
                     ;

    if ( // non-constant fullpel offset (3% of blocks)
        ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
         (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift) ||
        // uses more than 16 bits of subpel mv (only at huge resolution)
        (dxx | dxy | dyx | dyy) & 15 ||
        (need_emu && (h > MAX_H || stride > MAX_STRIDE))) {
        // FIXME could still use mmx for some of the rows
        ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
                 shift, r, width, height);
        return;
    }

    src += ix + iy * stride;
    if (need_emu) {
        ff_emulated_edge_mc_8(edge_buf, src, stride, stride, w + 1, h + 1, ix, iy, width, height);
        src = edge_buf;
    }

    __asm__ volatile (
        "movd         %0, %%mm6         \n\t"
        "pxor      %%mm7, %%mm7         \n\t"
        "punpcklwd %%mm6, %%mm6         \n\t"
        "punpcklwd %%mm6, %%mm6         \n\t"
        :: "r" (1 << shift));

    for (x = 0; x < w; x += 4) {
        uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
                            oxs - dxys + dxxs * (x + 1),
                            oxs - dxys + dxxs * (x + 2),
                            oxs - dxys + dxxs * (x + 3) };
        uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
                            oys - dyys + dyxs * (x + 1),
                            oys - dyys + dyxs * (x + 2),
                            oys - dyys + dyxs * (x + 3) };

        for (y = 0; y < h; y++) {
            __asm__ volatile (
                "movq      %0, %%mm4    \n\t"
                "movq      %1, %%mm5    \n\t"
                "paddw     %2, %%mm4    \n\t"
                "paddw     %3, %%mm5    \n\t"
                "movq   %%mm4, %0       \n\t"
                "movq   %%mm5, %1       \n\t"
                "psrlw    $12, %%mm4    \n\t"
                "psrlw    $12, %%mm5    \n\t"
                : "+m" (*dx4), "+m" (*dy4)
                : "m" (*dxy4), "m" (*dyy4));

            __asm__ volatile (
                "movq      %%mm6, %%mm2 \n\t"
                "movq      %%mm6, %%mm1 \n\t"
                "psubw     %%mm4, %%mm2 \n\t"
                "psubw     %%mm5, %%mm1 \n\t"
                "movq      %%mm2, %%mm0 \n\t"
                "movq      %%mm4, %%mm3 \n\t"
                "pmullw    %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy)
                "pmullw    %%mm5, %%mm3 \n\t" // dx * dy
                "pmullw    %%mm5, %%mm2 \n\t" // (s - dx) * dy
                "pmullw    %%mm4, %%mm1 \n\t" // dx * (s - dy)

                "movd         %4, %%mm5 \n\t"
                "movd         %3, %%mm4 \n\t"
                "punpcklbw %%mm7, %%mm5 \n\t"
                "punpcklbw %%mm7, %%mm4 \n\t"
                "pmullw    %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy
                "pmullw    %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy

                "movd         %2, %%mm5 \n\t"
                "movd         %1, %%mm4 \n\t"
                "punpcklbw %%mm7, %%mm5 \n\t"
                "punpcklbw %%mm7, %%mm4 \n\t"
                "pmullw    %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy)
                "pmullw    %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - dy)
                "paddw        %5, %%mm1 \n\t"
                "paddw     %%mm3, %%mm2 \n\t"
                "paddw     %%mm1, %%mm0 \n\t"
                "paddw     %%mm2, %%mm0 \n\t"

                "psrlw        %6, %%mm0 \n\t"
                "packuswb  %%mm0, %%mm0 \n\t"
                "movd      %%mm0, %0    \n\t"

                : "=m" (dst[x + y * stride])
                : "m" (src[0]), "m" (src[1]),
                  "m" (src[stride]), "m" (src[stride + 1]),
                  "m" (*r4), "m" (shift2));
            src += stride;
        }
        src += 4 - h * stride;
    }
}

#endif /* HAVE_INLINE_ASM */

av_cold void ff_mpeg4videodsp_init_x86(Mpeg4VideoDSPContext *c)
{
#if HAVE_INLINE_ASM
    int cpu_flags = av_get_cpu_flags();

    if (INLINE_MMX(cpu_flags))
        c->gmc = gmc_mmx;
#endif /* HAVE_INLINE_ASM */
}

Line	Count	Source
1		/*
2		* This file is part of FFmpeg.
3		*
4		* FFmpeg is free software; you can redistribute it and/or
5		* modify it under the terms of the GNU Lesser General Public
6		* License as published by the Free Software Foundation; either
7		* version 2.1 of the License, or (at your option) any later version.
8		*
9		* FFmpeg is distributed in the hope that it will be useful,
10		* but WITHOUT ANY WARRANTY; without even the implied warranty of
11		* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12		* Lesser General Public License for more details.
13		*
14		* You should have received a copy of the GNU Lesser General Public
15		* License along with FFmpeg; if not, write to the Free Software
16		* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17		*/
18
19		#include "config.h"
20		#include "libavutil/attributes.h"
21		#include "libavutil/cpu.h"
22		#include "libavutil/x86/cpu.h"
23		#include "libavcodec/mpeg4videodsp.h"
24		#include "libavcodec/videodsp.h"
25
26		#if HAVE_INLINE_ASM
27
28		static void gmc_mmx(uint8_t dst, const uint8_t src,
29		int stride, int h, int ox, int oy,
30		int dxx, int dxy, int dyx, int dyy,
31		int shift, int r, int width, int height)
32	63.7k	{
33	63.7k	const int w = 8;
34	63.7k	const int ix = ox >> (16 + shift);
35	63.7k	const int iy = oy >> (16 + shift);
36	63.7k	const int oxs = ox >> 4;
37	63.7k	const int oys = oy >> 4;
38	63.7k	const int dxxs = dxx >> 4;
39	63.7k	const int dxys = dxy >> 4;
40	63.7k	const int dyxs = dyx >> 4;
41	63.7k	const int dyys = dyy >> 4;
42	63.7k	const uint16_t r4[4] = { r, r, r, r };
43	63.7k	const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
44	63.7k	const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
45	63.7k	const uint64_t shift2 = 2 * shift;
46	63.7k	#define MAX_STRIDE 4096U
47	63.7k	#define MAX_H 8U
48	63.7k	uint8_t edge_buf[(MAX_H + 1) * MAX_STRIDE];
49	63.7k	int x, y;
50
51	63.7k	const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
52	63.7k	const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
53	63.7k	const int dxh = dxy * (h - 1);
54	63.7k	const int dyw = dyx * (w - 1);
55	63.7k	int need_emu = (unsigned) ix >= width - w \|\| width < w \|\|
56	63.7k	(unsigned) iy >= height - h \|\| height< h
57	63.7k	;
58
59	63.7k	if ( // non-constant fullpel offset (3% of blocks)
60	63.7k	((ox ^ (ox + dxw)) \| (ox ^ (ox + dxh)) \| (ox ^ (ox + dxw + dxh)) \|
61	63.7k	(oy ^ (oy + dyw)) \| (oy ^ (oy + dyh)) \| (oy ^ (oy + dyw + dyh))) >> (16 + shift) \|\|
62		// uses more than 16 bits of subpel mv (only at huge resolution)
63	63.7k	(dxx \| dxy \| dyx \| dyy) & 15 \|\|
64	63.7k	(need_emu && (h > MAX_H \|\| stride > MAX_STRIDE))) {
65		// FIXME could still use mmx for some of the rows
66	46.9k	ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
67	46.9k	shift, r, width, height);
68	46.9k	return;
69	46.9k	}
70
71	16.7k	src += ix + iy * stride;
72	16.7k	if (need_emu) {
73	6.78k	ff_emulated_edge_mc_8(edge_buf, src, stride, stride, w + 1, h + 1, ix, iy, width, height);
74	6.78k	src = edge_buf;
75	6.78k	}
76
77	16.7k	__asm__ volatile (
78	16.7k	"movd %0, %%mm6 \n\t"
79	16.7k	"pxor %%mm7, %%mm7 \n\t"
80	16.7k	"punpcklwd %%mm6, %%mm6 \n\t"
81	16.7k	"punpcklwd %%mm6, %%mm6 \n\t"
82	16.7k	:: "r" (1 << shift));
83
84	50.3k	for (x = 0; x < w; x += 4) {
85	33.5k	uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
86	33.5k	oxs - dxys + dxxs * (x + 1),
87	33.5k	oxs - dxys + dxxs * (x + 2),
88	33.5k	oxs - dxys + dxxs * (x + 3) };
89	33.5k	uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
90	33.5k	oys - dyys + dyxs * (x + 1),
91	33.5k	oys - dyys + dyxs * (x + 2),
92	33.5k	oys - dyys + dyxs * (x + 3) };
93
94	378k	for (y = 0; y < h; y++) {
95	344k	__asm__ volatile (
96	344k	"movq %0, %%mm4 \n\t"
97	344k	"movq %1, %%mm5 \n\t"
98	344k	"paddw %2, %%mm4 \n\t"
99	344k	"paddw %3, %%mm5 \n\t"
100	344k	"movq %%mm4, %0 \n\t"
101	344k	"movq %%mm5, %1 \n\t"
102	344k	"psrlw $12, %%mm4 \n\t"
103	344k	"psrlw $12, %%mm5 \n\t"
104	344k	: "+m" (dx4), "+m" (dy4)
105	344k	: "m" (dxy4), "m" (dyy4));
106
107	344k	__asm__ volatile (
108	344k	"movq %%mm6, %%mm2 \n\t"
109	344k	"movq %%mm6, %%mm1 \n\t"
110	344k	"psubw %%mm4, %%mm2 \n\t"
111	344k	"psubw %%mm5, %%mm1 \n\t"
112	344k	"movq %%mm2, %%mm0 \n\t"
113	344k	"movq %%mm4, %%mm3 \n\t"
114	344k	"pmullw %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy)
115	344k	"pmullw %%mm5, %%mm3 \n\t" // dx * dy
116	344k	"pmullw %%mm5, %%mm2 \n\t" // (s - dx) * dy
117	344k	"pmullw %%mm4, %%mm1 \n\t" // dx * (s - dy)
118
119	344k	"movd %4, %%mm5 \n\t"
120	344k	"movd %3, %%mm4 \n\t"
121	344k	"punpcklbw %%mm7, %%mm5 \n\t"
122	344k	"punpcklbw %%mm7, %%mm4 \n\t"
123	344k	"pmullw %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy
124	344k	"pmullw %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy
125
126	344k	"movd %2, %%mm5 \n\t"
127	344k	"movd %1, %%mm4 \n\t"
128	344k	"punpcklbw %%mm7, %%mm5 \n\t"
129	344k	"punpcklbw %%mm7, %%mm4 \n\t"
130	344k	"pmullw %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy)
131	344k	"pmullw %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - dy)
132	344k	"paddw %5, %%mm1 \n\t"
133	344k	"paddw %%mm3, %%mm2 \n\t"
134	344k	"paddw %%mm1, %%mm0 \n\t"
135	344k	"paddw %%mm2, %%mm0 \n\t"
136
137	344k	"psrlw %6, %%mm0 \n\t"
138	344k	"packuswb %%mm0, %%mm0 \n\t"
139	344k	"movd %%mm0, %0 \n\t"
140
141	344k	: "=m" (dst[x + y * stride])
142	344k	: "m" (src[0]), "m" (src[1]),
143	344k	"m" (src[stride]), "m" (src[stride + 1]),
144	344k	"m" (*r4), "m" (shift2));
145	344k	src += stride;
146	344k	}
147	33.5k	src += 4 - h * stride;
148	33.5k	}
149	16.7k	}
150
151		#endif /* HAVE_INLINE_ASM */
152
153		av_cold void ff_mpeg4videodsp_init_x86(Mpeg4VideoDSPContext *c)
154	10.5k	{
155	10.5k	#if HAVE_INLINE_ASM
156	10.5k	int cpu_flags = av_get_cpu_flags();
157
158	10.5k	if (INLINE_MMX(cpu_flags))
159	6.08k	c->gmc = gmc_mmx;
160	10.5k	#endif /* HAVE_INLINE_ASM */
161	10.5k	}

Coverage Report

Created: 2025-08-28 07:12