/src/mozilla-central/gfx/2d/ssse3-scaler.c

Source (jump to first uncovered line)
/*
 * Copyright © 2013 Soren Sandmann Pedersen
 * Copyright © 2013 Red Hat, Inc.
 * Copyright © 2016 Mozilla Foundation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 *
 * Author: Soren Sandmann (soren.sandmann@gmail.com)
 *         Jeff Muizelaar (jmuizelaar@mozilla.com)
 */

/* This has been adapted from the ssse3 code from pixman. It's currently
 * a mess as I want to try it out in practice before finalizing the details.
 */

#include <stdlib.h>
#include <mmintrin.h>
#include <xmmintrin.h>
#include <emmintrin.h>
#include <tmmintrin.h>
#include <stdint.h>
#include <assert.h>
#include "ssse3-scaler.h"

typedef int32_t                 pixman_fixed_16_16_t;
typedef pixman_fixed_16_16_t    pixman_fixed_t;
#define pixman_fixed_1                  (pixman_int_to_fixed(1))
#define pixman_fixed_to_int(f)          ((int) ((f) >> 16))
#define pixman_int_to_fixed(i)          ((pixman_fixed_t) ((i) << 16))
#define pixman_double_to_fixed(d)       ((pixman_fixed_t) ((d) * 65536.0))
#define PIXMAN_FIXED_INT_MAX 32767
#define PIXMAN_FIXED_INT_MIN -32768
typedef struct pixman_vector pixman_vector_t;

typedef int pixman_bool_t;
typedef int64_t                 pixman_fixed_32_32_t;
typedef pixman_fixed_32_32_t    pixman_fixed_48_16_t;
typedef struct { pixman_fixed_48_16_t v[3]; } pixman_vector_48_16_t;

struct pixman_vector
{
    pixman_fixed_t      vector[3];
};
typedef struct pixman_transform pixman_transform_t;

struct pixman_transform
{
    pixman_fixed_t      matrix[3][3];
};

#ifdef _MSC_VER
#define force_inline __forceinline
#else
#define force_inline __inline__ __attribute__((always_inline))
#endif

#define BILINEAR_INTERPOLATION_BITS 6

static force_inline int
pixman_fixed_to_bilinear_weight (pixman_fixed_t x)
{
    return (x >> (16 - BILINEAR_INTERPOLATION_BITS)) &
                               ((1 << BILINEAR_INTERPOLATION_BITS) - 1);
}

static void
pixman_transform_point_31_16_3d (const pixman_transform_t    *t,
                                 const pixman_vector_48_16_t *v,
                                 pixman_vector_48_16_t       *result)
{
    int i;
    int64_t tmp[3][2];

    /* input vector values must have no more than 31 bits (including sign)
     * in the integer part */
    assert (v->v[0] <   ((pixman_fixed_48_16_t)1 << (30 + 16)));
    assert (v->v[0] >= -((pixman_fixed_48_16_t)1 << (30 + 16)));
    assert (v->v[1] <   ((pixman_fixed_48_16_t)1 << (30 + 16)));
    assert (v->v[1] >= -((pixman_fixed_48_16_t)1 << (30 + 16)));
    assert (v->v[2] <   ((pixman_fixed_48_16_t)1 << (30 + 16)));
    assert (v->v[2] >= -((pixman_fixed_48_16_t)1 << (30 + 16)));

    for (i = 0; i < 3; i++)
    {
        tmp[i][0] = (int64_t)t->matrix[i][0] * (v->v[0] >> 16);
        tmp[i][1] = (int64_t)t->matrix[i][0] * (v->v[0] & 0xFFFF);
        tmp[i][0] += (int64_t)t->matrix[i][1] * (v->v[1] >> 16);
        tmp[i][1] += (int64_t)t->matrix[i][1] * (v->v[1] & 0xFFFF);
        tmp[i][0] += (int64_t)t->matrix[i][2] * (v->v[2] >> 16);
        tmp[i][1] += (int64_t)t->matrix[i][2] * (v->v[2] & 0xFFFF);
    }

    result->v[0] = tmp[0][0] + ((tmp[0][1] + 0x8000) >> 16);
    result->v[1] = tmp[1][0] + ((tmp[1][1] + 0x8000) >> 16);
    result->v[2] = tmp[2][0] + ((tmp[2][1] + 0x8000) >> 16);
}

static pixman_bool_t
pixman_transform_point_3d (const struct pixman_transform *transform,
                           struct pixman_vector *         vector)
{
    pixman_vector_48_16_t tmp;
    tmp.v[0] = vector->vector[0];
    tmp.v[1] = vector->vector[1];
    tmp.v[2] = vector->vector[2];

    pixman_transform_point_31_16_3d (transform, &tmp, &tmp);

    vector->vector[0] = tmp.v[0];
    vector->vector[1] = tmp.v[1];
    vector->vector[2] = tmp.v[2];

    return vector->vector[0] == tmp.v[0] &&
           vector->vector[1] == tmp.v[1] &&
           vector->vector[2] == tmp.v[2];
}


struct bits_image_t
{
    uint32_t *                 bits;
    int                        rowstride;
    pixman_transform_t *transform;
};

typedef struct bits_image_t bits_image_t;
typedef struct {
    int unused;
} pixman_iter_info_t;

typedef struct pixman_iter_t pixman_iter_t;
typedef void      (* pixman_iter_fini_t)         (pixman_iter_t *iter);

struct pixman_iter_t
{
    int x, y;
    pixman_iter_fini_t          fini;
    bits_image_t *image;
    uint32_t *                  buffer;
    int width;
    int height;
    void *                      data;
};

typedef struct
{
    int   y;
    uint64_t *  buffer;
} line_t;

typedef struct
{
    line_t    lines[2];
    pixman_fixed_t  y;
    pixman_fixed_t  x;
    uint64_t    data[1];
} bilinear_info_t;

static void
ssse3_fetch_horizontal (bits_image_t *image, line_t *line,
      int y, pixman_fixed_t x, pixman_fixed_t ux, int n)
{
    uint32_t *bits = image->bits + y * image->rowstride;
    __m128i vx = _mm_set_epi16 (
  - (x + 1), x, - (x + 1), x,
  - (x + ux + 1), x + ux,  - (x + ux + 1), x + ux);
    __m128i vux = _mm_set_epi16 (
  - 2 * ux, 2 * ux, - 2 * ux, 2 * ux,
  - 2 * ux, 2 * ux, - 2 * ux, 2 * ux);
    __m128i vaddc = _mm_set_epi16 (1, 0, 1, 0, 1, 0, 1, 0);
    __m128i *b = (__m128i *)line->buffer;
    __m128i vrl0, vrl1;

    while ((n -= 2) >= 0)
    {
        __m128i vw, vr, s;
#ifdef HACKY_PADDING
        if (pixman_fixed_to_int(x + ux) >= image->rowstride) {
            vrl1 = _mm_setzero_si128();
            printf("overread 2loop\n");
         } else {
                 if (pixman_fixed_to_int(x + ux) < 0)
                         printf("underflow\n");
        vrl1 = _mm_loadl_epi64(
            (__m128i *)(bits + (pixman_fixed_to_int(x + ux) < 0 ? 0 : pixman_fixed_to_int(x + ux))));
        }
#else
        vrl1 = _mm_loadl_epi64(
            (__m128i *)(bits + pixman_fixed_to_int(x + ux)));
#endif
  /* vrl1: R1, L1 */

    final_pixel:
#ifdef HACKY_PADDING
  vrl0 = _mm_loadl_epi64 (
      (__m128i *)(bits + (pixman_fixed_to_int (x) < 0 ? 0 : pixman_fixed_to_int (x))));
#else
        vrl0 = _mm_loadl_epi64 (
      (__m128i *)(bits + pixman_fixed_to_int (x)));
#endif
        /* vrl0: R0, L0 */

  /* The weights are based on vx which is a vector of 
   *
   *    - (x + 1), x, - (x + 1), x,
   *          - (x + ux + 1), x + ux, - (x + ux + 1), x + ux
   *
   * so the 16 bit weights end up like this:
   *
   *    iw0, w0, iw0, w0, iw1, w1, iw1, w1
   *
   * and after shifting and packing, we get these bytes:
   *
   *    iw0, w0, iw0, w0, iw1, w1, iw1, w1,
   *        iw0, w0, iw0, w0, iw1, w1, iw1, w1,
   *
   * which means the first and the second input pixel 
   * have to be interleaved like this:
   *
   *    la0, ra0, lr0, rr0, la1, ra1, lr1, rr1,
   *        lg0, rg0, lb0, rb0, lg1, rg1, lb1, rb1
   *
   * before maddubsw can be used.
   */

  vw = _mm_add_epi16 (
      vaddc, _mm_srli_epi16 (vx, 16 - BILINEAR_INTERPOLATION_BITS));
  /* vw: iw0, w0, iw0, w0, iw1, w1, iw1, w1
   */

  vw = _mm_packus_epi16 (vw, vw);
  /* vw: iw0, w0, iw0, w0, iw1, w1, iw1, w1,
   *         iw0, w0, iw0, w0, iw1, w1, iw1, w1
   */
  vx = _mm_add_epi16 (vx, vux);

  x += 2 * ux;

  vr = _mm_unpacklo_epi16 (vrl1, vrl0);
  /* vr: rar0, rar1, rgb0, rgb1, lar0, lar1, lgb0, lgb1 */

  s = _mm_shuffle_epi32 (vr, _MM_SHUFFLE (1, 0, 3, 2));
  /* s:  lar0, lar1, lgb0, lgb1, rar0, rar1, rgb0, rgb1 */

  vr = _mm_unpackhi_epi8 (vr, s);
  /* vr: la0, ra0, lr0, rr0, la1, ra1, lr1, rr1,
   *         lg0, rg0, lb0, rb0, lg1, rg1, lb1, rb1
   */

  vr = _mm_maddubs_epi16 (vr, vw);

  /* When the weight is 0, the inverse weight is
   * 128 which can't be represented in a signed byte.
   * As a result maddubsw computes the following:
   *
   *     r = l * -128 + r * 0
   *
   * rather than the desired
   *
   *     r = l * 128 + r * 0
   *
   * We fix this by taking the absolute value of the
   * result.
   */
        // we can drop this if we use lower precision

  vr = _mm_shuffle_epi32 (vr, _MM_SHUFFLE (2, 0, 3, 1));
  /* vr: A0, R0, A1, R1, G0, B0, G1, B1 */
  _mm_store_si128 (b++, vr);
    }

    if (n == -1)
    {
  vrl1 = _mm_setzero_si128();
  goto final_pixel;
    }

    line->y = y;
}

// scale a line of destination pixels
static uint32_t *
ssse3_fetch_bilinear_cover (pixman_iter_t *iter, const uint32_t *mask)
{
    pixman_fixed_t fx, ux;
    bilinear_info_t *info = iter->data;
    line_t *line0, *line1;
    int y0, y1;
    int32_t dist_y;
    __m128i vw, uvw;
    int i;

    fx = info->x;
    ux = iter->image->transform->matrix[0][0];

    y0 = pixman_fixed_to_int (info->y);
    if (y0 < 0)
        *(volatile char*)0 = 9;
    y1 = y0 + 1;

    // clamping in y direction
    if (y1 >= iter->height) {
        y1 = iter->height - 1;
    }

    line0 = &info->lines[y0 & 0x01];
    line1 = &info->lines[y1 & 0x01];

    if (line0->y != y0)
    {
  ssse3_fetch_horizontal (
      iter->image, line0, y0, fx, ux, iter->width);
    }

    if (line1->y != y1)
    {
  ssse3_fetch_horizontal (
      iter->image, line1, y1, fx, ux, iter->width);
    }

#ifdef PIXMAN_STYLE_INTERPOLATION
    dist_y = pixman_fixed_to_bilinear_weight (info->y);
    dist_y <<= (16 - BILINEAR_INTERPOLATION_BITS);

    vw = _mm_set_epi16 (
  dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y);

#else
    // setup the weights for the top (vw) and bottom (uvw) lines
    dist_y = pixman_fixed_to_bilinear_weight (info->y);
    // we use 15 instead of 16 because we need an extra bit to handle when the weights are 0 and 1
    dist_y <<= (15 - BILINEAR_INTERPOLATION_BITS);

    vw = _mm_set_epi16 (
  dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y);


    dist_y = (1 << BILINEAR_INTERPOLATION_BITS) - pixman_fixed_to_bilinear_weight (info->y);
    dist_y <<= (15 - BILINEAR_INTERPOLATION_BITS);
    uvw = _mm_set_epi16 (
  dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y);
#endif

    for (i = 0; i + 3 < iter->width; i += 4)
    {
  __m128i top0 = _mm_load_si128 ((__m128i *)(line0->buffer + i));
  __m128i bot0 = _mm_load_si128 ((__m128i *)(line1->buffer + i));
  __m128i top1 = _mm_load_si128 ((__m128i *)(line0->buffer + i + 2));
  __m128i bot1 = _mm_load_si128 ((__m128i *)(line1->buffer + i + 2));
#ifdef PIXMAN_STYLE_INTERPOLATION
  __m128i r0, r1, tmp, p;

        r0 = _mm_mulhi_epu16 (
      _mm_sub_epi16 (bot0, top0), vw);
  tmp = _mm_cmplt_epi16 (bot0, top0);
  tmp = _mm_and_si128 (tmp, vw);
  r0 = _mm_sub_epi16 (r0, tmp);
  r0 = _mm_add_epi16 (r0, top0);
  r0 = _mm_srli_epi16 (r0, BILINEAR_INTERPOLATION_BITS);
  /* r0:  A0 R0 A1 R1 G0 B0 G1 B1 */
        //r0 = _mm_shuffle_epi32 (r0, _MM_SHUFFLE (2, 0, 3, 1));
  /* r0:  A1 R1 G1 B1 A0 R0 G0 B0 */

        // tmp = bot1 < top1 ? vw : 0;
        // r1 = (bot1 - top1)*vw + top1 - tmp
        // r1 = bot1*vw - vw*top1 + top1 - tmp
        // r1 = bot1*vw + top1 - vw*top1 - tmp
        // r1 = bot1*vw + top1*(1 - vw) - tmp
  r1 = _mm_mulhi_epu16 (
      _mm_sub_epi16 (bot1, top1), vw);
  tmp = _mm_cmplt_epi16 (bot1, top1);
  tmp = _mm_and_si128 (tmp, vw);
  r1 = _mm_sub_epi16 (r1, tmp);
  r1 = _mm_add_epi16 (r1, top1);
  r1 = _mm_srli_epi16 (r1, BILINEAR_INTERPOLATION_BITS);
  //r1 = _mm_shuffle_epi32 (r1, _MM_SHUFFLE (2, 0, 3, 1));
  /* r1: A3 R3 G3 B3 A2 R2 G2 B2 */
#else
  __m128i r0, r1, p;
        top0 = _mm_mulhi_epu16 (top0, uvw);
        bot0 = _mm_mulhi_epu16 (bot0, vw);
        r0 = _mm_add_epi16(top0, bot0);
        r0 = _mm_srli_epi16(r0, BILINEAR_INTERPOLATION_BITS-1);

        top1 = _mm_mulhi_epu16 (top1, uvw);
        bot1 = _mm_mulhi_epu16 (bot1, vw);
        r1 = _mm_add_epi16(top1, bot1);
        r1 = _mm_srli_epi16(r1, BILINEAR_INTERPOLATION_BITS-1);
#endif

  p = _mm_packus_epi16 (r0, r1);
  _mm_storeu_si128 ((__m128i *)(iter->buffer + i), p);
    }

    while (i < iter->width)
    {
  __m128i top0 = _mm_load_si128 ((__m128i *)(line0->buffer + i));
  __m128i bot0 = _mm_load_si128 ((__m128i *)(line1->buffer + i));

#ifdef PIXMAN_STYLE_INTERPOLATION
  __m128i r0, tmp, p;
  r0 = _mm_mulhi_epu16 (
      _mm_sub_epi16 (bot0, top0), vw);
  tmp = _mm_cmplt_epi16 (bot0, top0);
  tmp = _mm_and_si128 (tmp, vw);
  r0 = _mm_sub_epi16 (r0, tmp);
  r0 = _mm_add_epi16 (r0, top0);
  r0 = _mm_srli_epi16 (r0, BILINEAR_INTERPOLATION_BITS);
  /* r0:  A0 R0 A1 R1 G0 B0 G1 B1 */
  r0 = _mm_shuffle_epi32 (r0, _MM_SHUFFLE (2, 0, 3, 1));
  /* r0:  A1 R1 G1 B1 A0 R0 G0 B0 */
#else
  __m128i r0, p;
        top0 = _mm_mulhi_epu16 (top0, uvw);
        bot0 = _mm_mulhi_epu16 (bot0, vw);
        r0 = _mm_add_epi16(top0, bot0);
        r0 = _mm_srli_epi16(r0, BILINEAR_INTERPOLATION_BITS-1);
#endif

  p = _mm_packus_epi16 (r0, r0);

  if (iter->width - i == 1)
  {
      *(uint32_t *)(iter->buffer + i) = _mm_cvtsi128_si32 (p);
      i++;
  }
  else
  {
      _mm_storel_epi64 ((__m128i *)(iter->buffer + i), p);
      i += 2;
  }
    }

    info->y += iter->image->transform->matrix[1][1];

    return iter->buffer;
}

static void
ssse3_bilinear_cover_iter_fini (pixman_iter_t *iter)
{
    free (iter->data);
}

static void
ssse3_bilinear_cover_iter_init (pixman_iter_t *iter)
{
    int width = iter->width;
    bilinear_info_t *info;
    pixman_vector_t v;

    if (iter->x > PIXMAN_FIXED_INT_MAX ||
        iter->x < PIXMAN_FIXED_INT_MIN ||
        iter->y > PIXMAN_FIXED_INT_MAX ||
        iter->y < PIXMAN_FIXED_INT_MIN)
      goto fail;

    /* Reference point is the center of the pixel */
    v.vector[0] = pixman_int_to_fixed (iter->x) + pixman_fixed_1 / 2;
    v.vector[1] = pixman_int_to_fixed (iter->y) + pixman_fixed_1 / 2;
    v.vector[2] = pixman_fixed_1;

    if (!pixman_transform_point_3d (iter->image->transform, &v))
  goto fail;

    info = malloc (sizeof (*info) + (2 * width - 1) * sizeof (uint64_t) + 64);
    if (!info)
  goto fail;

    info->x = v.vector[0] - pixman_fixed_1 / 2;
    info->y = v.vector[1] - pixman_fixed_1 / 2;

#define ALIGN(addr)             \
    ((void *)((((uintptr_t)(addr)) + 15) & (~15)))

    /* It is safe to set the y coordinates to -1 initially
     * because COVER_CLIP_BILINEAR ensures that we will only
     * be asked to fetch lines in the [0, height) interval
     */
    info->lines[0].y = -1;
    info->lines[0].buffer = ALIGN (&(info->data[0]));
    info->lines[1].y = -1;
    info->lines[1].buffer = ALIGN (info->lines[0].buffer + width);

    iter->fini = ssse3_bilinear_cover_iter_fini;

    iter->data = info;
    return;

fail:
    /* Something went wrong, either a bad matrix or OOM; in such cases,
     * we don't guarantee any particular rendering.
     */
    iter->fini = NULL;
}

/* scale the src from src_width/height to dest_width/height drawn
 * into the rectangle x,y width,height
 * src_stride and dst_stride are 4 byte units */
bool ssse3_scale_data(uint32_t *src, int src_width, int src_height, int src_stride,
                      uint32_t *dest, int dest_width, int dest_height,
                      int dest_stride,
                      int x, int y,
                      int width, int height)
{
    //XXX: assert(src_width > 1)
    pixman_transform_t transform = {
        { { pixman_fixed_1, 0, 0 },
            { 0, pixman_fixed_1, 0 },
            { 0, 0, pixman_fixed_1 } }
    };
    double width_scale = ((double)src_width)/dest_width;
    double height_scale = ((double)src_height)/dest_height;
#define AVOID_PADDING
#ifdef AVOID_PADDING
    // scale up by enough that we don't read outside of the bounds of the source surface
    // currently this is required to avoid reading out of bounds.
    if (width_scale < 1) {
        width_scale = (double)(src_width-1)/dest_width;
        transform.matrix[0][2] = pixman_fixed_1/2;
    }
    if (height_scale < 1) {
        height_scale = (double)(src_height-1)/dest_height;
        transform.matrix[1][2] = pixman_fixed_1/2;
    }
#endif
    transform.matrix[0][0] = pixman_double_to_fixed(width_scale);
    transform.matrix[1][1] = pixman_double_to_fixed(height_scale);
    transform.matrix[2][2] = pixman_fixed_1;

    bits_image_t image;
    image.bits = src;
    image.transform = &transform;
    image.rowstride = src_stride;

    pixman_iter_t iter;
    iter.image = &image;
    iter.x = x;
    iter.y = y;
    iter.width = width;
    iter.height = src_height;
    iter.buffer = dest;
    iter.data = NULL;

    ssse3_bilinear_cover_iter_init(&iter);

    if (!iter.fini)
      return false;

    if (iter.data) {
        for (int iy = 0; iy < height; iy++) {
            ssse3_fetch_bilinear_cover(&iter, NULL);
            iter.buffer += dest_stride;
        }
        ssse3_bilinear_cover_iter_fini(&iter);
    }
    return true;
}

Coverage Report

Created: 2018-09-25 14:53

Line	Count	Source (jump to first uncovered line)
1		/*
2		* Copyright © 2013 Soren Sandmann Pedersen
3		* Copyright © 2013 Red Hat, Inc.
4		* Copyright © 2016 Mozilla Foundation
5		*
6		* Permission is hereby granted, free of charge, to any person obtaining a
7		* copy of this software and associated documentation files (the "Software"),
8		* to deal in the Software without restriction, including without limitation
9		* the rights to use, copy, modify, merge, publish, distribute, sublicense,
10		* and/or sell copies of the Software, and to permit persons to whom the
11		* Software is furnished to do so, subject to the following conditions:
12		*
13		* The above copyright notice and this permission notice (including the next
14		* paragraph) shall be included in all copies or substantial portions of the
15		* Software.
16		*
17		* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18		* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19		* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20		* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21		* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22		* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23		* DEALINGS IN THE SOFTWARE.
24		*
25		* Author: Soren Sandmann (soren.sandmann@gmail.com)
26		* Jeff Muizelaar (jmuizelaar@mozilla.com)
27		*/
28
29		/* This has been adapted from the ssse3 code from pixman. It's currently
30		* a mess as I want to try it out in practice before finalizing the details.
31		*/
32
33		#include <stdlib.h>
34		#include <mmintrin.h>
35		#include <xmmintrin.h>
36		#include <emmintrin.h>
37		#include <tmmintrin.h>
38		#include <stdint.h>
39		#include <assert.h>
40		#include "ssse3-scaler.h"
41
42		typedef int32_t pixman_fixed_16_16_t;
43		typedef pixman_fixed_16_16_t pixman_fixed_t;
44	0	#define pixman_fixed_1 (pixman_int_to_fixed(1))
45	0	#define pixman_fixed_to_int(f) ((int) ((f) >> 16))
46	0	#define pixman_int_to_fixed(i) ((pixman_fixed_t) ((i) << 16))
47	0	#define pixman_double_to_fixed(d) ((pixman_fixed_t) ((d) * 65536.0))
48	0	#define PIXMAN_FIXED_INT_MAX 32767
49	0	#define PIXMAN_FIXED_INT_MIN -32768
50		typedef struct pixman_vector pixman_vector_t;
51
52		typedef int pixman_bool_t;
53		typedef int64_t pixman_fixed_32_32_t;
54		typedef pixman_fixed_32_32_t pixman_fixed_48_16_t;
55		typedef struct { pixman_fixed_48_16_t v[3]; } pixman_vector_48_16_t;
56
57		struct pixman_vector
58		{
59		pixman_fixed_t vector[3];
60		};
61		typedef struct pixman_transform pixman_transform_t;
62
63		struct pixman_transform
64		{
65		pixman_fixed_t matrix[3][3];
66		};
67
68		#ifdef _MSC_VER
69		#define force_inline __forceinline
70		#else
71		#define force_inline __inline__ __attribute__((always_inline))
72		#endif
73
74	0	#define BILINEAR_INTERPOLATION_BITS 6
75
76		static force_inline int
77		pixman_fixed_to_bilinear_weight (pixman_fixed_t x)
78	0	{
79	0	return (x >> (16 - BILINEAR_INTERPOLATION_BITS)) &
80	0	((1 << BILINEAR_INTERPOLATION_BITS) - 1);
81	0	}
82
83		static void
84		pixman_transform_point_31_16_3d (const pixman_transform_t *t,
85		const pixman_vector_48_16_t *v,
86		pixman_vector_48_16_t *result)
87	0	{
88	0	int i;
89	0	int64_t tmp[3][2];
90	0
91	0	/* input vector values must have no more than 31 bits (including sign)
92	0	* in the integer part */
93	0	assert (v->v[0] < ((pixman_fixed_48_16_t)1 << (30 + 16)));
94	0	assert (v->v[0] >= -((pixman_fixed_48_16_t)1 << (30 + 16)));
95	0	assert (v->v[1] < ((pixman_fixed_48_16_t)1 << (30 + 16)));
96	0	assert (v->v[1] >= -((pixman_fixed_48_16_t)1 << (30 + 16)));
97	0	assert (v->v[2] < ((pixman_fixed_48_16_t)1 << (30 + 16)));
98	0	assert (v->v[2] >= -((pixman_fixed_48_16_t)1 << (30 + 16)));
99	0
100	0	for (i = 0; i < 3; i++)
101	0	{
102	0	tmp[i][0] = (int64_t)t->matrix[i][0] * (v->v[0] >> 16);
103	0	tmp[i][1] = (int64_t)t->matrix[i][0] * (v->v[0] & 0xFFFF);
104	0	tmp[i][0] += (int64_t)t->matrix[i][1] * (v->v[1] >> 16);
105	0	tmp[i][1] += (int64_t)t->matrix[i][1] * (v->v[1] & 0xFFFF);
106	0	tmp[i][0] += (int64_t)t->matrix[i][2] * (v->v[2] >> 16);
107	0	tmp[i][1] += (int64_t)t->matrix[i][2] * (v->v[2] & 0xFFFF);
108	0	}
109	0
110	0	result->v[0] = tmp[0][0] + ((tmp[0][1] + 0x8000) >> 16);
111	0	result->v[1] = tmp[1][0] + ((tmp[1][1] + 0x8000) >> 16);
112	0	result->v[2] = tmp[2][0] + ((tmp[2][1] + 0x8000) >> 16);
113	0	}
114
115		static pixman_bool_t
116		pixman_transform_point_3d (const struct pixman_transform *transform,
117		struct pixman_vector * vector)
118	0	{
119	0	pixman_vector_48_16_t tmp;
120	0	tmp.v[0] = vector->vector[0];
121	0	tmp.v[1] = vector->vector[1];
122	0	tmp.v[2] = vector->vector[2];
123	0
124	0	pixman_transform_point_31_16_3d (transform, &tmp, &tmp);
125	0
126	0	vector->vector[0] = tmp.v[0];
127	0	vector->vector[1] = tmp.v[1];
128	0	vector->vector[2] = tmp.v[2];
129	0
130	0	return vector->vector[0] == tmp.v[0] &&
131	0	vector->vector[1] == tmp.v[1] &&
132	0	vector->vector[2] == tmp.v[2];
133	0	}
134
135
136		struct bits_image_t
137		{
138		uint32_t * bits;
139		int rowstride;
140		pixman_transform_t *transform;
141		};
142
143		typedef struct bits_image_t bits_image_t;
144		typedef struct {
145		int unused;
146		} pixman_iter_info_t;
147
148		typedef struct pixman_iter_t pixman_iter_t;
149		typedef void (* pixman_iter_fini_t) (pixman_iter_t *iter);
150
151		struct pixman_iter_t
152		{
153		int x, y;
154		pixman_iter_fini_t fini;
155		bits_image_t *image;
156		uint32_t * buffer;
157		int width;
158		int height;
159		void * data;
160		};
161
162		typedef struct
163		{
164		int y;
165		uint64_t * buffer;
166		} line_t;
167
168		typedef struct
169		{
170		line_t lines[2];
171		pixman_fixed_t y;
172		pixman_fixed_t x;
173		uint64_t data[1];
174		} bilinear_info_t;
175
176		static void
177		ssse3_fetch_horizontal (bits_image_t image, line_t line,
178		int y, pixman_fixed_t x, pixman_fixed_t ux, int n)
179	0	{
180	0	uint32_t bits = image->bits + y image->rowstride;
181	0	__m128i vx = _mm_set_epi16 (
182	0	- (x + 1), x, - (x + 1), x,
183	0	- (x + ux + 1), x + ux, - (x + ux + 1), x + ux);
184	0	__m128i vux = _mm_set_epi16 (
185	0	- 2 * ux, 2 * ux, - 2 * ux, 2 * ux,
186	0	- 2 * ux, 2 * ux, - 2 * ux, 2 * ux);
187	0	__m128i vaddc = _mm_set_epi16 (1, 0, 1, 0, 1, 0, 1, 0);
188	0	__m128i b = (__m128i )line->buffer;
189	0	__m128i vrl0, vrl1;
190	0
191	0	while ((n -= 2) >= 0)
192	0	{
193	0	__m128i vw, vr, s;
194		#ifdef HACKY_PADDING
195		if (pixman_fixed_to_int(x + ux) >= image->rowstride) {
196		vrl1 = _mm_setzero_si128();
197		printf("overread 2loop\n");
198		} else {
199		if (pixman_fixed_to_int(x + ux) < 0)
200		printf("underflow\n");
201		vrl1 = _mm_loadl_epi64(
202		(__m128i *)(bits + (pixman_fixed_to_int(x + ux) < 0 ? 0 : pixman_fixed_to_int(x + ux))));
203		}
204		#else
205		vrl1 = _mm_loadl_epi64(
206	0	(__m128i *)(bits + pixman_fixed_to_int(x + ux)));
207	0	#endif
208	0	/* vrl1: R1, L1 */
209	0
210	0	final_pixel:
211		#ifdef HACKY_PADDING
212		vrl0 = _mm_loadl_epi64 (
213		(__m128i *)(bits + (pixman_fixed_to_int (x) < 0 ? 0 : pixman_fixed_to_int (x))));
214		#else
215		vrl0 = _mm_loadl_epi64 (
216	0	(__m128i *)(bits + pixman_fixed_to_int (x)));
217	0	#endif
218	0	/* vrl0: R0, L0 */
219	0
220	0	/* The weights are based on vx which is a vector of
221	0	*
222	0	* - (x + 1), x, - (x + 1), x,
223	0	* - (x + ux + 1), x + ux, - (x + ux + 1), x + ux
224	0	*
225	0	* so the 16 bit weights end up like this:
226	0	*
227	0	* iw0, w0, iw0, w0, iw1, w1, iw1, w1
228	0	*
229	0	* and after shifting and packing, we get these bytes:
230	0	*
231	0	* iw0, w0, iw0, w0, iw1, w1, iw1, w1,
232	0	* iw0, w0, iw0, w0, iw1, w1, iw1, w1,
233	0	*
234	0	* which means the first and the second input pixel
235	0	* have to be interleaved like this:
236	0	*
237	0	* la0, ra0, lr0, rr0, la1, ra1, lr1, rr1,
238	0	* lg0, rg0, lb0, rb0, lg1, rg1, lb1, rb1
239	0	*
240	0	* before maddubsw can be used.
241	0	*/
242	0
243	0	vw = _mm_add_epi16 (
244	0	vaddc, _mm_srli_epi16 (vx, 16 - BILINEAR_INTERPOLATION_BITS));
245	0	/* vw: iw0, w0, iw0, w0, iw1, w1, iw1, w1
246	0	*/
247	0
248	0	vw = _mm_packus_epi16 (vw, vw);
249	0	/* vw: iw0, w0, iw0, w0, iw1, w1, iw1, w1,
250	0	* iw0, w0, iw0, w0, iw1, w1, iw1, w1
251	0	*/
252	0	vx = _mm_add_epi16 (vx, vux);
253	0
254	0	x += 2 * ux;
255	0
256	0	vr = _mm_unpacklo_epi16 (vrl1, vrl0);
257	0	/* vr: rar0, rar1, rgb0, rgb1, lar0, lar1, lgb0, lgb1 */
258	0
259	0	s = _mm_shuffle_epi32 (vr, _MM_SHUFFLE (1, 0, 3, 2));
260	0	/* s: lar0, lar1, lgb0, lgb1, rar0, rar1, rgb0, rgb1 */
261	0
262	0	vr = _mm_unpackhi_epi8 (vr, s);
263	0	/* vr: la0, ra0, lr0, rr0, la1, ra1, lr1, rr1,
264	0	* lg0, rg0, lb0, rb0, lg1, rg1, lb1, rb1
265	0	*/
266	0
267	0	vr = _mm_maddubs_epi16 (vr, vw);
268	0
269	0	/* When the weight is 0, the inverse weight is
270	0	* 128 which can't be represented in a signed byte.
271	0	* As a result maddubsw computes the following:
272	0	*
273	0	* r = l * -128 + r * 0
274	0	*
275	0	* rather than the desired
276	0	*
277	0	* r = l * 128 + r * 0
278	0	*
279	0	* We fix this by taking the absolute value of the
280	0	* result.
281	0	*/
282	0	// we can drop this if we use lower precision
283	0
284	0	vr = _mm_shuffle_epi32 (vr, _MM_SHUFFLE (2, 0, 3, 1));
285	0	/* vr: A0, R0, A1, R1, G0, B0, G1, B1 */
286	0	_mm_store_si128 (b++, vr);
287	0	}
288	0
289	0	if (n == -1)
290	0	{
291	0	vrl1 = _mm_setzero_si128();
292	0	goto final_pixel;
293	0	}
294	0
295	0	line->y = y;
296	0	}
297
298		// scale a line of destination pixels
299		static uint32_t *
300		ssse3_fetch_bilinear_cover (pixman_iter_t iter, const uint32_t mask)
301	0	{
302	0	pixman_fixed_t fx, ux;
303	0	bilinear_info_t *info = iter->data;
304	0	line_t line0, line1;
305	0	int y0, y1;
306	0	int32_t dist_y;
307	0	__m128i vw, uvw;
308	0	int i;
309	0
310	0	fx = info->x;
311	0	ux = iter->image->transform->matrix[0][0];
312	0
313	0	y0 = pixman_fixed_to_int (info->y);
314	0	if (y0 < 0)
315	0	(volatile char)0 = 9;
316	0	y1 = y0 + 1;
317	0
318	0	// clamping in y direction
319	0	if (y1 >= iter->height) {
320	0	y1 = iter->height - 1;
321	0	}
322	0
323	0	line0 = &info->lines[y0 & 0x01];
324	0	line1 = &info->lines[y1 & 0x01];
325	0
326	0	if (line0->y != y0)
327	0	{
328	0	ssse3_fetch_horizontal (
329	0	iter->image, line0, y0, fx, ux, iter->width);
330	0	}
331	0
332	0	if (line1->y != y1)
333	0	{
334	0	ssse3_fetch_horizontal (
335	0	iter->image, line1, y1, fx, ux, iter->width);
336	0	}
337	0
338		#ifdef PIXMAN_STYLE_INTERPOLATION
339		dist_y = pixman_fixed_to_bilinear_weight (info->y);
340		dist_y <<= (16 - BILINEAR_INTERPOLATION_BITS);
341
342		vw = _mm_set_epi16 (
343		dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y);
344
345		#else
346		// setup the weights for the top (vw) and bottom (uvw) lines
347	0	dist_y = pixman_fixed_to_bilinear_weight (info->y);
348	0	// we use 15 instead of 16 because we need an extra bit to handle when the weights are 0 and 1
349	0	dist_y <<= (15 - BILINEAR_INTERPOLATION_BITS);
350	0
351	0	vw = _mm_set_epi16 (
352	0	dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y);
353	0
354	0
355	0	dist_y = (1 << BILINEAR_INTERPOLATION_BITS) - pixman_fixed_to_bilinear_weight (info->y);
356	0	dist_y <<= (15 - BILINEAR_INTERPOLATION_BITS);
357	0	uvw = _mm_set_epi16 (
358	0	dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y);
359	0	#endif
360	0
361	0	for (i = 0; i + 3 < iter->width; i += 4)
362	0	{
363	0	__m128i top0 = _mm_load_si128 ((__m128i *)(line0->buffer + i));
364	0	__m128i bot0 = _mm_load_si128 ((__m128i *)(line1->buffer + i));
365	0	__m128i top1 = _mm_load_si128 ((__m128i *)(line0->buffer + i + 2));
366	0	__m128i bot1 = _mm_load_si128 ((__m128i *)(line1->buffer + i + 2));
367		#ifdef PIXMAN_STYLE_INTERPOLATION
368		__m128i r0, r1, tmp, p;
369
370		r0 = _mm_mulhi_epu16 (
371		_mm_sub_epi16 (bot0, top0), vw);
372		tmp = _mm_cmplt_epi16 (bot0, top0);
373		tmp = _mm_and_si128 (tmp, vw);
374		r0 = _mm_sub_epi16 (r0, tmp);
375		r0 = _mm_add_epi16 (r0, top0);
376		r0 = _mm_srli_epi16 (r0, BILINEAR_INTERPOLATION_BITS);
377		/* r0: A0 R0 A1 R1 G0 B0 G1 B1 */
378		//r0 = _mm_shuffle_epi32 (r0, _MM_SHUFFLE (2, 0, 3, 1));
379		/* r0: A1 R1 G1 B1 A0 R0 G0 B0 */
380
381		// tmp = bot1 < top1 ? vw : 0;
382		// r1 = (bot1 - top1)*vw + top1 - tmp
383		// r1 = bot1vw - vwtop1 + top1 - tmp
384		// r1 = bot1vw + top1 - vwtop1 - tmp
385		// r1 = bot1vw + top1(1 - vw) - tmp
386		r1 = _mm_mulhi_epu16 (
387		_mm_sub_epi16 (bot1, top1), vw);
388		tmp = _mm_cmplt_epi16 (bot1, top1);
389		tmp = _mm_and_si128 (tmp, vw);
390		r1 = _mm_sub_epi16 (r1, tmp);
391		r1 = _mm_add_epi16 (r1, top1);
392		r1 = _mm_srli_epi16 (r1, BILINEAR_INTERPOLATION_BITS);
393		//r1 = _mm_shuffle_epi32 (r1, _MM_SHUFFLE (2, 0, 3, 1));
394		/* r1: A3 R3 G3 B3 A2 R2 G2 B2 */
395		#else
396		__m128i r0, r1, p;
397	0	top0 = _mm_mulhi_epu16 (top0, uvw);
398	0	bot0 = _mm_mulhi_epu16 (bot0, vw);
399	0	r0 = _mm_add_epi16(top0, bot0);
400	0	r0 = _mm_srli_epi16(r0, BILINEAR_INTERPOLATION_BITS-1);
401	0
402	0	top1 = _mm_mulhi_epu16 (top1, uvw);
403	0	bot1 = _mm_mulhi_epu16 (bot1, vw);
404	0	r1 = _mm_add_epi16(top1, bot1);
405	0	r1 = _mm_srli_epi16(r1, BILINEAR_INTERPOLATION_BITS-1);
406	0	#endif
407	0
408	0	p = _mm_packus_epi16 (r0, r1);
409	0	_mm_storeu_si128 ((__m128i *)(iter->buffer + i), p);
410	0	}
411	0
412	0	while (i < iter->width)
413	0	{
414	0	__m128i top0 = _mm_load_si128 ((__m128i *)(line0->buffer + i));
415	0	__m128i bot0 = _mm_load_si128 ((__m128i *)(line1->buffer + i));
416	0
417		#ifdef PIXMAN_STYLE_INTERPOLATION
418		__m128i r0, tmp, p;
419		r0 = _mm_mulhi_epu16 (
420		_mm_sub_epi16 (bot0, top0), vw);
421		tmp = _mm_cmplt_epi16 (bot0, top0);
422		tmp = _mm_and_si128 (tmp, vw);
423		r0 = _mm_sub_epi16 (r0, tmp);
424		r0 = _mm_add_epi16 (r0, top0);
425		r0 = _mm_srli_epi16 (r0, BILINEAR_INTERPOLATION_BITS);
426		/* r0: A0 R0 A1 R1 G0 B0 G1 B1 */
427		r0 = _mm_shuffle_epi32 (r0, _MM_SHUFFLE (2, 0, 3, 1));
428		/* r0: A1 R1 G1 B1 A0 R0 G0 B0 */
429		#else
430		__m128i r0, p;
431	0	top0 = _mm_mulhi_epu16 (top0, uvw);
432	0	bot0 = _mm_mulhi_epu16 (bot0, vw);
433	0	r0 = _mm_add_epi16(top0, bot0);
434	0	r0 = _mm_srli_epi16(r0, BILINEAR_INTERPOLATION_BITS-1);
435	0	#endif
436	0
437	0	p = _mm_packus_epi16 (r0, r0);
438	0
439	0	if (iter->width - i == 1)
440	0	{
441	0	(uint32_t )(iter->buffer + i) = _mm_cvtsi128_si32 (p);
442	0	i++;
443	0	}
444	0	else
445	0	{
446	0	_mm_storel_epi64 ((__m128i *)(iter->buffer + i), p);
447	0	i += 2;
448	0	}
449	0	}
450	0
451	0	info->y += iter->image->transform->matrix[1][1];
452	0
453	0	return iter->buffer;
454	0	}
455
456		static void
457		ssse3_bilinear_cover_iter_fini (pixman_iter_t *iter)
458	0	{
459	0	free (iter->data);
460	0	}
461
462		static void
463		ssse3_bilinear_cover_iter_init (pixman_iter_t *iter)
464	0	{
465	0	int width = iter->width;
466	0	bilinear_info_t *info;
467	0	pixman_vector_t v;
468	0
469	0	if (iter->x > PIXMAN_FIXED_INT_MAX \|\|
470	0	iter->x < PIXMAN_FIXED_INT_MIN \|\|
471	0	iter->y > PIXMAN_FIXED_INT_MAX \|\|
472	0	iter->y < PIXMAN_FIXED_INT_MIN)
473	0	goto fail;
474	0
475	0	/* Reference point is the center of the pixel */
476	0	v.vector[0] = pixman_int_to_fixed (iter->x) + pixman_fixed_1 / 2;
477	0	v.vector[1] = pixman_int_to_fixed (iter->y) + pixman_fixed_1 / 2;
478	0	v.vector[2] = pixman_fixed_1;
479	0
480	0	if (!pixman_transform_point_3d (iter->image->transform, &v))
481	0	goto fail;
482	0
483	0	info = malloc (sizeof (info) + (2 width - 1) * sizeof (uint64_t) + 64);
484	0	if (!info)
485	0	goto fail;
486	0
487	0	info->x = v.vector[0] - pixman_fixed_1 / 2;
488	0	info->y = v.vector[1] - pixman_fixed_1 / 2;
489	0
490	0	#define ALIGN(addr) \
491	0	((void *)((((uintptr_t)(addr)) + 15) & (~15)))
492	0
493	0	/* It is safe to set the y coordinates to -1 initially
494	0	* because COVER_CLIP_BILINEAR ensures that we will only
495	0	* be asked to fetch lines in the [0, height) interval
496	0	*/
497	0	info->lines[0].y = -1;
498	0	info->lines[0].buffer = ALIGN (&(info->data[0]));
499	0	info->lines[1].y = -1;
500	0	info->lines[1].buffer = ALIGN (info->lines[0].buffer + width);
501	0
502	0	iter->fini = ssse3_bilinear_cover_iter_fini;
503	0
504	0	iter->data = info;
505	0	return;
506	0
507	0	fail:
508	0	/* Something went wrong, either a bad matrix or OOM; in such cases,
509	0	* we don't guarantee any particular rendering.
510	0	*/
511	0	iter->fini = NULL;
512	0	}
513
514		/* scale the src from src_width/height to dest_width/height drawn
515		* into the rectangle x,y width,height
516		* src_stride and dst_stride are 4 byte units */
517		bool ssse3_scale_data(uint32_t *src, int src_width, int src_height, int src_stride,
518		uint32_t *dest, int dest_width, int dest_height,
519		int dest_stride,
520		int x, int y,
521		int width, int height)
522	0	{
523	0	//XXX: assert(src_width > 1)
524	0	pixman_transform_t transform = {
525	0	{ { pixman_fixed_1, 0, 0 },
526	0	{ 0, pixman_fixed_1, 0 },
527	0	{ 0, 0, pixman_fixed_1 } }
528	0	};
529	0	double width_scale = ((double)src_width)/dest_width;
530	0	double height_scale = ((double)src_height)/dest_height;
531	0	#define AVOID_PADDING
532	0	#ifdef AVOID_PADDING
533	0	// scale up by enough that we don't read outside of the bounds of the source surface
534	0	// currently this is required to avoid reading out of bounds.
535	0	if (width_scale < 1) {
536	0	width_scale = (double)(src_width-1)/dest_width;
537	0	transform.matrix[0][2] = pixman_fixed_1/2;
538	0	}
539	0	if (height_scale < 1) {
540	0	height_scale = (double)(src_height-1)/dest_height;
541	0	transform.matrix[1][2] = pixman_fixed_1/2;
542	0	}
543	0	#endif
544	0	transform.matrix[0][0] = pixman_double_to_fixed(width_scale);
545	0	transform.matrix[1][1] = pixman_double_to_fixed(height_scale);
546	0	transform.matrix[2][2] = pixman_fixed_1;
547	0
548	0	bits_image_t image;
549	0	image.bits = src;
550	0	image.transform = &transform;
551	0	image.rowstride = src_stride;
552	0
553	0	pixman_iter_t iter;
554	0	iter.image = &image;
555	0	iter.x = x;
556	0	iter.y = y;
557	0	iter.width = width;
558	0	iter.height = src_height;
559	0	iter.buffer = dest;
560	0	iter.data = NULL;
561	0
562	0	ssse3_bilinear_cover_iter_init(&iter);
563	0
564	0	if (!iter.fini)
565	0	return false;
566	0
567	0	if (iter.data) {
568	0	for (int iy = 0; iy < height; iy++) {
569	0	ssse3_fetch_bilinear_cover(&iter, NULL);
570	0	iter.buffer += dest_stride;
571	0	}
572	0	ssse3_bilinear_cover_iter_fini(&iter);
573	0	}
574	0	return true;
575	0	}