/work/libde265/libde265/sao.cc

Source
/*
 * H.265 video codec.
 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
 *
 * This file is part of libde265.
 *
 * libde265 is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation, either version 3 of
 * the License, or (at your option) any later version.
 *
 * libde265 is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
 */

#include "sao.h"
#include "util.h"

#include <stdlib.h>
#include <string.h>


template <class pixel_t>
void apply_sao_internal(de265_image* img, int xCtb,int yCtb,
                        const slice_segment_header* shdr, int cIdx, int nSW,int nSH,
                        const pixel_t* in_img,  int in_stride,
                        /* */ pixel_t* out_img, int out_stride)
{
  const sao_info* saoinfo = img->get_sao_info(xCtb,yCtb);

  int SaoTypeIdx = (saoinfo->SaoTypeIdx >> (2*cIdx)) & 0x3;

  logtrace(LogSAO,"apply_sao CTB %d;%d cIdx:%d type=%d (%dx%d)\n",xCtb,yCtb,cIdx, SaoTypeIdx, nSW,nSH);

  if (SaoTypeIdx==0) {
    return;
  }

  const seq_parameter_set* sps = &img->get_sps();
  const pic_parameter_set* pps = &img->get_pps();
  const int bitDepth = (cIdx==0 ? sps->BitDepth_Y : sps->BitDepth_C);
  const int maxPixelValue = (1<<bitDepth)-1;

  // top left position of CTB in pixels
  const int xC = xCtb*nSW;
  const int yC = yCtb*nSH;

  const int width  = img->get_width(cIdx);
  const int height = img->get_height(cIdx);

  const int ctbSliceAddrRS = img->get_SliceHeader(xC,yC)->SliceAddrRS;

  const int picWidthInCtbs = sps->PicWidthInCtbsY;
  const int chromashiftW = sps->get_chroma_shift_W(cIdx);
  const int chromashiftH = sps->get_chroma_shift_H(cIdx);
  const int ctbshiftW = sps->Log2CtbSizeY - chromashiftW;
  const int ctbshiftH = sps->Log2CtbSizeY - chromashiftH;


  for (int i=0;i<5;i++)
    {
      logtrace(LogSAO,"offset[%d] = %d\n", i, i==0 ? 0 : saoinfo->saoOffsetVal[cIdx][i-1]);
    }


  // actual size of CTB to be processed (can be smaller when partially outside of image)
  const int ctbW = (xC+nSW>width)  ? width -xC : nSW;
  const int ctbH = (yC+nSH>height) ? height-yC : nSH;


  const bool extendedTests = img->get_CTB_has_pcm_or_cu_transquant_bypass(xCtb,yCtb);

  if (SaoTypeIdx==2) {
    int hPos[2], vPos[2];
    int vPosStride[2]; // vPos[] multiplied by image stride
    int SaoEoClass = (saoinfo->SaoEoClass >> (2*cIdx)) & 0x3;

    switch (SaoEoClass) {
    case 0: hPos[0]=-1; hPos[1]= 1; vPos[0]= 0; vPos[1]=0; break;
    case 1: hPos[0]= 0; hPos[1]= 0; vPos[0]=-1; vPos[1]=1; break;
    case 2: hPos[0]=-1; hPos[1]= 1; vPos[0]=-1; vPos[1]=1; break;
    case 3: hPos[0]= 1; hPos[1]=-1; vPos[0]=-1; vPos[1]=1; break;
    }

    vPosStride[0] = vPos[0] * in_stride;
    vPosStride[1] = vPos[1] * in_stride;

    /* Reorder sao_info.saoOffsetVal[] array, so that we can index it
       directly with the sum of the two pixel-difference signs. */
    int8_t  saoOffsetVal[5]; // [2] unused
    saoOffsetVal[0] = saoinfo->saoOffsetVal[cIdx][1-1];
    saoOffsetVal[1] = saoinfo->saoOffsetVal[cIdx][2-1];
    saoOffsetVal[2] = 0;
    saoOffsetVal[3] = saoinfo->saoOffsetVal[cIdx][3-1];
    saoOffsetVal[4] = saoinfo->saoOffsetVal[cIdx][4-1];


    for (int j=0;j<ctbH;j++) {
      const pixel_t* in_ptr  = &in_img [xC+(yC+j)*in_stride];
      /* */ pixel_t* out_ptr = &out_img[xC+(yC+j)*out_stride];

      for (int i=0;i<ctbW;i++) {
        int edgeIdx = -1;

        logtrace(LogSAO, "pos %d,%d\n",xC+i,yC+j);

        if ((extendedTests &&
             (sps->pcm_loop_filter_disable_flag &&
              img->get_pcm_flag((xC+i)<<chromashiftW,(yC+j)<<chromashiftH))) ||
            img->get_cu_transquant_bypass((xC+i)<<chromashiftW,(yC+j)<<chromashiftH)) {
          continue;
        }

        // do the expensive test for boundaries only at the boundaries
        bool testBoundary = (i==0 || j==0 || i==ctbW-1 || j==ctbH-1);

        if (testBoundary)
          for (int k=0;k<2;k++) {
            int xS = xC+i+hPos[k];
            int yS = yC+j+vPos[k];

            if (xS<0 || yS<0 || xS>=width || yS>=height) {
              edgeIdx=0;
              break;
            }


            // This part seems inefficient with all the get_SliceHeaderIndex() calls,
            // but removing this part (because the input was known to have only a single
            // slice anyway) reduced computation time only by 1.3%.
            // TODO: however, this may still be a big part of SAO itself.

            slice_segment_header* sliceHeader = img->get_SliceHeader(xS<<chromashiftW,
                                                                     yS<<chromashiftH);
            if (sliceHeader==NULL) { return; }

            int sliceAddrRS = sliceHeader->SliceAddrRS;
            if (sliceAddrRS <  ctbSliceAddrRS &&
                img->get_SliceHeader((xC+i)<<chromashiftW,
                                     (yC+j)<<chromashiftH)->slice_loop_filter_across_slices_enabled_flag==0) {
              edgeIdx=0;
              break;
            }

            if (sliceAddrRS >  ctbSliceAddrRS &&
                img->get_SliceHeader(xS<<chromashiftW,
                                     yS<<chromashiftH)->slice_loop_filter_across_slices_enabled_flag==0) {
              edgeIdx=0;
              break;
            }


            if (pps->loop_filter_across_tiles_enabled_flag==0 &&
                pps->TileIdRS[(xS>>ctbshiftW) + (yS>>ctbshiftH)*picWidthInCtbs] !=
                pps->TileIdRS[(xC>>ctbshiftW) + (yC>>ctbshiftH)*picWidthInCtbs]) {
              edgeIdx=0;
              break;
            }
          }

        if (edgeIdx != 0) {

          edgeIdx = ( Sign(in_ptr[i] - in_ptr[i+hPos[0]+vPosStride[0]]) +
                      Sign(in_ptr[i] - in_ptr[i+hPos[1]+vPosStride[1]])   );

          if (1) { // edgeIdx != 0) {   // seems to be faster without this check (zero in offset table)
            int offset = saoOffsetVal[edgeIdx+2];

            out_ptr[i] = Clip3(0,maxPixelValue,
                               in_ptr[i] + offset);
          }
        }
      }
    }
  }
  else {
    int bandShift = bitDepth-5;
    int saoLeftClass = saoinfo->sao_band_position[cIdx];
    logtrace(LogSAO,"saoLeftClass: %d\n",saoLeftClass);

    int bandTable[32];
    memset(bandTable, 0, sizeof(int)*32);

    for (int k=0;k<4;k++) {
      bandTable[ (k+saoLeftClass)&31 ] = k+1;
    }


    /* If PCM or transquant_bypass is used in this CTB, we have to
       run all checks (A).
       Otherwise, we run a simplified version of the code (B).

       NOTE: this whole part of SAO does not seem to be a significant part of the time spent
    */

    if (extendedTests) {

      // (A) full version with all checks

      for (int j=0;j<ctbH;j++)
        for (int i=0;i<ctbW;i++) {

          if ((sps->pcm_loop_filter_disable_flag &&
               img->get_pcm_flag((xC+i)<<chromashiftW,(yC+j)<<chromashiftH)) ||
              img->get_cu_transquant_bypass((xC+i)<<chromashiftW,(yC+j)<<chromashiftH)) {
            continue;
          }

          // Shifts are a strange thing. On x86, >>x actually computes >>(x%64).
          // But this should never happen, because the maximum bit-depth is 16.
          int pixel = in_img[xC + i + (yC + j) * in_stride];

          // Note: the input pixel value should never exceed the valid range, but it seems that it still does,
          // maybe when there was a decoding error and the pixels have not been filled in correctly.
          // Thus, we have to limit the pixel range to ensure that we have no illegal table access.
          pixel = Clip3(0, maxPixelValue, pixel);

          int bandIdx = bandTable[pixel >> bandShift];

          if (bandIdx>0) {
            int offset = saoinfo->saoOffsetVal[cIdx][bandIdx-1];

            logtrace(LogSAO,"%d %d (%d) offset %d  %x -> %x\n",xC+i,yC+j,bandIdx,
                     offset,
                     in_img[xC+i+(yC+j)*in_stride],
                     in_img[xC+i+(yC+j)*in_stride]+offset);

            out_img[xC+i+(yC+j)*out_stride] = Clip3(0,maxPixelValue,
                                                    in_img[xC+i+(yC+j)*in_stride] + offset);
          }
        }
    }
    else
      {
        // (B) simplified version (only works if no PCM and transquant_bypass is active)

        for (int j=0;j<ctbH;j++)
          for (int i=0;i<ctbW;i++) {

            int pixel = in_img[xC + i + (yC + j) * in_stride];

            // Note: the input pixel value should never exceed the valid range, but it seems that it still does,
            // maybe when there was a decoding error and the pixels have not been filled in correctly.
            // Thus, we have to limit the pixel range to ensure that we have no illegal table access.
            pixel = Clip3(0, maxPixelValue, pixel);

            int bandIdx = bandTable[pixel >> bandShift];

            if (bandIdx>0) {
              int offset = saoinfo->saoOffsetVal[cIdx][bandIdx-1];

              out_img[xC+i+(yC+j)*out_stride] = Clip3(0,maxPixelValue,
                                                      in_img[xC+i+(yC+j)*in_stride] + offset);
            }
          }
      }
  }
}


template <class pixel_t>
void apply_sao(de265_image* img, int xCtb,int yCtb,
               const slice_segment_header* shdr, int cIdx, int nSW,int nSH,
               const pixel_t* in_img,  int in_stride,
               /* */ pixel_t* out_img, int out_stride)
{
  if (img->high_bit_depth(cIdx)) {
    apply_sao_internal<uint16_t>(img,xCtb,yCtb, shdr,cIdx,nSW,nSH,
                                 (uint16_t*)in_img, in_stride,
                                 (uint16_t*)out_img,out_stride);
  }
  else {
    apply_sao_internal<uint8_t>(img,xCtb,yCtb, shdr,cIdx,nSW,nSH,
                                in_img, in_stride,
                                out_img,out_stride);
  }
}


void apply_sample_adaptive_offset(de265_image* img)
{
  const seq_parameter_set& sps = img->get_sps();

  if (sps.sample_adaptive_offset_enabled_flag==0) {
    return;
  }

  de265_image inputCopy;
  de265_error err = inputCopy.copy_image(img);
  if (err != DE265_OK) {
    img->decctx->add_warning(DE265_WARNING_CANNOT_APPLY_SAO_OUT_OF_MEMORY,false);
    return;
  }

  for (int yCtb=0; yCtb<sps.PicHeightInCtbsY; yCtb++)
    for (int xCtb=0; xCtb<sps.PicWidthInCtbsY; xCtb++)
      {
        const slice_segment_header* shdr = img->get_SliceHeaderCtb(xCtb,yCtb);

        if (shdr->slice_sao_luma_flag) {
          apply_sao(img, xCtb,yCtb, shdr, 0, 1<<sps.Log2CtbSizeY, 1<<sps.Log2CtbSizeY,
                    inputCopy.get_image_plane(0), inputCopy.get_image_stride(0),
                    img->get_image_plane(0), img->get_image_stride(0));
        }

        if (shdr->slice_sao_chroma_flag) {
          int nSW = (1<<sps.Log2CtbSizeY) / sps.SubWidthC;
          int nSH = (1<<sps.Log2CtbSizeY) / sps.SubHeightC;

          apply_sao(img, xCtb,yCtb, shdr, 1, nSW,nSH,
                    inputCopy.get_image_plane(1), inputCopy.get_image_stride(1),
                    img->get_image_plane(1), img->get_image_stride(1));

          apply_sao(img, xCtb,yCtb, shdr, 2, nSW,nSH,
                    inputCopy.get_image_plane(2), inputCopy.get_image_stride(2),
                    img->get_image_plane(2), img->get_image_stride(2));
        }
      }
}


void apply_sample_adaptive_offset_sequential(de265_image* img)
{
  const seq_parameter_set& sps = img->get_sps();

  if (sps.sample_adaptive_offset_enabled_flag==0) {
    return;
  }

  int lumaImageSize   = img->get_image_stride(0) * img->get_height(0) * img->get_bytes_per_pixel(0);
  int chromaImageSize = img->get_image_stride(1) * img->get_height(1) * img->get_bytes_per_pixel(1);

  uint8_t* inputCopy = new uint8_t[ libde265_max(lumaImageSize, chromaImageSize) ];
  if (inputCopy == NULL) {
    img->decctx->add_warning(DE265_WARNING_CANNOT_APPLY_SAO_OUT_OF_MEMORY,false);
    return;
  }


  int nChannels = 3;
  if (sps.ChromaArrayType == CHROMA_MONO) { nChannels=1; }

  for (int cIdx=0;cIdx<nChannels;cIdx++) {

    int stride = img->get_image_stride(cIdx);
    int height = img->get_height(cIdx);

    memcpy(inputCopy, img->get_image_plane(cIdx), stride * height * img->get_bytes_per_pixel(cIdx));

    for (int yCtb=0; yCtb<sps.PicHeightInCtbsY; yCtb++)
      for (int xCtb=0; xCtb<sps.PicWidthInCtbsY; xCtb++)
        {
          const slice_segment_header* shdr = img->get_SliceHeaderCtb(xCtb,yCtb);
          if (shdr==NULL) {
      delete[] inputCopy;
      return;
    }

          if (cIdx==0 && shdr->slice_sao_luma_flag) {
            apply_sao(img, xCtb,yCtb, shdr, 0, 1<<sps.Log2CtbSizeY, 1<<sps.Log2CtbSizeY,
                      inputCopy, stride,
                      img->get_image_plane(0), img->get_image_stride(0));
          }

          if (cIdx!=0 && shdr->slice_sao_chroma_flag) {
            int nSW = (1<<sps.Log2CtbSizeY) / sps.SubWidthC;
            int nSH = (1<<sps.Log2CtbSizeY) / sps.SubHeightC;

            apply_sao(img, xCtb,yCtb, shdr, cIdx, nSW,nSH,
                      inputCopy, stride,
                      img->get_image_plane(cIdx), img->get_image_stride(cIdx));
          }
        }
  }

  delete[] inputCopy;
}




class thread_task_sao : public thread_task
{
public:
  int  ctb_y;
  de265_image* img; /* this is where we get the SPS from
                       (either inputImg or outputImg can be a dummy image)
                    */

  de265_image* inputImg;
  de265_image* outputImg;
  int inputProgress;

  virtual void work();
  virtual std::string name() const {
    char buf[100];
    sprintf(buf,"sao-%d",ctb_y);
    return buf;
  }
};


void thread_task_sao::work()
{
  state = Running;
  img->thread_run(this);

  const seq_parameter_set& sps = img->get_sps();

  const int rightCtb = sps.PicWidthInCtbsY-1;
  const int ctbSize  = (1<<sps.Log2CtbSizeY);


  // wait until also the CTB-rows below and above are ready

  img->wait_for_progress(this, rightCtb,ctb_y,  inputProgress);

  if (ctb_y>0) {
    img->wait_for_progress(this, rightCtb,ctb_y-1, inputProgress);
  }

  if (ctb_y+1<sps.PicHeightInCtbsY) {
    img->wait_for_progress(this, rightCtb,ctb_y+1, inputProgress);
  }


  // copy input image to output for this CTB-row

  outputImg->copy_lines_from(inputImg, ctb_y * ctbSize, (ctb_y+1) * ctbSize);


  // process SAO in the CTB-row

  for (int xCtb=0; xCtb<sps.PicWidthInCtbsY; xCtb++)
    {
      const slice_segment_header* shdr = img->get_SliceHeaderCtb(xCtb,ctb_y);
      if (shdr==NULL) {
        break;
      }

      if (shdr->slice_sao_luma_flag) {
        apply_sao(img, xCtb,ctb_y, shdr, 0, ctbSize, ctbSize,
                  inputImg ->get_image_plane(0), inputImg ->get_image_stride(0),
                  outputImg->get_image_plane(0), outputImg->get_image_stride(0));
      }

      if (shdr->slice_sao_chroma_flag) {
        int nSW = ctbSize / sps.SubWidthC;
        int nSH = ctbSize / sps.SubHeightC;

        apply_sao(img, xCtb,ctb_y, shdr, 1, nSW,nSH,
                  inputImg ->get_image_plane(1), inputImg ->get_image_stride(1),
                  outputImg->get_image_plane(1), outputImg->get_image_stride(1));

        apply_sao(img, xCtb,ctb_y, shdr, 2, nSW,nSH,
                  inputImg ->get_image_plane(2), inputImg ->get_image_stride(2),
                  outputImg->get_image_plane(2), outputImg->get_image_stride(2));
      }
    }


  // mark SAO progress

  for (int x=0;x<=rightCtb;x++) {
    const int CtbWidth = sps.PicWidthInCtbsY;
    img->ctb_progress[x+ctb_y*CtbWidth].set_progress(CTB_PROGRESS_SAO);
  }


  state = Finished;
  img->thread_finishes(this);
}


bool add_sao_tasks(image_unit* imgunit, int saoInputProgress)
{
  de265_image* img = imgunit->img;
  const seq_parameter_set& sps = img->get_sps();

  if (sps.sample_adaptive_offset_enabled_flag==0) {
    return false;
  }


  decoder_context* ctx = img->decctx;

  de265_error err = imgunit->sao_output.alloc_image(img->get_width(), img->get_height(),
                                                    img->get_chroma_format(),
                                                    img->get_shared_sps(),
                                                    false,
                                                    img->decctx, //img->encctx,
                                                    img->pts, img->user_data, true);
  if (err != DE265_OK) {
    img->decctx->add_warning(DE265_WARNING_CANNOT_APPLY_SAO_OUT_OF_MEMORY,false);
    return false;
  }

  int nRows = sps.PicHeightInCtbsY;

  int n=0;
  img->thread_start(nRows);

  for (int y=0;y<nRows;y++)
    {
      thread_task_sao* task = new thread_task_sao;

      task->inputImg  = img;
      task->outputImg = &imgunit->sao_output;
      task->img = img;
      task->ctb_y = y;
      task->inputProgress = saoInputProgress;

      imgunit->tasks.push_back(task);
      add_task(&ctx->thread_pool_, task);
      n++;
    }

  /* Currently need barrier here because when are finished, we have to swap the pixel
     data back into the main image. */
  img->wait_for_completion();

  img->exchange_pixel_data_with(imgunit->sao_output);

  return true;
}

Coverage Report

Created: 2026-03-08 06:41

Line	Count	Source
1		/*
2		* H.265 video codec.
3		* Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
4		*
5		* This file is part of libde265.
6		*
7		* libde265 is free software: you can redistribute it and/or modify
8		* it under the terms of the GNU Lesser General Public License as
9		* published by the Free Software Foundation, either version 3 of
10		* the License, or (at your option) any later version.
11		*
12		* libde265 is distributed in the hope that it will be useful,
13		* but WITHOUT ANY WARRANTY; without even the implied warranty of
14		* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15		* GNU Lesser General Public License for more details.
16		*
17		* You should have received a copy of the GNU Lesser General Public License
18		* along with libde265. If not, see <http://www.gnu.org/licenses/>.
19		*/
20
21		#include "sao.h"
22		#include "util.h"
23
24		#include <stdlib.h>
25		#include <string.h>
26
27
28		template <class pixel_t>
29		void apply_sao_internal(de265_image* img, int xCtb,int yCtb,
30		const slice_segment_header* shdr, int cIdx, int nSW,int nSH,
31		const pixel_t* in_img, int in_stride,
32		/* / pixel_t out_img, int out_stride)
33	0	{
34	0	const sao_info* saoinfo = img->get_sao_info(xCtb,yCtb);
35
36	0	int SaoTypeIdx = (saoinfo->SaoTypeIdx >> (2*cIdx)) & 0x3;
37
38	0	logtrace(LogSAO,"apply_sao CTB %d;%d cIdx:%d type=%d (%dx%d)\n",xCtb,yCtb,cIdx, SaoTypeIdx, nSW,nSH);
39
40	0	if (SaoTypeIdx==0) {
41	0	return;
42	0	}
43
44	0	const seq_parameter_set* sps = &img->get_sps();
45	0	const pic_parameter_set* pps = &img->get_pps();
46	0	const int bitDepth = (cIdx==0 ? sps->BitDepth_Y : sps->BitDepth_C);
47	0	const int maxPixelValue = (1<<bitDepth)-1;
48
49		// top left position of CTB in pixels
50	0	const int xC = xCtb*nSW;
51	0	const int yC = yCtb*nSH;
52
53	0	const int width = img->get_width(cIdx);
54	0	const int height = img->get_height(cIdx);
55
56	0	const int ctbSliceAddrRS = img->get_SliceHeader(xC,yC)->SliceAddrRS;
57
58	0	const int picWidthInCtbs = sps->PicWidthInCtbsY;
59	0	const int chromashiftW = sps->get_chroma_shift_W(cIdx);
60	0	const int chromashiftH = sps->get_chroma_shift_H(cIdx);
61	0	const int ctbshiftW = sps->Log2CtbSizeY - chromashiftW;
62	0	const int ctbshiftH = sps->Log2CtbSizeY - chromashiftH;
63
64
65	0	for (int i=0;i<5;i++)
66	0	{
67	0	logtrace(LogSAO,"offset[%d] = %d\n", i, i==0 ? 0 : saoinfo->saoOffsetVal[cIdx][i-1]);
68	0	}
69
70
71		// actual size of CTB to be processed (can be smaller when partially outside of image)
72	0	const int ctbW = (xC+nSW>width) ? width -xC : nSW;
73	0	const int ctbH = (yC+nSH>height) ? height-yC : nSH;
74
75
76	0	const bool extendedTests = img->get_CTB_has_pcm_or_cu_transquant_bypass(xCtb,yCtb);
77
78	0	if (SaoTypeIdx==2) {
79	0	int hPos[2], vPos[2];
80	0	int vPosStride[2]; // vPos[] multiplied by image stride
81	0	int SaoEoClass = (saoinfo->SaoEoClass >> (2*cIdx)) & 0x3;
82
83	0	switch (SaoEoClass) {
84	0	case 0: hPos[0]=-1; hPos[1]= 1; vPos[0]= 0; vPos[1]=0; break;
85	0	case 1: hPos[0]= 0; hPos[1]= 0; vPos[0]=-1; vPos[1]=1; break;
86	0	case 2: hPos[0]=-1; hPos[1]= 1; vPos[0]=-1; vPos[1]=1; break;
87	0	case 3: hPos[0]= 1; hPos[1]=-1; vPos[0]=-1; vPos[1]=1; break;
88	0	}
89
90	0	vPosStride[0] = vPos[0] * in_stride;
91	0	vPosStride[1] = vPos[1] * in_stride;
92
93		/* Reorder sao_info.saoOffsetVal[] array, so that we can index it
94		directly with the sum of the two pixel-difference signs. */
95	0	int8_t saoOffsetVal[5]; // [2] unused
96	0	saoOffsetVal[0] = saoinfo->saoOffsetVal[cIdx][1-1];
97	0	saoOffsetVal[1] = saoinfo->saoOffsetVal[cIdx][2-1];
98	0	saoOffsetVal[2] = 0;
99	0	saoOffsetVal[3] = saoinfo->saoOffsetVal[cIdx][3-1];
100	0	saoOffsetVal[4] = saoinfo->saoOffsetVal[cIdx][4-1];
101
102
103	0	for (int j=0;j<ctbH;j++) {
104	0	const pixel_t* in_ptr = &in_img [xC+(yC+j)*in_stride];
105	0	/* / pixel_t out_ptr = &out_img[xC+(yC+j)*out_stride];
106
107	0	for (int i=0;i<ctbW;i++) {
108	0	int edgeIdx = -1;
109
110	0	logtrace(LogSAO, "pos %d,%d\n",xC+i,yC+j);
111
112	0	if ((extendedTests &&
113	0	(sps->pcm_loop_filter_disable_flag &&
114	0	img->get_pcm_flag((xC+i)<<chromashiftW,(yC+j)<<chromashiftH))) \|\|
115	0	img->get_cu_transquant_bypass((xC+i)<<chromashiftW,(yC+j)<<chromashiftH)) {
116	0	continue;
117	0	}
118
119		// do the expensive test for boundaries only at the boundaries
120	0	bool testBoundary = (i==0 \|\| j==0 \|\| i==ctbW-1 \|\| j==ctbH-1);
121
122	0	if (testBoundary)
123	0	for (int k=0;k<2;k++) {
124	0	int xS = xC+i+hPos[k];
125	0	int yS = yC+j+vPos[k];
126
127	0	if (xS<0 \|\| yS<0 \|\| xS>=width \|\| yS>=height) {
128	0	edgeIdx=0;
129	0	break;
130	0	}
131
132
133		// This part seems inefficient with all the get_SliceHeaderIndex() calls,
134		// but removing this part (because the input was known to have only a single
135		// slice anyway) reduced computation time only by 1.3%.
136		// TODO: however, this may still be a big part of SAO itself.
137
138	0	slice_segment_header* sliceHeader = img->get_SliceHeader(xS<<chromashiftW,
139	0	yS<<chromashiftH);
140	0	if (sliceHeader==NULL) { return; }
141
142	0	int sliceAddrRS = sliceHeader->SliceAddrRS;
143	0	if (sliceAddrRS < ctbSliceAddrRS &&
144	0	img->get_SliceHeader((xC+i)<<chromashiftW,
145	0	(yC+j)<<chromashiftH)->slice_loop_filter_across_slices_enabled_flag==0) {
146	0	edgeIdx=0;
147	0	break;
148	0	}
149
150	0	if (sliceAddrRS > ctbSliceAddrRS &&
151	0	img->get_SliceHeader(xS<<chromashiftW,
152	0	yS<<chromashiftH)->slice_loop_filter_across_slices_enabled_flag==0) {
153	0	edgeIdx=0;
154	0	break;
155	0	}
156
157
158	0	if (pps->loop_filter_across_tiles_enabled_flag==0 &&
159	0	pps->TileIdRS[(xS>>ctbshiftW) + (yS>>ctbshiftH)*picWidthInCtbs] !=
160	0	pps->TileIdRS[(xC>>ctbshiftW) + (yC>>ctbshiftH)*picWidthInCtbs]) {
161	0	edgeIdx=0;
162	0	break;
163	0	}
164	0	}
165
166	0	if (edgeIdx != 0) {
167
168	0	edgeIdx = ( Sign(in_ptr[i] - in_ptr[i+hPos[0]+vPosStride[0]]) +
169	0	Sign(in_ptr[i] - in_ptr[i+hPos[1]+vPosStride[1]]) );
170
171	0	if (1) { // edgeIdx != 0) { // seems to be faster without this check (zero in offset table)
172	0	int offset = saoOffsetVal[edgeIdx+2];
173
174	0	out_ptr[i] = Clip3(0,maxPixelValue,
175	0	in_ptr[i] + offset);
176	0	}
177	0	}
178	0	}
179	0	}
180	0	}
181	0	else {
182	0	int bandShift = bitDepth-5;
183	0	int saoLeftClass = saoinfo->sao_band_position[cIdx];
184	0	logtrace(LogSAO,"saoLeftClass: %d\n",saoLeftClass);
185
186	0	int bandTable[32];
187	0	memset(bandTable, 0, sizeof(int)*32);
188
189	0	for (int k=0;k<4;k++) {
190	0	bandTable[ (k+saoLeftClass)&31 ] = k+1;
191	0	}
192
193
194		/* If PCM or transquant_bypass is used in this CTB, we have to
195		run all checks (A).
196		Otherwise, we run a simplified version of the code (B).
197
198		NOTE: this whole part of SAO does not seem to be a significant part of the time spent
199		*/
200
201	0	if (extendedTests) {
202
203		// (A) full version with all checks
204
205	0	for (int j=0;j<ctbH;j++)
206	0	for (int i=0;i<ctbW;i++) {
207
208	0	if ((sps->pcm_loop_filter_disable_flag &&
209	0	img->get_pcm_flag((xC+i)<<chromashiftW,(yC+j)<<chromashiftH)) \|\|
210	0	img->get_cu_transquant_bypass((xC+i)<<chromashiftW,(yC+j)<<chromashiftH)) {
211	0	continue;
212	0	}
213
214		// Shifts are a strange thing. On x86, >>x actually computes >>(x%64).
215		// But this should never happen, because the maximum bit-depth is 16.
216	0	int pixel = in_img[xC + i + (yC + j) * in_stride];
217
218		// Note: the input pixel value should never exceed the valid range, but it seems that it still does,
219		// maybe when there was a decoding error and the pixels have not been filled in correctly.
220		// Thus, we have to limit the pixel range to ensure that we have no illegal table access.
221	0	pixel = Clip3(0, maxPixelValue, pixel);
222
223	0	int bandIdx = bandTable[pixel >> bandShift];
224
225	0	if (bandIdx>0) {
226	0	int offset = saoinfo->saoOffsetVal[cIdx][bandIdx-1];
227
228	0	logtrace(LogSAO,"%d %d (%d) offset %d %x -> %x\n",xC+i,yC+j,bandIdx,
229	0	offset,
230	0	in_img[xC+i+(yC+j)*in_stride],
231	0	in_img[xC+i+(yC+j)*in_stride]+offset);
232
233	0	out_img[xC+i+(yC+j)*out_stride] = Clip3(0,maxPixelValue,
234	0	in_img[xC+i+(yC+j)*in_stride] + offset);
235	0	}
236	0	}
237	0	}
238	0	else
239	0	{
240		// (B) simplified version (only works if no PCM and transquant_bypass is active)
241
242	0	for (int j=0;j<ctbH;j++)
243	0	for (int i=0;i<ctbW;i++) {
244
245	0	int pixel = in_img[xC + i + (yC + j) * in_stride];
246
247		// Note: the input pixel value should never exceed the valid range, but it seems that it still does,
248		// maybe when there was a decoding error and the pixels have not been filled in correctly.
249		// Thus, we have to limit the pixel range to ensure that we have no illegal table access.
250	0	pixel = Clip3(0, maxPixelValue, pixel);
251
252	0	int bandIdx = bandTable[pixel >> bandShift];
253
254	0	if (bandIdx>0) {
255	0	int offset = saoinfo->saoOffsetVal[cIdx][bandIdx-1];
256
257	0	out_img[xC+i+(yC+j)*out_stride] = Clip3(0,maxPixelValue,
258	0	in_img[xC+i+(yC+j)*in_stride] + offset);
259	0	}
260	0	}
261	0	}
262	0	}
263	0	} Unexecuted instantiation: void apply_sao_internal<unsigned short>(de265_image, int, int, slice_segment_header const, int, int, int, unsigned short const, int, unsigned short, int) Unexecuted instantiation: void apply_sao_internal<unsigned char>(de265_image, int, int, slice_segment_header const, int, int, int, unsigned char const, int, unsigned char, int)
264
265
266		template <class pixel_t>
267		void apply_sao(de265_image* img, int xCtb,int yCtb,
268		const slice_segment_header* shdr, int cIdx, int nSW,int nSH,
269		const pixel_t* in_img, int in_stride,
270		/* / pixel_t out_img, int out_stride)
271	0	{
272	0	if (img->high_bit_depth(cIdx)) {
273	0	apply_sao_internal<uint16_t>(img,xCtb,yCtb, shdr,cIdx,nSW,nSH,
274	0	(uint16_t*)in_img, in_stride,
275	0	(uint16_t*)out_img,out_stride);
276	0	}
277	0	else {
278	0	apply_sao_internal<uint8_t>(img,xCtb,yCtb, shdr,cIdx,nSW,nSH,
279	0	in_img, in_stride,
280	0	out_img,out_stride);
281	0	}
282	0	}
283
284
285		void apply_sample_adaptive_offset(de265_image* img)
286	0	{
287	0	const seq_parameter_set& sps = img->get_sps();
288
289	0	if (sps.sample_adaptive_offset_enabled_flag==0) {
290	0	return;
291	0	}
292
293	0	de265_image inputCopy;
294	0	de265_error err = inputCopy.copy_image(img);
295	0	if (err != DE265_OK) {
296	0	img->decctx->add_warning(DE265_WARNING_CANNOT_APPLY_SAO_OUT_OF_MEMORY,false);
297	0	return;
298	0	}
299
300	0	for (int yCtb=0; yCtb<sps.PicHeightInCtbsY; yCtb++)
301	0	for (int xCtb=0; xCtb<sps.PicWidthInCtbsY; xCtb++)
302	0	{
303	0	const slice_segment_header* shdr = img->get_SliceHeaderCtb(xCtb,yCtb);
304
305	0	if (shdr->slice_sao_luma_flag) {
306	0	apply_sao(img, xCtb,yCtb, shdr, 0, 1<<sps.Log2CtbSizeY, 1<<sps.Log2CtbSizeY,
307	0	inputCopy.get_image_plane(0), inputCopy.get_image_stride(0),
308	0	img->get_image_plane(0), img->get_image_stride(0));
309	0	}
310
311	0	if (shdr->slice_sao_chroma_flag) {
312	0	int nSW = (1<<sps.Log2CtbSizeY) / sps.SubWidthC;
313	0	int nSH = (1<<sps.Log2CtbSizeY) / sps.SubHeightC;
314
315	0	apply_sao(img, xCtb,yCtb, shdr, 1, nSW,nSH,
316	0	inputCopy.get_image_plane(1), inputCopy.get_image_stride(1),
317	0	img->get_image_plane(1), img->get_image_stride(1));
318
319	0	apply_sao(img, xCtb,yCtb, shdr, 2, nSW,nSH,
320	0	inputCopy.get_image_plane(2), inputCopy.get_image_stride(2),
321	0	img->get_image_plane(2), img->get_image_stride(2));
322	0	}
323	0	}
324	0	}
325
326
327		void apply_sample_adaptive_offset_sequential(de265_image* img)
328	0	{
329	0	const seq_parameter_set& sps = img->get_sps();
330
331	0	if (sps.sample_adaptive_offset_enabled_flag==0) {
332	0	return;
333	0	}
334
335	0	int lumaImageSize = img->get_image_stride(0) * img->get_height(0) * img->get_bytes_per_pixel(0);
336	0	int chromaImageSize = img->get_image_stride(1) * img->get_height(1) * img->get_bytes_per_pixel(1);
337
338	0	uint8_t* inputCopy = new uint8_t[ libde265_max(lumaImageSize, chromaImageSize) ];
339	0	if (inputCopy == NULL) {
340	0	img->decctx->add_warning(DE265_WARNING_CANNOT_APPLY_SAO_OUT_OF_MEMORY,false);
341	0	return;
342	0	}
343
344
345	0	int nChannels = 3;
346	0	if (sps.ChromaArrayType == CHROMA_MONO) { nChannels=1; }
347
348	0	for (int cIdx=0;cIdx<nChannels;cIdx++) {
349
350	0	int stride = img->get_image_stride(cIdx);
351	0	int height = img->get_height(cIdx);
352
353	0	memcpy(inputCopy, img->get_image_plane(cIdx), stride * height * img->get_bytes_per_pixel(cIdx));
354
355	0	for (int yCtb=0; yCtb<sps.PicHeightInCtbsY; yCtb++)
356	0	for (int xCtb=0; xCtb<sps.PicWidthInCtbsY; xCtb++)
357	0	{
358	0	const slice_segment_header* shdr = img->get_SliceHeaderCtb(xCtb,yCtb);
359	0	if (shdr==NULL) {
360	0	delete[] inputCopy;
361	0	return;
362	0	}
363
364	0	if (cIdx==0 && shdr->slice_sao_luma_flag) {
365	0	apply_sao(img, xCtb,yCtb, shdr, 0, 1<<sps.Log2CtbSizeY, 1<<sps.Log2CtbSizeY,
366	0	inputCopy, stride,
367	0	img->get_image_plane(0), img->get_image_stride(0));
368	0	}
369
370	0	if (cIdx!=0 && shdr->slice_sao_chroma_flag) {
371	0	int nSW = (1<<sps.Log2CtbSizeY) / sps.SubWidthC;
372	0	int nSH = (1<<sps.Log2CtbSizeY) / sps.SubHeightC;
373
374	0	apply_sao(img, xCtb,yCtb, shdr, cIdx, nSW,nSH,
375	0	inputCopy, stride,
376	0	img->get_image_plane(cIdx), img->get_image_stride(cIdx));
377	0	}
378	0	}
379	0	}
380
381	0	delete[] inputCopy;
382	0	}
383
384
385
386
387		class thread_task_sao : public thread_task
388		{
389		public:
390		int ctb_y;
391		de265_image* img; /* this is where we get the SPS from
392		(either inputImg or outputImg can be a dummy image)
393		*/
394
395		de265_image* inputImg;
396		de265_image* outputImg;
397		int inputProgress;
398
399		virtual void work();
400	0	virtual std::string name() const {
401	0	char buf[100];
402	0	sprintf(buf,"sao-%d",ctb_y);
403	0	return buf;
404	0	}
405		};
406
407
408		void thread_task_sao::work()
409	0	{
410	0	state = Running;
411	0	img->thread_run(this);
412
413	0	const seq_parameter_set& sps = img->get_sps();
414
415	0	const int rightCtb = sps.PicWidthInCtbsY-1;
416	0	const int ctbSize = (1<<sps.Log2CtbSizeY);
417
418
419		// wait until also the CTB-rows below and above are ready
420
421	0	img->wait_for_progress(this, rightCtb,ctb_y, inputProgress);
422
423	0	if (ctb_y>0) {
424	0	img->wait_for_progress(this, rightCtb,ctb_y-1, inputProgress);
425	0	}
426
427	0	if (ctb_y+1<sps.PicHeightInCtbsY) {
428	0	img->wait_for_progress(this, rightCtb,ctb_y+1, inputProgress);
429	0	}
430
431
432		// copy input image to output for this CTB-row
433
434	0	outputImg->copy_lines_from(inputImg, ctb_y * ctbSize, (ctb_y+1) * ctbSize);
435
436
437		// process SAO in the CTB-row
438
439	0	for (int xCtb=0; xCtb<sps.PicWidthInCtbsY; xCtb++)
440	0	{
441	0	const slice_segment_header* shdr = img->get_SliceHeaderCtb(xCtb,ctb_y);
442	0	if (shdr==NULL) {
443	0	break;
444	0	}
445
446	0	if (shdr->slice_sao_luma_flag) {
447	0	apply_sao(img, xCtb,ctb_y, shdr, 0, ctbSize, ctbSize,
448	0	inputImg ->get_image_plane(0), inputImg ->get_image_stride(0),
449	0	outputImg->get_image_plane(0), outputImg->get_image_stride(0));
450	0	}
451
452	0	if (shdr->slice_sao_chroma_flag) {
453	0	int nSW = ctbSize / sps.SubWidthC;
454	0	int nSH = ctbSize / sps.SubHeightC;
455
456	0	apply_sao(img, xCtb,ctb_y, shdr, 1, nSW,nSH,
457	0	inputImg ->get_image_plane(1), inputImg ->get_image_stride(1),
458	0	outputImg->get_image_plane(1), outputImg->get_image_stride(1));
459
460	0	apply_sao(img, xCtb,ctb_y, shdr, 2, nSW,nSH,
461	0	inputImg ->get_image_plane(2), inputImg ->get_image_stride(2),
462	0	outputImg->get_image_plane(2), outputImg->get_image_stride(2));
463	0	}
464	0	}
465
466
467		// mark SAO progress
468
469	0	for (int x=0;x<=rightCtb;x++) {
470	0	const int CtbWidth = sps.PicWidthInCtbsY;
471	0	img->ctb_progress[x+ctb_y*CtbWidth].set_progress(CTB_PROGRESS_SAO);
472	0	}
473
474
475	0	state = Finished;
476	0	img->thread_finishes(this);
477	0	}
478
479
480		bool add_sao_tasks(image_unit* imgunit, int saoInputProgress)
481	0	{
482	0	de265_image* img = imgunit->img;
483	0	const seq_parameter_set& sps = img->get_sps();
484
485	0	if (sps.sample_adaptive_offset_enabled_flag==0) {
486	0	return false;
487	0	}
488
489
490	0	decoder_context* ctx = img->decctx;
491
492	0	de265_error err = imgunit->sao_output.alloc_image(img->get_width(), img->get_height(),
493	0	img->get_chroma_format(),
494	0	img->get_shared_sps(),
495	0	false,
496	0	img->decctx, //img->encctx,
497	0	img->pts, img->user_data, true);
498	0	if (err != DE265_OK) {
499	0	img->decctx->add_warning(DE265_WARNING_CANNOT_APPLY_SAO_OUT_OF_MEMORY,false);
500	0	return false;
501	0	}
502
503	0	int nRows = sps.PicHeightInCtbsY;
504
505	0	int n=0;
506	0	img->thread_start(nRows);
507
508	0	for (int y=0;y<nRows;y++)
509	0	{
510	0	thread_task_sao* task = new thread_task_sao;
511
512	0	task->inputImg = img;
513	0	task->outputImg = &imgunit->sao_output;
514	0	task->img = img;
515	0	task->ctb_y = y;
516	0	task->inputProgress = saoInputProgress;
517
518	0	imgunit->tasks.push_back(task);
519	0	add_task(&ctx->thread_pool_, task);
520	0	n++;
521	0	}
522
523		/* Currently need barrier here because when are finished, we have to swap the pixel
524		data back into the main image. */
525	0	img->wait_for_completion();
526
527	0	img->exchange_pixel_data_with(imgunit->sao_output);
528
529	0	return true;
530	0	}