Coverage Report

Created: 2024-05-04 12:45

/proc/self/cwd/external/gemmlowp/internal/unpack.h
Line
Count
Source (jump to first uncovered line)
1
// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
2
//
3
// Licensed under the Apache License, Version 2.0 (the "License");
4
// you may not use this file except in compliance with the License.
5
// You may obtain a copy of the License at
6
//
7
//     http://www.apache.org/licenses/LICENSE-2.0
8
//
9
// Unless required by applicable law or agreed to in writing, software
10
// distributed under the License is distributed on an "AS IS" BASIS,
11
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
// See the License for the specific language governing permissions and
13
// limitations under the License.
14
15
// unpack.h: unpacking the result blocks computed by compute.h,
16
// storing them into the destination matrix.
17
18
#ifndef GEMMLOWP_INTERNAL_UNPACK_H_
19
#define GEMMLOWP_INTERNAL_UNPACK_H_
20
21
#include "allocator.h"
22
#include "block_params.h"
23
#include "output.h"
24
#include "pack.h"
25
26
#include <cmath>
27
28
namespace gemmlowp {
29
30
class PackedResult {
31
 public:
32
  PackedResult(Allocator* _allocator, const BlockParams& _block_params)
33
0
      : allocator_(_allocator), block_params_(_block_params) {
34
0
    matrix_handle_ = allocator_->Reserve<std::int32_t>(block_params_.l2_rows *
35
0
                                                       block_params_.l2_cols);
36
0
  }
37
38
0
  ~PackedResult() {}
39
40
0
  MatrixMap<std::int32_t, MapOrder::ColMajor> Map() {
41
0
    return MatrixMap<std::int32_t, MapOrder::ColMajor>(
42
0
        allocator_->GetPointer<std::int32_t>(matrix_handle_),
43
0
        block_params_.l2_rows, block_params_.l2_cols, block_params_.l2_rows);
44
0
  }
45
46
0
  MatrixMap<const std::int32_t, MapOrder::ColMajor> Map() const {
47
0
    return MatrixMap<const std::int32_t, MapOrder::ColMajor>(
48
0
        allocator_->GetPointer<const std::int32_t>(matrix_handle_),
49
0
        block_params_.l2_rows, block_params_.l2_cols, block_params_.l2_rows);
50
0
  }
51
52
 private:
53
  Allocator* allocator_;
54
  Allocator::Handle matrix_handle_;
55
  const BlockParams& block_params_;
56
};
57
58
struct MatrixBlockBounds {
59
  int start_row;
60
  int start_col;
61
  int rows;
62
  int cols;
63
64
  MatrixBlockBounds(int start_row_, int start_col_, int rows_, int cols_)
65
      : start_row(start_row_),
66
        start_col(start_col_),
67
        rows(rows_),
68
0
        cols(cols_) {}
69
};
70
71
template <int Rows, int Cols, typename SrcMapType>
72
void PrefetchResultBlock(const SrcMapType& src,
73
                         const VectorMap<const std::int32_t, VectorShape::Col>&
74
                             lhs_sums_of_each_slice,
75
0
                         int src_row, int src_col) {
76
0
  const std::int32_t* src_data = src.data(src_row, src_col);
77
0
  const int src_stride = src.stride();
78
0
  const std::int32_t* lhs_sums_data = lhs_sums_of_each_slice.data(src_row);
79
0
  for (int r = 0; r < Rows; r += 4) {
80
0
    Prefetch(lhs_sums_data + r);
81
0
  }
82
0
  for (int c = 0; c < Cols; c++) {
83
0
    for (int r = 0; r < Rows; r += 4) {
84
0
      Prefetch(src_data + r + c * src_stride);
85
0
    }
86
0
  }
87
0
}
Unexecuted instantiation: void gemmlowp::PrefetchResultBlock<8, 4, gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0> >(gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0> const&, gemmlowp::VectorMap<int const, (gemmlowp::VectorShape)0> const&, int, int)
Unexecuted instantiation: void gemmlowp::PrefetchResultBlock<8, 1, gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0> >(gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0> const&, gemmlowp::VectorMap<int const, (gemmlowp::VectorShape)0> const&, int, int)
Unexecuted instantiation: void gemmlowp::PrefetchResultBlock<8, 8, gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0> >(gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0> const&, gemmlowp::VectorMap<int const, (gemmlowp::VectorShape)0> const&, int, int)
88
89
template <typename KernelFormat, typename RegisterBlockType,
90
          typename SrcMapType, typename LhsOffset, typename RhsOffset,
91
          typename OutputPipelineExecutorType, typename DstType>
92
void UnpackResultBlock(const SrcMapType& src,
93
                       const OutputPipelineExecutorType& executor, DstType* dst,
94
                       const VectorMap<const std::int32_t, VectorShape::Col>&
95
                           lhs_sums_of_each_slice,
96
                       const VectorMap<const std::int32_t, VectorShape::Row>&
97
                           rhs_sums_of_each_slice,
98
                       const LhsOffset& lhs_offset, const RhsOffset& rhs_offset,
99
                       int depth, int src_row, int src_col, int src_global_row,
100
0
                       int src_global_col, int dst_row, int dst_col) {
101
0
  using KernelLhsInputScalar = typename KernelFormat::Lhs::InputScalar;
102
0
  using KernelLhsScalar = typename KernelFormat::Lhs::Scalar;
103
0
  using KernelRhsInputScalar = typename KernelFormat::Rhs::InputScalar;
104
0
  using KernelRhsScalar = typename KernelFormat::Rhs::Scalar;
105
0
  static constexpr int KernelLhsZeroPointInput =
106
0
      ZeroPointInputValue<KernelLhsInputScalar, KernelLhsScalar>::kValue;
107
0
  static constexpr int KernelRhsZeroPointInput =
108
0
      ZeroPointInputValue<KernelRhsInputScalar, KernelRhsScalar>::kValue;
109
0
  auto acc = Load<RegisterBlockType>(src, src_row, src_col);
110
0
  const auto& lhs_sums_of_each_slice_block =
111
0
      LoadForBroadcasting<RegisterBlockType>(lhs_sums_of_each_slice, src_row);
112
0
  const auto& rhs_sums_of_each_slice_block =
113
0
      LoadForBroadcasting<RegisterBlockType>(rhs_sums_of_each_slice, src_col);
114
0
  auto lhs_offset_block =
115
0
      LoadForBroadcasting<RegisterBlockType>(lhs_offset, src_row);
116
0
  auto rhs_offset_block =
117
0
      LoadForBroadcasting<RegisterBlockType>(rhs_offset, src_col);
118
0
  AddConstant<KernelLhsZeroPointInput>(&lhs_offset_block);
119
0
  AddConstant<KernelRhsZeroPointInput>(&rhs_offset_block);
120
0
  BroadcastMulAdd(lhs_sums_of_each_slice_block, rhs_offset_block, &acc);
121
0
  for (int i = 0; i < decltype(rhs_offset_block)::kRegisterCount; i++) {
122
0
    rhs_offset_block.buf.reg[i] = Mul(rhs_offset_block.buf.reg[i], depth);
123
0
  }
124
0
  BroadcastMulAdd(BroadcastAdd(rhs_sums_of_each_slice_block, rhs_offset_block),
125
0
                  lhs_offset_block, &acc);
126
0
  executor.Execute(acc, dst, src_global_row, src_global_col, dst_row, dst_col);
127
0
}
Unexecuted instantiation: void gemmlowp::UnpackResultBlock<gemmlowp::KernelFormat<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1>, gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> >, gemmlowp::RegisterBlock<int, 8, 4>, gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0>, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1>, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0>, gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 8, 4> >, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0> >(gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0> const&, gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 8, 4> > const&, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0>*, gemmlowp::VectorMap<int const, (gemmlowp::VectorShape)0> const&, gemmlowp::VectorMap<int const, (gemmlowp::VectorShape)1> const&, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1> const&, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0> const&, int, int, int, int, int, int, int)
Unexecuted instantiation: void gemmlowp::UnpackResultBlock<gemmlowp::KernelFormat<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1>, gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> >, gemmlowp::RegisterBlock<int, 4, 4>, gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0>, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1>, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0>, gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 4, 4> >, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0> >(gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0> const&, gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 4, 4> > const&, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0>*, gemmlowp::VectorMap<int const, (gemmlowp::VectorShape)0> const&, gemmlowp::VectorMap<int const, (gemmlowp::VectorShape)1> const&, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1> const&, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0> const&, int, int, int, int, int, int, int)
Unexecuted instantiation: void gemmlowp::UnpackResultBlock<gemmlowp::KernelFormat<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1>, gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> >, gemmlowp::RegisterBlock<int, 1, 4>, gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0>, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1>, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0>, gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 1, 4> >, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0> >(gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0> const&, gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 1, 4> > const&, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0>*, gemmlowp::VectorMap<int const, (gemmlowp::VectorShape)0> const&, gemmlowp::VectorMap<int const, (gemmlowp::VectorShape)1> const&, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1> const&, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0> const&, int, int, int, int, int, int, int)
Unexecuted instantiation: void gemmlowp::UnpackResultBlock<gemmlowp::KernelFormat<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1>, gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> >, gemmlowp::RegisterBlock<int, 8, 1>, gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0>, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1>, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0>, gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 8, 1> >, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0> >(gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0> const&, gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 8, 1> > const&, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0>*, gemmlowp::VectorMap<int const, (gemmlowp::VectorShape)0> const&, gemmlowp::VectorMap<int const, (gemmlowp::VectorShape)1> const&, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1> const&, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0> const&, int, int, int, int, int, int, int)
Unexecuted instantiation: void gemmlowp::UnpackResultBlock<gemmlowp::KernelFormat<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1>, gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> >, gemmlowp::RegisterBlock<int, 4, 1>, gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0>, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1>, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0>, gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 4, 1> >, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0> >(gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0> const&, gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 4, 1> > const&, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0>*, gemmlowp::VectorMap<int const, (gemmlowp::VectorShape)0> const&, gemmlowp::VectorMap<int const, (gemmlowp::VectorShape)1> const&, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1> const&, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0> const&, int, int, int, int, int, int, int)
Unexecuted instantiation: void gemmlowp::UnpackResultBlock<gemmlowp::KernelFormat<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1>, gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> >, gemmlowp::RegisterBlock<int, 1, 1>, gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0>, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1>, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0>, gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 1, 1> >, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0> >(gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0> const&, gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 1, 1> > const&, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0>*, gemmlowp::VectorMap<int const, (gemmlowp::VectorShape)0> const&, gemmlowp::VectorMap<int const, (gemmlowp::VectorShape)1> const&, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1> const&, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0> const&, int, int, int, int, int, int, int)
Unexecuted instantiation: void gemmlowp::UnpackResultBlock<gemmlowp::KernelFormat<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1>, gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> >, gemmlowp::RegisterBlock<int, 8, 4>, gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0>, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0>, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1>, gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 8, 4> >, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0> >(gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0> const&, gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 8, 4> > const&, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0>*, gemmlowp::VectorMap<int const, (gemmlowp::VectorShape)0> const&, gemmlowp::VectorMap<int const, (gemmlowp::VectorShape)1> const&, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0> const&, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1> const&, int, int, int, int, int, int, int)
Unexecuted instantiation: void gemmlowp::UnpackResultBlock<gemmlowp::KernelFormat<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1>, gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> >, gemmlowp::RegisterBlock<int, 4, 4>, gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0>, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0>, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1>, gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 4, 4> >, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1> >(gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0> const&, gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 4, 4> > const&, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1>*, gemmlowp::VectorMap<int const, (gemmlowp::VectorShape)0> const&, gemmlowp::VectorMap<int const, (gemmlowp::VectorShape)1> const&, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0> const&, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1> const&, int, int, int, int, int, int, int)
Unexecuted instantiation: void gemmlowp::UnpackResultBlock<gemmlowp::KernelFormat<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1>, gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> >, gemmlowp::RegisterBlock<int, 1, 4>, gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0>, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0>, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1>, gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 1, 4> >, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1> >(gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0> const&, gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 1, 4> > const&, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1>*, gemmlowp::VectorMap<int const, (gemmlowp::VectorShape)0> const&, gemmlowp::VectorMap<int const, (gemmlowp::VectorShape)1> const&, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0> const&, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1> const&, int, int, int, int, int, int, int)
Unexecuted instantiation: void gemmlowp::UnpackResultBlock<gemmlowp::KernelFormat<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1>, gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> >, gemmlowp::RegisterBlock<int, 8, 4>, gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0>, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0>, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1>, gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 8, 4> >, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1> >(gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0> const&, gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 8, 4> > const&, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1>*, gemmlowp::VectorMap<int const, (gemmlowp::VectorShape)0> const&, gemmlowp::VectorMap<int const, (gemmlowp::VectorShape)1> const&, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0> const&, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1> const&, int, int, int, int, int, int, int)
Unexecuted instantiation: void gemmlowp::UnpackResultBlock<gemmlowp::KernelFormat<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1>, gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> >, gemmlowp::RegisterBlock<int, 8, 1>, gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0>, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0>, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1>, gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 8, 1> >, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1> >(gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0> const&, gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 8, 1> > const&, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1>*, gemmlowp::VectorMap<int const, (gemmlowp::VectorShape)0> const&, gemmlowp::VectorMap<int const, (gemmlowp::VectorShape)1> const&, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0> const&, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1> const&, int, int, int, int, int, int, int)
Unexecuted instantiation: void gemmlowp::UnpackResultBlock<gemmlowp::KernelFormat<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1>, gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> >, gemmlowp::RegisterBlock<int, 4, 1>, gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0>, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0>, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1>, gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 4, 1> >, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1> >(gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0> const&, gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 4, 1> > const&, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1>*, gemmlowp::VectorMap<int const, (gemmlowp::VectorShape)0> const&, gemmlowp::VectorMap<int const, (gemmlowp::VectorShape)1> const&, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0> const&, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1> const&, int, int, int, int, int, int, int)
Unexecuted instantiation: void gemmlowp::UnpackResultBlock<gemmlowp::KernelFormat<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1>, gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> >, gemmlowp::RegisterBlock<int, 1, 1>, gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0>, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0>, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1>, gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 1, 1> >, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1> >(gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0> const&, gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 1, 1> > const&, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1>*, gemmlowp::VectorMap<int const, (gemmlowp::VectorShape)0> const&, gemmlowp::VectorMap<int const, (gemmlowp::VectorShape)1> const&, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0> const&, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1> const&, int, int, int, int, int, int, int)
128
129
template <typename KernelFormat, typename ResultBlockType,
130
          typename PackedResultType, typename LhsOffset, typename RhsOffset,
131
          typename OutputPipelineType>
132
void UnpackResult(ResultBlockType* dst, const MatrixBlockBounds& dst_block,
133
                  const PackedResultType& src, int depth,
134
                  const std::int32_t* lhs_sums_of_each_slice_ptr,
135
                  const std::int32_t* rhs_sums_of_each_slice_ptr,
136
                  const LhsOffset& lhs_offset, const RhsOffset& rhs_offset,
137
0
                  const OutputPipelineType& output_pipeline) {
138
0
  ScopedProfilingLabel label(ResultBlockType::kOrder == MapOrder::ColMajor
139
0
                                 ? "unpack to column-major"
140
0
                                 : "unpack to row-major");
141
0
  assert(dst_block.start_row >= 0);
142
0
  assert(dst_block.start_row + dst_block.rows <= dst->rows());
143
0
  assert(dst_block.start_col >= 0);
144
0
  assert(dst_block.start_col + dst_block.cols <= dst->cols());
145
0
  const auto src_map = src.Map();
146
0
  const VectorMap<const std::int32_t, VectorShape::Col> lhs_sums_of_each_slice(
147
0
      lhs_sums_of_each_slice_ptr, dst_block.rows);
148
0
  const VectorMap<const std::int32_t, VectorShape::Row> rhs_sums_of_each_slice(
149
0
      rhs_sums_of_each_slice_ptr, dst_block.cols);
150
0
  using Int32x1x1 = RegisterBlock<std::int32_t, 1, 1>;
151
0
  using Int32x4x1 = RegisterBlock<std::int32_t, 4, 1>;
152
0
  using Int32x8x1 = RegisterBlock<std::int32_t, 8, 1>;
153
0
  using Int32x1x4 = RegisterBlock<std::int32_t, 1, 4>;
154
0
  using Int32x4x4 = RegisterBlock<std::int32_t, 4, 4>;
155
0
  using Int32x8x4 = RegisterBlock<std::int32_t, 8, 4>;
156
157
0
  using DstScalarType = typename ResultBlockType::Scalar;
158
0
  using DstScalarx8x8 = RegisterBlock<DstScalarType, 8, 8>;
159
160
0
  OutputPipelineExecutor<OutputPipelineType, Int32x1x1>
161
0
      output_pipeline_executor_1x1(output_pipeline);
162
0
  OutputPipelineExecutor<OutputPipelineType, Int32x4x1>
163
0
      output_pipeline_executor_4x1(output_pipeline);
164
0
  OutputPipelineExecutor<OutputPipelineType, Int32x8x1>
165
0
      output_pipeline_executor_8x1(output_pipeline);
166
0
  OutputPipelineExecutor<OutputPipelineType, Int32x1x4>
167
0
      output_pipeline_executor_1x4(output_pipeline);
168
0
  OutputPipelineExecutor<OutputPipelineType, Int32x4x4>
169
0
      output_pipeline_executor_4x4(output_pipeline);
170
0
  OutputPipelineExecutor<OutputPipelineType, Int32x8x4>
171
0
      output_pipeline_executor_8x4(output_pipeline);
172
173
0
  int c8 = 0;
174
0
  if (ResultBlockType::kOrder == MapOrder::RowMajor) {
175
0
    for (; c8 <= dst_block.cols - 8; c8 += 8) {
176
0
      PrefetchResultBlock<8, 8>(src_map, lhs_sums_of_each_slice, 0, c8);
177
0
      int r = 0;
178
0
      for (; r <= dst_block.rows - 8; r += 8) {
179
0
        const int global_row = r + dst_block.start_row;
180
0
        PrefetchResultBlock<8, 8>(src_map, lhs_sums_of_each_slice, r + 8, c8);
181
0
        DstScalarType dst_colmajor_buf[64];
182
0
        MatrixMap<DstScalarType, MapOrder::ColMajor> dst_colmajor_map(
183
0
            dst_colmajor_buf, 8, 8);
184
0
        for (int cx = 0; cx < 8; cx += 4) {
185
0
          const int c = c8 + cx;
186
0
          const int global_col = c + dst_block.start_col;
187
0
          UnpackResultBlock<KernelFormat, Int32x8x4>(
188
0
              src_map, output_pipeline_executor_8x4, &dst_colmajor_map,
189
0
              lhs_sums_of_each_slice, rhs_sums_of_each_slice, lhs_offset,
190
0
              rhs_offset, depth, r, c, global_row, global_col, 0, cx);
191
0
        }
192
0
        StoreFinalOutput(LoadContiguous<DstScalarx8x8>(dst_colmajor_buf), dst,
193
0
                         r + dst_block.start_row, c8 + dst_block.start_col);
194
0
      }
195
0
      for (; r <= dst_block.rows - 4; r += 4) {
196
0
        const int global_row = r + dst_block.start_row;
197
0
        for (int cx = 0; cx < 8; cx += 4) {
198
0
          const int c = c8 + cx;
199
0
          const int global_col = c + dst_block.start_col;
200
0
          UnpackResultBlock<KernelFormat, Int32x4x4>(
201
0
              src_map, output_pipeline_executor_4x4, dst,
202
0
              lhs_sums_of_each_slice, rhs_sums_of_each_slice, lhs_offset,
203
0
              rhs_offset, depth, r, c, global_row, global_col, global_row,
204
0
              global_col);
205
0
        }
206
0
      }
207
0
      for (; r < dst_block.rows; r++) {
208
0
        const int global_row = r + dst_block.start_row;
209
0
        for (int cx = 0; cx < 8; cx += 4) {
210
0
          const int c = c8 + cx;
211
0
          const int global_col = c + dst_block.start_col;
212
0
          UnpackResultBlock<KernelFormat, Int32x1x4>(
213
0
              src_map, output_pipeline_executor_1x4, dst,
214
0
              lhs_sums_of_each_slice, rhs_sums_of_each_slice, lhs_offset,
215
0
              rhs_offset, depth, r, c, global_row, global_col, global_row,
216
0
              global_col);
217
0
        }
218
0
      }
219
0
    }
220
0
  }
221
0
  int c = c8;
222
0
  for (; c <= dst_block.cols - 4; c += 4) {
223
0
    const int global_col = c + dst_block.start_col;
224
0
    PrefetchResultBlock<8, 4>(src_map, lhs_sums_of_each_slice, 0, c);
225
0
    int r = 0;
226
0
    for (; r <= dst_block.rows - 8; r += 8) {
227
0
      const int global_row = r + dst_block.start_row;
228
0
      PrefetchResultBlock<8, 4>(src_map, lhs_sums_of_each_slice, r + 8, c);
229
0
      UnpackResultBlock<KernelFormat, Int32x8x4>(
230
0
          src_map, output_pipeline_executor_8x4, dst, lhs_sums_of_each_slice,
231
0
          rhs_sums_of_each_slice, lhs_offset, rhs_offset, depth, r, c,
232
0
          global_row, global_col, global_row, global_col);
233
0
    }
234
0
    for (; r <= dst_block.rows - 4; r += 4) {
235
0
      const int global_row = r + dst_block.start_row;
236
0
      UnpackResultBlock<KernelFormat, Int32x4x4>(
237
0
          src_map, output_pipeline_executor_4x4, dst, lhs_sums_of_each_slice,
238
0
          rhs_sums_of_each_slice, lhs_offset, rhs_offset, depth, r, c,
239
0
          global_row, global_col, global_row, global_col);
240
0
    }
241
0
    for (; r < dst_block.rows; r++) {
242
0
      const int global_row = r + dst_block.start_row;
243
0
      UnpackResultBlock<KernelFormat, Int32x1x4>(
244
0
          src_map, output_pipeline_executor_1x4, dst, lhs_sums_of_each_slice,
245
0
          rhs_sums_of_each_slice, lhs_offset, rhs_offset, depth, r, c,
246
0
          global_row, global_col, global_row, global_col);
247
0
    }
248
0
  }
249
0
  for (; c < dst_block.cols; c++) {
250
0
    const int global_col = c + dst_block.start_col;
251
0
    PrefetchResultBlock<8, 1>(src_map, lhs_sums_of_each_slice, 0, c);
252
0
    int r = 0;
253
0
    for (; r <= dst_block.rows - 8; r += 8) {
254
0
      const int global_row = r + dst_block.start_row;
255
0
      PrefetchResultBlock<8, 1>(src_map, lhs_sums_of_each_slice, r + 8, c);
256
0
      UnpackResultBlock<KernelFormat, Int32x8x1>(
257
0
          src_map, output_pipeline_executor_8x1, dst, lhs_sums_of_each_slice,
258
0
          rhs_sums_of_each_slice, lhs_offset, rhs_offset, depth, r, c,
259
0
          global_row, global_col, global_row, global_col);
260
0
    }
261
0
    for (; r <= dst_block.rows - 4; r += 4) {
262
0
      const int global_row = r + dst_block.start_row;
263
0
      UnpackResultBlock<KernelFormat, Int32x4x1>(
264
0
          src_map, output_pipeline_executor_4x1, dst, lhs_sums_of_each_slice,
265
0
          rhs_sums_of_each_slice, lhs_offset, rhs_offset, depth, r, c,
266
0
          global_row, global_col, global_row, global_col);
267
0
    }
268
0
    for (; r < dst_block.rows; r++) {
269
0
      const int global_row = r + dst_block.start_row;
270
0
      UnpackResultBlock<KernelFormat, Int32x1x1>(
271
0
          src_map, output_pipeline_executor_1x1, dst, lhs_sums_of_each_slice,
272
0
          rhs_sums_of_each_slice, lhs_offset, rhs_offset, depth, r, c,
273
0
          global_row, global_col, global_row, global_col);
274
0
    }
275
0
  }
276
0
}
Unexecuted instantiation: void gemmlowp::UnpackResult<gemmlowp::KernelFormat<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1>, gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> >, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0>, gemmlowp::PackedResult, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1>, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0>, std::__1::tuple<> >(gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0>*, gemmlowp::MatrixBlockBounds const&, gemmlowp::PackedResult const&, int, int const*, int const*, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1> const&, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0> const&, std::__1::tuple<> const&)
Unexecuted instantiation: void gemmlowp::UnpackResult<gemmlowp::KernelFormat<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1>, gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> >, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1>, gemmlowp::PackedResult, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0>, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1>, std::__1::tuple<> >(gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1>*, gemmlowp::MatrixBlockBounds const&, gemmlowp::PackedResult const&, int, int const*, int const*, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0> const&, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1> const&, std::__1::tuple<> const&)
277
278
}  // end namespace gemmlowp
279
280
#endif  // GEMMLOWP_INTERNAL_UNPACK_H_