Coverage Report

Created: 2024-05-04 12:45

/proc/self/cwd/external/gemmlowp/internal/pack.h
Line
Count
Source (jump to first uncovered line)
1
// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
2
//
3
// Licensed under the Apache License, Version 2.0 (the "License");
4
// you may not use this file except in compliance with the License.
5
// You may obtain a copy of the License at
6
//
7
//     http://www.apache.org/licenses/LICENSE-2.0
8
//
9
// Unless required by applicable law or agreed to in writing, software
10
// distributed under the License is distributed on an "AS IS" BASIS,
11
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
// See the License for the specific language governing permissions and
13
// limitations under the License.
14
15
// pack.h: packing blocks of the LHS and RHS into the data layout
16
// that is expected by compute.h and eventually by kernels.
17
// Because this data layout depends on the kernel format, code here
18
// is templated in KernelLhsFormat/KernelRhsFormat.
19
//
20
// Readers note: an important theme around here is that we try hard
21
// to handle both Lhs and Rhs with a single piece of code. We indifferently
22
// refer to the Lhs and Rhs as a 'Side'. Instead of addressing matrices
23
// by (row, column) indices, we address them by (width, depth), as explained
24
// in kernel.h. This allows us to handle both Lhs and Rhs on an equal footing,
25
// at once.
26
27
#ifndef GEMMLOWP_INTERNAL_PACK_H_
28
#define GEMMLOWP_INTERNAL_PACK_H_
29
30
#include <cstring>
31
32
#include "allocator.h"
33
#include "block_params.h"
34
#include "common.h"
35
#include "kernel.h"
36
37
namespace gemmlowp {
38
39
// A PackedSideBlock instance is a packed block of either the LHS or RHS
40
// (whence the generic 'Side' name).
41
//
42
// 'Packed' means that it is laid out in the storage order that
43
// is expected by the specified kernel format. From a block of the input
44
// LHS or RHS matrix, one obtains a PackedSideBlock by calling PackLhs()
45
// or PackRhs().
46
template <typename tKernelSideFormat>
47
class PackedSideBlock {
48
 public:
49
  typedef tKernelSideFormat KernelSideFormat;
50
51
  PackedSideBlock(Side side, Allocator* allocator,
52
                  const BlockParams& block_params)
53
0
      : allocator_(allocator), pos_(0) {
54
0
    GetSideBlockParams(side, &params_, block_params);
55
0
    data_handle_ =
56
0
        allocator_->Reserve<std::uint8_t>(params_.l2_width * params_.l2_depth);
57
0
    sums_of_each_slice_handle_ =
58
0
        allocator_->Reserve<std::int32_t>(params_.l2_width);
59
0
  }
60
61
0
  ~PackedSideBlock() {}
62
63
0
  void seek_run(int start_width, int start_depth) const {
64
0
    int kernel_run_depth =
65
0
        std::min<int>(params_.l1_depth, params_.l2_depth - start_depth);
66
0
    pos_ = params_.l2_width * start_depth + start_width * kernel_run_depth;
67
0
  }
68
69
  void seek_next_cell() const { pos_ += KernelSideFormat::Cell::kSize; }
70
71
0
  void seek_forward_n_cells(int n) const {
72
0
    pos_ += n * KernelSideFormat::Cell::kSize;
73
0
  }
74
75
  // TODO(suharshs): The datatype can now be int8 as well. We could introduce a
76
  // new int8 current_data impl as well. This change would propagate to all pack
77
  // impls and the Kernel::Run API, which all assume uint8. For now we leave
78
  // this as-is pending future refactor.
79
0
  const std::uint8_t* current_data() const {
80
0
    return allocator_->GetPointer<std::uint8_t>(data_handle_) + pos_;
81
0
  }
82
83
0
  std::uint8_t* current_data() {
84
0
    return allocator_->GetPointer<std::uint8_t>(data_handle_) + pos_;
85
0
  }
86
87
0
  std::int32_t* sums_of_each_slice() {
88
0
    return allocator_->GetPointer<std::int32_t>(sums_of_each_slice_handle_);
89
0
  }
90
91
0
  const std::int32_t* sums_of_each_slice() const {
92
0
    return allocator_->GetPointer<const std::int32_t>(
93
0
        sums_of_each_slice_handle_);
94
0
  }
95
96
0
  const SideBlockParams& params() const { return params_; }
97
98
 private:
99
  // The block size parameters that this PackedSizeBlock follows.
100
  // The L2 parameters determine its overall size, while the L1 parameters,
101
  // together with the kernel format template parameter, determine
102
  // the fine details of the storage/traversal order.
103
  SideBlockParams params_;
104
105
  // Pointer to the allocator provided by the caller. Not owned.
106
  // The Allocator is assumed to outlive the PackedSideBlock.
107
  Allocator* const allocator_;
108
109
  // Handle on the buffer backing this packed block. Owned.
110
  Allocator::Handle data_handle_;
111
112
  // Handle on the additional buffer backing the vector of sums of slices
113
  // associated with this block. Owned.
114
  Allocator::Handle sums_of_each_slice_handle_;
115
116
  // pos_ is the current position in the buffer, which we access
117
  // sequentially, like a file.
118
  // The idea is that we pack data in the same order as it is
119
  // going to be traversed during the computation, which for
120
  // cache-friendliness reasons is complicated to random-access,
121
  // as the offsets calculations would be intricate. So we
122
  // give up random-access addressing, and instead content ourselves
123
  // with sequential access.
124
  //
125
  // pos_ is mutable because during the computation we will want to
126
  // be able to iterate on the data in a const PackedSideBlock.
127
  mutable int pos_;
128
};
129
130
// WidthMajor and DepthMajor are custom phrases modelled after the
131
// standard terminology 'row-major' and 'column-major'. Their meaning
132
// should be transparent once one has read the explanation in kernel.h:
133
// for example, in the Lhs, the 'width' dimension is the rows dimension,
134
// so there WidthMajor means RowMajor, while in the Rhs it is the opposite.
135
// Another way to put it: WidthMajor means that contiguous storage is used
136
// for entries having the same 'width' index.
137
enum class SideMapOrder { WidthMajor, DepthMajor };
138
139
// Similar to MatrixMap from map.h, but in terms of width/depth instead of
140
// rows/columns. Used to address blocks of the input LHS/RHS matrices when
141
// packing them.
142
template <typename tScalar, SideMapOrder tOrder>
143
class SideMap {
144
 public:
145
  typedef tScalar Scalar;
146
  static constexpr SideMapOrder kOrder = tOrder;
147
148
  SideMap(Scalar* data, int width, int depth, int stride)
149
0
      : data_(data), width_(width), depth_(depth), stride_(stride) {}
Unexecuted instantiation: gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)1>::SideMap(unsigned char const*, int, int, int)
Unexecuted instantiation: gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)0>::SideMap(unsigned char const*, int, int, int)
150
151
  SideMap(Scalar* data, int width, int depth)
152
0
      : data_(data), width_(width), depth_(depth) {
153
0
    stride_ = kOrder == SideMapOrder::WidthMajor ? depth_ : width_;
154
0
  }
Unexecuted instantiation: gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)1>::SideMap(unsigned char const*, int, int)
Unexecuted instantiation: gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)0>::SideMap(unsigned char const*, int, int)
155
156
  SideMap(const SideMap& other) = default;
157
  SideMap& operator=(const SideMap& other) = default;
158
159
0
  int width() const { return width_; }
Unexecuted instantiation: gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)1>::width() const
Unexecuted instantiation: gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)0>::width() const
160
0
  int depth() const { return depth_; }
Unexecuted instantiation: gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)1>::depth() const
Unexecuted instantiation: gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)0>::depth() const
161
  int stride() const { return stride_; }
162
0
  int width_stride() const {
163
0
    return kOrder == SideMapOrder::DepthMajor ? 1 : stride_;
164
0
  }
Unexecuted instantiation: gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)1>::width_stride() const
Unexecuted instantiation: gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)0>::width_stride() const
165
0
  int depth_stride() const {
166
0
    return kOrder == SideMapOrder::WidthMajor ? 1 : stride_;
167
0
  }
Unexecuted instantiation: gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)1>::depth_stride() const
Unexecuted instantiation: gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)0>::depth_stride() const
168
  Scalar* data() const { return data_; }
169
0
  Scalar* data(int w, int d) const {
170
0
    return data_ + w * width_stride() + d * depth_stride();
171
0
  }
Unexecuted instantiation: gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)1>::data(int, int) const
Unexecuted instantiation: gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)0>::data(int, int) const
172
0
  Scalar operator()(int w, int d) const { return *data(w, d); }
Unexecuted instantiation: gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)1>::operator()(int, int) const
Unexecuted instantiation: gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)0>::operator()(int, int) const
173
  Scalar& operator()(int w, int d) { return *data(w, d); }
174
175
  SideMap block(int start_width, int start_depth, int block_width,
176
0
                int block_depth) const {
177
0
    assert(start_width >= 0);
178
0
    assert(start_width + block_width <= width_);
179
0
    assert(start_depth >= 0);
180
0
    assert(start_depth + block_depth <= depth_);
181
182
0
    return SideMap(data(start_width, start_depth), block_width, block_depth,
183
0
                   stride_);
184
0
  }
Unexecuted instantiation: gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)1>::block(int, int, int, int) const
Unexecuted instantiation: gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)0>::block(int, int, int, int) const
185
186
 private:
187
  Scalar* data_;  // not owned.
188
  int width_, depth_, stride_;
189
};
190
191
// A PackingRegisterBlock is a small fixed-size block of a matrix being
192
// packed. This class is the generic non-optimized implementation,
193
// it is inherited by the generic implementation of PackingRegisterBlock,
194
// which may be overriden by template specialization. Overriding it is how
195
// one may provide optimized packing code paths.
196
//
197
// The packing of a block proceeds in two steps:
198
//   1. Ensuring that we have a complete block of source data, i.e. a block of
199
//      the compile-time prescribed size. This is where we handle unaligned
200
//      boundaries: if we don't have a complete block of source data, then
201
//      we copy and zero-extend it into a local temporary (complete_src_),
202
//      see MakeCompleteSrc. In the generic case, we do have a complete block,
203
//      so we just use it in-place, see UseCompleteSrcInPlace.
204
//   2. Packing a complete block into the destination, see Pack. This is the
205
//      most critical part, so it's convenient that unaligned boundaries have
206
//      already been handled in step 1.
207
template <typename SrcMapType, typename PackedSideBlock>
208
class PackingRegisterBlockBase {
209
 public:
210
  typedef typename PackedSideBlock::KernelSideFormat KernelSideFormat;
211
  typedef typename KernelSideFormat::Cell CellFormat;
212
  typedef typename KernelSideFormat::InputScalar KernelInputScalar;
213
  typedef typename KernelSideFormat::Scalar KernelScalar;
214
  static constexpr int kCells = KernelSideFormat::kCells;
215
  static constexpr int kCellWidth = CellFormat::kWidth;
216
  static constexpr int kKernelWidth = CellFormat::kWidth * kCells;
217
  static constexpr int kCellDepth = CellFormat::kDepth;
218
  static constexpr int kCellSize = CellFormat::kSize;
219
  static constexpr SideMapOrder kSrcOrder = SrcMapType::kOrder;
220
  static constexpr int kZeroPointInputValue =
221
      ZeroPointInputValue<KernelInputScalar, KernelScalar>::kValue;
222
223
0
  PackingRegisterBlockBase() : complete_src_(nullptr, 0, 0, 0) {}
Unexecuted instantiation: gemmlowp::PackingRegisterBlockBase<gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)1>, gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> > >::PackingRegisterBlockBase()
Unexecuted instantiation: gemmlowp::PackingRegisterBlockBase<gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)0>, gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> > >::PackingRegisterBlockBase()
224
225
 protected:
226
  // The source data that's ready for packing. May point to
227
  // in-place actual source data if it's already a complete block,
228
  // (see UseCompleteSrcInPlace)
229
  // or to the local buf_ below into which we copy incomplete blocks
230
  // (see MakeCompleteSrc)
231
  SrcMapType complete_src_;
232
233
  // Temporary buffer for loading incomplete blocks to,
234
  // in the source storage order
235
  std::uint8_t buf_[kKernelWidth * kRegisterSize];
236
237
 public:
238
  // Selects a block if in-place source data that's already a complete block.
239
0
  void UseCompleteSrcInPlace(const SrcMapType& src) { complete_src_ = src; }
Unexecuted instantiation: gemmlowp::PackingRegisterBlockBase<gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)1>, gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> > >::UseCompleteSrcInPlace(gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)1> const&)
Unexecuted instantiation: gemmlowp::PackingRegisterBlockBase<gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)0>, gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> > >::UseCompleteSrcInPlace(gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)0> const&)
240
  // Copies an incomplete block of source data into a local temporary
241
  // complete block by zero-extending it.
242
0
  void MakeCompleteSrc(const SrcMapType& src) {
243
0
    memset(buf_, kZeroPointInputValue, kKernelWidth * kRegisterSize);
244
0
    if (kSrcOrder == SideMapOrder::WidthMajor) {
245
0
      for (int w = 0; w < src.width(); w++) {
246
0
        memcpy(buf_ + w * kRegisterSize, src.data(w, 0), src.depth());
247
0
      }
248
0
    } else {
249
0
      assert(kSrcOrder == SideMapOrder::DepthMajor);
250
0
      for (int d = 0; d < src.depth(); d++) {
251
0
        memcpy(buf_ + d * kKernelWidth, src.data(0, d), src.width());
252
0
      }
253
0
    }
254
255
    // Since the KernelInputScalar type may not be uint8, we need to cast buf_.
256
0
    complete_src_ = SrcMapType(reinterpret_cast<KernelInputScalar*>(buf_),
257
0
                               kKernelWidth, kRegisterSize);
258
0
  }
Unexecuted instantiation: gemmlowp::PackingRegisterBlockBase<gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)1>, gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> > >::MakeCompleteSrc(gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)1> const&)
Unexecuted instantiation: gemmlowp::PackingRegisterBlockBase<gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)0>, gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> > >::MakeCompleteSrc(gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)0> const&)
259
  // Packs a complete block into the destination. This is the most
260
  // critical part and the part that we most typically want to
261
  // override in architecture-specific optimized specializations.
262
0
  void Pack(PackedSideBlock* dst, int start_width) {
263
0
    std::uint8_t* dst_ptr = dst->current_data();
264
0
    for (int cell_start_depth = 0; cell_start_depth < kRegisterSize;
265
0
         cell_start_depth += kCellDepth) {
266
0
      for (int cell_start_width = 0; cell_start_width < kKernelWidth;
267
0
           cell_start_width += kCellWidth) {
268
0
        std::int32_t* cell_sums_of_each_slice_ptr =
269
0
            dst->sums_of_each_slice() + start_width + cell_start_width;
270
0
        const SideMap<const std::uint8_t, kSrcOrder> src_cell_map(
271
0
            complete_src_.block(cell_start_width, cell_start_depth, kCellWidth,
272
0
                                kCellDepth));
273
0
        for (int w = 0; w < kCellWidth; w++) {
274
0
          std::int32_t sum = 0;
275
0
          for (int d = 0; d < kCellDepth; d++) {
276
0
            const std::uint8_t src_val = src_cell_map(w, d);
277
0
            const std::int16_t kernel_val_unwrapped =
278
0
                src_val - kZeroPointInputValue;
279
0
            const std::uint8_t kernel_val_uint8 = kernel_val_unwrapped;
280
0
            dst_ptr[OffsetIntoCell<CellFormat>(w, d)] = kernel_val_uint8;
281
0
            sum += kernel_val_unwrapped;
282
0
          }
283
0
          cell_sums_of_each_slice_ptr[w] += sum;
284
0
        }
285
0
        dst_ptr += kCellSize;
286
0
      }
287
0
    }
288
0
    dst->seek_forward_n_cells(kCells * kRegisterSize / kCellDepth);
289
0
  }
Unexecuted instantiation: gemmlowp::PackingRegisterBlockBase<gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)1>, gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> > >::Pack(gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> >*, int)
Unexecuted instantiation: gemmlowp::PackingRegisterBlockBase<gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)0>, gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> > >::Pack(gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> >*, int)
290
};
291
292
template <typename SrcMapType, typename PackedSideBlock>
293
class PackingRegisterBlock
294
    : public PackingRegisterBlockBase<SrcMapType, PackedSideBlock> {};
295
296
// Large-scale implementation of packing.
297
template <typename SrcMapType, typename PackedSideBlock>
298
class PackSideBlockImpl {
299
 public:
300
  typedef typename PackedSideBlock::KernelSideFormat KernelSideFormat;
301
  typedef typename KernelSideFormat::Cell CellFormat;
302
  static constexpr int kCells = KernelSideFormat::kCells;
303
  static constexpr int kCellWidth = CellFormat::kWidth;
304
  static constexpr int kKernelWidth = CellFormat::kWidth * kCells;
305
  static constexpr int kCellDepth = CellFormat::kDepth;
306
307
  typedef PackingRegisterBlock<SrcMapType, PackedSideBlock>
308
      PackingRegisterBlockType;
309
310
  PackSideBlockImpl(PackedSideBlock* packed_side_block,
311
                    const SrcMapType& src_map)
312
0
      : packed_side_block_(packed_side_block), src_map_(src_map) {}
Unexecuted instantiation: gemmlowp::PackSideBlockImpl<gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)1>, gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> > >::PackSideBlockImpl(gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> >*, gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)1> const&)
Unexecuted instantiation: gemmlowp::PackSideBlockImpl<gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)0>, gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> > >::PackSideBlockImpl(gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> >*, gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)0> const&)
313
314
  PackedSideBlock* packed_side_block() const { return packed_side_block_; }
315
316
  const SrcMapType& src_map() const { return src_map_; }
317
318
  // The public entry point to pack a block.
319
0
  void PackL2() {
320
0
    memset(packed_side_block_->sums_of_each_slice(), 0,
321
0
           sizeof(std::int32_t) * packed_side_block_->params().l2_width);
322
0
    for (int d = 0; d < src_map_.depth();
323
0
         d += packed_side_block_->params().l1_depth) {
324
0
      int ds = std::min<int>(packed_side_block_->params().l1_depth,
325
0
                             src_map_.depth() - d);
326
327
0
      for (int w = 0; w < src_map_.width();
328
0
           w += packed_side_block_->params().l1_width) {
329
0
        int ws = std::min<int>(packed_side_block_->params().l1_width,
330
0
                               src_map_.width() - w);
331
332
0
        PrefetchL1(w, ws, d, ds);
333
0
        PackL1(w, ws, d, ds);
334
0
      }
335
0
    }
336
0
  }
Unexecuted instantiation: gemmlowp::PackSideBlockImpl<gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)1>, gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> > >::PackL2()
Unexecuted instantiation: gemmlowp::PackSideBlockImpl<gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)0>, gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> > >::PackL2()
337
338
 protected:
339
  // The intermediate-level loops, between PackL2 and PackRun.
340
0
  void PackL1(int start_width, int width, int start_depth, int depth) {
341
0
    for (int w = 0; w < width; w += kKernelWidth) {
342
0
      int ws = std::min(+kKernelWidth, width - w);
343
0
      packed_side_block_->seek_run(start_width + w, start_depth);
344
0
      PackRun(start_width + w, ws, start_depth, depth);
345
0
    }
346
0
  }
Unexecuted instantiation: gemmlowp::PackSideBlockImpl<gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)1>, gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> > >::PackL1(int, int, int, int)
Unexecuted instantiation: gemmlowp::PackSideBlockImpl<gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)0>, gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> > >::PackL1(int, int, int, int)
347
348
  // Prefetches the data that will be read by PackL1.
349
0
  void PrefetchL1(int start_width, int width, int start_depth, int depth) {
350
0
    if (SrcMapType::kOrder == SideMapOrder::WidthMajor) {
351
0
      for (int d = 0; d < depth; d += kDefaultCacheLineSize) {
352
0
        for (int w = 0; w < width; w += 1) {
353
0
          Prefetch(src_map_.data(start_width + w, start_depth + d));
354
0
        }
355
0
      }
356
0
    } else {
357
0
      for (int d = 0; d < depth; d++) {
358
0
        for (int w = 0; w < width; w += kDefaultCacheLineSize) {
359
0
          Prefetch(src_map_.data(start_width + w, start_depth + d));
360
0
        }
361
0
      }
362
0
    }
363
0
  }
Unexecuted instantiation: gemmlowp::PackSideBlockImpl<gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)1>, gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> > >::PrefetchL1(int, int, int, int)
Unexecuted instantiation: gemmlowp::PackSideBlockImpl<gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)0>, gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> > >::PrefetchL1(int, int, int, int)
364
365
  // PackRun packs only a run i.e. is the inner loop in the depth dimension.
366
0
  void PackRun(int start_width, int width, int start_depth, int depth) {
367
0
    PackingRegisterBlockType b;
368
0
    if (width == kKernelWidth) {
369
0
      const int register_aligned_depth = RoundDown<kRegisterSize>(depth);
370
0
      if (register_aligned_depth) {
371
0
        for (int d = 0; d < register_aligned_depth; d += kRegisterSize) {
372
0
          b.UseCompleteSrcInPlace(src_map_.block(start_width, start_depth + d,
373
0
                                                 width, kRegisterSize));
374
0
          b.Pack(packed_side_block_, start_width);
375
0
        }
376
0
      }
377
0
      if (register_aligned_depth < depth) {
378
0
        b.MakeCompleteSrc(
379
0
            src_map_.block(start_width, start_depth + register_aligned_depth,
380
0
                           width, depth - register_aligned_depth));
381
0
        b.Pack(packed_side_block_, start_width);
382
0
      }
383
0
    } else {
384
0
      assert(width < kKernelWidth);
385
0
      for (int d = 0; d < depth; d += kRegisterSize) {
386
0
        const int ds = std::min(+kRegisterSize, depth - d);
387
0
        b.MakeCompleteSrc(
388
0
            src_map_.block(start_width, start_depth + d, width, ds));
389
0
        b.Pack(packed_side_block_, start_width);
390
0
      }
391
0
    }
392
0
  }
Unexecuted instantiation: gemmlowp::PackSideBlockImpl<gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)1>, gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> > >::PackRun(int, int, int, int)
Unexecuted instantiation: gemmlowp::PackSideBlockImpl<gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)0>, gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> > >::PackRun(int, int, int, int)
393
394
  // The PackedSideBlock being packed, i.e. the 'destination'.
395
  PackedSideBlock* const packed_side_block_;
396
397
  // A map on the block of the original matrix block being packed,
398
  // i.e. the 'source'.
399
  const SrcMapType& src_map_;
400
};
401
402
// Packs a block of the input LHS matrix, into a PackedSideBlock.
403
template <typename PackedSideBlock, typename MatrixMapType>
404
0
void PackLhs(PackedSideBlock* dst, const MatrixMapType& src) {
405
0
  ScopedProfilingLabel label("pack LHS");
406
0
  static const SideMapOrder kSideMapOrder =
407
0
      MatrixMapType::kOrder == MapOrder::RowMajor ? SideMapOrder::WidthMajor
408
0
                                                  : SideMapOrder::DepthMajor;
409
0
  typedef typename MatrixMapType::Scalar Scalar;
410
0
  typedef SideMap<Scalar, kSideMapOrder> SideMapType;
411
0
  SideMapType src_side_map(src.data(), src.rows(), src.cols(), src.stride());
412
0
  typedef PackSideBlockImpl<SideMapType, PackedSideBlock> ImplType;
413
0
  ImplType impl(dst, src_side_map);
414
0
  impl.PackL2();
415
0
}
Unexecuted instantiation: void gemmlowp::PackLhs<gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> >, gemmlowp::MatrixMap<unsigned char const, (gemmlowp::MapOrder)0> >(gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> >*, gemmlowp::MatrixMap<unsigned char const, (gemmlowp::MapOrder)0> const&)
Unexecuted instantiation: void gemmlowp::PackLhs<gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> >, gemmlowp::MatrixMap<unsigned char const, (gemmlowp::MapOrder)1> >(gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> >*, gemmlowp::MatrixMap<unsigned char const, (gemmlowp::MapOrder)1> const&)
416
417
// Packs a block of the input RHS matrix, into a PackedSideBlock.
418
template <typename PackedSideBlock, typename MatrixMapType>
419
0
void PackRhs(PackedSideBlock* dst, const MatrixMapType& src) {
420
0
  ScopedProfilingLabel label("pack RHS");
421
0
  static const SideMapOrder kSideMapOrder =
422
0
      MatrixMapType::kOrder == MapOrder::ColMajor ? SideMapOrder::WidthMajor
423
0
                                                  : SideMapOrder::DepthMajor;
424
0
  typedef typename MatrixMapType::Scalar Scalar;
425
0
  typedef SideMap<Scalar, kSideMapOrder> SideMapType;
426
0
  SideMapType src_side_map(src.data(), src.cols(), src.rows(), src.stride());
427
0
  typedef PackSideBlockImpl<SideMapType, PackedSideBlock> ImplType;
428
0
  ImplType impl(dst, src_side_map);
429
0
  impl.PackL2();
430
0
}
Unexecuted instantiation: void gemmlowp::PackRhs<gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> >, gemmlowp::MatrixMap<unsigned char const, (gemmlowp::MapOrder)0> >(gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> >*, gemmlowp::MatrixMap<unsigned char const, (gemmlowp::MapOrder)0> const&)
Unexecuted instantiation: void gemmlowp::PackRhs<gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> >, gemmlowp::MatrixMap<unsigned char const, (gemmlowp::MapOrder)1> >(gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> >*, gemmlowp::MatrixMap<unsigned char const, (gemmlowp::MapOrder)1> const&)
431
432
}  // namespace gemmlowp
433
434
#ifdef GEMMLOWP_NEON
435
#include "pack_neon.h"
436
#elif defined(GEMMLOWP_SSE4)
437
#include "pack_sse.h"
438
#elif defined(GEMMLOWP_AVX2)
439
#include "pack_avx.h"
440
#elif defined(GEMMLOWP_MSA)
441
#include "pack_msa.h"
442
#endif
443
444
#endif  // GEMMLOWP_INTERNAL_PACK_H_