/proc/self/cwd/external/gemmlowp/internal/pack.h
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright 2015 The Gemmlowp Authors. All Rights Reserved. |
2 | | // |
3 | | // Licensed under the Apache License, Version 2.0 (the "License"); |
4 | | // you may not use this file except in compliance with the License. |
5 | | // You may obtain a copy of the License at |
6 | | // |
7 | | // http://www.apache.org/licenses/LICENSE-2.0 |
8 | | // |
9 | | // Unless required by applicable law or agreed to in writing, software |
10 | | // distributed under the License is distributed on an "AS IS" BASIS, |
11 | | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | | // See the License for the specific language governing permissions and |
13 | | // limitations under the License. |
14 | | |
15 | | // pack.h: packing blocks of the LHS and RHS into the data layout |
16 | | // that is expected by compute.h and eventually by kernels. |
17 | | // Because this data layout depends on the kernel format, code here |
18 | | // is templated in KernelLhsFormat/KernelRhsFormat. |
19 | | // |
20 | | // Readers note: an important theme around here is that we try hard |
21 | | // to handle both Lhs and Rhs with a single piece of code. We indifferently |
22 | | // refer to the Lhs and Rhs as a 'Side'. Instead of addressing matrices |
23 | | // by (row, column) indices, we address them by (width, depth), as explained |
24 | | // in kernel.h. This allows us to handle both Lhs and Rhs on an equal footing, |
25 | | // at once. |
26 | | |
27 | | #ifndef GEMMLOWP_INTERNAL_PACK_H_ |
28 | | #define GEMMLOWP_INTERNAL_PACK_H_ |
29 | | |
30 | | #include <cstring> |
31 | | |
32 | | #include "allocator.h" |
33 | | #include "block_params.h" |
34 | | #include "common.h" |
35 | | #include "kernel.h" |
36 | | |
37 | | namespace gemmlowp { |
38 | | |
39 | | // A PackedSideBlock instance is a packed block of either the LHS or RHS |
40 | | // (whence the generic 'Side' name). |
41 | | // |
42 | | // 'Packed' means that it is laid out in the storage order that |
43 | | // is expected by the specified kernel format. From a block of the input |
44 | | // LHS or RHS matrix, one obtains a PackedSideBlock by calling PackLhs() |
45 | | // or PackRhs(). |
46 | | template <typename tKernelSideFormat> |
47 | | class PackedSideBlock { |
48 | | public: |
49 | | typedef tKernelSideFormat KernelSideFormat; |
50 | | |
51 | | PackedSideBlock(Side side, Allocator* allocator, |
52 | | const BlockParams& block_params) |
53 | 0 | : allocator_(allocator), pos_(0) { |
54 | 0 | GetSideBlockParams(side, ¶ms_, block_params); |
55 | 0 | data_handle_ = |
56 | 0 | allocator_->Reserve<std::uint8_t>(params_.l2_width * params_.l2_depth); |
57 | 0 | sums_of_each_slice_handle_ = |
58 | 0 | allocator_->Reserve<std::int32_t>(params_.l2_width); |
59 | 0 | } |
60 | | |
61 | 0 | ~PackedSideBlock() {} |
62 | | |
63 | 0 | void seek_run(int start_width, int start_depth) const { |
64 | 0 | int kernel_run_depth = |
65 | 0 | std::min<int>(params_.l1_depth, params_.l2_depth - start_depth); |
66 | 0 | pos_ = params_.l2_width * start_depth + start_width * kernel_run_depth; |
67 | 0 | } |
68 | | |
69 | | void seek_next_cell() const { pos_ += KernelSideFormat::Cell::kSize; } |
70 | | |
71 | 0 | void seek_forward_n_cells(int n) const { |
72 | 0 | pos_ += n * KernelSideFormat::Cell::kSize; |
73 | 0 | } |
74 | | |
75 | | // TODO(suharshs): The datatype can now be int8 as well. We could introduce a |
76 | | // new int8 current_data impl as well. This change would propagate to all pack |
77 | | // impls and the Kernel::Run API, which all assume uint8. For now we leave |
78 | | // this as-is pending future refactor. |
79 | 0 | const std::uint8_t* current_data() const { |
80 | 0 | return allocator_->GetPointer<std::uint8_t>(data_handle_) + pos_; |
81 | 0 | } |
82 | | |
83 | 0 | std::uint8_t* current_data() { |
84 | 0 | return allocator_->GetPointer<std::uint8_t>(data_handle_) + pos_; |
85 | 0 | } |
86 | | |
87 | 0 | std::int32_t* sums_of_each_slice() { |
88 | 0 | return allocator_->GetPointer<std::int32_t>(sums_of_each_slice_handle_); |
89 | 0 | } |
90 | | |
91 | 0 | const std::int32_t* sums_of_each_slice() const { |
92 | 0 | return allocator_->GetPointer<const std::int32_t>( |
93 | 0 | sums_of_each_slice_handle_); |
94 | 0 | } |
95 | | |
96 | 0 | const SideBlockParams& params() const { return params_; } |
97 | | |
98 | | private: |
99 | | // The block size parameters that this PackedSizeBlock follows. |
100 | | // The L2 parameters determine its overall size, while the L1 parameters, |
101 | | // together with the kernel format template parameter, determine |
102 | | // the fine details of the storage/traversal order. |
103 | | SideBlockParams params_; |
104 | | |
105 | | // Pointer to the allocator provided by the caller. Not owned. |
106 | | // The Allocator is assumed to outlive the PackedSideBlock. |
107 | | Allocator* const allocator_; |
108 | | |
109 | | // Handle on the buffer backing this packed block. Owned. |
110 | | Allocator::Handle data_handle_; |
111 | | |
112 | | // Handle on the additional buffer backing the vector of sums of slices |
113 | | // associated with this block. Owned. |
114 | | Allocator::Handle sums_of_each_slice_handle_; |
115 | | |
116 | | // pos_ is the current position in the buffer, which we access |
117 | | // sequentially, like a file. |
118 | | // The idea is that we pack data in the same order as it is |
119 | | // going to be traversed during the computation, which for |
120 | | // cache-friendliness reasons is complicated to random-access, |
121 | | // as the offsets calculations would be intricate. So we |
122 | | // give up random-access addressing, and instead content ourselves |
123 | | // with sequential access. |
124 | | // |
125 | | // pos_ is mutable because during the computation we will want to |
126 | | // be able to iterate on the data in a const PackedSideBlock. |
127 | | mutable int pos_; |
128 | | }; |
129 | | |
130 | | // WidthMajor and DepthMajor are custom phrases modelled after the |
131 | | // standard terminology 'row-major' and 'column-major'. Their meaning |
132 | | // should be transparent once one has read the explanation in kernel.h: |
133 | | // for example, in the Lhs, the 'width' dimension is the rows dimension, |
134 | | // so there WidthMajor means RowMajor, while in the Rhs it is the opposite. |
135 | | // Another way to put it: WidthMajor means that contiguous storage is used |
136 | | // for entries having the same 'width' index. |
137 | | enum class SideMapOrder { WidthMajor, DepthMajor }; |
138 | | |
139 | | // Similar to MatrixMap from map.h, but in terms of width/depth instead of |
140 | | // rows/columns. Used to address blocks of the input LHS/RHS matrices when |
141 | | // packing them. |
142 | | template <typename tScalar, SideMapOrder tOrder> |
143 | | class SideMap { |
144 | | public: |
145 | | typedef tScalar Scalar; |
146 | | static constexpr SideMapOrder kOrder = tOrder; |
147 | | |
148 | | SideMap(Scalar* data, int width, int depth, int stride) |
149 | 0 | : data_(data), width_(width), depth_(depth), stride_(stride) {} Unexecuted instantiation: gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)1>::SideMap(unsigned char const*, int, int, int) Unexecuted instantiation: gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)0>::SideMap(unsigned char const*, int, int, int) |
150 | | |
151 | | SideMap(Scalar* data, int width, int depth) |
152 | 0 | : data_(data), width_(width), depth_(depth) { |
153 | 0 | stride_ = kOrder == SideMapOrder::WidthMajor ? depth_ : width_; |
154 | 0 | } Unexecuted instantiation: gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)1>::SideMap(unsigned char const*, int, int) Unexecuted instantiation: gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)0>::SideMap(unsigned char const*, int, int) |
155 | | |
156 | | SideMap(const SideMap& other) = default; |
157 | | SideMap& operator=(const SideMap& other) = default; |
158 | | |
159 | 0 | int width() const { return width_; } Unexecuted instantiation: gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)1>::width() const Unexecuted instantiation: gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)0>::width() const |
160 | 0 | int depth() const { return depth_; } Unexecuted instantiation: gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)1>::depth() const Unexecuted instantiation: gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)0>::depth() const |
161 | | int stride() const { return stride_; } |
162 | 0 | int width_stride() const { |
163 | 0 | return kOrder == SideMapOrder::DepthMajor ? 1 : stride_; |
164 | 0 | } Unexecuted instantiation: gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)1>::width_stride() const Unexecuted instantiation: gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)0>::width_stride() const |
165 | 0 | int depth_stride() const { |
166 | 0 | return kOrder == SideMapOrder::WidthMajor ? 1 : stride_; |
167 | 0 | } Unexecuted instantiation: gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)1>::depth_stride() const Unexecuted instantiation: gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)0>::depth_stride() const |
168 | | Scalar* data() const { return data_; } |
169 | 0 | Scalar* data(int w, int d) const { |
170 | 0 | return data_ + w * width_stride() + d * depth_stride(); |
171 | 0 | } Unexecuted instantiation: gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)1>::data(int, int) const Unexecuted instantiation: gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)0>::data(int, int) const |
172 | 0 | Scalar operator()(int w, int d) const { return *data(w, d); } Unexecuted instantiation: gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)1>::operator()(int, int) const Unexecuted instantiation: gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)0>::operator()(int, int) const |
173 | | Scalar& operator()(int w, int d) { return *data(w, d); } |
174 | | |
175 | | SideMap block(int start_width, int start_depth, int block_width, |
176 | 0 | int block_depth) const { |
177 | 0 | assert(start_width >= 0); |
178 | 0 | assert(start_width + block_width <= width_); |
179 | 0 | assert(start_depth >= 0); |
180 | 0 | assert(start_depth + block_depth <= depth_); |
181 | | |
182 | 0 | return SideMap(data(start_width, start_depth), block_width, block_depth, |
183 | 0 | stride_); |
184 | 0 | } Unexecuted instantiation: gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)1>::block(int, int, int, int) const Unexecuted instantiation: gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)0>::block(int, int, int, int) const |
185 | | |
186 | | private: |
187 | | Scalar* data_; // not owned. |
188 | | int width_, depth_, stride_; |
189 | | }; |
190 | | |
191 | | // A PackingRegisterBlock is a small fixed-size block of a matrix being |
192 | | // packed. This class is the generic non-optimized implementation, |
193 | | // it is inherited by the generic implementation of PackingRegisterBlock, |
194 | | // which may be overriden by template specialization. Overriding it is how |
195 | | // one may provide optimized packing code paths. |
196 | | // |
197 | | // The packing of a block proceeds in two steps: |
198 | | // 1. Ensuring that we have a complete block of source data, i.e. a block of |
199 | | // the compile-time prescribed size. This is where we handle unaligned |
200 | | // boundaries: if we don't have a complete block of source data, then |
201 | | // we copy and zero-extend it into a local temporary (complete_src_), |
202 | | // see MakeCompleteSrc. In the generic case, we do have a complete block, |
203 | | // so we just use it in-place, see UseCompleteSrcInPlace. |
204 | | // 2. Packing a complete block into the destination, see Pack. This is the |
205 | | // most critical part, so it's convenient that unaligned boundaries have |
206 | | // already been handled in step 1. |
207 | | template <typename SrcMapType, typename PackedSideBlock> |
208 | | class PackingRegisterBlockBase { |
209 | | public: |
210 | | typedef typename PackedSideBlock::KernelSideFormat KernelSideFormat; |
211 | | typedef typename KernelSideFormat::Cell CellFormat; |
212 | | typedef typename KernelSideFormat::InputScalar KernelInputScalar; |
213 | | typedef typename KernelSideFormat::Scalar KernelScalar; |
214 | | static constexpr int kCells = KernelSideFormat::kCells; |
215 | | static constexpr int kCellWidth = CellFormat::kWidth; |
216 | | static constexpr int kKernelWidth = CellFormat::kWidth * kCells; |
217 | | static constexpr int kCellDepth = CellFormat::kDepth; |
218 | | static constexpr int kCellSize = CellFormat::kSize; |
219 | | static constexpr SideMapOrder kSrcOrder = SrcMapType::kOrder; |
220 | | static constexpr int kZeroPointInputValue = |
221 | | ZeroPointInputValue<KernelInputScalar, KernelScalar>::kValue; |
222 | | |
223 | 0 | PackingRegisterBlockBase() : complete_src_(nullptr, 0, 0, 0) {} Unexecuted instantiation: gemmlowp::PackingRegisterBlockBase<gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)1>, gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> > >::PackingRegisterBlockBase() Unexecuted instantiation: gemmlowp::PackingRegisterBlockBase<gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)0>, gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> > >::PackingRegisterBlockBase() |
224 | | |
225 | | protected: |
226 | | // The source data that's ready for packing. May point to |
227 | | // in-place actual source data if it's already a complete block, |
228 | | // (see UseCompleteSrcInPlace) |
229 | | // or to the local buf_ below into which we copy incomplete blocks |
230 | | // (see MakeCompleteSrc) |
231 | | SrcMapType complete_src_; |
232 | | |
233 | | // Temporary buffer for loading incomplete blocks to, |
234 | | // in the source storage order |
235 | | std::uint8_t buf_[kKernelWidth * kRegisterSize]; |
236 | | |
237 | | public: |
238 | | // Selects a block if in-place source data that's already a complete block. |
239 | 0 | void UseCompleteSrcInPlace(const SrcMapType& src) { complete_src_ = src; } Unexecuted instantiation: gemmlowp::PackingRegisterBlockBase<gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)1>, gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> > >::UseCompleteSrcInPlace(gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)1> const&) Unexecuted instantiation: gemmlowp::PackingRegisterBlockBase<gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)0>, gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> > >::UseCompleteSrcInPlace(gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)0> const&) |
240 | | // Copies an incomplete block of source data into a local temporary |
241 | | // complete block by zero-extending it. |
242 | 0 | void MakeCompleteSrc(const SrcMapType& src) { |
243 | 0 | memset(buf_, kZeroPointInputValue, kKernelWidth * kRegisterSize); |
244 | 0 | if (kSrcOrder == SideMapOrder::WidthMajor) { |
245 | 0 | for (int w = 0; w < src.width(); w++) { |
246 | 0 | memcpy(buf_ + w * kRegisterSize, src.data(w, 0), src.depth()); |
247 | 0 | } |
248 | 0 | } else { |
249 | 0 | assert(kSrcOrder == SideMapOrder::DepthMajor); |
250 | 0 | for (int d = 0; d < src.depth(); d++) { |
251 | 0 | memcpy(buf_ + d * kKernelWidth, src.data(0, d), src.width()); |
252 | 0 | } |
253 | 0 | } |
254 | | |
255 | | // Since the KernelInputScalar type may not be uint8, we need to cast buf_. |
256 | 0 | complete_src_ = SrcMapType(reinterpret_cast<KernelInputScalar*>(buf_), |
257 | 0 | kKernelWidth, kRegisterSize); |
258 | 0 | } Unexecuted instantiation: gemmlowp::PackingRegisterBlockBase<gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)1>, gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> > >::MakeCompleteSrc(gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)1> const&) Unexecuted instantiation: gemmlowp::PackingRegisterBlockBase<gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)0>, gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> > >::MakeCompleteSrc(gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)0> const&) |
259 | | // Packs a complete block into the destination. This is the most |
260 | | // critical part and the part that we most typically want to |
261 | | // override in architecture-specific optimized specializations. |
262 | 0 | void Pack(PackedSideBlock* dst, int start_width) { |
263 | 0 | std::uint8_t* dst_ptr = dst->current_data(); |
264 | 0 | for (int cell_start_depth = 0; cell_start_depth < kRegisterSize; |
265 | 0 | cell_start_depth += kCellDepth) { |
266 | 0 | for (int cell_start_width = 0; cell_start_width < kKernelWidth; |
267 | 0 | cell_start_width += kCellWidth) { |
268 | 0 | std::int32_t* cell_sums_of_each_slice_ptr = |
269 | 0 | dst->sums_of_each_slice() + start_width + cell_start_width; |
270 | 0 | const SideMap<const std::uint8_t, kSrcOrder> src_cell_map( |
271 | 0 | complete_src_.block(cell_start_width, cell_start_depth, kCellWidth, |
272 | 0 | kCellDepth)); |
273 | 0 | for (int w = 0; w < kCellWidth; w++) { |
274 | 0 | std::int32_t sum = 0; |
275 | 0 | for (int d = 0; d < kCellDepth; d++) { |
276 | 0 | const std::uint8_t src_val = src_cell_map(w, d); |
277 | 0 | const std::int16_t kernel_val_unwrapped = |
278 | 0 | src_val - kZeroPointInputValue; |
279 | 0 | const std::uint8_t kernel_val_uint8 = kernel_val_unwrapped; |
280 | 0 | dst_ptr[OffsetIntoCell<CellFormat>(w, d)] = kernel_val_uint8; |
281 | 0 | sum += kernel_val_unwrapped; |
282 | 0 | } |
283 | 0 | cell_sums_of_each_slice_ptr[w] += sum; |
284 | 0 | } |
285 | 0 | dst_ptr += kCellSize; |
286 | 0 | } |
287 | 0 | } |
288 | 0 | dst->seek_forward_n_cells(kCells * kRegisterSize / kCellDepth); |
289 | 0 | } Unexecuted instantiation: gemmlowp::PackingRegisterBlockBase<gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)1>, gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> > >::Pack(gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> >*, int) Unexecuted instantiation: gemmlowp::PackingRegisterBlockBase<gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)0>, gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> > >::Pack(gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> >*, int) |
290 | | }; |
291 | | |
292 | | template <typename SrcMapType, typename PackedSideBlock> |
293 | | class PackingRegisterBlock |
294 | | : public PackingRegisterBlockBase<SrcMapType, PackedSideBlock> {}; |
295 | | |
296 | | // Large-scale implementation of packing. |
297 | | template <typename SrcMapType, typename PackedSideBlock> |
298 | | class PackSideBlockImpl { |
299 | | public: |
300 | | typedef typename PackedSideBlock::KernelSideFormat KernelSideFormat; |
301 | | typedef typename KernelSideFormat::Cell CellFormat; |
302 | | static constexpr int kCells = KernelSideFormat::kCells; |
303 | | static constexpr int kCellWidth = CellFormat::kWidth; |
304 | | static constexpr int kKernelWidth = CellFormat::kWidth * kCells; |
305 | | static constexpr int kCellDepth = CellFormat::kDepth; |
306 | | |
307 | | typedef PackingRegisterBlock<SrcMapType, PackedSideBlock> |
308 | | PackingRegisterBlockType; |
309 | | |
310 | | PackSideBlockImpl(PackedSideBlock* packed_side_block, |
311 | | const SrcMapType& src_map) |
312 | 0 | : packed_side_block_(packed_side_block), src_map_(src_map) {} Unexecuted instantiation: gemmlowp::PackSideBlockImpl<gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)1>, gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> > >::PackSideBlockImpl(gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> >*, gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)1> const&) Unexecuted instantiation: gemmlowp::PackSideBlockImpl<gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)0>, gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> > >::PackSideBlockImpl(gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> >*, gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)0> const&) |
313 | | |
314 | | PackedSideBlock* packed_side_block() const { return packed_side_block_; } |
315 | | |
316 | | const SrcMapType& src_map() const { return src_map_; } |
317 | | |
318 | | // The public entry point to pack a block. |
319 | 0 | void PackL2() { |
320 | 0 | memset(packed_side_block_->sums_of_each_slice(), 0, |
321 | 0 | sizeof(std::int32_t) * packed_side_block_->params().l2_width); |
322 | 0 | for (int d = 0; d < src_map_.depth(); |
323 | 0 | d += packed_side_block_->params().l1_depth) { |
324 | 0 | int ds = std::min<int>(packed_side_block_->params().l1_depth, |
325 | 0 | src_map_.depth() - d); |
326 | |
|
327 | 0 | for (int w = 0; w < src_map_.width(); |
328 | 0 | w += packed_side_block_->params().l1_width) { |
329 | 0 | int ws = std::min<int>(packed_side_block_->params().l1_width, |
330 | 0 | src_map_.width() - w); |
331 | |
|
332 | 0 | PrefetchL1(w, ws, d, ds); |
333 | 0 | PackL1(w, ws, d, ds); |
334 | 0 | } |
335 | 0 | } |
336 | 0 | } Unexecuted instantiation: gemmlowp::PackSideBlockImpl<gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)1>, gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> > >::PackL2() Unexecuted instantiation: gemmlowp::PackSideBlockImpl<gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)0>, gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> > >::PackL2() |
337 | | |
338 | | protected: |
339 | | // The intermediate-level loops, between PackL2 and PackRun. |
340 | 0 | void PackL1(int start_width, int width, int start_depth, int depth) { |
341 | 0 | for (int w = 0; w < width; w += kKernelWidth) { |
342 | 0 | int ws = std::min(+kKernelWidth, width - w); |
343 | 0 | packed_side_block_->seek_run(start_width + w, start_depth); |
344 | 0 | PackRun(start_width + w, ws, start_depth, depth); |
345 | 0 | } |
346 | 0 | } Unexecuted instantiation: gemmlowp::PackSideBlockImpl<gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)1>, gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> > >::PackL1(int, int, int, int) Unexecuted instantiation: gemmlowp::PackSideBlockImpl<gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)0>, gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> > >::PackL1(int, int, int, int) |
347 | | |
348 | | // Prefetches the data that will be read by PackL1. |
349 | 0 | void PrefetchL1(int start_width, int width, int start_depth, int depth) { |
350 | 0 | if (SrcMapType::kOrder == SideMapOrder::WidthMajor) { |
351 | 0 | for (int d = 0; d < depth; d += kDefaultCacheLineSize) { |
352 | 0 | for (int w = 0; w < width; w += 1) { |
353 | 0 | Prefetch(src_map_.data(start_width + w, start_depth + d)); |
354 | 0 | } |
355 | 0 | } |
356 | 0 | } else { |
357 | 0 | for (int d = 0; d < depth; d++) { |
358 | 0 | for (int w = 0; w < width; w += kDefaultCacheLineSize) { |
359 | 0 | Prefetch(src_map_.data(start_width + w, start_depth + d)); |
360 | 0 | } |
361 | 0 | } |
362 | 0 | } |
363 | 0 | } Unexecuted instantiation: gemmlowp::PackSideBlockImpl<gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)1>, gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> > >::PrefetchL1(int, int, int, int) Unexecuted instantiation: gemmlowp::PackSideBlockImpl<gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)0>, gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> > >::PrefetchL1(int, int, int, int) |
364 | | |
365 | | // PackRun packs only a run i.e. is the inner loop in the depth dimension. |
366 | 0 | void PackRun(int start_width, int width, int start_depth, int depth) { |
367 | 0 | PackingRegisterBlockType b; |
368 | 0 | if (width == kKernelWidth) { |
369 | 0 | const int register_aligned_depth = RoundDown<kRegisterSize>(depth); |
370 | 0 | if (register_aligned_depth) { |
371 | 0 | for (int d = 0; d < register_aligned_depth; d += kRegisterSize) { |
372 | 0 | b.UseCompleteSrcInPlace(src_map_.block(start_width, start_depth + d, |
373 | 0 | width, kRegisterSize)); |
374 | 0 | b.Pack(packed_side_block_, start_width); |
375 | 0 | } |
376 | 0 | } |
377 | 0 | if (register_aligned_depth < depth) { |
378 | 0 | b.MakeCompleteSrc( |
379 | 0 | src_map_.block(start_width, start_depth + register_aligned_depth, |
380 | 0 | width, depth - register_aligned_depth)); |
381 | 0 | b.Pack(packed_side_block_, start_width); |
382 | 0 | } |
383 | 0 | } else { |
384 | 0 | assert(width < kKernelWidth); |
385 | 0 | for (int d = 0; d < depth; d += kRegisterSize) { |
386 | 0 | const int ds = std::min(+kRegisterSize, depth - d); |
387 | 0 | b.MakeCompleteSrc( |
388 | 0 | src_map_.block(start_width, start_depth + d, width, ds)); |
389 | 0 | b.Pack(packed_side_block_, start_width); |
390 | 0 | } |
391 | 0 | } |
392 | 0 | } Unexecuted instantiation: gemmlowp::PackSideBlockImpl<gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)1>, gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> > >::PackRun(int, int, int, int) Unexecuted instantiation: gemmlowp::PackSideBlockImpl<gemmlowp::SideMap<unsigned char const, (gemmlowp::SideMapOrder)0>, gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> > >::PackRun(int, int, int, int) |
393 | | |
394 | | // The PackedSideBlock being packed, i.e. the 'destination'. |
395 | | PackedSideBlock* const packed_side_block_; |
396 | | |
397 | | // A map on the block of the original matrix block being packed, |
398 | | // i.e. the 'source'. |
399 | | const SrcMapType& src_map_; |
400 | | }; |
401 | | |
402 | | // Packs a block of the input LHS matrix, into a PackedSideBlock. |
403 | | template <typename PackedSideBlock, typename MatrixMapType> |
404 | 0 | void PackLhs(PackedSideBlock* dst, const MatrixMapType& src) { |
405 | 0 | ScopedProfilingLabel label("pack LHS"); |
406 | 0 | static const SideMapOrder kSideMapOrder = |
407 | 0 | MatrixMapType::kOrder == MapOrder::RowMajor ? SideMapOrder::WidthMajor |
408 | 0 | : SideMapOrder::DepthMajor; |
409 | 0 | typedef typename MatrixMapType::Scalar Scalar; |
410 | 0 | typedef SideMap<Scalar, kSideMapOrder> SideMapType; |
411 | 0 | SideMapType src_side_map(src.data(), src.rows(), src.cols(), src.stride()); |
412 | 0 | typedef PackSideBlockImpl<SideMapType, PackedSideBlock> ImplType; |
413 | 0 | ImplType impl(dst, src_side_map); |
414 | 0 | impl.PackL2(); |
415 | 0 | } Unexecuted instantiation: void gemmlowp::PackLhs<gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> >, gemmlowp::MatrixMap<unsigned char const, (gemmlowp::MapOrder)0> >(gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> >*, gemmlowp::MatrixMap<unsigned char const, (gemmlowp::MapOrder)0> const&) Unexecuted instantiation: void gemmlowp::PackLhs<gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> >, gemmlowp::MatrixMap<unsigned char const, (gemmlowp::MapOrder)1> >(gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> >*, gemmlowp::MatrixMap<unsigned char const, (gemmlowp::MapOrder)1> const&) |
416 | | |
417 | | // Packs a block of the input RHS matrix, into a PackedSideBlock. |
418 | | template <typename PackedSideBlock, typename MatrixMapType> |
419 | 0 | void PackRhs(PackedSideBlock* dst, const MatrixMapType& src) { |
420 | 0 | ScopedProfilingLabel label("pack RHS"); |
421 | 0 | static const SideMapOrder kSideMapOrder = |
422 | 0 | MatrixMapType::kOrder == MapOrder::ColMajor ? SideMapOrder::WidthMajor |
423 | 0 | : SideMapOrder::DepthMajor; |
424 | 0 | typedef typename MatrixMapType::Scalar Scalar; |
425 | 0 | typedef SideMap<Scalar, kSideMapOrder> SideMapType; |
426 | 0 | SideMapType src_side_map(src.data(), src.cols(), src.rows(), src.stride()); |
427 | 0 | typedef PackSideBlockImpl<SideMapType, PackedSideBlock> ImplType; |
428 | 0 | ImplType impl(dst, src_side_map); |
429 | 0 | impl.PackL2(); |
430 | 0 | } Unexecuted instantiation: void gemmlowp::PackRhs<gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> >, gemmlowp::MatrixMap<unsigned char const, (gemmlowp::MapOrder)0> >(gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> >*, gemmlowp::MatrixMap<unsigned char const, (gemmlowp::MapOrder)0> const&) Unexecuted instantiation: void gemmlowp::PackRhs<gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> >, gemmlowp::MatrixMap<unsigned char const, (gemmlowp::MapOrder)1> >(gemmlowp::PackedSideBlock<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> >*, gemmlowp::MatrixMap<unsigned char const, (gemmlowp::MapOrder)1> const&) |
431 | | |
432 | | } // namespace gemmlowp |
433 | | |
434 | | #ifdef GEMMLOWP_NEON |
435 | | #include "pack_neon.h" |
436 | | #elif defined(GEMMLOWP_SSE4) |
437 | | #include "pack_sse.h" |
438 | | #elif defined(GEMMLOWP_AVX2) |
439 | | #include "pack_avx.h" |
440 | | #elif defined(GEMMLOWP_MSA) |
441 | | #include "pack_msa.h" |
442 | | #endif |
443 | | |
444 | | #endif // GEMMLOWP_INTERNAL_PACK_H_ |