/proc/self/cwd/external/gemmlowp/internal/unpack.h
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright 2015 The Gemmlowp Authors. All Rights Reserved. |
2 | | // |
3 | | // Licensed under the Apache License, Version 2.0 (the "License"); |
4 | | // you may not use this file except in compliance with the License. |
5 | | // You may obtain a copy of the License at |
6 | | // |
7 | | // http://www.apache.org/licenses/LICENSE-2.0 |
8 | | // |
9 | | // Unless required by applicable law or agreed to in writing, software |
10 | | // distributed under the License is distributed on an "AS IS" BASIS, |
11 | | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | | // See the License for the specific language governing permissions and |
13 | | // limitations under the License. |
14 | | |
15 | | // unpack.h: unpacking the result blocks computed by compute.h, |
16 | | // storing them into the destination matrix. |
17 | | |
18 | | #ifndef GEMMLOWP_INTERNAL_UNPACK_H_ |
19 | | #define GEMMLOWP_INTERNAL_UNPACK_H_ |
20 | | |
21 | | #include "allocator.h" |
22 | | #include "block_params.h" |
23 | | #include "output.h" |
24 | | #include "pack.h" |
25 | | |
26 | | #include <cmath> |
27 | | |
28 | | namespace gemmlowp { |
29 | | |
30 | | class PackedResult { |
31 | | public: |
32 | | PackedResult(Allocator* _allocator, const BlockParams& _block_params) |
33 | 0 | : allocator_(_allocator), block_params_(_block_params) { |
34 | 0 | matrix_handle_ = allocator_->Reserve<std::int32_t>(block_params_.l2_rows * |
35 | 0 | block_params_.l2_cols); |
36 | 0 | } |
37 | | |
38 | 0 | ~PackedResult() {} |
39 | | |
40 | 0 | MatrixMap<std::int32_t, MapOrder::ColMajor> Map() { |
41 | 0 | return MatrixMap<std::int32_t, MapOrder::ColMajor>( |
42 | 0 | allocator_->GetPointer<std::int32_t>(matrix_handle_), |
43 | 0 | block_params_.l2_rows, block_params_.l2_cols, block_params_.l2_rows); |
44 | 0 | } |
45 | | |
46 | 0 | MatrixMap<const std::int32_t, MapOrder::ColMajor> Map() const { |
47 | 0 | return MatrixMap<const std::int32_t, MapOrder::ColMajor>( |
48 | 0 | allocator_->GetPointer<const std::int32_t>(matrix_handle_), |
49 | 0 | block_params_.l2_rows, block_params_.l2_cols, block_params_.l2_rows); |
50 | 0 | } |
51 | | |
52 | | private: |
53 | | Allocator* allocator_; |
54 | | Allocator::Handle matrix_handle_; |
55 | | const BlockParams& block_params_; |
56 | | }; |
57 | | |
58 | | struct MatrixBlockBounds { |
59 | | int start_row; |
60 | | int start_col; |
61 | | int rows; |
62 | | int cols; |
63 | | |
64 | | MatrixBlockBounds(int start_row_, int start_col_, int rows_, int cols_) |
65 | | : start_row(start_row_), |
66 | | start_col(start_col_), |
67 | | rows(rows_), |
68 | 0 | cols(cols_) {} |
69 | | }; |
70 | | |
71 | | template <int Rows, int Cols, typename SrcMapType> |
72 | | void PrefetchResultBlock(const SrcMapType& src, |
73 | | const VectorMap<const std::int32_t, VectorShape::Col>& |
74 | | lhs_sums_of_each_slice, |
75 | 0 | int src_row, int src_col) { |
76 | 0 | const std::int32_t* src_data = src.data(src_row, src_col); |
77 | 0 | const int src_stride = src.stride(); |
78 | 0 | const std::int32_t* lhs_sums_data = lhs_sums_of_each_slice.data(src_row); |
79 | 0 | for (int r = 0; r < Rows; r += 4) { |
80 | 0 | Prefetch(lhs_sums_data + r); |
81 | 0 | } |
82 | 0 | for (int c = 0; c < Cols; c++) { |
83 | 0 | for (int r = 0; r < Rows; r += 4) { |
84 | 0 | Prefetch(src_data + r + c * src_stride); |
85 | 0 | } |
86 | 0 | } |
87 | 0 | } Unexecuted instantiation: void gemmlowp::PrefetchResultBlock<8, 4, gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0> >(gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0> const&, gemmlowp::VectorMap<int const, (gemmlowp::VectorShape)0> const&, int, int) Unexecuted instantiation: void gemmlowp::PrefetchResultBlock<8, 1, gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0> >(gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0> const&, gemmlowp::VectorMap<int const, (gemmlowp::VectorShape)0> const&, int, int) Unexecuted instantiation: void gemmlowp::PrefetchResultBlock<8, 8, gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0> >(gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0> const&, gemmlowp::VectorMap<int const, (gemmlowp::VectorShape)0> const&, int, int) |
88 | | |
89 | | template <typename KernelFormat, typename RegisterBlockType, |
90 | | typename SrcMapType, typename LhsOffset, typename RhsOffset, |
91 | | typename OutputPipelineExecutorType, typename DstType> |
92 | | void UnpackResultBlock(const SrcMapType& src, |
93 | | const OutputPipelineExecutorType& executor, DstType* dst, |
94 | | const VectorMap<const std::int32_t, VectorShape::Col>& |
95 | | lhs_sums_of_each_slice, |
96 | | const VectorMap<const std::int32_t, VectorShape::Row>& |
97 | | rhs_sums_of_each_slice, |
98 | | const LhsOffset& lhs_offset, const RhsOffset& rhs_offset, |
99 | | int depth, int src_row, int src_col, int src_global_row, |
100 | 0 | int src_global_col, int dst_row, int dst_col) { |
101 | 0 | using KernelLhsInputScalar = typename KernelFormat::Lhs::InputScalar; |
102 | 0 | using KernelLhsScalar = typename KernelFormat::Lhs::Scalar; |
103 | 0 | using KernelRhsInputScalar = typename KernelFormat::Rhs::InputScalar; |
104 | 0 | using KernelRhsScalar = typename KernelFormat::Rhs::Scalar; |
105 | 0 | static constexpr int KernelLhsZeroPointInput = |
106 | 0 | ZeroPointInputValue<KernelLhsInputScalar, KernelLhsScalar>::kValue; |
107 | 0 | static constexpr int KernelRhsZeroPointInput = |
108 | 0 | ZeroPointInputValue<KernelRhsInputScalar, KernelRhsScalar>::kValue; |
109 | 0 | auto acc = Load<RegisterBlockType>(src, src_row, src_col); |
110 | 0 | const auto& lhs_sums_of_each_slice_block = |
111 | 0 | LoadForBroadcasting<RegisterBlockType>(lhs_sums_of_each_slice, src_row); |
112 | 0 | const auto& rhs_sums_of_each_slice_block = |
113 | 0 | LoadForBroadcasting<RegisterBlockType>(rhs_sums_of_each_slice, src_col); |
114 | 0 | auto lhs_offset_block = |
115 | 0 | LoadForBroadcasting<RegisterBlockType>(lhs_offset, src_row); |
116 | 0 | auto rhs_offset_block = |
117 | 0 | LoadForBroadcasting<RegisterBlockType>(rhs_offset, src_col); |
118 | 0 | AddConstant<KernelLhsZeroPointInput>(&lhs_offset_block); |
119 | 0 | AddConstant<KernelRhsZeroPointInput>(&rhs_offset_block); |
120 | 0 | BroadcastMulAdd(lhs_sums_of_each_slice_block, rhs_offset_block, &acc); |
121 | 0 | for (int i = 0; i < decltype(rhs_offset_block)::kRegisterCount; i++) { |
122 | 0 | rhs_offset_block.buf.reg[i] = Mul(rhs_offset_block.buf.reg[i], depth); |
123 | 0 | } |
124 | 0 | BroadcastMulAdd(BroadcastAdd(rhs_sums_of_each_slice_block, rhs_offset_block), |
125 | 0 | lhs_offset_block, &acc); |
126 | 0 | executor.Execute(acc, dst, src_global_row, src_global_col, dst_row, dst_col); |
127 | 0 | } Unexecuted instantiation: void gemmlowp::UnpackResultBlock<gemmlowp::KernelFormat<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1>, gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> >, gemmlowp::RegisterBlock<int, 8, 4>, gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0>, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1>, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0>, gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 8, 4> >, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0> >(gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0> const&, gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 8, 4> > const&, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0>*, gemmlowp::VectorMap<int const, (gemmlowp::VectorShape)0> const&, gemmlowp::VectorMap<int const, (gemmlowp::VectorShape)1> const&, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1> const&, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0> const&, int, int, int, int, int, int, int) Unexecuted instantiation: void gemmlowp::UnpackResultBlock<gemmlowp::KernelFormat<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1>, gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> >, gemmlowp::RegisterBlock<int, 4, 4>, gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0>, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1>, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0>, gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 4, 4> >, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0> >(gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0> const&, gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 4, 4> > const&, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0>*, gemmlowp::VectorMap<int const, (gemmlowp::VectorShape)0> const&, gemmlowp::VectorMap<int const, (gemmlowp::VectorShape)1> const&, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1> const&, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0> const&, int, int, int, int, int, int, int) Unexecuted instantiation: void gemmlowp::UnpackResultBlock<gemmlowp::KernelFormat<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1>, gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> >, gemmlowp::RegisterBlock<int, 1, 4>, gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0>, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1>, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0>, gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 1, 4> >, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0> >(gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0> const&, gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 1, 4> > const&, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0>*, gemmlowp::VectorMap<int const, (gemmlowp::VectorShape)0> const&, gemmlowp::VectorMap<int const, (gemmlowp::VectorShape)1> const&, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1> const&, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0> const&, int, int, int, int, int, int, int) Unexecuted instantiation: void gemmlowp::UnpackResultBlock<gemmlowp::KernelFormat<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1>, gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> >, gemmlowp::RegisterBlock<int, 8, 1>, gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0>, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1>, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0>, gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 8, 1> >, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0> >(gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0> const&, gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 8, 1> > const&, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0>*, gemmlowp::VectorMap<int const, (gemmlowp::VectorShape)0> const&, gemmlowp::VectorMap<int const, (gemmlowp::VectorShape)1> const&, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1> const&, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0> const&, int, int, int, int, int, int, int) Unexecuted instantiation: void gemmlowp::UnpackResultBlock<gemmlowp::KernelFormat<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1>, gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> >, gemmlowp::RegisterBlock<int, 4, 1>, gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0>, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1>, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0>, gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 4, 1> >, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0> >(gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0> const&, gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 4, 1> > const&, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0>*, gemmlowp::VectorMap<int const, (gemmlowp::VectorShape)0> const&, gemmlowp::VectorMap<int const, (gemmlowp::VectorShape)1> const&, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1> const&, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0> const&, int, int, int, int, int, int, int) Unexecuted instantiation: void gemmlowp::UnpackResultBlock<gemmlowp::KernelFormat<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1>, gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> >, gemmlowp::RegisterBlock<int, 1, 1>, gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0>, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1>, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0>, gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 1, 1> >, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0> >(gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0> const&, gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 1, 1> > const&, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0>*, gemmlowp::VectorMap<int const, (gemmlowp::VectorShape)0> const&, gemmlowp::VectorMap<int const, (gemmlowp::VectorShape)1> const&, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1> const&, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0> const&, int, int, int, int, int, int, int) Unexecuted instantiation: void gemmlowp::UnpackResultBlock<gemmlowp::KernelFormat<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1>, gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> >, gemmlowp::RegisterBlock<int, 8, 4>, gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0>, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0>, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1>, gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 8, 4> >, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0> >(gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0> const&, gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 8, 4> > const&, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0>*, gemmlowp::VectorMap<int const, (gemmlowp::VectorShape)0> const&, gemmlowp::VectorMap<int const, (gemmlowp::VectorShape)1> const&, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0> const&, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1> const&, int, int, int, int, int, int, int) Unexecuted instantiation: void gemmlowp::UnpackResultBlock<gemmlowp::KernelFormat<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1>, gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> >, gemmlowp::RegisterBlock<int, 4, 4>, gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0>, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0>, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1>, gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 4, 4> >, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1> >(gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0> const&, gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 4, 4> > const&, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1>*, gemmlowp::VectorMap<int const, (gemmlowp::VectorShape)0> const&, gemmlowp::VectorMap<int const, (gemmlowp::VectorShape)1> const&, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0> const&, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1> const&, int, int, int, int, int, int, int) Unexecuted instantiation: void gemmlowp::UnpackResultBlock<gemmlowp::KernelFormat<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1>, gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> >, gemmlowp::RegisterBlock<int, 1, 4>, gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0>, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0>, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1>, gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 1, 4> >, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1> >(gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0> const&, gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 1, 4> > const&, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1>*, gemmlowp::VectorMap<int const, (gemmlowp::VectorShape)0> const&, gemmlowp::VectorMap<int const, (gemmlowp::VectorShape)1> const&, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0> const&, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1> const&, int, int, int, int, int, int, int) Unexecuted instantiation: void gemmlowp::UnpackResultBlock<gemmlowp::KernelFormat<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1>, gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> >, gemmlowp::RegisterBlock<int, 8, 4>, gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0>, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0>, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1>, gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 8, 4> >, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1> >(gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0> const&, gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 8, 4> > const&, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1>*, gemmlowp::VectorMap<int const, (gemmlowp::VectorShape)0> const&, gemmlowp::VectorMap<int const, (gemmlowp::VectorShape)1> const&, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0> const&, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1> const&, int, int, int, int, int, int, int) Unexecuted instantiation: void gemmlowp::UnpackResultBlock<gemmlowp::KernelFormat<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1>, gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> >, gemmlowp::RegisterBlock<int, 8, 1>, gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0>, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0>, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1>, gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 8, 1> >, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1> >(gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0> const&, gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 8, 1> > const&, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1>*, gemmlowp::VectorMap<int const, (gemmlowp::VectorShape)0> const&, gemmlowp::VectorMap<int const, (gemmlowp::VectorShape)1> const&, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0> const&, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1> const&, int, int, int, int, int, int, int) Unexecuted instantiation: void gemmlowp::UnpackResultBlock<gemmlowp::KernelFormat<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1>, gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> >, gemmlowp::RegisterBlock<int, 4, 1>, gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0>, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0>, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1>, gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 4, 1> >, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1> >(gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0> const&, gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 4, 1> > const&, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1>*, gemmlowp::VectorMap<int const, (gemmlowp::VectorShape)0> const&, gemmlowp::VectorMap<int const, (gemmlowp::VectorShape)1> const&, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0> const&, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1> const&, int, int, int, int, int, int, int) Unexecuted instantiation: void gemmlowp::UnpackResultBlock<gemmlowp::KernelFormat<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1>, gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> >, gemmlowp::RegisterBlock<int, 1, 1>, gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0>, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0>, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1>, gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 1, 1> >, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1> >(gemmlowp::MatrixMap<int const, (gemmlowp::MapOrder)0> const&, gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 1, 1> > const&, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1>*, gemmlowp::VectorMap<int const, (gemmlowp::VectorShape)0> const&, gemmlowp::VectorMap<int const, (gemmlowp::VectorShape)1> const&, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0> const&, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1> const&, int, int, int, int, int, int, int) |
128 | | |
129 | | template <typename KernelFormat, typename ResultBlockType, |
130 | | typename PackedResultType, typename LhsOffset, typename RhsOffset, |
131 | | typename OutputPipelineType> |
132 | | void UnpackResult(ResultBlockType* dst, const MatrixBlockBounds& dst_block, |
133 | | const PackedResultType& src, int depth, |
134 | | const std::int32_t* lhs_sums_of_each_slice_ptr, |
135 | | const std::int32_t* rhs_sums_of_each_slice_ptr, |
136 | | const LhsOffset& lhs_offset, const RhsOffset& rhs_offset, |
137 | 0 | const OutputPipelineType& output_pipeline) { |
138 | 0 | ScopedProfilingLabel label(ResultBlockType::kOrder == MapOrder::ColMajor |
139 | 0 | ? "unpack to column-major" |
140 | 0 | : "unpack to row-major"); |
141 | 0 | assert(dst_block.start_row >= 0); |
142 | 0 | assert(dst_block.start_row + dst_block.rows <= dst->rows()); |
143 | 0 | assert(dst_block.start_col >= 0); |
144 | 0 | assert(dst_block.start_col + dst_block.cols <= dst->cols()); |
145 | 0 | const auto src_map = src.Map(); |
146 | 0 | const VectorMap<const std::int32_t, VectorShape::Col> lhs_sums_of_each_slice( |
147 | 0 | lhs_sums_of_each_slice_ptr, dst_block.rows); |
148 | 0 | const VectorMap<const std::int32_t, VectorShape::Row> rhs_sums_of_each_slice( |
149 | 0 | rhs_sums_of_each_slice_ptr, dst_block.cols); |
150 | 0 | using Int32x1x1 = RegisterBlock<std::int32_t, 1, 1>; |
151 | 0 | using Int32x4x1 = RegisterBlock<std::int32_t, 4, 1>; |
152 | 0 | using Int32x8x1 = RegisterBlock<std::int32_t, 8, 1>; |
153 | 0 | using Int32x1x4 = RegisterBlock<std::int32_t, 1, 4>; |
154 | 0 | using Int32x4x4 = RegisterBlock<std::int32_t, 4, 4>; |
155 | 0 | using Int32x8x4 = RegisterBlock<std::int32_t, 8, 4>; |
156 | |
|
157 | 0 | using DstScalarType = typename ResultBlockType::Scalar; |
158 | 0 | using DstScalarx8x8 = RegisterBlock<DstScalarType, 8, 8>; |
159 | |
|
160 | 0 | OutputPipelineExecutor<OutputPipelineType, Int32x1x1> |
161 | 0 | output_pipeline_executor_1x1(output_pipeline); |
162 | 0 | OutputPipelineExecutor<OutputPipelineType, Int32x4x1> |
163 | 0 | output_pipeline_executor_4x1(output_pipeline); |
164 | 0 | OutputPipelineExecutor<OutputPipelineType, Int32x8x1> |
165 | 0 | output_pipeline_executor_8x1(output_pipeline); |
166 | 0 | OutputPipelineExecutor<OutputPipelineType, Int32x1x4> |
167 | 0 | output_pipeline_executor_1x4(output_pipeline); |
168 | 0 | OutputPipelineExecutor<OutputPipelineType, Int32x4x4> |
169 | 0 | output_pipeline_executor_4x4(output_pipeline); |
170 | 0 | OutputPipelineExecutor<OutputPipelineType, Int32x8x4> |
171 | 0 | output_pipeline_executor_8x4(output_pipeline); |
172 | |
|
173 | 0 | int c8 = 0; |
174 | 0 | if (ResultBlockType::kOrder == MapOrder::RowMajor) { |
175 | 0 | for (; c8 <= dst_block.cols - 8; c8 += 8) { |
176 | 0 | PrefetchResultBlock<8, 8>(src_map, lhs_sums_of_each_slice, 0, c8); |
177 | 0 | int r = 0; |
178 | 0 | for (; r <= dst_block.rows - 8; r += 8) { |
179 | 0 | const int global_row = r + dst_block.start_row; |
180 | 0 | PrefetchResultBlock<8, 8>(src_map, lhs_sums_of_each_slice, r + 8, c8); |
181 | 0 | DstScalarType dst_colmajor_buf[64]; |
182 | 0 | MatrixMap<DstScalarType, MapOrder::ColMajor> dst_colmajor_map( |
183 | 0 | dst_colmajor_buf, 8, 8); |
184 | 0 | for (int cx = 0; cx < 8; cx += 4) { |
185 | 0 | const int c = c8 + cx; |
186 | 0 | const int global_col = c + dst_block.start_col; |
187 | 0 | UnpackResultBlock<KernelFormat, Int32x8x4>( |
188 | 0 | src_map, output_pipeline_executor_8x4, &dst_colmajor_map, |
189 | 0 | lhs_sums_of_each_slice, rhs_sums_of_each_slice, lhs_offset, |
190 | 0 | rhs_offset, depth, r, c, global_row, global_col, 0, cx); |
191 | 0 | } |
192 | 0 | StoreFinalOutput(LoadContiguous<DstScalarx8x8>(dst_colmajor_buf), dst, |
193 | 0 | r + dst_block.start_row, c8 + dst_block.start_col); |
194 | 0 | } |
195 | 0 | for (; r <= dst_block.rows - 4; r += 4) { |
196 | 0 | const int global_row = r + dst_block.start_row; |
197 | 0 | for (int cx = 0; cx < 8; cx += 4) { |
198 | 0 | const int c = c8 + cx; |
199 | 0 | const int global_col = c + dst_block.start_col; |
200 | 0 | UnpackResultBlock<KernelFormat, Int32x4x4>( |
201 | 0 | src_map, output_pipeline_executor_4x4, dst, |
202 | 0 | lhs_sums_of_each_slice, rhs_sums_of_each_slice, lhs_offset, |
203 | 0 | rhs_offset, depth, r, c, global_row, global_col, global_row, |
204 | 0 | global_col); |
205 | 0 | } |
206 | 0 | } |
207 | 0 | for (; r < dst_block.rows; r++) { |
208 | 0 | const int global_row = r + dst_block.start_row; |
209 | 0 | for (int cx = 0; cx < 8; cx += 4) { |
210 | 0 | const int c = c8 + cx; |
211 | 0 | const int global_col = c + dst_block.start_col; |
212 | 0 | UnpackResultBlock<KernelFormat, Int32x1x4>( |
213 | 0 | src_map, output_pipeline_executor_1x4, dst, |
214 | 0 | lhs_sums_of_each_slice, rhs_sums_of_each_slice, lhs_offset, |
215 | 0 | rhs_offset, depth, r, c, global_row, global_col, global_row, |
216 | 0 | global_col); |
217 | 0 | } |
218 | 0 | } |
219 | 0 | } |
220 | 0 | } |
221 | 0 | int c = c8; |
222 | 0 | for (; c <= dst_block.cols - 4; c += 4) { |
223 | 0 | const int global_col = c + dst_block.start_col; |
224 | 0 | PrefetchResultBlock<8, 4>(src_map, lhs_sums_of_each_slice, 0, c); |
225 | 0 | int r = 0; |
226 | 0 | for (; r <= dst_block.rows - 8; r += 8) { |
227 | 0 | const int global_row = r + dst_block.start_row; |
228 | 0 | PrefetchResultBlock<8, 4>(src_map, lhs_sums_of_each_slice, r + 8, c); |
229 | 0 | UnpackResultBlock<KernelFormat, Int32x8x4>( |
230 | 0 | src_map, output_pipeline_executor_8x4, dst, lhs_sums_of_each_slice, |
231 | 0 | rhs_sums_of_each_slice, lhs_offset, rhs_offset, depth, r, c, |
232 | 0 | global_row, global_col, global_row, global_col); |
233 | 0 | } |
234 | 0 | for (; r <= dst_block.rows - 4; r += 4) { |
235 | 0 | const int global_row = r + dst_block.start_row; |
236 | 0 | UnpackResultBlock<KernelFormat, Int32x4x4>( |
237 | 0 | src_map, output_pipeline_executor_4x4, dst, lhs_sums_of_each_slice, |
238 | 0 | rhs_sums_of_each_slice, lhs_offset, rhs_offset, depth, r, c, |
239 | 0 | global_row, global_col, global_row, global_col); |
240 | 0 | } |
241 | 0 | for (; r < dst_block.rows; r++) { |
242 | 0 | const int global_row = r + dst_block.start_row; |
243 | 0 | UnpackResultBlock<KernelFormat, Int32x1x4>( |
244 | 0 | src_map, output_pipeline_executor_1x4, dst, lhs_sums_of_each_slice, |
245 | 0 | rhs_sums_of_each_slice, lhs_offset, rhs_offset, depth, r, c, |
246 | 0 | global_row, global_col, global_row, global_col); |
247 | 0 | } |
248 | 0 | } |
249 | 0 | for (; c < dst_block.cols; c++) { |
250 | 0 | const int global_col = c + dst_block.start_col; |
251 | 0 | PrefetchResultBlock<8, 1>(src_map, lhs_sums_of_each_slice, 0, c); |
252 | 0 | int r = 0; |
253 | 0 | for (; r <= dst_block.rows - 8; r += 8) { |
254 | 0 | const int global_row = r + dst_block.start_row; |
255 | 0 | PrefetchResultBlock<8, 1>(src_map, lhs_sums_of_each_slice, r + 8, c); |
256 | 0 | UnpackResultBlock<KernelFormat, Int32x8x1>( |
257 | 0 | src_map, output_pipeline_executor_8x1, dst, lhs_sums_of_each_slice, |
258 | 0 | rhs_sums_of_each_slice, lhs_offset, rhs_offset, depth, r, c, |
259 | 0 | global_row, global_col, global_row, global_col); |
260 | 0 | } |
261 | 0 | for (; r <= dst_block.rows - 4; r += 4) { |
262 | 0 | const int global_row = r + dst_block.start_row; |
263 | 0 | UnpackResultBlock<KernelFormat, Int32x4x1>( |
264 | 0 | src_map, output_pipeline_executor_4x1, dst, lhs_sums_of_each_slice, |
265 | 0 | rhs_sums_of_each_slice, lhs_offset, rhs_offset, depth, r, c, |
266 | 0 | global_row, global_col, global_row, global_col); |
267 | 0 | } |
268 | 0 | for (; r < dst_block.rows; r++) { |
269 | 0 | const int global_row = r + dst_block.start_row; |
270 | 0 | UnpackResultBlock<KernelFormat, Int32x1x1>( |
271 | 0 | src_map, output_pipeline_executor_1x1, dst, lhs_sums_of_each_slice, |
272 | 0 | rhs_sums_of_each_slice, lhs_offset, rhs_offset, depth, r, c, |
273 | 0 | global_row, global_col, global_row, global_col); |
274 | 0 | } |
275 | 0 | } |
276 | 0 | } Unexecuted instantiation: void gemmlowp::UnpackResult<gemmlowp::KernelFormat<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1>, gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> >, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0>, gemmlowp::PackedResult, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1>, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0>, std::__1::tuple<> >(gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0>*, gemmlowp::MatrixBlockBounds const&, gemmlowp::PackedResult const&, int, int const*, int const*, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1> const&, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0> const&, std::__1::tuple<> const&) Unexecuted instantiation: void gemmlowp::UnpackResult<gemmlowp::KernelFormat<gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1>, gemmlowp::KernelSideFormat<gemmlowp::CellFormat<4, 16, (gemmlowp::CellOrder)1>, 1> >, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1>, gemmlowp::PackedResult, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0>, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1>, std::__1::tuple<> >(gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1>*, gemmlowp::MatrixBlockBounds const&, gemmlowp::PackedResult const&, int, int const*, int const*, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)0> const&, gemmlowp::VectorDup<int const, (gemmlowp::VectorShape)1> const&, std::__1::tuple<> const&) |
277 | | |
278 | | } // end namespace gemmlowp |
279 | | |
280 | | #endif // GEMMLOWP_INTERNAL_UNPACK_H_ |