/proc/self/cwd/external/gemmlowp/internal/output.h
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright 2015 The Gemmlowp Authors. All Rights Reserved. |
2 | | // |
3 | | // Licensed under the Apache License, Version 2.0 (the "License"); |
4 | | // you may not use this file except in compliance with the License. |
5 | | // You may obtain a copy of the License at |
6 | | // |
7 | | // http://www.apache.org/licenses/LICENSE-2.0 |
8 | | // |
9 | | // Unless required by applicable law or agreed to in writing, software |
10 | | // distributed under the License is distributed on an "AS IS" BASIS, |
11 | | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | | // See the License for the specific language governing permissions and |
13 | | // limitations under the License. |
14 | | |
15 | | // output.h: processing the 32-bit accumulators output by the unpack |
16 | | // stage, obtaining the final result matrix entries and storing them into |
17 | | // the destination matrix. |
18 | | |
19 | | #ifndef GEMMLOWP_INTERNAL_OUTPUT_H_ |
20 | | #define GEMMLOWP_INTERNAL_OUTPUT_H_ |
21 | | |
22 | | #include <cmath> |
23 | | #include <tuple> |
24 | | #include <type_traits> |
25 | | #include <typeinfo> |
26 | | |
27 | | #include "../fixedpoint/fixedpoint.h" |
28 | | #include "../public/output_stages.h" |
29 | | #include "simd_wrappers.h" |
30 | | |
31 | | namespace gemmlowp { |
32 | | |
33 | | template <typename OutputStage, typename InputBufferType> |
34 | | struct OutputStageEvalBufferImpl { |
35 | | // This generic template body should never be hit. |
36 | | static_assert( |
37 | | std::is_same<InputBufferType, void>::value, |
38 | | "Unimplemented: missing implementation of this output pipeline stage " |
39 | | "for this data type. This would happen if some architecture-specific " |
40 | | "SIMD back-end (output_$arch.h) were incomplete."); |
41 | | }; |
42 | | |
43 | | template <typename OutputStage, typename InputType> |
44 | | struct OutputStageEvalImpl { |
45 | | static constexpr int kRows = InputType::kRows; |
46 | | static constexpr int kCols = InputType::kCols; |
47 | | using InputBufferType = typename InputType::BufferType; |
48 | | using BufferEvalImplType = |
49 | | OutputStageEvalBufferImpl<OutputStage, InputBufferType>; |
50 | | using OutputBufferType = typename BufferEvalImplType::OutputType; |
51 | | using OutputScalarType = typename OutputBufferType::ScalarType; |
52 | | using OutputType = RegisterBlock<OutputScalarType, kRows, kCols>; |
53 | | |
54 | | OutputStageEvalImpl(const OutputStage& s) : buffer_eval_impl(s) {} |
55 | | |
56 | | OutputType Eval(InputType input, int, int) const { |
57 | | OutputType output; |
58 | | output.buf = buffer_eval_impl.Eval(input.buf); |
59 | | return output; |
60 | | } |
61 | | |
62 | | const BufferEvalImplType buffer_eval_impl; |
63 | | }; |
64 | | |
65 | | template <int Size> |
66 | | struct OutputStageEvalBufferImpl<OutputStageQuantizeDownInt32ToUint8Scale, |
67 | | RegisterBuffer<std::int32_t, Size>> { |
68 | | using InputType = RegisterBuffer<std::int32_t, Size>; |
69 | | using OutputType = RegisterBuffer<std::int32_t, Size>; |
70 | | |
71 | | typedef OutputStageQuantizeDownInt32ToUint8Scale OutputStage; |
72 | | |
73 | | OutputStageEvalBufferImpl(const OutputStage& s) : output_stage(s) {} |
74 | | |
75 | | OutputType Eval(InputType input) const { |
76 | | const int result_shift = output_stage.result_shift; |
77 | | const std::int32_t result_mult_int = output_stage.result_mult_int; |
78 | | using RegisterType = typename InputType::RegisterType; |
79 | | const RegisterType result_offset = |
80 | | Dup<RegisterType>(output_stage.result_offset); |
81 | | OutputType output; |
82 | | for (int i = 0; i < InputType::kRegisterCount; i++) { |
83 | | output.reg[i] = RoundingDivideByPOT( |
84 | | Mul(Add(input.reg[i], result_offset), result_mult_int), result_shift); |
85 | | } |
86 | | return output; |
87 | | } |
88 | | |
89 | | const OutputStage& output_stage; |
90 | | }; |
91 | | |
92 | | template <int Rows, int Cols, VectorShape Shape> |
93 | | struct OutputStageEvalImpl<OutputStageQuantizeDownInt32ToUint8ScalePC<Shape>, |
94 | | RegisterBlock<std::int32_t, Rows, Cols>> { |
95 | | typedef RegisterBlock<std::int32_t, Rows, Cols> InputType; |
96 | | typedef RegisterBlock<std::int32_t, Rows, Cols> OutputType; |
97 | | typedef OutputStageQuantizeDownInt32ToUint8ScalePC<Shape> OutputStage; |
98 | | |
99 | | OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {} |
100 | | |
101 | | OutputType Eval(InputType input, int row, int col) const { |
102 | | OutputType output; |
103 | | const int result_shift = output_stage.result_shift; |
104 | | const int pos = Shape == VectorShape::Col ? row : col; |
105 | | const auto result_mult_int = |
106 | | LoadForBroadcasting<InputType>(output_stage.result_mult_int, pos); |
107 | | const auto result_offset = |
108 | | LoadForBroadcasting<InputType>(output_stage.result_offset, pos); |
109 | | const auto dividend = BroadcastMul<InputType>( |
110 | | BroadcastAdd<InputType>(input, result_offset), result_mult_int); |
111 | | for (int i = 0; i < InputType::kRegisterCount; i++) { |
112 | | output.buf.reg[i] = |
113 | | RoundingDivideByPOT(dividend.buf.reg[i], result_shift); |
114 | | } |
115 | | return output; |
116 | | } |
117 | | |
118 | | const OutputStage& output_stage; |
119 | | }; |
120 | | |
121 | | template <int Size> |
122 | | struct OutputStageEvalBufferImpl< |
123 | | OutputStageQuantizeDownInt32ByFixedPoint, |
124 | | RegisterBuffer<std::int32_t, Size>> { |
125 | | typedef RegisterBuffer<std::int32_t, Size> InputType; |
126 | | typedef RegisterBuffer<std::int32_t, Size> OutputType; |
127 | | |
128 | | typedef OutputStageQuantizeDownInt32ByFixedPoint OutputStage; |
129 | | |
130 | | OutputStageEvalBufferImpl(const OutputStage& s) : output_stage(s) {} |
131 | | |
132 | | OutputType Eval(InputType input) const { |
133 | | OutputType output; |
134 | | using RegisterType = typename InputType::RegisterType; |
135 | | const RegisterType result_offset_after_shift = |
136 | | Dup<RegisterType>(output_stage.result_offset_after_shift); |
137 | | for (int i = 0; i < InputType::kRegisterCount; i++) { |
138 | | const RegisterType mulhigh_val = SaturatingRoundingDoublingHighMul( |
139 | | input.reg[i], output_stage.result_fixedpoint_multiplier); |
140 | | output.reg[i] = |
141 | | Add(RoundingDivideByPOT(mulhigh_val, output_stage.result_shift), |
142 | | result_offset_after_shift); |
143 | | } |
144 | | return output; |
145 | | } |
146 | | |
147 | | const OutputStage& output_stage; |
148 | | }; |
149 | | |
150 | | template <int Size> |
151 | | struct OutputStageEvalBufferImpl<OutputStageScaleInt32ByFixedPointAndExponent, |
152 | | RegisterBuffer<std::int32_t, Size>> { |
153 | | typedef RegisterBuffer<std::int32_t, Size> InputType; |
154 | | typedef RegisterBuffer<std::int32_t, Size> OutputType; |
155 | | |
156 | | typedef OutputStageScaleInt32ByFixedPointAndExponent OutputStage; |
157 | | |
158 | | OutputStageEvalBufferImpl(const OutputStage& s) : output_stage(s) { |
159 | | left_shift = std::max(0, output_stage.result_exponent); |
160 | | right_shift = std::max(0, -output_stage.result_exponent); |
161 | | } |
162 | | |
163 | | OutputType Eval(InputType input) const { |
164 | | OutputType output; |
165 | | using RegisterType = typename InputType::RegisterType; |
166 | | const RegisterType result_offset_after_shift = |
167 | | Dup<RegisterType>(output_stage.result_offset_after_shift); |
168 | | for (int i = 0; i < InputType::kRegisterCount; i++) { |
169 | | const RegisterType mulhigh_val = SaturatingRoundingDoublingHighMul( |
170 | | ShiftLeft(input.reg[i], left_shift), |
171 | | output_stage.result_fixedpoint_multiplier); |
172 | | output.reg[i] = Add(RoundingDivideByPOT(mulhigh_val, right_shift), |
173 | | result_offset_after_shift); |
174 | | } |
175 | | return output; |
176 | | } |
177 | | |
178 | | const OutputStage& output_stage; |
179 | | int left_shift; |
180 | | int right_shift; |
181 | | }; |
182 | | |
183 | | template <int Rows, int Cols, VectorShape Shape> |
184 | | struct OutputStageEvalImpl< |
185 | | OutputStageScaleInt32ByFixedPointAndExponentPC<Shape>, |
186 | | RegisterBlock<std::int32_t, Rows, Cols>> { |
187 | | typedef RegisterBlock<std::int32_t, Rows, Cols> InputType; |
188 | | typedef RegisterBlock<std::int32_t, Rows, Cols> OutputType; |
189 | | |
190 | | typedef OutputStageScaleInt32ByFixedPointAndExponentPC<Shape> OutputStage; |
191 | | |
192 | | OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {} |
193 | | |
194 | | OutputType Eval(InputType input, int row, int col) const { |
195 | | OutputType output; |
196 | | const int pos = Shape == VectorShape::Row ? col : row; |
197 | | using RegisterType = typename InputType::RegisterType; |
198 | | const RegisterType result_offset_after_shift = |
199 | | Dup<RegisterType>(output_stage.result_offset_after_shift); |
200 | | auto left_shift = |
201 | | LoadForBroadcasting<InputType>(output_stage.result_exponent, pos); |
202 | | auto right_shift = |
203 | | LoadForBroadcasting<InputType>(output_stage.result_exponent, pos); |
204 | | const auto result_fixedpoint_multiplier = LoadForBroadcasting<InputType>( |
205 | | output_stage.result_fixedpoint_multiplier, pos); |
206 | | for (int i = 0; i < decltype(left_shift)::kRegisterCount; i++) { |
207 | | left_shift.buf.reg[i] = Max(left_shift.buf.reg[i], 0); |
208 | | right_shift.buf.reg[i] = Max(-right_shift.buf.reg[i], 0); |
209 | | } |
210 | | const auto mulhigh_val = BroadcastSaturatingRoundingDoublingHighMul( |
211 | | BroadcastShiftLeft(input, left_shift), result_fixedpoint_multiplier); |
212 | | const auto rdpot_val = |
213 | | BroadcastRoundingDivideByPOT(mulhigh_val, right_shift); |
214 | | for (int i = 0; i < InputType::kRegisterCount; i++) { |
215 | | output.buf.reg[i] = Add(rdpot_val.buf.reg[i], result_offset_after_shift); |
216 | | } |
217 | | return output; |
218 | | } |
219 | | |
220 | | const OutputStage& output_stage; |
221 | | }; |
222 | | |
223 | | // Implementation of OutputStageSaturatingCastToUint8 for scalar data. |
224 | | template <int Size> |
225 | | struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToUint8, |
226 | | RegisterBuffer<std::int32_t, Size>> { |
227 | | typedef RegisterBuffer<std::int32_t, Size> InputType; |
228 | | typedef RegisterBuffer<std::uint8_t, Size> OutputType; |
229 | | static_assert(InputType::kRegisterLanes == 1, |
230 | | "This path is only for scalar values"); |
231 | | |
232 | | typedef OutputStageSaturatingCastToUint8 OutputStage; |
233 | | |
234 | | OutputStageEvalBufferImpl(const OutputStage&) {} |
235 | | |
236 | | OutputType Eval(InputType input) const { |
237 | | OutputType output; |
238 | | for (int i = 0; i < InputType::kRegisterCount; i++) { |
239 | | std::int32_t data = input.reg[i]; |
240 | | output.reg[i] = data > 255 ? 255 : data < 0 ? 0 : data; |
241 | | } |
242 | | return output; |
243 | | } |
244 | | }; |
245 | | |
246 | | // Implementation of OutputStageSaturatingCastToInt8 for scalar data. |
247 | | template <int Size> |
248 | | struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToInt8, |
249 | | RegisterBuffer<std::int32_t, Size>> { |
250 | | typedef RegisterBuffer<std::int32_t, Size> InputType; |
251 | | typedef RegisterBuffer<std::int8_t, Size> OutputType; |
252 | | static_assert(InputType::kRegisterLanes == 1, |
253 | | "This path is only for scalar values"); |
254 | | |
255 | | typedef OutputStageSaturatingCastToInt8 OutputStage; |
256 | | |
257 | | OutputStageEvalBufferImpl(const OutputStage&) {} |
258 | | |
259 | | OutputType Eval(InputType input) const { |
260 | | OutputType output; |
261 | | for (int i = 0; i < InputType::kRegisterCount; i++) { |
262 | | std::int32_t data = input.reg[i]; |
263 | | output.reg[i] = data > 127 ? 127 : data < -128 ? -128 : data; |
264 | | } |
265 | | return output; |
266 | | } |
267 | | }; |
268 | | |
269 | | // Implementation of OutputStageSaturatingCastToInt16 for scalar data. |
270 | | template <int Size> |
271 | | struct OutputStageEvalBufferImpl<OutputStageSaturatingCastToInt16, |
272 | | RegisterBuffer<std::int32_t, Size>> { |
273 | | typedef RegisterBuffer<std::int32_t, Size> InputType; |
274 | | typedef RegisterBuffer<std::int16_t, Size> OutputType; |
275 | | static_assert(InputType::kRegisterLanes == 1, |
276 | | "This path is only for scalar values"); |
277 | | |
278 | | typedef OutputStageSaturatingCastToInt16 OutputStage; |
279 | | |
280 | | OutputStageEvalBufferImpl(const OutputStage&) {} |
281 | | |
282 | | OutputType Eval(InputType input) const { |
283 | | OutputType output; |
284 | | for (int i = 0; i < InputType::kRegisterCount; i++) { |
285 | | std::int32_t data = input.reg[i]; |
286 | | output.reg[i] = data > 32767 ? 32767 : data < -32768 ? -32768 : data; |
287 | | } |
288 | | return output; |
289 | | } |
290 | | }; |
291 | | |
292 | | // Implementation of OutputStageTruncatingCastToUint8 for scalar data |
293 | | template <int Size> |
294 | | struct OutputStageEvalBufferImpl<OutputStageTruncatingCastToUint8, |
295 | | RegisterBuffer<std::int32_t, Size>> { |
296 | | typedef RegisterBuffer<std::int32_t, Size> InputType; |
297 | | typedef RegisterBuffer<std::uint8_t, Size> OutputType; |
298 | | static_assert(InputType::kRegisterLanes == 1, |
299 | | "This path is only for scalar values"); |
300 | | |
301 | | typedef OutputStageTruncatingCastToUint8 OutputStage; |
302 | | |
303 | | OutputStageEvalBufferImpl(const OutputStage&) {} |
304 | | |
305 | | OutputType Eval(InputType input) const { |
306 | | OutputType output; |
307 | | for (int i = 0; i < InputType::kRegisterCount; i++) { |
308 | | output.reg[i] = input.reg[i]; |
309 | | } |
310 | | return output; |
311 | | } |
312 | | }; |
313 | | |
314 | | template <int Rows, int Cols, typename VectorType> |
315 | | struct OutputStageEvalImpl<OutputStageBiasAddition<VectorType>, |
316 | | RegisterBlock<std::int32_t, Rows, Cols>> { |
317 | | typedef RegisterBlock<std::int32_t, Rows, Cols> InputType; |
318 | | typedef RegisterBlock<std::int32_t, Rows, Cols> OutputType; |
319 | | typedef OutputStageBiasAddition<VectorType> OutputStage; |
320 | | |
321 | | OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {} |
322 | | |
323 | | OutputType Eval(InputType input, int row, int col) const { |
324 | | const int pos = VectorType::kShape == VectorShape::Row ? col : row; |
325 | | return BroadcastAdd<InputType>( |
326 | | input, LoadForBroadcasting<InputType>(output_stage.bias_vector, pos)); |
327 | | } |
328 | | |
329 | | const OutputStage& output_stage; |
330 | | }; |
331 | | |
332 | | template <int Size> |
333 | | struct OutputStageEvalBufferImpl<OutputStageClamp, |
334 | | RegisterBuffer<std::int32_t, Size>> { |
335 | | typedef RegisterBuffer<std::int32_t, Size> InputType; |
336 | | typedef RegisterBuffer<std::int32_t, Size> OutputType; |
337 | | |
338 | | typedef OutputStageClamp OutputStage; |
339 | | |
340 | | OutputStageEvalBufferImpl(const OutputStage& s) : output_stage(s) {} |
341 | | |
342 | | OutputType Eval(InputType input) const { |
343 | | using RegisterType = typename InputType::RegisterType; |
344 | | const RegisterType min = Dup<RegisterType>(output_stage.min); |
345 | | const RegisterType max = Dup<RegisterType>(output_stage.max); |
346 | | OutputType output; |
347 | | for (int i = 0; i < InputType::kRegisterCount; i++) { |
348 | | output.reg[i] = Min(Max(input.reg[i], min), max); |
349 | | } |
350 | | return output; |
351 | | } |
352 | | |
353 | | const OutputStage& output_stage; |
354 | | }; |
355 | | |
356 | | template <int Size> |
357 | | struct OutputStageEvalBufferImpl<OutputStageTanh, |
358 | | RegisterBuffer<std::int32_t, Size>> { |
359 | | typedef RegisterBuffer<std::int32_t, Size> InputType; |
360 | | typedef RegisterBuffer<std::int32_t, Size> OutputType; |
361 | | using RegisterType = typename InputType::RegisterType; |
362 | | typedef RegisterType DataType; |
363 | | typedef OutputStageTanh OutputStage; |
364 | | |
365 | | OutputStageEvalBufferImpl(const OutputStage& s) : output_stage(s) { |
366 | | const std::int32_t real_zero_as_int32 = output_stage.real_zero_as_int32; |
367 | | const std::int32_t real_amplitude_as_int32 = |
368 | | output_stage.real_amplitude_as_int32; |
369 | | |
370 | | input_cutoff_min = real_zero_as_int32 - 8 * real_amplitude_as_int32; |
371 | | input_cutoff_max = real_zero_as_int32 + 8 * real_amplitude_as_int32; |
372 | | output_min = real_zero_as_int32 - real_amplitude_as_int32; |
373 | | output_max = real_zero_as_int32 + real_amplitude_as_int32; |
374 | | |
375 | | double inverse_amplitude_normalized_double = 1.0 / real_amplitude_as_int32; |
376 | | inverse_amplitude_neg_exponent = 0; |
377 | | while (inverse_amplitude_normalized_double < 0.5) { |
378 | | inverse_amplitude_normalized_double *= 2; |
379 | | inverse_amplitude_neg_exponent++; |
380 | | } |
381 | | inverse_amplitude_normalized = FixedPoint<DataType, 0>::FromDouble( |
382 | | inverse_amplitude_normalized_double); |
383 | | |
384 | | double amplitude_normalized_double = real_amplitude_as_int32; |
385 | | amplitude_exponent = 0; |
386 | | while (amplitude_normalized_double >= 1.0) { |
387 | | amplitude_normalized_double *= 0.5; |
388 | | amplitude_exponent++; |
389 | | } |
390 | | amplitude_normalized = |
391 | | FixedPoint<DataType, 0>::FromDouble(amplitude_normalized_double); |
392 | | } |
393 | | |
394 | | OutputType Eval(InputType input) const { |
395 | | const std::int32_t real_zero_as_int32 = output_stage.real_zero_as_int32; |
396 | | |
397 | | typedef FixedPoint<DataType, 3> F3; |
398 | | typedef FixedPoint<DataType, 0> F0; |
399 | | |
400 | | OutputType output; |
401 | | |
402 | | for (int i = 0; i < OutputType::kRegisterCount; i++) { |
403 | | // fixed-point affine transformation |
404 | | DataType input_centered = |
405 | | Sub(input.reg[i], Dup<DataType>(real_zero_as_int32)); |
406 | | F3 fixedpoint_input = |
407 | | F3::FromRaw(input_centered) * inverse_amplitude_normalized; |
408 | | // left shift |
409 | | fixedpoint_input.raw() = ShiftLeft(fixedpoint_input.raw(), |
410 | | 28 - inverse_amplitude_neg_exponent); |
411 | | // fixed-point tanh and multiplication |
412 | | F0 fixedpoint_output = tanh(fixedpoint_input) * amplitude_normalized; |
413 | | // right shift |
414 | | DataType int32_output = |
415 | | Add(Dup<DataType>(real_zero_as_int32), |
416 | | ShiftRight(fixedpoint_output.raw(), 31 - amplitude_exponent)); |
417 | | |
418 | | DataType mask_if_below_cutoff_min = |
419 | | MaskIfLessThanOrEqual(input.reg[i], Dup<DataType>(input_cutoff_min)); |
420 | | DataType mask_if_above_cutoff_max = MaskIfGreaterThanOrEqual( |
421 | | input.reg[i], Dup<DataType>(input_cutoff_max)); |
422 | | |
423 | | output.reg[i] = SelectUsingMask( |
424 | | mask_if_below_cutoff_min, Dup<DataType>(output_min), |
425 | | SelectUsingMask(mask_if_above_cutoff_max, Dup<DataType>(output_max), |
426 | | int32_output)); |
427 | | } |
428 | | return output; |
429 | | } |
430 | | |
431 | | const OutputStage& output_stage; |
432 | | std::int32_t input_cutoff_min, input_cutoff_max; |
433 | | std::int32_t output_min, output_max; |
434 | | FixedPoint<DataType, 0> inverse_amplitude_normalized; |
435 | | int inverse_amplitude_neg_exponent; |
436 | | FixedPoint<DataType, 0> amplitude_normalized; |
437 | | int amplitude_exponent; |
438 | | }; |
439 | | |
440 | | // OutputPipelineOutputType is a helper to determine the output data type of a |
441 | | // pipeline, for a |
442 | | // given input data type. It is a recursive template; see the explanation on |
443 | | // OutputPipelineEvalImpl below. |
444 | | template <typename OutputPipelineType, int FirstStage, typename InputType, |
445 | | bool StopRecursion = |
446 | | FirstStage == std::tuple_size<OutputPipelineType>::value> |
447 | | struct OutputPipelineOutputType { |
448 | | typedef typename std::tuple_element<FirstStage, OutputPipelineType>::type |
449 | | FirstStageType; |
450 | | typedef typename OutputStageEvalImpl<FirstStageType, InputType>::OutputType |
451 | | FirstStageOutputType; |
452 | | typedef typename OutputPipelineOutputType<OutputPipelineType, FirstStage + 1, |
453 | | FirstStageOutputType>::Type Type; |
454 | | }; |
455 | | |
456 | | template <typename OutputPipelineType, int FirstStage, typename InputType> |
457 | | struct OutputPipelineOutputType<OutputPipelineType, FirstStage, InputType, |
458 | | true> { |
459 | | typedef InputType Type; |
460 | | }; |
461 | | |
462 | | // OutputPipelineEvalImpl is a helper to implement the evaluation of |
463 | | // the whole pipeline. It is a recursive template to implement compile-time |
464 | | // unrolling of the loop over all pipeline stages. The 'FirstStage' parameter |
465 | | // is how we implement recursion: each specialization implements only |
466 | | // evaluation starting at 'FirstStage'. The StopRecursion parameter is just a |
467 | | // helper to implement the termination of the recursion as a partial |
468 | | // specialization below. |
469 | | template <typename OutputPipelineType, int FirstStage, typename InputType, |
470 | | bool StopRecursion = |
471 | | FirstStage == std::tuple_size<OutputPipelineType>::value> |
472 | | struct OutputPipelineEvalImpl { |
473 | | typedef typename std::tuple_element<FirstStage, OutputPipelineType>::type |
474 | | FirstStageType; |
475 | | typedef typename OutputStageEvalImpl<FirstStageType, InputType>::OutputType |
476 | | FirstStageOutputType; |
477 | | typedef typename OutputPipelineOutputType<OutputPipelineType, FirstStage, |
478 | | InputType>::Type OutputType; |
479 | | |
480 | | OutputPipelineEvalImpl(const OutputPipelineType& output_pipeline) |
481 | | : head_impl(std::get<FirstStage>(output_pipeline)), |
482 | | tail_impl(output_pipeline) {} |
483 | | |
484 | | OutputType Eval(InputType input, int row, int col) const { |
485 | | // Evaluate the first stage. |
486 | | FirstStageOutputType first_stage_output = head_impl.Eval(input, row, col); |
487 | | // Recurse into the remaining stages. |
488 | | return tail_impl.Eval(first_stage_output, row, col); |
489 | | } |
490 | | |
491 | | const OutputStageEvalImpl<FirstStageType, InputType> head_impl; |
492 | | const OutputPipelineEvalImpl<OutputPipelineType, FirstStage + 1, |
493 | | FirstStageOutputType> |
494 | | tail_impl; |
495 | | }; |
496 | | |
497 | | // Specialization on 'StopRecursion' for terminating the recursion. |
498 | | template <typename OutputPipelineType, int FirstStage, typename InputType> |
499 | | struct OutputPipelineEvalImpl<OutputPipelineType, FirstStage, InputType, true> { |
500 | 0 | OutputPipelineEvalImpl(const OutputPipelineType&) {} Unexecuted instantiation: gemmlowp::OutputPipelineEvalImpl<std::__1::tuple<>, 0, gemmlowp::RegisterBlock<int, 1, 1>, true>::OutputPipelineEvalImpl(std::__1::tuple<> const&) Unexecuted instantiation: gemmlowp::OutputPipelineEvalImpl<std::__1::tuple<>, 0, gemmlowp::RegisterBlock<int, 4, 1>, true>::OutputPipelineEvalImpl(std::__1::tuple<> const&) Unexecuted instantiation: gemmlowp::OutputPipelineEvalImpl<std::__1::tuple<>, 0, gemmlowp::RegisterBlock<int, 8, 1>, true>::OutputPipelineEvalImpl(std::__1::tuple<> const&) Unexecuted instantiation: gemmlowp::OutputPipelineEvalImpl<std::__1::tuple<>, 0, gemmlowp::RegisterBlock<int, 1, 4>, true>::OutputPipelineEvalImpl(std::__1::tuple<> const&) Unexecuted instantiation: gemmlowp::OutputPipelineEvalImpl<std::__1::tuple<>, 0, gemmlowp::RegisterBlock<int, 4, 4>, true>::OutputPipelineEvalImpl(std::__1::tuple<> const&) Unexecuted instantiation: gemmlowp::OutputPipelineEvalImpl<std::__1::tuple<>, 0, gemmlowp::RegisterBlock<int, 8, 4>, true>::OutputPipelineEvalImpl(std::__1::tuple<> const&) |
501 | | |
502 | 0 | InputType Eval(InputType input, int, int) const { |
503 | | // Terminating the recursion. |
504 | 0 | return input; |
505 | 0 | } Unexecuted instantiation: gemmlowp::OutputPipelineEvalImpl<std::__1::tuple<>, 0, gemmlowp::RegisterBlock<int, 8, 4>, true>::Eval(gemmlowp::RegisterBlock<int, 8, 4>, int, int) const Unexecuted instantiation: gemmlowp::OutputPipelineEvalImpl<std::__1::tuple<>, 0, gemmlowp::RegisterBlock<int, 4, 4>, true>::Eval(gemmlowp::RegisterBlock<int, 4, 4>, int, int) const Unexecuted instantiation: gemmlowp::OutputPipelineEvalImpl<std::__1::tuple<>, 0, gemmlowp::RegisterBlock<int, 1, 4>, true>::Eval(gemmlowp::RegisterBlock<int, 1, 4>, int, int) const Unexecuted instantiation: gemmlowp::OutputPipelineEvalImpl<std::__1::tuple<>, 0, gemmlowp::RegisterBlock<int, 8, 1>, true>::Eval(gemmlowp::RegisterBlock<int, 8, 1>, int, int) const Unexecuted instantiation: gemmlowp::OutputPipelineEvalImpl<std::__1::tuple<>, 0, gemmlowp::RegisterBlock<int, 4, 1>, true>::Eval(gemmlowp::RegisterBlock<int, 4, 1>, int, int) const Unexecuted instantiation: gemmlowp::OutputPipelineEvalImpl<std::__1::tuple<>, 0, gemmlowp::RegisterBlock<int, 1, 1>, true>::Eval(gemmlowp::RegisterBlock<int, 1, 1>, int, int) const |
506 | | }; |
507 | | |
508 | | template <typename RegisterBlockType, typename DstType> |
509 | | struct StoreFinalOutputImpl { |
510 | | static_assert(std::is_same<RegisterBlockType, void>::value, |
511 | | "This generic impl should never be hit"); |
512 | | }; |
513 | | |
514 | | template <typename ScalarType, int Rows, int Cols, typename DstType> |
515 | | struct StoreFinalOutputImpl<RegisterBlock<ScalarType, Rows, Cols>, DstType> { |
516 | | using RegisterBlockType = RegisterBlock<ScalarType, Rows, Cols>; |
517 | | static void Run(const RegisterBlockType& src, DstType* dst, int row, |
518 | 0 | int col) { |
519 | 0 | for (int r = 0; r < Rows; r++) { |
520 | 0 | for (int c = 0; c < Cols; c++) { |
521 | 0 | *dst->data(row + r, col + c) = src.buf.reg[r + c * Rows]; |
522 | 0 | } |
523 | 0 | } |
524 | 0 | } Unexecuted instantiation: gemmlowp::StoreFinalOutputImpl<gemmlowp::RegisterBlock<int, 8, 4>, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0> >::Run(gemmlowp::RegisterBlock<int, 8, 4> const&, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0>*, int, int) Unexecuted instantiation: gemmlowp::StoreFinalOutputImpl<gemmlowp::RegisterBlock<int, 4, 4>, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0> >::Run(gemmlowp::RegisterBlock<int, 4, 4> const&, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0>*, int, int) Unexecuted instantiation: gemmlowp::StoreFinalOutputImpl<gemmlowp::RegisterBlock<int, 1, 4>, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0> >::Run(gemmlowp::RegisterBlock<int, 1, 4> const&, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0>*, int, int) Unexecuted instantiation: gemmlowp::StoreFinalOutputImpl<gemmlowp::RegisterBlock<int, 8, 1>, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0> >::Run(gemmlowp::RegisterBlock<int, 8, 1> const&, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0>*, int, int) Unexecuted instantiation: gemmlowp::StoreFinalOutputImpl<gemmlowp::RegisterBlock<int, 4, 1>, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0> >::Run(gemmlowp::RegisterBlock<int, 4, 1> const&, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0>*, int, int) Unexecuted instantiation: gemmlowp::StoreFinalOutputImpl<gemmlowp::RegisterBlock<int, 1, 1>, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0> >::Run(gemmlowp::RegisterBlock<int, 1, 1> const&, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0>*, int, int) Unexecuted instantiation: gemmlowp::StoreFinalOutputImpl<gemmlowp::RegisterBlock<int, 8, 8>, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1> >::Run(gemmlowp::RegisterBlock<int, 8, 8> const&, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1>*, int, int) Unexecuted instantiation: gemmlowp::StoreFinalOutputImpl<gemmlowp::RegisterBlock<int, 4, 4>, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1> >::Run(gemmlowp::RegisterBlock<int, 4, 4> const&, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1>*, int, int) Unexecuted instantiation: gemmlowp::StoreFinalOutputImpl<gemmlowp::RegisterBlock<int, 1, 4>, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1> >::Run(gemmlowp::RegisterBlock<int, 1, 4> const&, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1>*, int, int) Unexecuted instantiation: gemmlowp::StoreFinalOutputImpl<gemmlowp::RegisterBlock<int, 8, 4>, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1> >::Run(gemmlowp::RegisterBlock<int, 8, 4> const&, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1>*, int, int) Unexecuted instantiation: gemmlowp::StoreFinalOutputImpl<gemmlowp::RegisterBlock<int, 8, 1>, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1> >::Run(gemmlowp::RegisterBlock<int, 8, 1> const&, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1>*, int, int) Unexecuted instantiation: gemmlowp::StoreFinalOutputImpl<gemmlowp::RegisterBlock<int, 4, 1>, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1> >::Run(gemmlowp::RegisterBlock<int, 4, 1> const&, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1>*, int, int) Unexecuted instantiation: gemmlowp::StoreFinalOutputImpl<gemmlowp::RegisterBlock<int, 1, 1>, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1> >::Run(gemmlowp::RegisterBlock<int, 1, 1> const&, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1>*, int, int) Unexecuted instantiation: gemmlowp::StoreFinalOutputImpl<gemmlowp::RegisterBlock<int, 8, 8>, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0> >::Run(gemmlowp::RegisterBlock<int, 8, 8> const&, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0>*, int, int) |
525 | | }; |
526 | | |
527 | | // StoreFinalOutput takes the final value at the end of the output pipeline and |
528 | | // stores it into the destination matrix. It can be specialized for different |
529 | | // data types; the generic implementation here is typically used only for plain |
530 | | // old scalar (not SIMD) types. |
531 | | template <typename RegisterBlockType, typename DstType> |
532 | 0 | void StoreFinalOutput(RegisterBlockType src, DstType* dst, int row, int col) { |
533 | 0 | StoreFinalOutputImpl<RegisterBlockType, DstType>::Run(src, dst, row, col); |
534 | 0 | } Unexecuted instantiation: void gemmlowp::StoreFinalOutput<gemmlowp::RegisterBlock<int, 8, 4>, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0> >(gemmlowp::RegisterBlock<int, 8, 4>, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0>*, int, int) Unexecuted instantiation: void gemmlowp::StoreFinalOutput<gemmlowp::RegisterBlock<int, 4, 4>, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0> >(gemmlowp::RegisterBlock<int, 4, 4>, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0>*, int, int) Unexecuted instantiation: void gemmlowp::StoreFinalOutput<gemmlowp::RegisterBlock<int, 1, 4>, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0> >(gemmlowp::RegisterBlock<int, 1, 4>, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0>*, int, int) Unexecuted instantiation: void gemmlowp::StoreFinalOutput<gemmlowp::RegisterBlock<int, 8, 1>, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0> >(gemmlowp::RegisterBlock<int, 8, 1>, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0>*, int, int) Unexecuted instantiation: void gemmlowp::StoreFinalOutput<gemmlowp::RegisterBlock<int, 4, 1>, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0> >(gemmlowp::RegisterBlock<int, 4, 1>, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0>*, int, int) Unexecuted instantiation: void gemmlowp::StoreFinalOutput<gemmlowp::RegisterBlock<int, 1, 1>, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0> >(gemmlowp::RegisterBlock<int, 1, 1>, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0>*, int, int) Unexecuted instantiation: void gemmlowp::StoreFinalOutput<gemmlowp::RegisterBlock<int, 8, 8>, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1> >(gemmlowp::RegisterBlock<int, 8, 8>, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1>*, int, int) Unexecuted instantiation: void gemmlowp::StoreFinalOutput<gemmlowp::RegisterBlock<int, 4, 4>, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1> >(gemmlowp::RegisterBlock<int, 4, 4>, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1>*, int, int) Unexecuted instantiation: void gemmlowp::StoreFinalOutput<gemmlowp::RegisterBlock<int, 1, 4>, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1> >(gemmlowp::RegisterBlock<int, 1, 4>, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1>*, int, int) Unexecuted instantiation: void gemmlowp::StoreFinalOutput<gemmlowp::RegisterBlock<int, 8, 4>, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1> >(gemmlowp::RegisterBlock<int, 8, 4>, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1>*, int, int) Unexecuted instantiation: void gemmlowp::StoreFinalOutput<gemmlowp::RegisterBlock<int, 8, 1>, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1> >(gemmlowp::RegisterBlock<int, 8, 1>, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1>*, int, int) Unexecuted instantiation: void gemmlowp::StoreFinalOutput<gemmlowp::RegisterBlock<int, 4, 1>, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1> >(gemmlowp::RegisterBlock<int, 4, 1>, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1>*, int, int) Unexecuted instantiation: void gemmlowp::StoreFinalOutput<gemmlowp::RegisterBlock<int, 1, 1>, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1> >(gemmlowp::RegisterBlock<int, 1, 1>, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1>*, int, int) Unexecuted instantiation: void gemmlowp::StoreFinalOutput<gemmlowp::RegisterBlock<int, 8, 8>, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0> >(gemmlowp::RegisterBlock<int, 8, 8>, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0>*, int, int) |
535 | | |
536 | | template <typename OutputPipelineType, typename InputType> |
537 | | struct OutputPipelineExecutor { |
538 | | OutputPipelineExecutor(const OutputPipelineType& output_pipeline) |
539 | 0 | : output_pipeline_eval_impl_(output_pipeline) {} Unexecuted instantiation: gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 1, 1> >::OutputPipelineExecutor(std::__1::tuple<> const&) Unexecuted instantiation: gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 4, 1> >::OutputPipelineExecutor(std::__1::tuple<> const&) Unexecuted instantiation: gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 8, 1> >::OutputPipelineExecutor(std::__1::tuple<> const&) Unexecuted instantiation: gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 1, 4> >::OutputPipelineExecutor(std::__1::tuple<> const&) Unexecuted instantiation: gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 4, 4> >::OutputPipelineExecutor(std::__1::tuple<> const&) Unexecuted instantiation: gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 8, 4> >::OutputPipelineExecutor(std::__1::tuple<> const&) |
540 | | |
541 | | // Execute is the entry point into the output pipeline evaluation |
542 | | // code. It should be the only thing that unpack code calls. It takes the |
543 | | // result |
544 | | // of the unpack stage and stores it into the destination matrix. |
545 | | template <typename DstType> |
546 | | void Execute(InputType input, DstType* dst, int src_global_row, |
547 | 0 | int src_global_col, int dst_row, int dst_col) const { |
548 | | // Statically assert that the output pipeline matches the given destination |
549 | | // matrix's scalar type. |
550 | 0 | typedef typename OutputPipelineOutputType< |
551 | 0 | OutputPipelineType, 0, InputType>::Type::BufferType::ScalarType |
552 | |
|
553 | 0 | ScalarOutputType; |
554 | 0 | typedef typename DstType::Scalar ScalarDstType; |
555 | 0 | static_assert(std::is_same<ScalarOutputType, ScalarDstType>::value, |
556 | 0 | "mismatched destination scalar type and output pipeline"); |
557 | | |
558 | | // Evaluate the output pipeline. |
559 | 0 | auto output = |
560 | 0 | output_pipeline_eval_impl_.Eval(input, src_global_row, src_global_col); |
561 | | // Store the result into the destination matrix. |
562 | 0 | StoreFinalOutput(output, dst, dst_row, dst_col); |
563 | 0 | } Unexecuted instantiation: void gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 8, 4> >::Execute<gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0> >(gemmlowp::RegisterBlock<int, 8, 4>, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0>*, int, int, int, int) const Unexecuted instantiation: void gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 4, 4> >::Execute<gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0> >(gemmlowp::RegisterBlock<int, 4, 4>, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0>*, int, int, int, int) const Unexecuted instantiation: void gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 1, 4> >::Execute<gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0> >(gemmlowp::RegisterBlock<int, 1, 4>, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0>*, int, int, int, int) const Unexecuted instantiation: void gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 8, 1> >::Execute<gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0> >(gemmlowp::RegisterBlock<int, 8, 1>, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0>*, int, int, int, int) const Unexecuted instantiation: void gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 4, 1> >::Execute<gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0> >(gemmlowp::RegisterBlock<int, 4, 1>, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0>*, int, int, int, int) const Unexecuted instantiation: void gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 1, 1> >::Execute<gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0> >(gemmlowp::RegisterBlock<int, 1, 1>, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)0>*, int, int, int, int) const Unexecuted instantiation: void gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 4, 4> >::Execute<gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1> >(gemmlowp::RegisterBlock<int, 4, 4>, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1>*, int, int, int, int) const Unexecuted instantiation: void gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 1, 4> >::Execute<gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1> >(gemmlowp::RegisterBlock<int, 1, 4>, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1>*, int, int, int, int) const Unexecuted instantiation: void gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 8, 4> >::Execute<gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1> >(gemmlowp::RegisterBlock<int, 8, 4>, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1>*, int, int, int, int) const Unexecuted instantiation: void gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 8, 1> >::Execute<gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1> >(gemmlowp::RegisterBlock<int, 8, 1>, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1>*, int, int, int, int) const Unexecuted instantiation: void gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 4, 1> >::Execute<gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1> >(gemmlowp::RegisterBlock<int, 4, 1>, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1>*, int, int, int, int) const Unexecuted instantiation: void gemmlowp::OutputPipelineExecutor<std::__1::tuple<>, gemmlowp::RegisterBlock<int, 1, 1> >::Execute<gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1> >(gemmlowp::RegisterBlock<int, 1, 1>, gemmlowp::MatrixMap<int, (gemmlowp::MapOrder)1>*, int, int, int, int) const |
564 | | |
565 | | const OutputPipelineEvalImpl<OutputPipelineType, 0, InputType> |
566 | | output_pipeline_eval_impl_; |
567 | | }; |
568 | | |
569 | | } // namespace gemmlowp |
570 | | |
571 | | #ifdef GEMMLOWP_NEON |
572 | | #include "output_neon.h" |
573 | | #elif defined(GEMMLOWP_SSE4) |
574 | | #include "output_sse.h" |
575 | | #elif defined(GEMMLOWP_MSA) |
576 | | #include "output_msa.h" |
577 | | #endif |
578 | | |
579 | | #endif // GEMMLOWP_INTERNAL_OUTPUT_H_ |