aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/core/CL/cl_kernels/direct_convolution_1x1_3x3_5x5_quantized.cl24
-rw-r--r--src/core/CL/kernels/CLDirectConvolutionOutputStageKernel.cpp44
-rw-r--r--src/core/CL/kernels/CLIm2ColKernel.cpp2
-rw-r--r--tests/benchmark/fixtures/ConvolutionLayerFixture.h10
4 files changed, 40 insertions, 40 deletions
diff --git a/src/core/CL/cl_kernels/direct_convolution_1x1_3x3_5x5_quantized.cl b/src/core/CL/cl_kernels/direct_convolution_1x1_3x3_5x5_quantized.cl
index ae87420774..83da76785b 100644
--- a/src/core/CL/cl_kernels/direct_convolution_1x1_3x3_5x5_quantized.cl
+++ b/src/core/CL/cl_kernels/direct_convolution_1x1_3x3_5x5_quantized.cl
@@ -248,6 +248,12 @@ __kernel void direct_convolution_1x1_3x3_5x5_quantized(
}
#endif // defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
+#if defined(VEC_SIZE)
+
+#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
+#define CONVERT_SAT_UCHAR_STR(x, size) (convert_uchar##size##_sat((x)))
+#define CONVERT_SAT_UCHAR(x, size) CONVERT_SAT_UCHAR_STR(x, size)
+
/** This function computes the output stage of a depthwise convolution.
*
* @param[in] src_ptr Pointer to the source image. Supported data types: QASYMM8
@@ -274,7 +280,6 @@ __kernel void direct_convolution_1x1_3x3_5x5_quantized(
* @param[in] output_multiplier Output scale multiplier
* @param[in] output_shift Output scale divisor exponent
*/
-
__kernel void output_stage_quantized(
TENSOR3D_DECLARATION(src),
TENSOR3D_DECLARATION(dst),
@@ -292,22 +297,29 @@ __kernel void output_stage_quantized(
#endif //defined(HAS_BIAS)
// Load input
- int16 vals = vload16(0, (__global int *)(src.ptr));
+ VEC_INT vals = VLOAD(VEC_SIZE)(0, (__global int *)(src.ptr));
#if defined(HAS_BIAS)
// Load and add bias
#if defined(NCHW)
int bias_value = *((__global int *)(vector_offset(&bias, get_global_id(2))));
#else // defined(NCHW)
- int16 bias_value = vload16(0, ((__global int *)(vector_offset(&bias, get_global_id(0) * 16))));
+ VEC_INT bias_value = VLOAD(VEC_SIZE)(0, ((__global int *)(vector_offset(&bias, get_global_id(0) * VEC_SIZE))));
#endif // defined(NCHW)
- vals += (int16)(bias_value);
+ vals += (VEC_INT)(bias_value);
#endif //defined(HAS_BIAS)
- vals = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(vals, output_multiplier, output_shift, 16);
+ vals = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(vals, output_multiplier, output_shift, VEC_SIZE);
vals = vals + output_offset;
// Store result in dst
- vstore16(convert_uchar16_sat(vals), 0, (__global uchar *)dst.ptr);
+ VSTORE(VEC_SIZE)
+ (CONVERT_SAT_UCHAR(vals, VEC_SIZE), 0, (__global uchar *)dst.ptr);
}
+
+#undef VEC_INT
+#undef CONVERT_SAT_UCHAR_STR
+#undef CONVERT_SAT_UCHAR
+
+#endif // defined(VEC_SIZE)
diff --git a/src/core/CL/kernels/CLDirectConvolutionOutputStageKernel.cpp b/src/core/CL/kernels/CLDirectConvolutionOutputStageKernel.cpp
index 3d9d520841..4e2352cf6e 100644
--- a/src/core/CL/kernels/CLDirectConvolutionOutputStageKernel.cpp
+++ b/src/core/CL/kernels/CLDirectConvolutionOutputStageKernel.cpp
@@ -90,44 +90,29 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
bool window_changed = false;
unsigned int num_elems_processed_per_iteration = 16 / element_size_from_data_type(input->data_type());
- // Update processed elements when input is S32 (comes from quantization input)
- if(input->data_type() == DataType::S32)
- {
- num_elems_processed_per_iteration = 16;
- }
-
// Configure kernel window
- Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+ Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+
+ // Input window
AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+ window_changed = window_changed || update_window_and_padding(win, input_access);
+
+ // Bias window
+ if(bias != nullptr)
+ {
+ AccessWindowStatic bias_access(bias, 0, 0, ceil_to_multiple(bias->dimension(0), num_elems_processed_per_iteration), bias->dimension(1));
+ window_changed = window_changed || update_window_and_padding(win, bias_access);
+ }
+ // Output window
if(output != nullptr && (output->total_size() != 0))
{
AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-
- if(bias == nullptr)
- {
- window_changed = update_window_and_padding(win, input_access, output_access);
- }
- else
- {
- AccessWindowStatic bias_access(bias, 0, 0, bias->dimension(0), bias->dimension(1));
- window_changed = update_window_and_padding(win, input_access, output_access, bias_access);
- }
-
+ window_changed = window_changed || update_window_and_padding(win, output_access);
output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
}
else
{
- if(bias == nullptr)
- {
- window_changed = update_window_and_padding(win, input_access);
- }
- else
- {
- AccessWindowStatic bias_access(bias, 0, 0, bias->dimension(0), bias->dimension(1));
- window_changed = update_window_and_padding(win, input_access, bias_access);
- }
-
input_access.set_valid_region(win, ValidRegion(Coordinates(), input->tensor_shape()));
}
@@ -165,10 +150,13 @@ void CLDirectConvolutionLayerOutputStageKernel::configure(ICLTensor *input, cons
_result_shift = result_shift;
_result_offset_after_shift = result_offset_after_shift;
+ const unsigned int num_elems_accessed_per_iteration = 16 / element_size_from_data_type(input->info()->data_type());
+
// Create kernel
CLBuildOptions build_opts;
build_opts.add_option_if(bias != nullptr, "-DHAS_BIAS");
build_opts.add_option("-D" + string_from_data_layout(input->info()->data_layout()));
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_accessed_per_iteration));
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("output_stage_quantized", build_opts.options()));
// Set static kernel arguments
diff --git a/src/core/CL/kernels/CLIm2ColKernel.cpp b/src/core/CL/kernels/CLIm2ColKernel.cpp
index 06ca005dd5..b1290b8edd 100644
--- a/src/core/CL/kernels/CLIm2ColKernel.cpp
+++ b/src/core/CL/kernels/CLIm2ColKernel.cpp
@@ -262,7 +262,7 @@ CLIm2ColKernel::configure_window(const ICLTensor *input, ICLTensor *output, cons
AccessWindowStatic input_access(input->info(),
-border.left,
-border.top,
- ceil_to_multiple(input_width + border.right, kernel_dims.width),
+ ceil_to_multiple(input_width + border.right, kernel_dims.width * _num_elems_processed_per_iteration),
input_height + border.bottom);
update_window_and_padding(win, input_access);
}
diff --git a/tests/benchmark/fixtures/ConvolutionLayerFixture.h b/tests/benchmark/fixtures/ConvolutionLayerFixture.h
index 338a02162d..b23c3457ab 100644
--- a/tests/benchmark/fixtures/ConvolutionLayerFixture.h
+++ b/tests/benchmark/fixtures/ConvolutionLayerFixture.h
@@ -46,16 +46,16 @@ public:
int batches)
{
// Set batched in source and destination shapes
-
src_shape.set(3 /* batch */, batches);
dst_shape.set(3 /* batch */, batches);
- DataType bias_data_type = is_data_type_quantized_asymmetric(data_type) ? DataType::S32 : data_type;
+ DataType bias_data_type = is_data_type_quantized_asymmetric(data_type) ? DataType::S32 : data_type;
+ const QuantizationInfo qinfo(2.f / 255.f, 127);
// Create tensors
- src = create_tensor<TensorType>(src_shape, data_type, 1);
- weights = create_tensor<TensorType>(weights_shape, data_type, 1);
+ src = create_tensor<TensorType>(src_shape, data_type, 1, qinfo);
+ weights = create_tensor<TensorType>(weights_shape, data_type, 1, qinfo);
biases = create_tensor<TensorType>(biases_shape, bias_data_type, 1);
- dst = create_tensor<TensorType>(dst_shape, data_type, 1);
+ dst = create_tensor<TensorType>(dst_shape, data_type, 1, qinfo);
// Create and configure function
conv_layer.configure(&src, &weights, &biases, &dst, info, WeightsInfo(), dilation, act_info);