aboutsummaryrefslogtreecommitdiff
path: root/src/core/GLES_COMPUTE/cs_shaders/gemm.cs
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/GLES_COMPUTE/cs_shaders/gemm.cs')
-rw-r--r--src/core/GLES_COMPUTE/cs_shaders/gemm.cs10
1 files changed, 5 insertions, 5 deletions
diff --git a/src/core/GLES_COMPUTE/cs_shaders/gemm.cs b/src/core/GLES_COMPUTE/cs_shaders/gemm.cs
index c81bed7066..580acc16a7 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/gemm.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/gemm.cs
@@ -509,7 +509,7 @@ void main()
vec4 acc3 = vec4(0.0f);
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
- for(; int(CURRENT_ITEM_OFFSET_IN_BYTES(src0_iter)) < int(end_row_vec_a - uint(2));
+ for(; int(CURRENT_ITEM_OFFSET_IN_BYTES(src0_iter)) <= int(end_row_vec_a - uint(4));
TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, 2 * 2), TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, uint(2) * src1_attrs.stride_y))
{
vec2 a0 = LOAD_UNPACK2_CURRENT_ITEM_HALF(src0_ptr, src0_iter);
@@ -549,7 +549,7 @@ void main()
vec2 a1 = LOAD_UNPACK2_HALF(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 1));
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
- vec a2 = LOAD_UNPACK2_HALF(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 2));
+ vec2 a2 = LOAD_UNPACK2_HALF(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 2));
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
vec2 a3 = LOAD_UNPACK2_HALF(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 3));
@@ -615,7 +615,7 @@ void main()
vec4 acc3 = vec4(0.0f);
#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
- for(; int(CURRENT_ITEM_OFFSET_IN_BYTES(src0_iter)) < int(end_row_vec_a - uint(16));
+ for(; int(CURRENT_ITEM_OFFSET_IN_BYTES(src0_iter)) <= int(end_row_vec_a - uint(16));
TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, uint(8) * src0_attrs.stride_x), TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, uint(8) * src1_attrs.stride_y))
{
vec4 a0[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(src0_ptr, src0_iter);
@@ -729,7 +729,7 @@ void main()
acc[0] = vec4(0.0f);
acc[1] = vec4(0.0f);
- for(; int(CURRENT_ITEM_OFFSET_IN_BYTES(src0_iter)) < int(end_row_vec_a - uint(16));
+ for(; int(CURRENT_ITEM_OFFSET_IN_BYTES(src0_iter)) <= int(end_row_vec_a - uint(16));
TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, uint(8) * src0_attrs.stride_x), TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, uint(8) * src1_attrs.stride_y))
{
vec4 a[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(src0_ptr, src0_iter);
@@ -823,7 +823,7 @@ void main(void)
VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR(biases_attrs, biases_shift);
vec4 u[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(accum_ptr, accum_iter);
- vec4 v[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(biases_ptr, bias_iter);
+ vec4 v[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(biases_ptr, biases_iter);
vec4 r[2];
r[0] = u[0] + v[0];