diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/core/CL/CLKernelLibrary.cpp | 9 | ||||
-rw-r--r-- | src/core/CL/OpenCL.cpp | 58 | ||||
-rw-r--r-- | src/runtime/CL/CLTensorAllocator.cpp | 89 |
3 files changed, 144 insertions, 12 deletions
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp index 45149c29b8..d815ac1afc 100644 --- a/src/core/CL/CLKernelLibrary.cpp +++ b/src/core/CL/CLKernelLibrary.cpp @@ -713,7 +713,6 @@ Kernel CLKernelLibrary::create_kernel(const std::string &kernel_name, const Stri { ARM_COMPUTE_ERROR("Kernel %s not found in the CLKernelLibrary", kernel_name.c_str()); } - std::string concat_str; if(fp16_support(_device)) @@ -721,13 +720,13 @@ Kernel CLKernelLibrary::create_kernel(const std::string &kernel_name, const Stri concat_str += " -DARM_COMPUTE_OPENCL_FP16_ENABLED=1 "; } - if(non_uniform_workgroup_support(_device)) + if(get_cl_version(_device) == CLVersion::CL20) { - concat_str += " -cl-arm-non-uniform-work-group-size "; + concat_str += " -cl-std=CL2.0 "; } - else if(get_cl_version(_device) == CLVersion::CL20) + else if(non_uniform_workgroup_support(_device)) { - concat_str += " -cl-std=CL2.0 "; + concat_str += " -cl-arm-non-uniform-work-group-size "; } else { diff --git a/src/core/CL/OpenCL.cpp b/src/core/CL/OpenCL.cpp index f75a90a324..0ef800f265 100644 --- a/src/core/CL/OpenCL.cpp +++ b/src/core/CL/OpenCL.cpp @@ -111,6 +111,10 @@ bool CLSymbols::load(const std::string &library) LOAD_FUNCTION_PTR(clGetCommandQueueInfo, handle); LOAD_FUNCTION_PTR(clGetKernelInfo, handle); LOAD_FUNCTION_PTR(clGetEventProfilingInfo, handle); + LOAD_FUNCTION_PTR(clSVMAlloc, handle); + LOAD_FUNCTION_PTR(clSVMFree, handle); + LOAD_FUNCTION_PTR(clEnqueueSVMMap, handle); + LOAD_FUNCTION_PTR(clEnqueueSVMUnmap, handle); #undef LOAD_FUNCTION_PTR @@ -129,6 +133,60 @@ bool opencl_is_available() } } // namespace arm_compute +cl_int clEnqueueSVMMap(cl_command_queue command_queue, cl_bool blocking_map, cl_map_flags flags, void *svm_ptr, + size_t size, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event) +{ + arm_compute::CLSymbols::get().load_default(); + auto func = arm_compute::CLSymbols::get().clEnqueueSVMMap_ptr; + if(func != nullptr) + { + return func(command_queue, blocking_map, flags, svm_ptr, size, num_events_in_wait_list, event_wait_list, event); + } + else + { + return CL_OUT_OF_RESOURCES; + } +} + +cl_int clEnqueueSVMUnmap(cl_command_queue command_queue, void *svm_ptr, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, cl_event *event) +{ + arm_compute::CLSymbols::get().load_default(); + auto func = arm_compute::CLSymbols::get().clEnqueueSVMUnmap_ptr; + if(func != nullptr) + { + return func(command_queue, svm_ptr, num_events_in_wait_list, event_wait_list, event); + } + else + { + return CL_OUT_OF_RESOURCES; + } +} + +void *clSVMAlloc(cl_context context, cl_svm_mem_flags_arm flags, size_t size, cl_uint alignment) +{ + arm_compute::CLSymbols::get().load_default(); + auto func = arm_compute::CLSymbols::get().clSVMAlloc_ptr; + if(func != nullptr) + { + return func(context, flags, size, alignment); + } + else + { + return nullptr; + } +} + +void clSVMFree(cl_context context, void *svm_pointer) +{ + arm_compute::CLSymbols::get().load_default(); + auto func = arm_compute::CLSymbols::get().clSVMFree_ptr; + if(func != nullptr) + { + func(context, svm_pointer); + } +} + cl_int clGetContextInfo(cl_context context, cl_context_info param_name, size_t param_value_size, diff --git a/src/runtime/CL/CLTensorAllocator.cpp b/src/runtime/CL/CLTensorAllocator.cpp index ad165fad7d..705c4edd60 100644 --- a/src/runtime/CL/CLTensorAllocator.cpp +++ b/src/runtime/CL/CLTensorAllocator.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, 2017 ARM Limited. + * Copyright (c) 2016-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -24,6 +24,7 @@ #include "arm_compute/runtime/CL/CLTensorAllocator.h" #include "arm_compute/core/Error.h" +#include "arm_compute/core/Log.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/runtime/CL/CLMemoryGroup.h" #include "arm_compute/runtime/CL/CLScheduler.h" @@ -31,7 +32,7 @@ using namespace arm_compute; CLTensorAllocator::CLTensorAllocator(CLTensor *owner) - : _associated_memory_group(nullptr), _buffer(), _mapping(nullptr), _owner(owner) + : _associated_memory_group(nullptr), _buffer(), _mapping(nullptr), _owner(owner), _svm_memory() { } @@ -50,12 +51,47 @@ const cl::Buffer &CLTensorAllocator::cl_data() const return _buffer; } +void *SVMMemory::allocate(cl_context context, size_t size, cl_svm_mem_flags flags, cl_uint alignment) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(context); + ARM_COMPUTE_ERROR_ON(size == 0); + ARM_COMPUTE_ERROR_ON(_ptr != nullptr); + ARM_COMPUTE_ERROR_ON(size > CL_DEVICE_MAX_MEM_ALLOC_SIZE); + _ptr = clSVMAlloc(context, flags, size, alignment); + if(_ptr == nullptr) + { + ARM_COMPUTE_LOG_INFO_MSG_CORE("Call to clSVMAlloc() failed."); + } + else + { + _size = size; + _fine_grain = static_cast<bool>(flags & CL_MEM_SVM_FINE_GRAIN_BUFFER); + } + return _ptr; +} +void *CLTensorAllocator::svm_ptr() +{ + return _svm_memory.ptr(); +} + void CLTensorAllocator::allocate() { ARM_COMPUTE_ERROR_ON(_buffer.get() != nullptr); if(_associated_memory_group == nullptr) { - _buffer = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info().total_size()); + if(_svm_memory.allocate(CLScheduler::get().context()(), CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER, info().total_size(), 0) == nullptr) + { + // try at coarse grain svm memory + _svm_memory.allocate(CLScheduler::get().context()(), CL_MEM_READ_WRITE, info().total_size(), 0); + } + if(_svm_memory.ptr() != nullptr) + { + _buffer = cl::Buffer(CLScheduler::get().context(), CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, info().total_size(), _svm_memory.ptr()); + } + else + { + _buffer = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info().total_size()); + } } else { @@ -69,6 +105,10 @@ void CLTensorAllocator::free() if(_associated_memory_group == nullptr) { _buffer = cl::Buffer(); + if(_svm_memory.ptr() != nullptr) + { + clSVMFree(CLScheduler::get().context()(), _svm_memory.ptr()); + } info().set_is_resizable(true); } } @@ -97,12 +137,47 @@ void CLTensorAllocator::unlock() uint8_t *CLTensorAllocator::map(cl::CommandQueue &q, bool blocking) { - ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr); - return static_cast<uint8_t *>(q.enqueueMapBuffer(_buffer, blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, info().total_size())); + const bool svm_mem = _svm_memory.ptr() != nullptr; + const bool fine_grain_svm = _svm_memory.fine_grain(); + if(!svm_mem) + { + ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr); + return static_cast<uint8_t *>(q.enqueueMapBuffer(_buffer, blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, info().total_size())); + } + else if(!fine_grain_svm) + { + const cl_int ret = clEnqueueSVMMap(q(), blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, _svm_memory.ptr(), _svm_memory.size(), 0, nullptr, nullptr); + ARM_COMPUTE_ERROR_ON(ret != CL_SUCCESS); + if(ret == CL_SUCCESS) + { + return reinterpret_cast<uint8_t *>(_svm_memory.ptr()); + } + else + { + return nullptr; + } + } + else + { + if(blocking) + { + clFinish(q()); + } + return reinterpret_cast<uint8_t *>(_svm_memory.ptr()); + } } void CLTensorAllocator::unmap(cl::CommandQueue &q, uint8_t *mapping) { - ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr); - q.enqueueUnmapMemObject(_buffer, mapping); + const bool svm_mem = _svm_memory.ptr() != nullptr; + const bool fine_grain_svm = _svm_memory.fine_grain(); + if(!svm_mem) + { + ARM_COMPUTE_ERROR_ON(_buffer.get() == nullptr); + q.enqueueUnmapMemObject(_buffer, mapping); + } + else if(!fine_grain_svm) + { + clEnqueueSVMUnmap(q(), _svm_memory.ptr(), 0, nullptr, nullptr); + } } |