diff --git a/include/CL/cl_ext.h b/include/CL/cl_ext.h index 5d4bcc20..537ed1c0 100644 --- a/include/CL/cl_ext.h +++ b/include/CL/cl_ext.h @@ -2384,6 +2384,66 @@ clCreateBufferWithPropertiesINTEL_fn)( void * host_ptr, cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; +/********************************** +* cl_intel_global_variable_access * +***********************************/ + +#define CL_COMMAND_READ_GLOBAL_VARIABLE_INTEL 0x418E +#define CL_COMMAND_WRITE_GLOBAL_VARIABLE_INTEL 0x418F + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueReadGlobalVariableINTEL( + cl_command_queue command_queue, + cl_program program, + const char* name, + cl_bool blocking_read, + size_t size, + size_t offset, + void* ptr, + cl_uint num_events_in_wait_list, + const cl_event* event_wait_list, + cl_event* event) CL_API_SUFFIX__VERSION_1_0; + + +typedef CL_API_ENTRY cl_int (CL_API_CALL * +clEnqueueReadGlobalVariableINTEL_fn)( + cl_command_queue command_queue, + cl_program program, + const char* name, + cl_bool blocking_read, + size_t size, + size_t offset, + const void* ptr, + cl_uint num_events_in_wait_list, + const cl_event* event_wait_list, + cl_event* event) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueWriteGlobalVariableINTEL( + cl_command_queue command_queue, + cl_program program, + const char* name, + cl_bool blocking_write, + size_t size, + size_t offset, + const void* ptr, + cl_uint num_events_in_wait_list, + const cl_event* event_wait_list, + cl_event* event) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int (CL_API_CALL * +clEnqueueWriteGlobalVariableINTEL_fn)( + cl_command_queue command_queue, + cl_program program, + const char* name, + cl_bool blocking_read, + size_t size, + size_t offset, + void* ptr, + cl_uint num_events_in_wait_list, + const cl_event* event_wait_list, + cl_event* event) CL_API_SUFFIX__VERSION_1_0; + /****************************************** * cl_intel_mem_channel_property extension * *******************************************/ diff --git a/include/acl.h b/include/acl.h index 45bcea7b..6ca056a0 100644 --- a/include/acl.h +++ b/include/acl.h @@ -553,6 +553,7 @@ typedef struct acl_device_def_autodiscovery_t { std::vector acl_hostpipe_info; // Device global definition. + unsigned int num_device_global; std::unordered_map device_global_mem_defs; bool cra_ring_root_exist = diff --git a/include/acl_kernel.h b/include/acl_kernel.h index 4b6d9979..b97c0c16 100644 --- a/include/acl_kernel.h +++ b/include/acl_kernel.h @@ -61,6 +61,10 @@ int acl_kernel_has_unmapped_subbuffers(acl_mem_migrate_t *mem_migration); // currently loaded program. bool acl_device_has_reprogram_device_globals(cl_device_id device); +cl_int set_kernel_arg_mem_pointer_without_checks(cl_kernel kernel, + cl_uint arg_index, + void *arg_value); + #if defined(__cplusplus) } /* extern "C" */ #endif diff --git a/include/acl_mem.h b/include/acl_mem.h index c0b21e77..7a898ddc 100644 --- a/include/acl_mem.h +++ b/include/acl_mem.h @@ -93,6 +93,14 @@ cl_bool acl_is_sub_or_parent_buffer(cl_mem mem); void CL_CALLBACK acl_free_allocation_after_event_completion( cl_event event, cl_int event_command_exec_status, void *callback_data); +void CL_CALLBACK acl_dev_global_cleanup(cl_event event, + cl_int event_command_exec_status, + void *callback_data); + +cl_int acl_extract_device_global_address(cl_kernel kernel, + const char *dev_global_name, + unsigned int *ret_addr); + #ifdef __GNUC__ #pragma GCC visibility pop #endif diff --git a/src/acl_icd_dispatch.cpp b/src/acl_icd_dispatch.cpp index 18de2b3a..c4b77b49 100644 --- a/src/acl_icd_dispatch.cpp +++ b/src/acl_icd_dispatch.cpp @@ -50,6 +50,8 @@ clGetExtensionFunctionAddressIntelFPGA(const char *func_name) { ADDFUNCTIONLOOKUP(clResetKernelsIntelFPGA); ADDFUNCTIONLOOKUP(clSetBoardLibraryIntelFPGA); ADDFUNCTIONLOOKUP(clCreateBufferWithPropertiesINTEL); + ADDFUNCTIONLOOKUP(clEnqueueReadGlobalVariableINTEL); + ADDFUNCTIONLOOKUP(clEnqueueWriteGlobalVariableINTEL); // USM APIs are not currently supported on 32bit devices #ifndef __arm__ diff --git a/src/acl_kernel.cpp b/src/acl_kernel.cpp index 1a1c0987..4226e42b 100644 --- a/src/acl_kernel.cpp +++ b/src/acl_kernel.cpp @@ -831,6 +831,56 @@ CL_API_ENTRY cl_int CL_API_CALL clSetKernelArgSVMPointer( return clSetKernelArgSVMPointerIntelFPGA(kernel, arg_index, arg_value); } +/** + * Set any provided void pointer as kernel arguments + * + * It is assumed that the provided pointer is a valid device address, + * or device global address that kernel can use to point to right address space. + * + * It is the same as `clSetKernelArgMemPointerINTEL` except the validity checks + * are removed. This is because the user provided pointer may not always be usm + * pointer, therefore will not belong to the context (as they are checked in + * clSetKernelArgMemPointerINTEL) + * + * @param kernel the kernel that accept the pointer arg + * @param arg_index which kernel argument accept the value + * @param arg_value the pointer to desired address space + * @return status code, CL_SUCCESS if all operations are successful. + */ +cl_int set_kernel_arg_mem_pointer_without_checks(cl_kernel kernel, + cl_uint arg_index, + void *arg_value) { + std::scoped_lock lock{acl_mutex_wrapper}; + if (!acl_kernel_is_valid(kernel)) { + return (CL_INVALID_KERNEL); + } + + cl_context context = kernel->program->context; + + if (arg_index >= kernel->accel_def->iface.args.size()) { + ERR_RET(CL_INVALID_ARG_INDEX, context, "Argument index is too large"); + } + + // Determine where to write the value. + size_t start_idx = 0; + size_t iface_arg_size = 0; + l_get_arg_offset_and_size(kernel, arg_index, &start_idx, &iface_arg_size); + safe_memcpy(&(kernel->arg_value[start_idx]), &arg_value, iface_arg_size, + kernel->arg_value_size - start_idx, iface_arg_size); + kernel->arg_is_svm[arg_index] = CL_FALSE; + kernel->arg_is_ptr[arg_index] = CL_TRUE; + + kernel->arg_defined[arg_index] = 1; + + // double vector size if size < arg_index + while (kernel->ptr_arg_vector.size() <= arg_index) { + kernel->ptr_arg_vector.resize(kernel->ptr_arg_vector.size() * 2); + } + kernel->ptr_arg_vector[arg_index] = arg_value; + + return (CL_SUCCESS); +} + ACL_EXPORT CL_API_ENTRY cl_int CL_API_CALL clSetKernelArgMemPointerINTEL( cl_kernel kernel, cl_uint arg_index, const void *arg_value) { @@ -3193,10 +3243,10 @@ void acl_receive_kernel_update(int activation_id, cl_int status) { acl_device_op_queue_t *doq = &(acl_platform.device_op_queue); // This function can potentially be called by a HAL that does not use the - // ACL global lock, so we need to use acl_lock() instead of - // acl_assert_locked(). However, the MMD HAL calls this function from a unix - // signal handler, which can't lock mutexes, so we don't lock in that case. - // All functions called from this one therefore have to use + // ACL global lock, so we need to use std::scoped_lock lock{acl_mutex_wrapper} + // instead of acl_assert_locked(). However, the MMD HAL calls this function + // from a unix signal handler, which can't lock mutexes, so we don't lock in + // that case. All functions called from this one therefore have to use // acl_assert_locked_or_sig() instead of just acl_assert_locked(). std::unique_lock lock{acl_mutex_wrapper, std::defer_lock}; if (!acl_is_inside_sig()) { diff --git a/src/acl_mem.cpp b/src/acl_mem.cpp index 52979348..1d36dad1 100644 --- a/src/acl_mem.cpp +++ b/src/acl_mem.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -408,6 +409,339 @@ int acl_bind_buffer_to_device(cl_device_id device, cl_mem mem) { return 1; } +/** + * Read bytes of data from device global + * + * This function extract the device global address with given name, create + * kernel from the given program, create temporary device usm pointer to hold + * data that will be copied from device global. Launch the copy kernel with + * correct pointers set as src and destination, then enqueue a memory copy + * operation that depend on copy kernel to copy from temporary device usm + * pointer into user provided host pointer. Then register a callback to release + * the necessary events, kernels, memories. + * + * @param command_queue the queue system this copy kernel will belong + * @param program contains copy kernel + * @param name name of device global, used to look up for device global address + * in autodiscovery string + * @param blocking_read whether the operation is blocking or not + * @param size number of bytes to read / write + * @param offset offset from the extracted address of device global + * @param ptr pointer that will hold the data copied from device global + * @param num_events_in_wait_list number of event that copy kernel depend on + * @param event_wait_list events that copy kernel depend on + * @param event the info about the execution of copy kernel will be stored in + * the event + * @return status code, CL_SUCCESS if all operations are successful. + */ +ACL_EXPORT +CL_API_ENTRY cl_int clEnqueueReadGlobalVariableINTEL( + cl_command_queue command_queue, cl_program program, const char *name, + cl_bool blocking_read, size_t size, size_t offset, void *ptr, + cl_uint num_events_in_wait_list, const cl_event *event_wait_list, + cl_event *event) { + cl_int status; + + // Get copy kernel from the program + cl_kernel kernel = + clCreateKernelIntelFPGA(program, "copy_device_global", &status); + if (status != CL_SUCCESS) { + return status; + } + assert(kernel); + + // Look up device global address with use provided name + unsigned int device_global_addr_num = 0; + status = + acl_extract_device_global_address(kernel, name, &device_global_addr_num); + if (status != CL_SUCCESS) + return status; + uintptr_t dev_global_address = device_global_addr_num; + + // Calculate the offset from the device global address, offset is in byte unit + void *dev_global_ptr = + (void *)(dev_global_address + offset * 8); // 1 unit of offset is 8 bits + // TODO: add checks for whether the copy will be out of bound for device + // global + + // Set the device global pointer as the kernel destination args + status = set_kernel_arg_mem_pointer_without_checks(kernel, 0, dev_global_ptr); + // status = clSetKernelArgMemPointerINTEL(kernel, 1, dev_global_ptr); + if (status != CL_SUCCESS) { + return status; + } + + // Copy device global memory to temporary device usm pointer first to minimize + // area cost + void *tmp_dev_ptr = clDeviceMemAllocINTEL( + command_queue->context, command_queue->device, NULL, size, 1, &status); + if (status != CL_SUCCESS) { + return status; + } + if (!tmp_dev_ptr) { + return CL_MEM_OBJECT_ALLOCATION_FAILURE; + } + + status = clSetKernelArgMemPointerINTEL(kernel, 1, tmp_dev_ptr); + if (status != CL_SUCCESS) { + return status; + } + + // Propagate size information to kernel + status = clSetKernelArg(kernel, 2, sizeof(size_t), (const void *)(&size)); + if (status != CL_SUCCESS) { + return status; + } + + // Enqueue copy kernel execution + cl_event tmp_event = 0; + status = clEnqueueTask(command_queue, kernel, num_events_in_wait_list, + event_wait_list, &tmp_event); + if (status != CL_SUCCESS) { + return status; + } + + // Copy from the temporary device memory into user provided host pointer + // give this event back to user, not the copy kernel one + status = clEnqueueMemcpyINTEL(command_queue, blocking_read, ptr, tmp_dev_ptr, + size, 1, &tmp_event, event); + if (status != CL_SUCCESS) { + return status; + } + + if (blocking_read) { + // If blocking, first wait for event to finish, then clean up the resources. + status = clReleaseEvent(tmp_event); + if (status != CL_SUCCESS) { + return status; + } + status = clMemFreeINTEL(command_queue->context, tmp_dev_ptr); + if (status != CL_SUCCESS) { + return status; + } + status = clReleaseKernel(kernel); + if (status != CL_SUCCESS) { + return status; + } + } else { + // Clean up resources after event finishes + void **callback_data = (void **)acl_malloc(sizeof(void *) * 3); + if (!callback_data) { + return CL_OUT_OF_HOST_MEMORY; + } + callback_data[0] = (void *)(tmp_dev_ptr); + callback_data[1] = (void *)(kernel); + callback_data[2] = (void *)(tmp_event); + clSetEventCallback(*event, CL_COMPLETE, acl_dev_global_cleanup, + (void *)callback_data); + } + + return CL_SUCCESS; +} + +/** + * Write bytes of data from user provided host pointer into device global + * + * This function extract the device global address with given name, create + * kernel from the given program, create temporary device usm pointer to hold + * user provided data through usm copy operation. Launch the copy kernel with + * correct pointers set as src and destination. Then register a callback to + * release the necessary events, kernels, memories. + * + * @param command_queue the queue system this copy kernel will belong + * @param program contains copy kernel + * @param name name of device global, used to look up for device global address + * in autodiscovery string + * @param blocking_write whether the operation is blocking or not + * @param size number of bytes to read / write + * @param offset offset from the extracted address of device global + * @param ptr pointer that will hold the data copied from device global + * @param num_events_in_wait_list number of event that copy kernel depend on + * @param event_wait_list events that copy kernel depend on + * @param event the info about the execution of copy kernel will be stored in + * the event + * @return status code, CL_SUCCESS if all operations are successful. + */ +ACL_EXPORT +CL_API_ENTRY cl_int clEnqueueWriteGlobalVariableINTEL( + cl_command_queue command_queue, cl_program program, const char *name, + cl_bool blocking_write, size_t size, size_t offset, const void *ptr, + cl_uint num_events_in_wait_list, const cl_event *event_wait_list, + cl_event *event) { + cl_int status; + + // Get copy kernel from the program + cl_kernel kernel = + clCreateKernelIntelFPGA(program, "copy_device_global", &status); + if (status != CL_SUCCESS) { + return status; + } + assert(kernel); + + // Allocate a temporary device usm pointer to hold user data + // This is done to minimize kernel area, as device to device memory + // interconnect occupies least amount of memory + void *src_dev_ptr = clDeviceMemAllocINTEL( + command_queue->context, command_queue->device, NULL, size, 1, &status); + if (status != CL_SUCCESS) { + return status; + } + + // Copy from user provide host pointer to temporary device usm pointer + status = clEnqueueMemcpyINTEL(command_queue, CL_TRUE, src_dev_ptr, ptr, size, + 0, NULL, NULL); + if (status != CL_SUCCESS) { + return status; + } + + // Set the source of copy (in kernel arg) as the temporary device usm pointer + status = clSetKernelArgMemPointerINTEL(kernel, 0, src_dev_ptr); + if (status != CL_SUCCESS) { + return status; + } + + // Look up device global address with use provided name + unsigned int device_global_addr_num = 0; + status = + acl_extract_device_global_address(kernel, name, &device_global_addr_num); + if (status != CL_SUCCESS) + return status; + uintptr_t dev_global_address = device_global_addr_num; + + // TODO: remove the following checks, as it only works for unit test + assert(((unsigned int)dev_global_address) == 0x1024); + // Calculate the offset from the device global address, offset is in byte unit + void *dev_global_ptr = + (void *)(dev_global_address + offset * 8); // 1 unit of offset is 8 bits + // TODO: add checks for whether the copy will be out of bound for device + // global + + // Set the device global pointer as the kernel destination args + status = set_kernel_arg_mem_pointer_without_checks(kernel, 1, dev_global_ptr); + if (status != CL_SUCCESS) { + return status; + } + + // Propagate size information to kernel + status = clSetKernelArg(kernel, 2, sizeof(size_t), (const void *)(&size)); + if (status != CL_SUCCESS) { + return status; + } + + // Enqueue kernel execution + status = clEnqueueTask(command_queue, kernel, num_events_in_wait_list, + event_wait_list, event); + if (status != CL_SUCCESS) { + return status; + } + + std::scoped_lock lock{acl_mutex_wrapper}; + // If nothing's blocking, then complete right away + acl_idle_update(command_queue->context); + + if (blocking_write) { + // If blocking, first wait for event to finish, then clean up the resources. + status = clWaitForEvents(1, event); + if (status == CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST) { + return status; + } + status = clMemFreeINTEL(command_queue->context, src_dev_ptr); + if (status != CL_SUCCESS) { + return status; + } + status = clReleaseKernel(kernel); + if (status != CL_SUCCESS) { + return status; + } + } else { + // Otherwise, clean up resources after event finishes + void **callback_data = (void **)acl_malloc(sizeof(void *) * 3); + if (!callback_data) { + return CL_OUT_OF_HOST_MEMORY; + } + callback_data[0] = (void *)(src_dev_ptr); + callback_data[1] = (void *)(kernel); + callback_data[2] = NULL; + clSetEventCallback(*event, CL_COMPLETE, acl_dev_global_cleanup, + (void *)callback_data); + } + + return CL_SUCCESS; +} + +/** + * Look up device global address in autodiscovery def using user provided name + * + * @param kernel the copy kernel + * @param dev_global_name name of device global to query + * @param ret_addr hold the resulting address of device global + * @return status code, CL_SUCCESS if all operations are successful. + */ +cl_int acl_extract_device_global_address(cl_kernel kernel, + const char *dev_global_name, + unsigned int *ret_addr) { + std::scoped_lock lock{acl_mutex_wrapper}; + // In full system flow, the autodiscovery string is available through kernel's + // dev_bin, however it is not in unit testing framework Thus, try to find + // device global definition first on kernel's device bin, if not found then + // try on acl_present_board_def(), which is set in unit test setup steps. + std::unordered_map dev_global_map = + kernel->dev_bin->get_devdef().autodiscovery_def.device_global_mem_defs; + std::unordered_map::const_iterator + dev_global = dev_global_map.find(dev_global_name); + if (dev_global != dev_global_map.end()) { + *ret_addr = dev_global->second.address; + return (CL_SUCCESS); + } + // Device global name not found in kernel dev_bin, try to find in the sysdef + // setup by unit tests + dev_global_map = acl_present_board_def() + ->device[0] + .autodiscovery_def.device_global_mem_defs; + dev_global = dev_global_map.find(dev_global_name); + if (dev_global != dev_global_map.end()) { + *ret_addr = dev_global->second.address; + return (CL_SUCCESS); + } + return (CL_INVALID_VALUE); +} + +/** + * Callback function that executes after copy event finishes + * + * Main responsible for free heap memories, device memories + * + * @param event the event that this callback is registered on + * @param event_command_exec_status the status of of event (queue, complete etc) + * @param callback_data hold the resources that need to be released + * @return nothing + */ +void CL_CALLBACK acl_dev_global_cleanup(cl_event event, + cl_int event_command_exec_status, + void *callback_data) { + void **callback_ptrs = + (void **)callback_data; // callback_ptrs[0] is usm device pointer + // callback_ptrs[1] kernel to be released + // callback_ptrs[2] temporary event to be released + event_command_exec_status = + event_command_exec_status; // Avoiding Windows warning. + event = event; + std::scoped_lock lock{acl_mutex_wrapper}; + if (callback_ptrs[0]) { + // Free intermediate device usm pointers + clMemFreeINTEL(event->context, callback_ptrs[0]); + } + if (callback_ptrs[1]) { + // Release kernel from hep memory as its associated device memory + clReleaseKernel(((cl_kernel)callback_ptrs[1])); + } + if (callback_ptrs[2]) { + // Release event from hep memory + clReleaseEvent(((cl_event)callback_ptrs[2])); + } + acl_free(callback_data); +} + ACL_EXPORT CL_API_ENTRY cl_mem clCreateBufferWithPropertiesINTEL( cl_context context, const cl_mem_properties_intel *properties, diff --git a/src/acl_platform.cpp b/src/acl_platform.cpp index 514c8642..111683bd 100644 --- a/src/acl_platform.cpp +++ b/src/acl_platform.cpp @@ -875,10 +875,10 @@ void acl_receive_device_exception(unsigned physical_device_id, CL_EXCEPTION_TYPE_INTEL exception_type, void *user_private_info, size_t user_cb) { // This function can potentially be called by a HAL that does not use the - // ACL global lock, so we need to use acl_lock() instead of - // acl_assert_locked(). However, the MMD HAL calls this function from a unix - // signal handler, which can't lock mutexes, so we don't lock in that case. - // All functions called from this one therefore have to use + // ACL global lock, so we need to use std::scoped_lock lock{acl_mutex_wrapper} + // instead of acl_assert_locked(). However, the MMD HAL calls this function + // from a unix signal handler, which can't lock mutexes, so we don't lock in + // that case. All functions called from this one therefore have to use // acl_assert_locked_or_sig() instead of just acl_assert_locked(). CL_EXCEPTION_TYPE_INTEL current_exception, listen_mask; diff --git a/test/acl_globals_test.cpp b/test/acl_globals_test.cpp index 00b65c7a..c23cc71f 100644 --- a/test/acl_globals_test.cpp +++ b/test/acl_globals_test.cpp @@ -118,6 +118,13 @@ static acl_kernel_interface_t acltest_kernels[] = { {ACL_ARG_ADDR_GLOBAL, ACL_ARG_MEM_OBJ, sizeof(int *), 0, 0, 8}, {ACL_ARG_ADDR_GLOBAL, ACL_ARG_MEM_OBJ, sizeof(int *), 0, 0, 16}, {ACL_ARG_ADDR_GLOBAL, ACL_ARG_MEM_OBJ, sizeof(int *), 0, 0, 1024}, + }}, + {// interface + "copy_device_global", + { + {ACL_ARG_ADDR_GLOBAL, ACL_ARG_MEM_OBJ, sizeof(void *), 0, 0, 1}, + {ACL_ARG_ADDR_GLOBAL, ACL_ARG_MEM_OBJ, sizeof(void *), 0, 0, 1}, + {ACL_ARG_ADDR_NONE, ACL_ARG_BY_VALUE, sizeof(size_t), 0, 0}, }}}; template @@ -191,6 +198,24 @@ static std::vector acltest_complex_system_device0_accel = { {}, {32768, 0, 0}, 1}, + { + 14, // id + ACL_RANGE_FROM_ARRAY(acltest_devicelocal[11]), // mem + acltest_kernels[14], // iface + acltest_laspace_info, // local_aspaces + {0, 0, 0}, // compile_work_group_size + 0, // is_workgroup_invariant + 0, // is_workitem_invariant + 1, // num_vector_lanes + 0, // profiling_words_to_readback + 32768, // max_work_group_size + 3, // max_global_work_dim + {}, // printf_format_info + {32768, 0, 0}, // max_work_group_size_arr + 1, // uses_global_work_offset + 0, // fast_launch_depth + 1, // is_sycl_compile + }, {1, ACL_RANGE_FROM_ARRAY(acltest_devicelocal[1]), acltest_kernels[1], @@ -565,22 +590,33 @@ static acl_system_def_t acltest_complex_system = { 1, 0, /* alloc capabilities */ 0, /* min_host_mem_alignment */ - {"fpga0", - "sample40byterandomhash000000000000000000", - 0, - acltest_complex_system_device0_accel, /* accel */ - {}, /* hal_info */ - 1, // number of global memory systems - { - /* global mem info array */ - { - /* global mem info for memory 0 */ - /* global mem */ ACL_RANGE_FROM_ARRAY(acltest_global), - /* acl_system_global_mem_type_t */ ACL_GLOBAL_MEM_DEVICE_PRIVATE, - /* num_global_bank */ 2, - /* burst_interleaved */ 1, - }, - }}}, + { + "fpga0", + "sample40byterandomhash000000000000000000", + 0, + acltest_complex_system_device0_accel, /* accel */ + {}, /* hal_info */ + 1, // number of global memory systems + { + /* global mem info array */ + { + /* global mem info for memory 0 */ + /* global mem */ ACL_RANGE_FROM_ARRAY(acltest_global), + /* acl_system_global_mem_type_t */ + ACL_GLOBAL_MEM_DEVICE_PRIVATE, + /* num_global_bank */ 2, + /* burst_interleaved */ 1, + }, + }, + {}, // acl_hostpipe_info + 1, // num_device_global + { + // device_global_mem_defs map + {"dev_global_name", + {0x1024, 2048, ACL_DEVICE_GLOBAL_HOST_ACCESS_TYPE_COUNT, + ACL_DEVICE_GLOBAL_INIT_MODE_TYPE_COUNT, 0}}, + }, + }}, {nullptr, 1, 1, @@ -590,22 +626,33 @@ static acl_system_def_t acltest_complex_system = { 0, 0, /* alloc capabilities */ 0, /* min_host_mem_alignment */ - {"fpga1", - "sample40byterandomhash000000000000000001", - 0, - acltest_complex_system_device1_accel, /* accel */ - {}, /* hal_info */ - 1, // number of global memory systems - { - /* global mem info array */ - { - /* global mem info for memory 0 */ - /* global mem */ ACL_RANGE_FROM_ARRAY(acltest_global), - /* acl_system_global_mem_type_t */ ACL_GLOBAL_MEM_DEVICE_PRIVATE, - /* num_global_bank */ 2, - /* burst_interleaved */ 1, - }, - }}}, + { + "fpga1", + "sample40byterandomhash000000000000000001", + 0, + acltest_complex_system_device1_accel, /* accel */ + {}, /* hal_info */ + 1, // number of global memory systems + { + /* global mem info array */ + { + /* global mem info for memory 0 */ + /* global mem */ ACL_RANGE_FROM_ARRAY(acltest_global), + /* acl_system_global_mem_type_t */ + ACL_GLOBAL_MEM_DEVICE_PRIVATE, + /* num_global_bank */ 2, + /* burst_interleaved */ 1, + }, + }, + {}, // acl_hostpipe_info + 1, // num_device_global + { + // device_global_mem_defs map + {"dev_global_name", + {0x1024, 2048, ACL_DEVICE_GLOBAL_HOST_ACCESS_TYPE_COUNT, + ACL_DEVICE_GLOBAL_INIT_MODE_TYPE_COUNT, 0}}, + }, + }}, {nullptr, 2, 1, diff --git a/test/acl_program_test.cpp b/test/acl_program_test.cpp index 93f8c96a..f67e7330 100644 --- a/test/acl_program_test.cpp +++ b/test/acl_program_test.cpp @@ -606,25 +606,25 @@ MT_TEST(acl_program, program_info) { // built stat. to success even before calling clbuildprogram CHECK_EQUAL(CL_SUCCESS, clGetProgramInfo(program, CL_PROGRAM_NUM_KERNELS, sizeof(size_t), &num_kernels, 0)); - CHECK_EQUAL(14, num_kernels); + CHECK_EQUAL(15, num_kernels); // This won't happen if program is built with binary since we set the program // built stat. to success even before calling clbuildprogram CHECK_EQUAL(CL_SUCCESS, clGetProgramInfo(program, CL_PROGRAM_KERNEL_NAMES, 0, NULL, &size_ret)); - CHECK_EQUAL(321, size_ret); + CHECK_EQUAL(340, size_ret); CHECK_EQUAL(CL_SUCCESS, clBuildProgram(program, 0, 0, "", 0, 0)); // after building the program CHECK_EQUAL(CL_SUCCESS, clGetProgramInfo(program, CL_PROGRAM_NUM_KERNELS, sizeof(size_t), &num_kernels, 0)); - CHECK_EQUAL(14, num_kernels); + CHECK_EQUAL(15, num_kernels); CHECK_EQUAL(CL_SUCCESS, clGetProgramInfo(program, CL_PROGRAM_NUM_KERNELS, sizeof(size_t), &num_kernels, &size_ret)); - CHECK_EQUAL(14, num_kernels); + CHECK_EQUAL(15, num_kernels); CHECK_EQUAL(sizeof(size_t), size_ret); CHECK_EQUAL(CL_SUCCESS, clGetProgramInfo(program, CL_PROGRAM_NUM_KERNELS, 0, @@ -633,15 +633,18 @@ MT_TEST(acl_program, program_info) { CHECK_EQUAL(CL_SUCCESS, clGetProgramInfo(program, CL_PROGRAM_KERNEL_NAMES, 0, NULL, &size_ret)); - CHECK_EQUAL(321, size_ret); + CHECK_EQUAL(340, size_ret); + // CHECK_EQUAL(321, size_ret); names[size_ret] = 100; // making sure extra bytes of memory are not affected. CHECK_EQUAL(CL_SUCCESS, clGetProgramInfo(program, CL_PROGRAM_KERNEL_NAMES, 2000 * sizeof(char), names, &size_ret)); - CHECK_EQUAL(321, size_ret); // only one kernel named: "foo" + CHECK_EQUAL(340, size_ret); // only one kernel named: "foo" CHECK_EQUAL(100, names[size_ret]); - CHECK_EQUAL(0, strcmp("kernel0_copy_vecin_vecout;" + CHECK_EQUAL(0, strcmp("copy_device_global;" // TODO: eventually we would want + // to hide it from users + "kernel0_copy_vecin_vecout;" "kernel11_task_double;" "kernel12_task_double;" "kernel13_multi_vec_lane;" diff --git a/test/acl_usm_test.cpp b/test/acl_usm_test.cpp index 0f780970..7dd202e6 100644 --- a/test/acl_usm_test.cpp +++ b/test/acl_usm_test.cpp @@ -41,25 +41,30 @@ MT_TEST_GROUP(acl_usm) { void setup() { if (threadNum() == 0) { acl_test_setup_generic_system(); - } + this->load(); + m_program = this->load_program(); + this->build(m_program); + } syncThreads(); - - this->load(); } void teardown() { - unload_context(); - if (m_context) { - clReleaseContext(m_context); - m_context = 0; - } syncThreads(); if (threadNum() == 0) { + this->unload_program(m_program); + unload_context(); + + if (m_context) { + clReleaseContext(m_context); + m_context = 0; + } + acl_test_teardown_generic_system(); } + syncThreads(); acl_test_run_standard_teardown_checks(); } @@ -108,6 +113,38 @@ MT_TEST_GROUP(acl_usm) { ACL_LOCKED(CHECK(acl_context_is_valid(m_context))); } + cl_program load_program() { + cl_int status; + cl_program program; + status = CL_INVALID_VALUE; + const unsigned char *bin = (const unsigned char *)"0"; + size_t bin_length = 1; + cl_int bin_status; + program = clCreateProgramWithBinary(m_context, 1, &m_device[0], &bin_length, + &bin, &bin_status, &status); + CHECK_EQUAL(CL_SUCCESS, status); + CHECK(program); + ACL_LOCKED(CHECK(acl_program_is_valid(program))); + + return program; + } + void unload_program(cl_program program) { + int program_is_valid; + ACL_LOCKED(program_is_valid = acl_program_is_valid(program)); + if (program_is_valid) { + cl_int status; + while (program->num_kernels) { + clReleaseKernel(program->kernel_list->kernel); + } + CHECK_EQUAL(1, acl_ref_count(program)); + status = clReleaseProgram(program); + CHECK_EQUAL(CL_SUCCESS, status); + } + } + void build(cl_program program) { + CHECK_EQUAL(CL_SUCCESS, clBuildProgram(program, 0, 0, "", 0, 0)); + } + void load_backing_store_context(void) { unload_context(); cl_context_properties props[] = { @@ -165,6 +202,7 @@ MT_TEST_GROUP(acl_usm) { cl_uint m_num_devices; cl_device_id m_device[ACL_MAX_DEVICE]; cl_command_queue m_cq; + cl_program m_program; public: bool yeah; @@ -1208,4 +1246,67 @@ MT_TEST(acl_usm, memfill_usm) { ACL_LOCKED(acl_print_debug_msg("end memfill_usm\n")); } +MT_TEST(acl_usm, read_device_global) { + ACL_LOCKED(acl_print_debug_msg("begin read_device_global\n")); + char str[100]; + const size_t strsize = sizeof(str) / sizeof(char); // includes NUL (!) + char resultbuf[strsize]; + cl_int status; + + cl_event write_event = 0; + cl_event read_event = 0; + + // Prepare host memory + syncThreads(); + // Host pointer example + void *src_ptr = malloc(strsize); + CHECK(src_ptr != NULL); + + syncThreads(); + // Write to device global + status = clEnqueueWriteGlobalVariableINTEL(m_cq, m_program, "dev_global_name", + CL_FALSE, strsize, 0, src_ptr, 0, + NULL, &write_event); + CHECK_EQUAL(CL_SUCCESS, status); + + // Read from device global, with dependence on write event + status = clEnqueueReadGlobalVariableINTEL(m_cq, m_program, "dev_global_name", + CL_FALSE, strsize, 0, src_ptr, 1, + &write_event, &read_event); + CHECK_EQUAL(CL_SUCCESS, status); + + // Manually set "write device global" event done + int write_activation_id = write_event->cmd.info.ndrange_kernel + .invocation_wrapper->image->activation_id; + acltest_call_kernel_update_callback(write_activation_id, CL_RUNNING); + acltest_call_kernel_update_callback(write_activation_id, CL_COMPLETE); + + // Nudge the scheduler to take above finish into account + std::scoped_lock lock{acl_mutex_wrapper}; + // If nothing's blocking, then complete right away + acl_idle_update(m_cq->context); + + // The event returned from read device global is not the copy kernel launch + // event Therefore need to first get the event that it depend on, then + // manually set it to complete + auto last_event = read_event->depend_on.end(); + last_event--; + cl_event read_copy_kernel_event = *last_event; + int read_activation_id = read_copy_kernel_event->cmd.info.ndrange_kernel + .invocation_wrapper->image->activation_id; + acltest_call_kernel_update_callback(read_activation_id, CL_RUNNING); + acltest_call_kernel_update_callback(read_activation_id, CL_COMPLETE); + // Now the usm copy operation will execute + + // Block on all event completion + CHECK_EQUAL(CL_SUCCESS, clFinish(m_cq)); + CHECK_EQUAL(CL_SUCCESS, clReleaseEvent(write_event)); + CHECK_EQUAL(CL_SUCCESS, clReleaseEvent(read_event)); + + // Host pointer example + free(src_ptr); + + ACL_LOCKED(acl_print_debug_msg("end read_write_buf\n")); +} + #endif // __arm__