#include #define CL_HPP_TARGET_OPENCL_VERSION 200 #define CL_HPP_MINIMUM_OPENCL_VERSION 200 #include #include #include #include #include #include #include #include #include void exitIfError(cl_int errorCode, char const* msg); void errCallback(const char* errinfo, const void* private_info, size_t cb, void* user_data); class SimpleClContext { public: SimpleClContext() { cl_int err; std::vector platformList; err = cl::Platform::get(&platformList); exitIfError(platformList.empty() ? CL_PLATFORM_NOT_FOUND_KHR : err, "cl::Platform::get()"); std::vector deviceList; err = platformList[0].getDevices(CL_DEVICE_TYPE_GPU, &deviceList); exitIfError(deviceList.empty() ? CL_DEVICE_NOT_FOUND : err, "cl::Platform::getDevices()"); m_context.reset(new cl::Context(CL_DEVICE_TYPE_GPU, nullptr, nullptr, nullptr, &err)); exitIfError(err, "Conext::Context()"); m_devices = context().getInfo(); exitIfError(m_devices.size() > 0 ? CL_SUCCESS : -1, "devices.size() > 0"); std::cerr << "Found " << m_devices.size() << " device(s)\n"; for (cl::Device const& device : m_devices) { std::cerr << " type = " << device.getInfo() << "\n"; } } cl::Context& context() { return *m_context; } std::vector& devices() { return m_devices; } cl::Device& device() { return m_devices[0]; } std::unique_ptr buildProgram(std::string const& source) { cl_int err; cl::Program::Sources sourceObj{ source }; std::unique_ptr program(new cl::Program(context(), source)); err = program->build(m_devices, ""); std::string messages; program->getBuildInfo(m_devices[0], CL_PROGRAM_BUILD_LOG, &messages); std::cerr << messages; if (err != CL_SUCCESS) { exitIfError(err, "program build()"); } exitIfError(err, "program build()"); return std::move(program); } private: std::unique_ptr m_context; std::vector m_devices; }; int main() { cl_int err; SimpleClContext myContext; std::unique_ptr mergeProcess = myContext.buildProgram( "#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable\n" "__kernel void mergeProcess(__global int* buf0, __global int* buf1,\n" " long size, long nrWorkers, short returnTo0) {\n" " int tid = get_global_id(0);\n" " long const begin = size*tid/nrWorkers;\n" " long const end = size*(tid+1)/nrWorkers;\n" " long const workSize = end - begin;\n" " long sortedSize;\n" " short isEven = 1;\n" " for(sortedSize = 1 ; sortedSize < workSize ; sortedSize *= 2) {\n" " __global int* const inBuf = isEven ? buf0 : buf1;\n" " __global int* const outBuf = isEven ? buf1 : buf0;\n" " __global int* pOut = outBuf + begin;\n" " __global int* pEndAllOut = outBuf + end;\n" " __global int* pIn0 = inBuf + begin;\n" " __global int* pEndAllIn = inBuf + end;\n" " long remaining = workSize;\n" " while(remaining > 0) {\n" " __global int* pIn1 = (remaining > sortedSize) ? pIn0+sortedSize : pEndAllIn;\n" " __global int* const pIn0End = pIn1;\n" " __global int* const pIn1End = (remaining > 2*sortedSize) ? pIn1+sortedSize : pEndAllIn;\n" " __global int* const pOutEnd = (remaining > 2*sortedSize) ? pOut+2*sortedSize : pEndAllOut;\n" " while(pOut=pIn1End || (pIn0 2*sortedSize) ? remaining-2*sortedSize : 0;\n" " pIn0 = pIn1End;\n" " }\n" " isEven = 1 - isEven;\n" " }\n" " if((returnTo0 && !isEven) || (!returnTo0 && isEven)) {\n" " __global int* p = (isEven ? buf0 : buf1) + begin;\n" " __global int* q = (isEven ? buf1 : buf0) + begin;\n" " __global int* pEnd = p + end;\n" " for( ; p < pEnd ; ++p,++q) *q=*p;\n" " }\n" "}\n" "__kernel void mergeCombine(__global int* in, __global int* out, long size, long nrWorkers) {\n" " int tid = get_global_id(0);\n" " int start = size * tid / nrWorkers, mid = size * (2 * tid + 1) / nrWorkers / 2, end = size * (tid+1) / nrWorkers;\n" " //printf(\"%d %d %d\\n\", start, mid, end);\n" " int out_position = start;\n" " int p1 = start;\n" " int p2 = mid;\n" " while (p1 < mid || p2 < end) {\n" " if (p2 == end || (p1 < mid && in[p1] < in[p2]))\n" " out[out_position++] = in[p1++];\n" " else\n" " out[out_position++] = in[p2++];\n" " }\n" " int i;\n" " if (out_position != end) printf(\"Mistake\\n\");\n" " for (i = start; i < out_position; ++i)\n" " in[i] = out[i];\n" "}\n" ); cl::Kernel mergekernel(*mergeProcess, "mergeProcess", &err); exitIfError(err, "Kernel::Kernel()"); std::cerr << "Kernel built\n"; cl::Kernel combinekernel(*mergeProcess, "mergeCombine", &err); exitIfError(err, "Kernel::Kernel()"); cl::CommandQueue queue(myContext.context(), myContext.device(), 0, &err); exitIfError(err, "CommandQueue::CommandQueue()"); size_t const n = 1024 * 32; int no_kernels = 32; cl::Buffer inputBuf(myContext.context(), CL_MEM_READ_WRITE, n * 4, nullptr, &err); exitIfError(err, "Buffer::Buffer()"); cl::Buffer outputBuf(myContext.context(), CL_MEM_READ_WRITE, n * 4, nullptr, &err); exitIfError(err, "Buffer::Buffer()"); err = mergekernel.setArg(0, inputBuf); exitIfError(err, "Kernel::setArg(0)"); err = mergekernel.setArg(1, outputBuf); exitIfError(err, "Kernel::setArg(1)"); err = mergekernel.setArg(2, int64_t(n)); exitIfError(err, "Kernel::setArg(2)"); err = mergekernel.setArg(3, int64_t(no_kernels)); exitIfError(err, "Kernel::setArg(3)"); err = mergekernel.setArg(4, int16_t(0)); exitIfError(err, "Kernel::setArg(4)"); std::vector afterCopyIn(1); std::vector afterKernelExec(1); std::vector afterCopyOut(1); std::vector input(n, 1); for (cl_int& v : input) { v = rand() % 1000; } std::cerr << "input generated\n"; err = queue.enqueueWriteBuffer(inputBuf, CL_TRUE, 0, 4 * input.size(), input.data(), nullptr, nullptr); exitIfError(err, "ComamndQueue::enqueueWriteBuffer()"); err = queue.enqueueNDRangeKernel(mergekernel, cl::NDRange(0), cl::NDRange(no_kernels), cl::NDRange(1), nullptr, &afterKernelExec[0]); exitIfError(err, "ComamndQueue::enqueueNDRangeKernel()"); afterKernelExec[0].wait(); no_kernels /= 2; while (no_kernels) { cl::Event e; std::vector output(n, 0); err = queue.enqueueReadBuffer(outputBuf, CL_TRUE, 0, 4 * output.size(), output.data(), nullptr, nullptr); exitIfError(err, "ComamndQueue::enqueueReadBuffer()"); err = combinekernel.setArg(0, outputBuf); exitIfError(err, "Kernel::setArg(0)"); err = combinekernel.setArg(1, inputBuf); exitIfError(err, "Kernel::setArg(1)"); err = combinekernel.setArg(2, int64_t(n)); exitIfError(err, "Kernel::setArg(2)"); err = combinekernel.setArg(3, int64_t(no_kernels)); exitIfError(err, "Kernel::setArg(3)"); err = queue.enqueueNDRangeKernel(combinekernel, cl::NDRange(0), cl::NDRange(no_kernels), cl::NDRange(1), nullptr, &e); exitIfError(err, "ComamndQueue::enqueueNDRangeKernel()"); e.wait(); no_kernels /= 2; } std::vector output(n, 0); err = queue.enqueueReadBuffer(outputBuf, CL_TRUE, 0, 4 * output.size(), output.data(), nullptr, nullptr); exitIfError(err, "ComamndQueue::enqueueReadBuffer()"); afterCopyOut[0].wait(); std::cout << "Output (good):\n"; for (size_t i = 0; i < n; ++i) { std::cout << output[i] << " "; } std::cout << "\n"; bool ok = true; for (size_t i = 1; i < n; ++i) { if (output[i - 1] > output[i]) { ok = false; } } std::cout << "ok=" << ok << "\n"; return EXIT_SUCCESS; } const char* getErrorString(cl_int error) { switch (error) { // run-time and JIT compiler errors case CL_SUCCESS: return "CL_SUCCESS"; case CL_DEVICE_NOT_FOUND: return "CL_DEVICE_NOT_FOUND"; case CL_DEVICE_NOT_AVAILABLE: return "CL_DEVICE_NOT_AVAILABLE"; case CL_COMPILER_NOT_AVAILABLE: return "CL_COMPILER_NOT_AVAILABLE"; case CL_MEM_OBJECT_ALLOCATION_FAILURE: return "CL_MEM_OBJECT_ALLOCATION_FAILURE"; case CL_OUT_OF_RESOURCES: return "CL_OUT_OF_RESOURCES"; case CL_OUT_OF_HOST_MEMORY: return "CL_OUT_OF_HOST_MEMORY"; case -7: return "CL_PROFILING_INFO_NOT_AVAILABLE"; case -8: return "CL_MEM_COPY_OVERLAP"; case -9: return "CL_IMAGE_FORMAT_MISMATCH"; case -10: return "CL_IMAGE_FORMAT_NOT_SUPPORTED"; case -11: return "CL_BUILD_PROGRAM_FAILURE"; case -12: return "CL_MAP_FAILURE"; case -13: return "CL_MISALIGNED_SUB_BUFFER_OFFSET"; case -14: return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST"; case -15: return "CL_COMPILE_PROGRAM_FAILURE"; case -16: return "CL_LINKER_NOT_AVAILABLE"; case -17: return "CL_LINK_PROGRAM_FAILURE"; case -18: return "CL_DEVICE_PARTITION_FAILED"; case -19: return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE"; // compile-time errors case -30: return "CL_INVALID_VALUE"; case -31: return "CL_INVALID_DEVICE_TYPE"; case -32: return "CL_INVALID_PLATFORM"; case -33: return "CL_INVALID_DEVICE"; case -34: return "CL_INVALID_CONTEXT"; case -35: return "CL_INVALID_QUEUE_PROPERTIES"; case -36: return "CL_INVALID_COMMAND_QUEUE"; case -37: return "CL_INVALID_HOST_PTR"; case -38: return "CL_INVALID_MEM_OBJECT"; case -39: return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR"; case -40: return "CL_INVALID_IMAGE_SIZE"; case -41: return "CL_INVALID_SAMPLER"; case -42: return "CL_INVALID_BINARY"; case -43: return "CL_INVALID_BUILD_OPTIONS"; case -44: return "CL_INVALID_PROGRAM"; case -45: return "CL_INVALID_PROGRAM_EXECUTABLE"; case -46: return "CL_INVALID_KERNEL_NAME"; case -47: return "CL_INVALID_KERNEL_DEFINITION"; case -48: return "CL_INVALID_KERNEL"; case -49: return "CL_INVALID_ARG_INDEX"; case -50: return "CL_INVALID_ARG_VALUE"; case -51: return "CL_INVALID_ARG_SIZE"; case -52: return "CL_INVALID_KERNEL_ARGS"; case -53: return "CL_INVALID_WORK_DIMENSION"; case -54: return "CL_INVALID_WORK_GROUP_SIZE"; case -55: return "CL_INVALID_WORK_ITEM_SIZE"; case -56: return "CL_INVALID_GLOBAL_OFFSET"; case -57: return "CL_INVALID_EVENT_WAIT_LIST"; case -58: return "CL_INVALID_EVENT"; case -59: return "CL_INVALID_OPERATION"; case -60: return "CL_INVALID_GL_OBJECT"; case -61: return "CL_INVALID_BUFFER_SIZE"; case -62: return "CL_INVALID_MIP_LEVEL"; case -63: return "CL_INVALID_GLOBAL_WORK_SIZE"; case -64: return "CL_INVALID_PROPERTY"; case -65: return "CL_INVALID_IMAGE_DESCRIPTOR"; case -66: return "CL_INVALID_COMPILER_OPTIONS"; case -67: return "CL_INVALID_LINKER_OPTIONS"; case -68: return "CL_INVALID_DEVICE_PARTITION_COUNT"; // extension errors case -1000: return "CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR"; case -1001: return "CL_PLATFORM_NOT_FOUND_KHR"; case -1002: return "CL_INVALID_D3D10_DEVICE_KHR"; case -1003: return "CL_INVALID_D3D10_RESOURCE_KHR"; case -1004: return "CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR"; case -1005: return "CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR"; default: return "Unknown OpenCL error"; } } void exitIfError(cl_int errorCode, char const* msg) { if (errorCode != CL_SUCCESS) { fprintf(stderr, "OpenCL error at %s: %s(%d)\n", msg, getErrorString(errorCode), errorCode); exit(1); } } void errCallback(const char* errinfo, const void* private_info, size_t cb, void* user_data) { fprintf(stderr, "OpenCL error callback: %s\n", errinfo); exit(1); }