Use OpenCL to execute native kernels

Published on sam 18 juin 2011 in Clover, (Comments)

Hello,

Today is a big day for Clover : we can finally use it to execute native kernels on the processor, in a command queue, asynchronously, and multiple one can be executed in parallel. A native kernel is a simple C/C++ function that we queue for execution on a CPU device, so there is no compiler, no bitcode, etc.

Here is a sample code executing a simple kernel (original in tests/test_kernel.cpp) :

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#include <CL/cl.h>

struct args
{
    size_t buffer_size;
    char *buffer;
};

static void native_kernel(void *a)
{
    struct args *data = (struct args *)a;
    int i;

    for (int i=0; i<data->buffer_size; ++i)
    {
        data->buffer[i] = ~data->buffer[i];
    }
}

int main(int argc, char **argv)
{
    cl_platform_id platform = 0;
    cl_device_id device;
    cl_context ctx;
    cl_command_queue queue;
    cl_event events[2];
    cl_mem buf1, buf2;
    char s1[] = "Lorem ipsum dolor sit amet";
    char s2[] = "I want to tell you that you rock";

    // Initialize the context
    clGetDeviceIDs(platform, CL_DEVICE_TYPE_DEFAULT, 1, &device, 0);
    ctx = clCreateContext(0, 1, &device, 0, 0, 0);

    // And the command queue
    queue = clCreateCommandQueue(ctx, device,
                                 CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, 0);

    // Create two buffers
    buf1 = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
                          sizeof(s1), (void *)&s1, 0);
    buf2 = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
                          sizeof(s2), (void *)&s2, 0);

    // Enqueue native kernels
    struct args a;
    const void *mem_loc = (const void *)&a.buffer; // Tell OpenCL to complete the struct

    a.buffer_size = sizeof(s1);
    clEnqueueNativeKernel(queue, &native_kernel, &a, sizeof(a),
                          1, &buf1, &mem_loc, 0, 0, &events[0]);

    a.buffer_size = sizeof(s2);
    clEnqueueNativeKernel(queue, &native_kernel, &a, sizeof(a),
                          1, &buf2, &mem_loc, 0, 0, &events[1]);

    // Wait for events
    clWaitForEvents(2, events);

    // Finished
    clReleaseCommandQueue(queue);
    clReleaseContext(ctx);
    return 0;
}

The code has been pushed on git. You look at the diff to see how I implemented this. The code is full of casts, I don't really like that, but it's low-level and I try to keep a good code quality.

I'll now polish a bit what I already made (for instance implementing clUnmapMemObject, an easy function), then I'll begin the OpenCL C kernels. My exams will end on Tuesday, I will finally have plenty of time to work on Clover before my vacation (starting July 3)

Have fun parallelizing your applications !

« More command queues work   Exams finished »