Alright, this OpenCL kernel is starting to bug me to death. AKA I can't find the damn bug.
Running it on an AMD Radeon HD 6850, OpenCL version 1.1. The kernel takes in data from two equal-length lists of galaxies and calculates a histogram based on the angular separation. I run three iterations of this kernel, each one with different lists. AFAIK, they all run fine give or take some floating point differences. The exception is the first iteration, where the first element in the histogram is always, no matter what, the length of the list, even though each work-item prints out correct values from atomic_inc. Really, WTF.
__kernel void histogram(
__global float *ascension,
__global float *declination,
__global float *ascension2,
__global float *declination2,
__global int *separation,
__local int *bins,
const int doubleList)
{
/* SETUP */
size_t xIndex = get_global_id(0);
size_t yIndex = get_global_id(1);
/* Calculate angles of separation in degrees */
if ((xIndex < yIndex) || (doubleList == 1)) {
float a1 = ascension[xIndex]/10800;
float d1 = declination[xIndex]/10800;
float a2 = ascension2[yIndex]/10800;
float d2 = declination2[yIndex]/10800;
float da = a2 - a1;
float A = cospi(d2)*sinpi(da);
float B = cospi(d1)*sinpi(d2) - sinpi(d1)*cospi(d2)*cospi(da);
float C = sinpi(d1)*sinpi(d2) + cospi(d1)*cospi(d2)*cospi(da);
float theta = 180*atan2pi(sqrt(pow(A,2) + pow(B,2)), C);
if (theta >= 0 && theta <= 64) {
int binId = convert_int_rtn(4*theta);
int old = atomic_inc(&separation[binId]);
if (binId == 0) printf("OCL: (%d,%d): %8f\t%d\r\n", xIndex, yIndex, theta, old);
}
}
}
#include <fstream>
#include <sstream>
#include <iostream>
#include <cstdlib>
#include <cmath>
#include <time.h>
#include <CL/cl.h>
#define ERR(msg) printf("\nError: %s : %d\n", (msg), __LINE__)
int loadKernelSource(const char* filename, std::string &outputSource) {
std::ifstream filestream(filename);
std::stringstream stream;
stream << filestream.rdbuf();
outputSource = stream.str();
const size_t length = outputSource.length();
stream.flush();
filestream.close();
return length;
}
int outputData(const char* filename, int *DD, int *DR, int *RR, float *omega) {
std::ofstream filestream(filename);
for(int k=0; k < 256; k++) {
filestream << DD[k] << "," << DR[k] << "," << RR[k] << "," << omega[k] << std::endl;
}
filestream.close();
return 0;
}
void loadData(const char* filename, float** ascension, float** declination, int &numData) {
std::ifstream filestream(filename);
filestream >> numData;
#ifdef DATA_SIZE
numData = (DATA_SIZE >= numData)? numData : DATA_SIZE;
#endif
*ascension = new float[numData];
*declination = new float[numData];
for(int k = 0; k<numData; k++) {
filestream >> *(*ascension + k);
filestream >> *(*declination + k);
}
filestream.close();
}
void testRun(float* ascension, float* declination, float* ascension2, float* declination2,
int* separation, int sizex, int sizey, int doubleList) {
for(int i = 0; i < sizex; i++) {
//for(int j = (1-doubleList)*(i+1);j < sizey; j++) {
for(int j = 0;j < sizey; j++) {
if((i < j) || (doubleList == 1)){
#define PI 3.1415927 //Accurate witin floating point precision
float a1 = ascension[i]/10800;
float d1 = declination[i]/10800;
float a2 = ascension2[j]/10800;
float d2 = declination2[j]/10800;
float da = a2 - a1;
float A = cos(PI*d2)*sin(PI*da);
float B = cos(PI*d1)*sin(PI*d2) - sin(PI*d1)*cos(PI*d2)*cos(PI*da);
float C = sin(PI*d1)*sin(PI*d2) + cos(PI*d1)*cos(PI*d2)*cos(PI*da);
float theta = 180*atan2(sqrt(pow(A,2) + pow(B,2)), C)/PI;
if (theta >= 0 && theta <= 64) {
int binId = (int) floor(4*theta);
separation[binId]++;
}}
}
}
}
cl_int runKernel(cl_command_queue *queue, cl_kernel *kernel,
cl_mem *a1, cl_mem *d1, cl_mem *a2, cl_mem *d2, cl_mem *s,
int *doubleList, size_t* size, cl_event *event) {
cl_int err = clSetKernelArg(*kernel, 0, sizeof(cl_mem), a1);
err |= clSetKernelArg(*kernel, 1, sizeof(cl_mem), d1);
err |= clSetKernelArg(*kernel, 2, sizeof(cl_mem), a2);
err |= clSetKernelArg(*kernel, 3, sizeof(cl_mem), d2);
err |= clSetKernelArg(*kernel, 4, sizeof(cl_mem), s);
err |= clSetKernelArg(*kernel, 5, 256*sizeof(int), NULL);
err |= clSetKernelArg(*kernel, 6, sizeof(int), doubleList);
if (err != CL_SUCCESS) {ERR("Set Arguments"); return err;};
err = clEnqueueNDRangeKernel(*queue, *kernel, 2, NULL, size, NULL, 0, NULL, event);
if (err != CL_SUCCESS) {ERR("Execution"); return err;}
return err;
}
int main(int argc, char** argv) {
/* Load kernel source */
std::string source;
loadKernelSource("histogram.cl", source);
int numData;
int numRand;
int bufferSize;
float *dataAsc;
float *dataDec;
float *randAsc;
float *randDec;
int DD[256];
int DR[256];
int RR[256];
int testDD[256];
int testDR[256];
int testRR[256];
float omega[256];
#ifdef PROFILE
clock_t start, end;
clock();
#ifdef PROFILE_CPU
clock_t t[4];
#endif
#endif
/* Load data */
loadData("data_100k_arcmin.txt", &dataAsc, &dataDec, numData);
loadData("flat_100k_arcmin.txt", &randAsc, &randDec, numRand);
bufferSize = (numData >= numRand)? numData : numRand;
std::cout << "Number of data points: " << numData << std::endl;
for(int k=0; k < 10; k++) { std::cout << std::endl << dataAsc[k] << "\t\t" << dataDec[k];}
std::cout << "\n..." << std::endl;
std::cout << dataAsc[numData-1] << "\t\t" << dataDec[numData-1];
std::cout << "\n\n";
std::cout << "Number of data points: " << numRand << std::endl;
for(int k=0; k < 10; k++) { std::cout << std::endl << randAsc[k] << "\t\t" << randDec[k];}
std::cout << "\n..." << std::endl;
std::cout << randAsc[numRand-1] << "\t\t" << randDec[numRand-1];
std::cout << "\n\n";
/* Find platform */
cl_platform_id platform;
cl_int err;
err = clGetPlatformIDs(1,&platform, NULL);
if (err != CL_SUCCESS) {ERR("Platform"); exit(1);}
/* Find device */
cl_device_id device;
err = clGetDeviceIDs(platform,CL_DEVICE_TYPE_GPU, 1, &device, NULL);
if (err != CL_SUCCESS) {ERR("Device"); exit(1);}
/* Create context */
cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
if (!context) {ERR("Context"); exit(1);}
/* Create command queue */
cl_command_queue queue = clCreateCommandQueue(context, device, 0, &err);
if (!queue) {ERR("Command Queue"); exit(1);}
/* Create program */
const char* programSource = source.c_str();
cl_program program = clCreateProgramWithSource(context, 1, &programSource, NULL, &err);
if (!program) {ERR("Program"); exit(1);}
/* Build program */
err = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
if (err != CL_SUCCESS) {
ERR("Build");
char output[200000];
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 200000*sizeof(char), &output, NULL);
std::cout << "Compiler output:\n\n" << output << std::endl;
exit(1);
}
/* Create kernel */
cl_kernel kernel = clCreateKernel(program, "histogram", &err);
if (!kernel || err != CL_SUCCESS) {ERR("Kernel"); exit(1);}
/* Create buffers */
cl_mem ascension = clCreateBuffer(context, CL_MEM_READ_ONLY, bufferSize*sizeof(float), NULL, &err);
if (!ascension || err != CL_SUCCESS) {ERR("Buffer Ascension"); exit(1);}
cl_mem ascension2 = clCreateBuffer(context, CL_MEM_READ_ONLY, bufferSize*sizeof(float), NULL, &err);
if (!ascension || err != CL_SUCCESS) {ERR("Buffer Ascension"); exit(1);}
cl_mem declination = clCreateBuffer(context, CL_MEM_READ_ONLY, bufferSize*sizeof(float), NULL, &err);
if (!declination || err != CL_SUCCESS) {ERR("Buffer Declination"); exit(1);}
cl_mem declination2 = clCreateBuffer(context, CL_MEM_READ_ONLY, bufferSize*sizeof(float), NULL, &err);
if (!declination || err != CL_SUCCESS) {ERR("Buffer Declination"); exit(1);}
cl_mem separation = clCreateBuffer(context, CL_MEM_READ_WRITE, 256*sizeof(int), NULL, &err);
if (!separation || err != CL_SUCCESS) {ERR("Buffer Separation"); exit(1);}
cl_mem separation2 = clCreateBuffer(context, CL_MEM_READ_WRITE, 256*sizeof(int), NULL, &err);
if (!separation || err != CL_SUCCESS) {ERR("Buffer Separation"); exit(1);}
cl_mem separation3 = clCreateBuffer(context, CL_MEM_READ_WRITE, 256*sizeof(int), NULL, &err);
if (!separation || err != CL_SUCCESS) {ERR("Buffer Separation"); exit(1);}
/* Start running kernels */
#ifdef PROFILE
start = clock();
#endif
int doubleList = 0;
size_t size[2] = {numData, numData};
err = clEnqueueWriteBuffer(queue, ascension, CL_TRUE, 0,
numData*sizeof(float), dataAsc, 0, NULL, NULL);
err |= clEnqueueWriteBuffer(queue, declination, CL_TRUE, 0,
numData*sizeof(float), dataDec, 0, NULL, NULL);
err |= clEnqueueWriteBuffer(queue, ascension2, CL_TRUE, 0,
numData*sizeof(float), dataAsc, 0, NULL, NULL);
err |= clEnqueueWriteBuffer(queue, declination2, CL_TRUE, 0,
numData*sizeof(float), dataDec, 0, NULL, NULL);
err |= clEnqueueWriteBuffer(queue, separation, CL_TRUE, 0,
256*sizeof(int), DD, 0, NULL, NULL);
if (err != CL_SUCCESS) {ERR("Buffer Write"); exit(1);}
err = runKernel(&queue, &kernel,
&ascension, &declination, &ascension2, &declination2, &separation, &doubleList, size, NULL);
if (err != CL_SUCCESS) {ERR("Execution"); exit(1);}
err = clEnqueueReadBuffer(queue, separation, CL_TRUE, 0, 256*sizeof(int), DD, 0, NULL, NULL);
if (err != CL_SUCCESS) {ERR("Read Buffer"); exit(1);}
err = clEnqueueWriteBuffer(queue, ascension2, CL_TRUE, 0,
numRand*sizeof(float), randAsc, 0, NULL, NULL);
err |= clEnqueueWriteBuffer(queue, declination2, CL_TRUE, 0,
numRand*sizeof(float), randDec, 0, NULL, NULL);
err |= clEnqueueWriteBuffer(queue, separation2, CL_TRUE, 0,
256*sizeof(int), DR, 0, NULL, NULL);
if (err != CL_SUCCESS) {ERR("Buffer Write"); exit(1);}
size[2] = numRand;
doubleList = 1;
err = runKernel(&queue, &kernel,
&ascension, &declination, &ascension2, &declination2, &separation2, &doubleList, size, NULL);
if (err != CL_SUCCESS) {ERR("Execution"); exit(1);}
err = clEnqueueReadBuffer(queue, separation2, CL_TRUE, 0, 256*sizeof(int), DR, 0, NULL, NULL);
if (err != CL_SUCCESS) {ERR("Read Buffer"); exit(1);}
err = clEnqueueWriteBuffer(queue, ascension, CL_TRUE, 0,
numRand*sizeof(float), randAsc, 0, NULL, NULL);
err |= clEnqueueWriteBuffer(queue, declination, CL_TRUE, 0,
numRand*sizeof(float), randDec, 0, NULL, NULL);
err |= clEnqueueWriteBuffer(queue, separation3, CL_TRUE, 0,
256*sizeof(int), RR, 0, NULL, NULL);
if (err != CL_SUCCESS) {ERR("Buffer Write"); exit(1);}
size[1] = numRand;
doubleList = 0;
err = runKernel(&queue, &kernel,
&ascension, &declination, &ascension2, &declination2, &separation3, &doubleList, size, NULL);
if (err != CL_SUCCESS) {ERR("Execution"); exit(1);}
err = clEnqueueReadBuffer(queue, separation3, CL_TRUE, 0, 256*sizeof(int), RR, 0, NULL, NULL);
if (err != CL_SUCCESS) {ERR("Read Buffer"); exit(1);}
clFinish(queue);
#ifdef PROFILE
end = clock();
#endif
printf("%d\n", DD[0]);
/* Output results to console */
std::cout << "\nValues:\n\n";
/* Optional CPU test run */
#ifdef TEST
#ifdef PROFILE_CPU
t[0] = clock();
testRun(dataAsc, dataDec, dataAsc, dataDec, testDD, numData, numData, 0);
t[1] = clock();
testRun(dataAsc, dataDec, randAsc, randDec, testDR, numData, numRand, 1);
t[2] = clock();
testRun(randAsc, randDec, randAsc, randDec, testRR, numRand, numRand, 0);
t[3] = clock();
#else
testRun(dataAsc, dataDec, dataAsc, dataDec, testDD, numData, numData, 0);
testRun(dataAsc, dataDec, randAsc, randDec, testDR, numData, numRand, 1);
testRun(randAsc, randDec, randAsc, randDec, testRR, numRand, numRand, 0);
#endif
unsigned long sumDD = 0;
unsigned long sumDR = 0;
unsigned long sumRR = 0;
bool equal = true;
bool testEqual;
for(int k=0; k < 256; k++) {
testEqual = (testDD[k] == DD[k]) && (testDR[k] == DR[k]) && (testRR[k] == RR[k]);
std::cout << DD[k] << "\t" << testDD[k]
<< "\t" << DR[k] << "\t" << testDR[k]
<< "\t" << RR[k] << "\t" << testRR[k] << "\t" << testEqual << std::endl;
sumDD += testDD[k];
sumDR += testDR[k];
sumRR += testRR[k];
equal = equal && testEqual;
}
std::cout << "Total pairs in range 0-64 degrees: " <<
sumDD << "\t" << sumDR << "\t" << sumRR << std::endl;
std::cout << "Are they equal (beware floating point differences): " << equal << std::endl;
#else
printf("%8s\t%8s\t%8s\t%8s\n", "DD", "DR", "RR", "Omega");
for(int k=0; k < 256; k++) {
omega[k] = (DD[k] - 2*DR[k] + RR[k])/((float) RR[k]);
printf("%8d\t%8d\t%8d\t%8f\n", DD[k], DR[k], RR[k], omega[k]);
}
#endif
/* Optional profile results */
#ifdef PROFILE
#ifdef TEST
#ifdef PROFILE_CPU
std::cout << "\n Individual timings:" << std::endl;
printf("%8s\n", "Sequntial CPU");
for(int k=0; k < 3; k++) {
printf("%8f\n", (t[k+1] - t[k])/((double) CLOCKS_PER_SEC));
}
#endif
#endif
std::cout << "\n Total execution time of histograms:" << std::endl;
printf("%8f", (end - start)/((double) CLOCKS_PER_SEC));
#ifdef TEST
#ifdef PROFILE_CPU
printf("\t%8f", (t[3] - t[0])/((double) CLOCKS_PER_SEC));
#endif
#endif
printf("\n");
#endif
/* Optional output to file */
#ifdef OUTPUT
outputData("output.csv", DD, DR, RR, omega);
#endif
/* Memory clean-up */
clReleaseMemObject(ascension);
clReleaseMemObject(ascension2);
clReleaseMemObject(declination);
clReleaseMemObject(declination2);
clReleaseMemObject(separation);
clReleaseMemObject(separation2);
clReleaseMemObject(separation3);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(queue);
clReleaseContext(context);
delete[] dataAsc;
delete[] dataDec;
delete[] randAsc;
delete[] randDec;
return 0;
}
Compiled with -I"$(AMDAPPSDKROOT)/include" -L"$(AMDAPPSDKROOT)/lib/x86" -LOpenCL -O3
Add -DDATA_SIZE=N to use the first N galaxies.
Add -DTEST to also run sequential CPU version. Produces seven columns, the first six are pairs of histograms made on GPU and CPU respectively. The last column contain boolean values that check if the elements of each pair are identical.
Galaxy data can be found
here. Use the 100k lists.