Optimizing kernel shuffled keys code - OpenCL -
i have started getting opencl , going through basics of writing kernel code. have written kernel code calculating shuffled keys points array. so, number of points n, shuffled keys calculated in 3-bit fashion, x-bit @ depth d (0
xd = 0 if p.x < cd.x xd = 1, otherwise
the shuffled xyz key given as:
x1y1z1x2y2z2...xdydzd
the kernel code written given below. point inputted in column major format.
__constant float3 boundsoffsettable[8] = { {-0.5,-0.5,-0.5}, {+0.5,-0.5,-0.5}, {-0.5,+0.5,-0.5}, {-0.5,-0.5,+0.5}, {+0.5,+0.5,-0.5}, {+0.5,-0.5,+0.5}, {-0.5,+0.5,+0.5}, {+0.5,+0.5,+0.5} }; uint setbit(uint x,unsigned char position) { uint mask = 1<<position; return x|mask; } __kernel void morton_code(__global float* point,__global uint*code,int level, float3 center,float radius,int size){ // index of current element processed int = get_global_id(0); float3 pt; pt.x = point[i];pt.y = point[size+i]; pt.z = point[2*size+i]; code[i] = 0; float3 newcenter; float newradius; if(pt.x>center.x) code = setbit(code,0); if(pt.y>center.y) code = setbit(code,1); if(pt.z>center.z) code = setbit(code,2); for(int l = 1;l<level;l++) { for(int i=0;i<8;i++) { newradius = radius *0.5; newcenter = center + boundoffsettable[i]*radius; if(newcenter.x-newradius<pt.x && newcenter.x+newradius>pt.x && newcenter.y-newradius<pt.y && newcenter.y+newradius>pt.y && newcenter.z-newradius<pt.z && newcenter.z+newradius>pt.z) { if(pt.x>newcenter.x) code = setbit(code,3*l); if(pt.y>newcenter.y) code = setbit(code,3*l+1); if(pt.z>newcenter.z) code = setbit(code,3*l+2); } } } }
it works wanted ask if missing in code , if there way optimize code.
try kernel:
__kernel void morton_code(__global float* point,__global uint*code,int level, float3 center,float radius,int size){ // index of current element processed int = get_global_id(0); float3 pt; pt.x = point[i];pt.y = point[size+i]; pt.z = point[2*size+i]; uint res; res = 0; float3 newcenter; float newradius; if(pt.x>center.x) res = setbit(res,0); if(pt.y>center.y) res = setbit(res,1); if(pt.z>center.z) res = setbit(res,2); for(int l = 1;l<level;l++) { for(int i=0;i<8;i++) { newradius = radius *0.5; newcenter = center + boundoffsettable[i]*radius; if(newcenter.x-newradius<pt.x && newcenter.x+newradius>pt.x && newcenter.y-newradius<pt.y && newcenter.y+newradius>pt.y && newcenter.z-newradius<pt.z && newcenter.z+newradius>pt.z) { if(pt.x>newcenter.x) res = setbit(res,3*l); if(pt.y>newcenter.y) res = setbit(res,3*l+1); if(pt.z>newcenter.z) res = setbit(res,3*l+2); } } } //save result code[i] = res; }
rules optimize:
- avoid global memory (you using "code" directly global memory, changed that), should see 3x increase in performance now.
- avoid ifs, use "select" instead if possible. (see opencl documentation)
- use more memory inside kernel. don't need operate @ bit level. operation @ int level better , avoid huge amount of calls "setbit". can construct result @ end.
another interesting thing. if operating @ 3d level, can use float3 variables , compute distances opencl operators. can increase performance quite lot. requires complete rewrite of kernel.
Comments
Post a Comment