(L) [2013/08/20] [ziu] [Speed Issues with GPU BVH rendering] Wayback!Hello,
I am presently trying to evaluate the performance of best of class GPU BVH ray tracing algorithms for real-time rendering. Of these the work by Luebke, Pantaleoni, Karras, and Garanzha stood out.
The only open source implementation I could find of these was Pantaleoni's HLBVH which is provided as an API on Google Code. So I tried making a prototype which ray traced an OBJ of, say, Buddha to gauge construction and rendering performance. It took me some effort integrating several components to get the API to do its motions. Then I found that scene traversal code was basically missing from the API so I made my own based on one of the provided API tests. However the rendering performance results I get are terrible. 0.68 FPS @ 512x512 on a GeForce GTX 660 Ti.
In case you are interested I placed the entire project here:
[LINK https://bitbucket.org/vasc/nih]
As for the CUDA traversal code I am using...
Code: [LINK # Select all]__device__
bool AABBtest(const nih::Bbox3f bbox, const nih::Vector3f orig, const nih::Vector3f idir, float2* ptnf) {
const nih::Vector3f Min = (bbox.m_min-orig)*idir;
const nih::Vector3f Max = (bbox.m_max-orig)*idir;
const nih::Vector3f t1 = min(Min, Max), t2 = max(Min, Max);
const float2 t = make_float2(max(max(t1.x[0], t1.x[1]), t1.x[2]), min(min(t2.x[0], t2.x[1]), t2.x[2]));
*ptnf = t;
return (t.y >= EPSILON && t.x < t.y);
}
__device__
bool gtest(const float *p_vertices,
const nih::Bvh_node *p_nodes,
const uint2 *p_leaves,
const nih::Bbox3f *p_node_bboxes,
const float3 orig, const float3 dir, float4* isect)
{
nih::uint32 d_node_id = 0;
nih::uint32 node_index = 0;
nih::uint32 leaf_index = 0;
bool hit = false;
const nih::Vector3f o(orig.x, orig.y, orig.z);
const nih::Vector3f idir(1.0f/dir.x, 1.0f/dir.y, 1.0f/dir.z);
while (d_node_id != nih::uint32(-1)) {
nih::Bvh_node d_node = p_nodes[ d_node_id ];
float2 t2;
if (AABBtest(p_node_bboxes[ d_node_id ], o, idir, &t2)) {
if (d_node.is_leaf()) {
const uint2 d_leaf = p_leaves[ d_node.get_leaf_index() ];
for (nih::uint32 id = d_leaf.x; id < d_leaf.y; ++id) {
const float3 p0 = make_float3(p_vertices[id*9+0], p_vertices[id*9+1], p_vertices[id*9+2]);
const float3 p1 = make_float3(p_vertices[id*9+3], p_vertices[id*9+4], p_vertices[id*9+5]);
const float3 p2 = make_float3(p_vertices[id*9+6], p_vertices[id*9+7], p_vertices[id*9+8]);
if (test(p0, p1, p2, orig, dir, isect)) {
hit = true;
}
}
d_node_id = d_node.get_skip_node();
leaf_index++;
} else {
d_node_id = d_node.get_child(0);
}
} else {
d_node_id = d_node.get_skip_node();
}
node_index++;
}
return hit;
}
__device__
void init(
const uint dw,
const uint dh,
const float3 from,
const float3 at,
const float3 up,
const float angle,
float3* ll,
float3* u,
float3* v)
{
float3 l = at - from;
float d, ax, ay;
d = length(l);
l = normalize(l);
float fovx, fovy;
fovx = angle*M_PI_360;
fovy = (float)dh/dw * fovx;
ax = d*tan(fovx);
ay = d*tan(fovy);
*v = normalize(cross(l, up));
*u = cross(*v, l);
*ll = l*d - *v*ax - *u*ay;
*v *= (2.0f * ax) / dw;
*u *= (2.0f * ay) / dh;
}
// happy.obj
__constant__ const float3 from = {-0.0054393f,0.148769f,0.245244f};
__constant__ const float3 at = {-0.0054393f,0.148769f,-0.00669f};
__constant__ const float3 up = {0.0f,1.0f,0.0f};
__constant__ const float angle = 50.0f;
__global__
void kernel(const float *p_vertices,
const nih::Bvh_node *p_nodes,
const uint2 *p_leaves,
const nih::Bbox3f *p_node_bboxes,
const uint dw, const uint dh)
{
float3 ll, u, v;
init(dw, dh, from, at, up, angle, &ll, &u, &v);
uint x = blockDim.x*blockIdx.x + threadIdx.x;
uint y = blockDim.y*blockIdx.y + threadIdx.y;
float3 orig, dir;
orig = from;
dir = normalize(ll + u*(float)y + v*(float)x);
const float3 fgcolor = make_float3(1.0, 1.0, 1.0);
const float3 bgcolor = make_float3(0.0, 0.0, 0.0);
float4 isect = {INFINITY,0.0f,0.0f,0.0f};
bool result;
result = gtest(p_vertices, p_nodes, p_leaves, p_node_bboxes, orig, dir, &isect);
if (result) {
float4 element = make_float4(fgcolor.x, fgcolor.y, fgcolor.z, 1.0); // r, g, b, a
surf2Dwrite(element, surfRef, x*sizeof(element), y, cudaBoundaryModeTrap);
} else {
float4 element = make_float4(bgcolor.x, bgcolor.y, bgcolor.z, 1.0); // r, g, b, a
surf2Dwrite(element, surfRef, x*sizeof(element), y, cudaBoundaryModeTrap);
}
}
...
const unsigned dw = 512, dh = 512;
cudaGraphicsMapResources(1, &cuda_texture, 0);
cudaGraphicsSubResourceGetMappedArray(&cuda_array, cuda_texture, 0, 0);
cudaBindSurfaceToArray(surfRef, cuda_array);
dim3 global(dw/8,dh/16,1);
dim3 local(8,16,1);
kernel<<<global,local>>>(p_vertices, p_nodes, p_leaves, p_node_bboxes, dw, dh);
cudaGraphicsUnmapResources(1, &cuda_texture, 0);
So  I guess I must be doing something wrong. But what?
Thanks
Ziu