(L) [2011/11/28] [ost
by admin] [Brigade footage] Wayback!Certainly. We started out with a 'reference path tracer'. This one is as basic as possible, and is designed only to produce the 'correct image'. We compare the other kernels against this one.
Code: [LINK # Select all]extern "C" __global__ void TracePixelReference()
{
    // setup path
    const int numRays = context.width * context.height;
    const int idx0 = threadIdx.y + blockDim.y * (blockIdx.x + gridDim.x * blockIdx.y) + ((context.firstline * context.width) >> 5);
    const int tx = threadIdx.x & 7, ty = threadIdx.x >> 3, tilesperline = context.width >> 3;
    const int xt = idx0 % tilesperline, yt = idx0 / tilesperline, px = (xt << 3) + tx, py = (yt << 2) + ty;
    const int pidx = numRays - 1 - (px + py * context.width);
    RNG genrand( pidx, (clock() * pidx * 8191) ^ 140167 );
    const int spp = context.SampleCount;
    const float rcpw = 1.0f / context.width, u = (float)px * rcpw - 0.5f, v = (float)(py + (context.width - context.height) * 0.5f) * rcpw - 0.5f;
    float3 E = make_float3( 0, 0, 0 );
    // trace path
    for( int sample = 0; sample < spp; sample++ )
    {
        // construct primary ray
        const float r1 = genrand(), r2 = genrand(), r3 = genrand(), r4 = genrand();
        float3 O = context.Translation + context.Left * context.LensSize * (r3 - 0.5f) + context.Up   * context.LensSize * (r4 - 0.5f);
        float3 D = context.Translation + (context.Forward + context.Left * context.tanFOV2 * (u + rcpw * r1) + context.Up * context.tanFOV2 * (v + rcpw * r2)) * context.FocalDist;
        D = normalize( D - O );
        // trace path
        float3 throughput = make_float3( 1, 1, 1 );
        int depth = 0;
        while (1)
        {
            int prim = 0; float2 BC, UV = make_float2( 0, 0 ); float dist = 1000000; bool backfaced = false;
            intersect<false,true>( O, D, dist, BC, prim, backfaced );
            O += D * dist;
            if (prim == -1)
            {
                E += throughput * GetSkySample( D );
                break;
            }
            Triangle& tri = context.Triangles[prim];
            TracerMaterial mat = context.Materials[tri.GetMaterialIdx()];
            if (mat.flags & TracerMaterial::EMITTER) // light
            {
                E += throughput * mat.EmissiveColor;
                break;
            }
            else // diffuse reflection
            {
                const float3 matcol = tri.GetMaterialColor( mat, BC, UV );
                const float3 N = tri.GetNormal( mat, BC, UV ) * (backfaced ? -1 : 1 );
                D = normalize( RandomReflection( genrand, N ) ); // don't even trust DiffuseReflection
                throughput *= matcol * dot( D, N );
            }
            O += D * EPSILON; // prevent intersection at dist = 0
            depth++;
            if (depth > 3) { if (genrand() > 0.5f) break; throughput *= 2.0f; }
        }
    }
    context.RenderTarget[pidx] = make_float4( E / (float)spp, 1 );
}
I am pasting this as-is; there's some Brigade-specific stuff in there as well as some dependencies, so ask if anything is unclear.
Then, we have a loop based on Novak's ideas. Instead of a path budget, it has a 'segment budget'. Paths are restarted when terminated, to keep the SIMT lanes occupied. Restarting is cheap, but happens typically in only a few threads in a warp. Here's the code:
Code: [LINK # Select all]#define TERMINATE { restart = true; continue; }
extern "C" __global__ void TracePixelSegment()
{
    // setup path
    const int numRays = context.width * context.height;
    const int idx0 = threadIdx.y + blockDim.y * (blockIdx.x + gridDim.x * blockIdx.y) + ((context.firstline * context.width) >> 5);
    const int tx = threadIdx.x & 7, ty = threadIdx.x >> 3, tilesperline = context.width >> 3;
    const int xt = idx0 % tilesperline, yt = idx0 / tilesperline, px = (xt << 3) + tx, py = (yt << 2) + ty;
    const int pidx = numRays - 1 - (px + py * context.width);
    RNG genrand( pidx, (clock() * pidx * 8191) ^ 140167 );
    const int spp = context.SampleCount;
    const float rcpw = 1.0f / context.width, u = (float)px * rcpw - 0.5f, v = (float)(py + (context.width - context.height) * 0.5f) * rcpw - 0.5f;
    float3 E = make_float3( 0, 0, 0 ), throughput, O, D;
    bool restart = true, firsthit = true;
    int paths = 0, curdepth = 0;
    // trace path
#ifdef PURIST
    for( int segment = 0; ((segment < spp * 2) || (!restart)); segment++ )
#else
    for( int segment = 0; (segment < spp * 2); segment++ )
#endif
    {
        if (restart)
        {
            // construct primary ray
            const float r1 = genrand(), r2 = genrand(), r3 = genrand(), r4 = genrand();
            O = context.Translation + context.Left * context.LensSize * (r3 - 0.5f) + context.Up   * context.LensSize * (r4 - 0.5f);
            D = context.Translation + (context.Forward + context.Left * context.tanFOV2 * (u + rcpw * r1) + context.Up * context.tanFOV2 * (v + rcpw * r2)) * context.FocalDist;
            D = normalize( D - O );
            firsthit = true, restart = false, throughput = make_float3( 1, 1, 1 ), curdepth = 0, paths++;
        }
        // trace path segment
        int prim = 0; float2 UV, BC; float dist = 1000000; bool backfaced = false;
        O += D * EPSILON; // prevent intersection at dist = 0
        intersect<false,true>( O, D, dist, BC, prim, backfaced );
        O += D * dist;
        if (prim == -1)
        {
            // path left scene
            E += throughput * GetSkySample( D );
            TERMINATE;
        }
        Triangle& tri = context.Triangles[prim];
        TracerMaterial mat = context.Materials[tri.GetMaterialIdx()];
        if (mat.flags & TracerMaterial::EMITTER)
        {
            // path arrived at light
            if (firsthit & (!backfaced)) E += throughput * mat.EmissiveColor;
            TERMINATE;
        }
        const float3 matcol = tri.GetMaterialColor( mat, BC, UV );
        const float3 N = tri.GetNormal( mat, BC, UV ) * (backfaced ? -1 : 1 );
        const float3 wo = D * -1.0f;
        // sample direct lighting using next event estimation (FLAWLESS)
        float3 L, LN, LColor;
        const float r8 = genrand();
        float area;
        RandomPointOnLight( L, LN, LColor, r8, genrand, area );
        L -= O;
        float sqdist = dot( L, L ), ldist = sqrtf( sqdist );
        L *= 1.0f / ldist;
        const float NdotL = dot( N, L ), LNdotL = -dot( LN, L );
        if ((NdotL > 0) && (LNdotL > 0))
        {
            bool backface; int sprim; float2 SBC; ldist -= 2 * EPSILON;
            intersect<true,false>( O + L * EPSILON, L, ldist, SBC, sprim, backface );
            if (sprim == -1)    
            {
                const float lightPdf = (LNdotL > EPSILON) ? (sqdist / (LNdotL * area * context.lightcount)) : 0.0f;
                if (lightPdf > 0) E += throughput * matcol * INVPI * 0.5f * LColor * NdotL / lightPdf;
            }
        }
        // russian roulette
        if (curdepth > 1)
        {
            const float p = max( EPSILON, min( 0.5f, (throughput.x + throughput.y + throughput.z) * 0.333f ) ); // condition taken from pbrt
            if (genrand() > p) TERMINATE;
            throughput /= p;
        }
        // do a lambert reflection (FLAWLESS)
        const float r6 = genrand(), r7 = genrand();
        D = DiffuseReflection( r6, r7, genrand, N );
        const float bsdfPdf = LambertPdf( D, N );
        const float3 f = matcol * INVPI * 0.5f;
        if (bsdfPdf < EPSILON) TERMINATE;
        throughput *= f * dot( D, N ) / bsdfPdf;
        firsthit = false;
        curdepth++;
    }
    context.RenderTarget[pidx] = make_float4( E * (1.0f / (float)paths), 1.0f );
}
Not much to explain, but I would like to point out how simple an actual implementation of Novak's ideas can be. There's no admin code here, and this runs very efficient. Only problem is that segments of different depths are traced simultaneously, which reduces 'ray coherence', which is something you ideally want to have even on the GPU. The last started path is completed even when the segment budget is depleted; this is to prevent bias. If we don't care about bias, we can make the path tracer 30% or so faster by skipping this. ('Purist').
Then, the MIS code. Not much new here, except that direct light is now sampled by two rays: one directly to the light, and a 'random bounce' based on the brdf. Because the second ray tends to hit other geometry than the light, we reuse this ray for the diffuse bounce. MIS is thus spread over two loop iterations. This complicates the loop somewhat.
Code: [LINK # Select all]extern "C" __global__ void TracePixelMIS()
{
    // setup path
    const int idx0 = threadIdx.y + blockDim.y * (blockIdx.x + gridDim.x * blockIdx.y) + ((context.firstline * context.width) >> 5);
    const int tx = threadIdx.x & 7, ty = threadIdx.x >> 3, tilesperline = context.width >> 3;
    const int xt = idx0 % tilesperline, yt = idx0 / tilesperline;
    int px = (xt << 3) + tx, py = (yt << 2) + ty;
    const int pidx =(px + py * context.width);
    px = context.width - px;
    py = context.height - py;
    RNG genrand( pidx, (clock() * pidx * 8191) ^ 140167 );
    const int spp = context.SampleCount;
    const float rcpw = 1.0f / context.width, u = (float)px * rcpw - 0.5f, v = (float)(py + (context.width - context.height) * 0.5f) * rcpw - 0.5f;
    float3 E = make_float3( 0, 0, 0 ), throughput, O, D, postComp, postThrough;
    bool restart = true, firsthit = true, postponed = false;
    float postPdf;
    int rays = 0;
    float3 lastabsorbance;
    int paths = 0, curdepth = 0;
    // trace path
#ifdef PURIST
    for( int segment = 0; ((segment < spp * 2) || (!restart)); segment++ )
#else
    for( int segment = 0; segment < (spp * 2); segment++ )
#endif
    {
        if (restart)
        {
            // construct primary ray
            const float r1 = genrand(), r2 = genrand(), r3 = genrand(), r4 = genrand();
            O = context.Translation + context.Left * context.LensSize * (r3 - 0.5f) + context.Up   * context.LensSize * (r4 - 0.5f);
            D = context.Translation + (context.Forward + context.Left * context.tanFOV2 * (u + rcpw * r1) + context.Up * context.tanFOV2 * (v + rcpw * r2)) * context.FocalDist;
            D = normalize( D - O );
            lastabsorbance = make_float3(0, 0, 0);
            firsthit = true, restart = false, postponed = false, throughput = make_float3( 1, 1, 1 ), curdepth = 0, paths++;
        }
        // trace path segment
        int prim = 0; float2 UV, BC; float dist = 1000000; bool backfaced = false;
        O += D * EPSILON; // prevent intersection at dist = 0
        intersect<false,true>( O, D, dist, BC, prim, backfaced ); rays++;
        O += D * dist;
        if (prim == -1)
        {
            // path left scene
            E += throughput * GetSkySample( D );
            TERMINATE;
        }
        if (lastabsorbance.x || lastabsorbance.y || lastabsorbance.z)
        {
            throughput *= make_float3(
                __expf(lastabsorbance.x * -dist),
                __expf(lastabsorbance.y * -dist),
                __expf(lastabsorbance.z * -dist));
            lastabsorbance = make_float3(0, 0, 0);
        }
        Triangle& tri = context.Triangles[prim];
        const TracerMaterial mat = context.Materials[tri.GetMaterialIdx()];
        if (mat.flags & TracerMaterial::EMITTER)
        {
            // path arrived at light
            if (postponed)
            {
                if ((mat.EmissiveColor.x == postComp.x) && (mat.EmissiveColor.y == postComp.y) && (mat.EmissiveColor.z == postComp.z))
                {
                    const float den = (tri.area * context.lightcount * -dot( tri.GetNormal(), D ));
                    const float lightPdf = (den > 0) ? ((dist * dist) / den) : 0.0f;
                    if (lightPdf > 0)
                    {
                        const float weight = PowerHeuristic( postPdf, lightPdf );
                        E += postThrough * mat.EmissiveColor * weight / postPdf;
                    }
                }
                postponed = false;
            }
            if (firsthit & (!backfaced)) E += throughput * mat.EmissiveColor;
            TERMINATE;
        }
        const float3 matcol = tri.GetMaterialColor( mat, BC, UV );
        const float3 N = tri.GetNormal( mat, BC, UV ) * (backfaced ? -1 : 1 );
        if (mat.Specularity > 0)
        {
            if (!mat.Absorbance) throughput *= matcol;
            // handle pure specular materials and dielectrics
            if (mat.Transparency > genrand())
            {
                // dielectric
                float nt = mat.RefractionIndex;
                if (backfaced) nt = 1.0f / nt;
                const float nnt = 1.0f / nt, ddn = dot( D, N );
                const float cos2t = 1 - nnt * nnt * (1 - ddn * ddn);
                if (cos2t < 0) D = reflect( D, N ); /* TIR */ else
                {
                    const float3 R = normalize( D * nnt - N * (ddn * nnt + sqrtf( cos2t )) );
                    const float a = nt - 1, b = nt + 1, R0 = (a * a) / (b * b);
                    const float c = 1 + ddn, Re = R0 + (1 - R0) * c * c * c * c * c;
                    const float P = .25f + .5f * Re;
                    const bool pick = genrand() < P;
                    throughput *= pick ? (Re / P) : ((1 - Re) / (1 - P));
                    D = pick ? reflect( D, N ) : R;
                    
                    if (mat.Absorbance && !backfaced) lastabsorbance = (make_float3(1, 1, 1) - matcol) * mat.Absorbance;
                }
            }
            else D = reflect( D, N ); // specular bounce
        }
        else
        {
            // handle diffuse materials
            const float3 wo = D * -1.0f;
            // sample direct lighting using next event estimation and MIS (FLAWLESS)
            float3 L, LN, LColor;
            const float r8 = genrand();
            float area;
            RandomPointOnLight( L, LN, LColor, r8, genrand, area );
            L -= O;
            float sqdist = dot( L, L ), ldist = sqrtf( sqdist );
            L *= 1.0f / ldist;
            const float NdotL = dot( N, L ), LNdotL = -dot( LN, L );
            if ((NdotL > 0) && (LNdotL > 0))
            {
                bool backface; int sprim; float2 SBC; ldist *= 0.99f;
                intersect<true,false>( O + L * EPSILON, L, ldist, SBC, sprim, backface ); rays++;
                if (sprim == -1)
                {
                    const float lightPdf = (LNdotL > EPSILON) ? (sqdist / (LNdotL * area * context.lightcount)) : 0.0f;
                    float bsdfPdf;
                    bsdfPdf = LambertPdf( L, N );
                    if ((lightPdf > 0) && (bsdfPdf > 0))
                    {
                        const float3 f = matcol * INVPI * 0.5f;
                        const float weight = PowerHeuristic( lightPdf, bsdfPdf );
                        E += throughput * f * LColor * weight * NdotL / lightPdf;
                    }
                }
            }
            // bsdf sampling with MIS (FLAWLESS)
            const float r3 = genrand(), r4 = genrand();
            float3 f;
            D = DiffuseReflection( r3, r4, genrand, N ), postPdf = LambertPdf( D, N ), f = matcol * INVPI * 0.5f;
            if (postPdf <= 0.01f) TERMINATE; // hmm
            postThrough = throughput * dot( D, N ) * matcol * INVPI * 0.5f;
            // russian roulette
            if (curdepth > 1)
            {
                float p = max( EPSILON, min( 0.5f, (throughput.x + throughput.y + throughput.z) * 0.333f ) ); // condition taken from pbrt
                if (genrand() > p) TERMINATE;
                throughput /= p;
            }
            postponed = true, postComp = LColor;
            throughput *= f * dot( D, N ) / postPdf;
            firsthit = false;
        }
        curdepth++;
    }
    context.RenderTarget[pidx] = make_float4( E * (1.0f / (float)paths), *(float*)&rays );
}
In all three kernels, Aila and Laine code is used for actual ray / scene intersection. We also use their BVH node layout, as well as Woop's triangle layout. The BVH is stored in texture memory. Triangle data and materials are stored in global memory. We do not use any advcanced CUDA features, and no inter-thread communication, so this is pretty straight-forward code for a GPU and should port to ATI easily.
There is some more stuff, like the handling of light sources, but I discussed that before.
I have a question about the MIS, but I will post that in a separate thread.
- Jacco.