smallpt: Global Illumination in 99 lines of C++

(L) [2008/11/08] [beason] [ smallpt: Global Illumination in 99 lines of C++] Wayback!

Finally releasing a tiny renderer I've been working off and on over the past year and half: [LINK http://kevinbeason.com/smallpt/ smallpt]. It renders this image (click for full size):
[IMG #1 Image]
* 99 lines of C++ (open source)
* Path tracing (unbiased)
* Full multi-threading via GOMP
* Renders a slightly modified Cornell box (made from spheres)
* Compiles to 4049 bytes with some minor changes
* Takes one command line argument, the number of samples per pixel
* Writes the resulting image to "image.ppm"
* [LINK http://www.kevinbeason.com/smallpt/ smallpt web site]
Hopefully this can serve as a minimal example of a global illumination renderer to anyone just starting off.
Full source including scene description:

#include <math.h>   // smallpt, a Path Tracer by Kevin Beason, 2008
#include <stdlib.h> // Make : g++ -O3 -fopenmp smallpt.cpp -o smallpt
#include <stdio.h>  //        Remove "-fopenmp" for g++ version < 4.2
struct Vec {        // Usage: time ./smallpt 5000 && xv image.ppm
  double x, y, z;                  // position, also color (r,g,b)
  Vec(double x_=0, double y_=0, double z_=0){ x=x_; y=y_; z=z_; }
  Vec operator+(const Vec &b) const { return Vec(x+b.x,y+b.y,z+b.z); }
  Vec operator-(const Vec &b) const { return Vec(x-b.x,y-b.y,z-b.z); }
  Vec operator*(double b) const { return Vec(x*b,y*b,z*b); }
  Vec mult(const Vec &b) const { return Vec(x*b.x,y*b.y,z*b.z); }
  Vec& norm(){ return *this = *this * (1/sqrt(x*x+y*y+z*z)); }
  double dot(const Vec &b) const { return x*b.x+y*b.y+z*b.z; } // cross:
  Vec operator%(Vec&b){return Vec(y*b.z-z*b.y,z*b.x-x*b.z,x*b.y-y*b.x);}
};
struct Ray { Vec o, d; Ray(Vec o_, Vec d_) : o(o_), d(d_) {} };
enum Refl_t { DIFF, SPEC, REFR };  // material types, used in radiance()
struct Sphere {
  double rad;       // radius
  Vec p, e, c;      // position, emission, color
  Refl_t refl;      // reflection type (DIFFuse, SPECular, REFRactive)
  Sphere(double rad_, Vec p_, Vec e_, Vec c_, Refl_t refl_):
    rad(rad_), p(p_), e(e_), c(c_), refl(refl_) {}
  double intersect(const Ray &r) const { // returns distance, 0 if nohit
    Vec op = p-r.o; // Solve t^2*d.d + 2*t*(o-p).d + (o-p).(o-p)-R^2 = 0
    double t, eps=1e-4, b=op.dot(r.d), det=b*b-op.dot(op)+rad*rad;
    if (det<0) return 0; else det=sqrt(det);
    return (t=b-det)>eps ? t : ((t=b+det)>eps ? t : 0);
  }
};
Sphere spheres[] = {//Scene: radius, position, emission, color, material
  Sphere(1e5, Vec( 1e5+1,40.8,81.6), Vec(),Vec(.75,.25,.25),DIFF),//Left
  Sphere(1e5, Vec(-1e5+99,40.8,81.6),Vec(),Vec(.25,.25,.75),DIFF),//Rght
  Sphere(1e5, Vec(50,40.8, 1e5),     Vec(),Vec(.75,.75,.75),DIFF),//Back
  Sphere(1e5, Vec(50,40.8,-1e5+170), Vec(),Vec(),           DIFF),//Frnt
  Sphere(1e5, Vec(50, 1e5, 81.6),    Vec(),Vec(.75,.75,.75),DIFF),//Botm
  Sphere(1e5, Vec(50,-1e5+81.6,81.6),Vec(),Vec(.75,.75,.75),DIFF),//Top
  Sphere(16.5,Vec(27,16.5,47),       Vec(),Vec(1,1,1)*.999, SPEC),//Mirr
  Sphere(16.5,Vec(73,16.5,78),       Vec(),Vec(1,1,1)*.999, REFR),//Glas
  Sphere(600, Vec(50,681.6-.27,81.6),Vec(12,12,12),  Vec(), DIFF) //Lite
};
inline double clamp(double x){ return x<0 ? 0 : x>1 ? 1 : x; }
inline int toInt(double x){ return int(pow(clamp(x),1/2.2)*255+.5); }
inline bool intersect(const Ray &r, double &t, int &id){
  double n=sizeof(spheres)/sizeof(Sphere), d, inf=t=1e20;
  for(int i=int(n);i--;) if((d=spheres[i].intersect(r))&&d<t){t=d;id=i;}
  return t<inf;
}
Vec radiance(const Ray &r, int depth, unsigned short *Xi){
  double t;                               // distance to intersection
  int id=0;                               // id of intersected object
  if (!intersect(r, t, id)) return Vec(); // if miss, return black
  const Sphere &obj = spheres[id];        // the hit object
  Vec x=r.o+r.d*t, n=(x-obj.p).norm(), nl=n.dot(r.d)<0?n:n*-1, f=obj.c;
  double p = f.x>f.y && f.x>f.z ? f.x : f.y>f.z ? f.y : f.z; // max refl
  if (++depth>5) if (erand48(Xi)<p) f=f*(1/p); else return obj.e; //R.R.
  if (obj.refl == DIFF){                  // Ideal DIFFUSE reflection
    double r1=2*M_PI*erand48(Xi), r2=erand48(Xi), r2s=sqrt(r2);
    Vec w=nl, u=((fabs(w.x)>.1?Vec(0,1):Vec(1))%w).norm(), v=w%u;
    Vec d = (u*cos(r1)*r2s + v*sin(r1)*r2s + w*sqrt(1-r2)).norm();
    return obj.e + f.mult(radiance(Ray(x,d),depth,Xi));
  } else if (obj.refl == SPEC)            // Ideal SPECULAR reflection
    return obj.e + f.mult(radiance(Ray(x,r.d-n*2*n.dot(r.d)),depth,Xi));
  Ray reflRay(x, r.d-n*2*n.dot(r.d));     // Ideal dielectric REFRACTION
  bool into = n.dot(nl)>0;                // Ray from outside going in?
  double nc=1, nt=1.5, nnt=into?nc/nt:nt/nc, ddn=r.d.dot(nl), cos2t;
  if ((cos2t=1-nnt*nnt*(1-ddn*ddn))<0)    // Total internal reflection
    return obj.e + f.mult(radiance(reflRay,depth,Xi));
  Vec tdir = (r.d*nnt - n*((into?1:-1)*(ddn*nnt+sqrt(cos2t)))).norm();
  double a=nt-nc, b=nt+nc, R0=a*a/(b*b), c = 1-(into?-ddn:tdir.dot(n));
  double Re=R0+(1-R0)*c*c*c*c*c,Tr=1-Re,P=.25+.5*Re,RP=Re/P,TP=Tr/(1-P);
  return obj.e + f.mult(depth>2 ? (erand48(Xi)<P ?   // Russian roulette
    radiance(reflRay,depth,Xi)*RP:radiance(Ray(x,tdir),depth,Xi)*TP) :
    radiance(reflRay,depth,Xi)*Re+radiance(Ray(x,tdir),depth,Xi)*Tr);
}
int main(int argc, char *argv[]){
  int w=1024, h=768, samps = argc==2 ? atoi(argv[1])/4 : 1; // # samples
  Ray cam(Vec(50,52,295.6), Vec(0,-0.042612,-1).norm()); // cam pos, dir
  Vec cx=Vec(w*.5135/h), cy=(cx%cam.d).norm()*.5135, r, *c=new Vec[w*h];
#pragma omp parallel for schedule(dynamic, 1) private(r)       // OpenMP
  for (int y=0; y<h; y++){                       // Loop over image rows
    fprintf(stderr,"\rRendering (%d spp) %5.2f%%",samps*4,100.*y/(h-1));
    for (unsigned short x=0, Xi[3]={0,0,y*y*y}; x<w; x++)   // Loop cols
      for (int sy=0, i=(h-y-1)*w+x; sy<2; sy++)     // 2x2 subpixel rows
        for (int sx=0; sx<2; sx++, r=Vec()){        // 2x2 subpixel cols
          for (int s=0; s<samps; s++){
            double r1=2*erand48(Xi), dx=r1<1 ? sqrt(r1)-1: 1-sqrt(2-r1);
            double r2=2*erand48(Xi), dy=r2<1 ? sqrt(r2)-1: 1-sqrt(2-r2);
            Vec d = cx*( ( (sx+.5 + dx)/2 + x)/w - .5) +
                    cy*( ( (sy+.5 + dy)/2 + y)/h - .5) + cam.d;
            r = r + radiance(Ray(cam.o+d*140,d.norm()),0,Xi)*(1./samps);
          } // Camera rays are pushed ^^^^^ forward to start in interior
          c[i] = c[i] + Vec(clamp(r.x),clamp(r.y),clamp(r.z))*.25;
        }
  }
  FILE *f = fopen("image.ppm", "w");         // Write image to PPM file.
  fprintf(f, "P3\n%d %d\n%d\n", w, h, 255);
  for (int i=0; i<w*h; i++)
    fprintf(f,"%d %d %d ", toInt(c[i].x), toInt(c[i].y), toInt(c[i].z));
}

More info, scenes, and syntax-highlighted source are available on the [LINK http://www.kevinbeason.com/smallpt/ smallpt site].
Comments welcome! I'm particularly interested in code fixes and optimization. Also I'm curious if this is really "unbiased".
Despite it's diminutive size it was a lot of work! Thank you to everyone on ompf.org, you have all been an inspiration.
And lastly, the 1024x768 image shown was computed with 25000 samples per pixel in 10.3 hours using 4 threads on an Intel Q6600 Core 2 Quad box running 32-bit Ubuntu Linux (gutsy) and GCC 4.2. So, obviously, it's not real time [SMILEY :wink:]
Update: switched bbcodes to get syntax highlighting

[IMG #1]:Not scraped: https://web.archive.org/web/20110212052444im_/http://kevinbeason.com/smallpt/result_25k_half.jpg

(L) [2008/11/09] [lomidrevo] [ smallpt: Global Illumination in 99 lines of C++] Wayback!

nice stuff [SMILEY ;)] especially i like those sphere walls [SMILEY :)]

(L) [2008/11/09] [pseudo-tbp] [ smallpt: Global Illumination in 99 lines of C++] Wayback!

Neat, really really neat.
Since you've asked for (random) comments, on first glance, two things comes to mind:
Since you have those externally visible functions hanging there, and better than spuriously sprinkle some 'static' throughout why don't you wrap them all in an anonymous namespace; should help. In my experience it pays to use more 'functional' idioms - in particular to avoid explicit struct-to-struct assignments: initialize via ctor and rewrite norm() so it's no more in-place; code shouldn't be any longer and i expect the codegen to be slightly better.

(L) [2008/11/09] [Phantom] [ smallpt: Global Illumination in 99 lines of C++] Wayback!

Nice contribution, should be very useful to newbies indeed!

(L) [2008/11/09] [tbp] [ smallpt: Global Illumination in 99 lines of C++] Wayback!

Bah. Even if i got slightly better code from
#include <cmath>   // smallpt, a Path Tracer by Kevin Beason, 2008
#include <cstdlib> // Make : g++ -O3 -fopenmp smallpt.cpp -o smallpt
#include <cstdio>  //        Remove "-fopenmp" for g++ version < 4.2
struct Vec {        // Usage: time ./smallpt 5000 && xv image.ppm
  double x, y, z;                  // position, also color (r,g,b)
  Vec(double x_=0, double y_=0, double z_=0) : x(x_),y(y_),z(z_) {}
  Vec operator+(const Vec &b) const { return Vec(x+b.x,y+b.y,z+b.z); }
  Vec operator-(const Vec &b) const { return Vec(x-b.x,y-b.y,z-b.z); }
  Vec operator*(double b) const { return Vec(x*b,y*b,z*b); }
  Vec mult(const Vec &b) const { return Vec(x*b.x,y*b.y,z*b.z); }
  Vec norm() const { return *this * (1/sqrt(dot(*this))); }
  double dot(const Vec &b) const { return x*b.x+y*b.y+z*b.z; }
  Vec operator%(const Vec &b) const  // cross:
    { return Vec(y*b.z-z*b.y,z*b.x-x*b.z,x*b.y-y*b.x); }
};
struct Ray { Vec o, d; Ray(Vec o_, Vec d_) : o(o_), d(d_) {} };
enum Refl_t { DIFF, SPEC, REFR };  // material types, used in radiance()
struct Sphere {
  Vec p, e, c;      // position, emission, color
  double rad;       // radius
  Refl_t refl;      // reflection type (DIFFuse, SPECular, REFRactive)
  Sphere(double rad_, Vec p_, Vec e_, Vec c_, Refl_t refl_):
    p(p_), e(e_), c(c_), rad(rad_), refl(refl_) {}
  double intersect(const Ray &r) const { // returns distance, 0 if nohit
    Vec op(p-r.o); // Solve t^2*d.d + 2*t*(o-p).d + (o-p).(o-p)-R^2 = 0
    double t, eps=1e-4, b=op.dot(r.d), det=b*b-op.dot(op)+rad*rad;
    if (det<0) return 0; else det=sqrt(det);
    return (t=b-det)>eps ? t : ((t=b+det)>eps ? t : 0);
  }
};
namespace {
const Sphere spheres[] = {//Scene: radius, position, emission, color, material
  Sphere(1e5, Vec( 1e5+1,40.8,81.6), Vec(),Vec(.75,.25,.25),DIFF),//Left
  Sphere(1e5, Vec(-1e5+99,40.8,81.6),Vec(),Vec(.25,.25,.75),DIFF),//Rght
  Sphere(1e5, Vec(50,40.8, 1e5),     Vec(),Vec(.75,.75,.75),DIFF),//Back
  Sphere(1e5, Vec(50,40.8,-1e5+170), Vec(),Vec(),           DIFF),//Frnt
  Sphere(1e5, Vec(50, 1e5, 81.6),    Vec(),Vec(.75,.75,.75),DIFF),//Botm
  Sphere(1e5, Vec(50,-1e5+81.6,81.6),Vec(),Vec(.75,.75,.75),DIFF),//Top
  Sphere(16.5,Vec(27,16.5,47),       Vec(),Vec(1,1,1)*.999, SPEC),//Mirr
  Sphere(16.5,Vec(73,16.5,78),       Vec(),Vec(1,1,1)*.999, REFR),//Glas
  Sphere(600, Vec(50,681.6-.27,81.6),Vec(12,12,12),  Vec(), DIFF) //Lite
};
double clamp(double x){ return x<0 ? 0 : x>1 ? 1 : x; }
int toInt(double x){ return int(pow(clamp(x),1/2.2)*255+.5); }
const Sphere *intersect(const Ray &r, double &t){
  const Sphere *id = 0; for(size_t i=sizeof(spheres)/sizeof(Sphere); i--;)
    if (double d = spheres[i].intersect(r)) if (d < t) { t=d; id=spheres+i; }
  return id;
}
Vec radiance(const Ray &r, unsigned depth, unsigned short *Xi){
  double t = 1e20;                        // distance to intersection
  if (const Sphere *obj = intersect(r, t)) { // the hit object
    Vec x(r.o+r.d*t), n((x-obj->p).norm()), nl(n.dot(r.d)<0?n:n*-1), f(obj->c);
    double p = f.x>f.y && f.x>f.z ? f.x : f.y>f.z ? f.y : f.z; // max refl
    if (++depth>5) if (erand48(Xi)<p) f=f*(1/p); else return obj->e; //R.R.
    if (obj->refl == DIFF){                  // Ideal DIFFUSE reflection
      double r1=2*M_PI*erand48(Xi), r2=erand48(Xi), r2s=sqrt(r2);
      Vec w(nl), u(((fabs(w.x)>.1?Vec(0,1):Vec(1))%w).norm()), v(w%u),
        d((u*cos(r1)*r2s + v*sin(r1)*r2s + w*sqrt(1-r2)).norm());
      return obj->e + f.mult(radiance(Ray(x,d),depth,Xi));
    } else if (obj->refl == SPEC)            // Ideal SPECULAR reflection
      return obj->e + f.mult(radiance(Ray(x,r.d-n*2*n.dot(r.d)),depth,Xi));
    Ray reflRay(x, r.d-n*2*n.dot(r.d));     // Ideal dielectric REFRACTION
    bool into = n.dot(nl)>0;                // Ray from outside going in?
    double nc=1, nt=1.5, nnt=into?nc/nt:nt/nc, ddn=r.d.dot(nl), cos2t;
    if ((cos2t=1-nnt*nnt*(1-ddn*ddn))<0)    // Total internal reflection
      return obj->e + f.mult(radiance(reflRay,depth,Xi));
    Vec tdir((r.d*nnt - n*((into?1:-1)*(ddn*nnt+sqrt(cos2t)))).norm());
    double a=nt-nc, b=nt+nc, R0=a*a/(b*b), c = 1-(into?-ddn:tdir.dot(n));
    double Re=R0+(1-R0)*c*c*c*c*c,Tr=1-Re,P=.25+.5*Re,RP=Re/P,TP=Tr/(1-P);
    return obj->e + f.mult(depth>2 ? (erand48(Xi)<P ?   // Russian roulette
      radiance(reflRay,depth,Xi)*RP:radiance(Ray(x,tdir),depth,Xi)*TP) :
      radiance(reflRay,depth,Xi)*Re+radiance(Ray(x,tdir),depth,Xi)*Tr);
  } else return Vec(); // if miss, return black
}
} // anon namespace
int main(int argc, char *argv[]){
  int w=1024, h=768, samps = argc==2 ? atoi(argv[1])/4 : 1; // # samples
  Ray cam(Vec(50,52,295.6), Vec(0,-0.042612,-1).norm()); // cam pos, dir
  Vec cx=Vec(w*.5135/h), cy=(cx%cam.d).norm()*.5135, r, *c=new Vec[w*h];
#pragma omp parallel for schedule(dynamic, 1) private(r)       // OpenMP
  for (int y=0; y<h; y++){                       // Loop over image rows
    // fprintf(stderr,"\rRendering (%d spp) %5.2f%%",samps*4,100.*y/(h-1));
    for (unsigned short x=0, Xi[3]={0,0,y*y*y}; x<w; x++)   // Loop cols
      for (int sy=0, i=(h-y-1)*w+x; sy<2; sy++)     // 2x2 subpixel rows
        for (int sx=0; sx<2; sx++, r=Vec()){        // 2x2 subpixel cols
          for (int s=0; s<samps; s++){
            double r1=2*erand48(Xi), dx=r1<1 ? sqrt(r1)-1: 1-sqrt(2-r1);
            double r2=2*erand48(Xi), dy=r2<1 ? sqrt(r2)-1: 1-sqrt(2-r2);
            Vec d = cx*( ( (sx+.5 + dx)/2 + x)/w - .5) +
                    cy*( ( (sy+.5 + dy)/2 + y)/h - .5) + cam.d;
            r = r + radiance(Ray(cam.o+d*140,d.norm()),0,Xi)*(1./samps);
          } // Camera rays are pushed ^^^^^ forward to start in interior
          c[i] = c[i] + Vec(clamp(r.x),clamp(r.y),clamp(r.z))*.25;
        }
  }
  FILE *f = fopen("image.ppm", "w");         // Write image to PPM file.
  fprintf(f, "P3\n%d %d\n%d\n", w, h, 255);
  for (int i=0; i<w*h; i++)
    fprintf(f,"%d %d %d ", toInt(c[i].x), toInt(c[i].y), toInt(c[i].z));
}
valgrind made it clear i was stupid to expect any practical improvement when rand48 calls amount to about 20% - with some of the I/O culled out - of the run-time.

(L) [2008/11/10] [beason] [ smallpt: Global Illumination in 99 lines of C++] Wayback!

>> lomidrevo wrote:nice stuff especially i like those sphere walls
Thanks! I forget where I got the idea from but there has been similar sphere-approximation stuff here on ompf, like [LINK http://ompf.org/forum/viewtopic.php?f=4&t=944 this thread].
>> pseudo-tbp wrote:Neat, really really neat.
Thanks!
>> Phantom wrote:Nice contribution, should be very useful to newbies indeed!
Thank you! I thought I knew everything when I started but while ironing everything out I definitely learned some things.

(L) [2008/11/10] [beason] [ smallpt: Global Illumination in 99 lines of C++] Wayback!

>> pseudo-tbp wrote:Neat, really really neat.
Since you've asked for (random) comments, on first glance, two things comes to mind:
Since you have those externally visible functions hanging there, and better than spuriously sprinkle some 'static' throughout why don't you wrap them all in an anonymous namespace; should help. In my experience it pays to use more 'functional' idioms - in particular to avoid explicit struct-to-struct assignments: initialize via ctor and rewrite norm() so it's no more in-place; code shouldn't be any longer and i expect the codegen to be slightly better.
You know you were very influential in the whole idea, so hearing your comments is much appreciated (as always [SMILEY :D]).
I was going to say I would try it out but looks like you did already, so thank you! Just glancing over your code I see somethings I didn't know were possible, like declaring variable in an if (). I will look at your code more in depth and reply later.

(L) [2008/11/10] [tbp] [ smallpt: Global Illumination in 99 lines of C++] Wayback!

>> beason wrote:I was going to say I would try it out but looks like you did already, so thank you!
Well, i had to take my own medicine at some point. In fact i should have done that before posting [SMILEY ;)]
>> beason wrote:Just glancing over your code I see somethings I didn't know were possible, like declaring variable in an if (). I will look at your code more in depth and reply later.
Can't say those are the wisest use of...wait a sec... [LINK http://dev.feuvan.net/docs/isocpp/stmt.html stmt 6.4.0.3/6.4.0.4] but i had to, you know, get back some precious lines. Anyway, don't look at it too closely, that was just quickly put together so i could, again, make a fool of myself in public (the whole exercise was useless). Hopefully i'm out of dumb ideas for now.

(L) [2008/11/10] [phresnel] [ smallpt: Global Illumination in 99 lines of C++] Wayback!

Very very neat, Beason! Congratulations [SMILEY :)]
I played with the idea of doing GI-RT in 100 lines, too, but I was too lazy. Glad somebody else did the job!
I really love that christmas/tree-image on your site, and your images really prove that spheres are the ultimate primitive. I think that sphere-thing would have been the point where I would have failed, I would have done it with a AA-Plane + Sphere combination.
There's also an image showing some gradient in the sky, how (and where in your code?) have you implemented that?
edit: I mean this one: [IMG #1 Image]

[IMG #1]:

(L) [2008/11/10] [beason] [ smallpt: Global Illumination in 99 lines of C++] Wayback!

>> phresnel wrote:Very very neat, Beason! Congratulations
I played with the idea of doing GI-RT in 100 lines, too, but I was too lazy. Glad somebody else did the job!
I really love that christmas/tree-image on your site, and your images really prove that spheres are the ultimate primitive. I think that sphere-thing would have been the point where I would have failed, I would have done it with a AA-Plane + Sphere combination.
There's also an image showing some gradient in the sky, how (and where in your code?) have you implemented that?
edit: I mean this one:
Hey phresnel, thank you!! [SMILEY :)]
I should have given you props for the idea for that image! It is based on the picogen.org header. The gradient is a hack.
It's an emissive ring near the horizon. The ground is like the snow on the christmas tree... there is a larger sphere below it which it peeks out from. That larger sphere is emissive and only visible near the edge where it touches the sky sphere, and it lights the sky sphere up.
Here is the code ("horizon brightener"):
Code: [LINK # Select all]Vec Cen(50,40.8,-860);
Sphere spheres[] = {//Scene: radius, position, emission, color, material
Sphere(1600, Vec(1,0,2)*3000, Vec(1,.9,.8)*1.2e1*1.56*2,Vec(), DIFF), // sun
Sphere(1560, Vec(1,0,2)*3500,Vec(1,.5,.05)*4.8e1*1.56*2, Vec(), DIFF), // horizon sun2
Sphere(10000,Cen+Vec(0,0,-200), Vec(0.00063842, 0.02001478, 0.28923243)*6e-2*8, Vec(.7,.7,1)*.25, DIFF), // sky

Sphere(100000, Vec(50, -100000, 0), Vec(),Vec(.3,.3,.3),DIFF), // grnd
Sphere(110000, Vec(50, -110048.5, 0), Vec(.9,.5,.05)*4,Vec(),DIFF),// horizon brightener
Sphere(4e4, Vec(50, -4e4-30, -3000), Vec(),Vec(.2,.2,.2),DIFF),// mountains
Sphere(26.5,Vec(22,26.5,42), Vec(),Vec(1,1,1)*.596, SPEC), // white Mirr
Sphere(13,Vec(75,13,82), Vec(),Vec(.96,.96,.96)*.96, REFR),// Glas
Sphere(22,Vec(87,22,24), Vec(),Vec(.6,.6,.6)*.696, REFR) // Glas2
};
The Cen thing is probably a mistake and not important. All the colors are hand picked, nothing physically-based.
Since you mentioned it, here is the tree scene:
Code: [LINK # Select all]Vec tc(0.0588, 0.361, 0.0941); // tree color
Vec sc = Vec(1,1,1)*.7; // snow color
Sphere spheres[] = {//Scene: radius, position, emission, color, material
Sphere(1e5, Vec(50, 1e5+130, 0), Vec(1,1,1)*1.3,Vec(),DIFF), //lite
Sphere(1e2, Vec(50, -1e2+2, 47), Vec(),Vec(1,1,1)*.7,DIFF), //grnd

Sphere(1e4, Vec(50, -30, 300)+Vec(-sin(50*M_PI/180),0,cos(50*M_PI/180))*1e4, Vec(), Vec(1,1,1)*.99,SPEC),// mirr L
Sphere(1e4, Vec(50, -30, 300)+Vec(sin(50*M_PI/180),0,cos(50*M_PI/180))*1e4, Vec(), Vec(1,1,1)*.99,SPEC),// mirr R
Sphere(1e4, Vec(50, -30, -50)+Vec(-sin(30*M_PI/180),0,-cos(30*M_PI/180))*1e4,Vec(), Vec(1,1,1)*.99,SPEC),// mirr FL
Sphere(1e4, Vec(50, -30, -50)+Vec(sin(30*M_PI/180),0,-cos(30*M_PI/180))*1e4, Vec(), Vec(1,1,1)*.99,SPEC),// mirr FR

Sphere(4, Vec(50,6*.6,47), Vec(),Vec(.13,.066,.033), DIFF),//"tree" trunk
Sphere(16,Vec(50,6*2+16*.6,47), Vec(), tc, DIFF),//"tree"
Sphere(11,Vec(50,6*2+16*.6*2+11*.6,47), Vec(), tc, DIFF),//"tree"
Sphere(7, Vec(50,6*2+16*.6*2+11*.6*2+7*.6,47), Vec(), tc, DIFF),//"tree"
Sphere(15.5,Vec(50,1.8+6*2+16*.6,47), Vec(), sc, DIFF),//"tree" snow
Sphere(10.5,Vec(50,1.8+6*2+16*.6*2+11*.6,47), Vec(), sc, DIFF),//"tree" snow
Sphere(6.5, Vec(50,1.8+6*2+16*.6*2+11*.6*2+7*.6,47), Vec(), sc, DIFF),//"tree" snow
};
As for plane-test, yeah, size requirements pretty much rule it out (maybe not... some lines can be freed up by manually inlining the intersect() routine into radiance()).

(L) [2008/11/10] [Screeb] [ smallpt: Global Illumination in 99 lines of C++] Wayback!

Really cool, but I'm sure anyone would have given you slack if you'd had empty lines in there to make it easier to read and still called it "99 lines" [SMILEY :wink:] Especially if you want to help people who are just starting off. Nothing worse than trying to understand code that's "badly" formatted.

(L) [2008/11/11] [tbp] [ smallpt: Global Illumination in 99 lines of C++] Wayback!

Nothing's like a template formulation to get on the good foot early in the morning:
struct intersector_t {
  const Sphere *obj; double t;
  intersector_t(const Ray &r) : obj(0), t(1e20) { next<sizeof(spheres)/sizeof(Sphere)>(r); }
  template<size_t num> void next(const Ray &r) {
    enum { id = num - 1 }; const Sphere &s(spheres[id]);
    Vec op(s.p-r.o); // Solve t^2*d.d + 2*t*(o-p).d + (o-p).(o-p)-R^2 = 0
    double eps=1e-4, b=op.dot(r.d), det=b*b-op.dot(op)+s.rad*s.rad, d=sqrt(det);
    if (det>=0 && (d=b-copysign(d,b-d))>eps && d<t) { t=d; obj=spheres+id; }
    next<id>(r);
  }
  operator bool() const { return obj != 0; }
  const Sphere *operator ->() const { return obj; }
};
template<> void intersector_t::next<0>(const Ray &r) { (void) r; }
Vec radiance(const Ray &r, unsigned depth, unsigned short *Xi){
  if (intersector_t hit = intersector_t(r)) { // the hit object
    Vec x(r.o+r.d*hit.t), n((x-hit->p).norm()), nl(n.dot(r.d)<0?n:n*-1), f(hit->c);
    double p = f.x>f.y && f.x>f.z ? f.x : f.y>f.z ? f.y : f.z; // max refl
    if (++depth>5) if (erand48(Xi)<p) f=f*(1/p); else return hit->e; //R.R.
    if (hit->refl == DIFF){                  // Ideal DIFFUSE reflection
      double r1=2*M_PI*erand48(Xi), r2=erand48(Xi), r2s=sqrt(r2);
      Vec w(nl), u(((fabs(w.x)>.1?Vec(0,1):Vec(1))%w).norm()), v(w%u),
        d((u*cos(r1)*r2s + v*sin(r1)*r2s + w*sqrt(1-r2)).norm());
      return hit->e + f.mult(radiance(Ray(x,d),depth,Xi));
    } else if (hit->refl == SPEC)            // Ideal SPECULAR reflection
      return hit->e + f.mult(radiance(Ray(x,r.d-n*2*n.dot(r.d)),depth,Xi));
    Ray reflRay(x, r.d-n*2*n.dot(r.d));     // Ideal dielectric REFRACTION
    bool into = n.dot(nl)>0;                // Ray from outside going in?
    double nc=1, nt=1.5, nnt=into?nc/nt:nt/nc, ddn=r.d.dot(nl), cos2t;
    if ((cos2t=1-nnt*nnt*(1-ddn*ddn))<0)    // Total internal reflection
      return hit->e + f.mult(radiance(reflRay,depth,Xi));
    Vec tdir((r.d*nnt - n*((into?1:-1)*(ddn*nnt+sqrt(cos2t)))).norm());
    double a=nt-nc, b=nt+nc, R0=a*a/(b*b), c = 1-(into?-ddn:tdir.dot(n));
    double Re=R0+(1-R0)*c*c*c*c*c,Tr=1-Re,P=.25+.5*Re,RP=Re/P,TP=Tr/(1-P);
    return hit->e + f.mult(depth>2 ? (erand48(Xi)<P ?   // Russian roulette
      radiance(reflRay,depth,Xi)*RP:radiance(Ray(x,tdir),depth,Xi)*TP) :
      radiance(reflRay,depth,Xi)*Re+radiance(Ray(x,tdir),depth,Xi)*Tr);
  } else return Vec(); // if miss, return black
}
It's, again, rather useless being a draw LOC (hmm, not sure) and run-time wise... but then if (yeah, right) you pay to expand that copysign and force inlining, it's a tad faster [SMILEY :P]
Initially i was trying to make some space to fit in a RNG (so among other things it could be ported to msvc), but apparently that's not the right angle...

(L) [2008/11/12] [Ono-Sendai] [ smallpt: Global Illumination in 99 lines of C++] Wayback!

Very cool demo, well done [SMILEY :)]

(L) [2008/12/02] [tbp] [ smallpt: Global Illumination in 99 lines of C++] Wayback!

Keeping up with the silly experiment tradition, i thought i could test msvc9.sp1 codegen some more (in a nutshell, it sucks) with this thing, so i first had to graft some small & decent RNG onto it. I made no attempt to obfuscate compact it any more because - euphemism ahead - i'm not quite sure of the soundness of what i did, even if it Seems-To-Work-Just-Fine[tm]:
  1 is excluded from possible values, but does it really matter?  i'm uncertain what's the proper way to compose those 2*random 32bits into 53.  all that decorrelation business. I'd appreciate an authoritative answer for either [SMILEY ;)]
Anyway,
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
template<typename T, typename U> static T bit_cast(const U &val) { union { U u; T t; } bits = { val }; return bits.t; }
// Marsaglia KISS RNG (combines 3 simpler RNG to get a decent period).
struct kiss_t {
  uint32_t x, y, z, c;
  // c doesn't need to be re-seeded but must be < 698769069; y must be non-zero; so...
  kiss_t(uint32_t seed1=123456789, uint32_t seed2=362436000) : x(seed1), y(362436000), z(seed2), c(7654321) {}
  uint32_t next() {
    x = 69069*x + 12345; y ^= y<<13; y ^= y>>17; y ^= y<<5;
    uint64_t t = 698769069ull*z + c;
    c = t>>32; z = t;
    return x + y + z;
  }
  operator double() {
    // not quite sure that's the right way to mix to get 53 bits worth of randomness
    // and unlike erand48, we return [0, 1)
    return bit_cast<double>((uint64_t(next())^(uint64_t(next())<<(53-32))) | bit_cast<uint64_t>(1.)) - 1.;
  }
};
struct drand48_t {
  unsigned short Xi[3];
  drand48_t(uint32_t seed) { Xi[0] = 0; Xi[1] = 0; Xi[2] = seed; }
  operator double() { return erand48(Xi); }
};
typedef kiss_t rng_t;
//typedef drand48_t rng_t;

#define _CRT_SECURE_NO_WARNINGS // pff
#define _USE_MATH_DEFINES       // M_PI
#include <math.h>
//#include <cmath>   // smallpt, a Path Tracer by Kevin Beason, 2008
#include <cstdlib> // Make : g++ -O3 -fopenmp smallpt.cpp -o smallpt
#include <cstdio>  //        Remove "-fopenmp" for g++ version < 4.2
struct Vec {        // Usage: time ./smallpt 5000 && xv image.ppm
  double x, y, z;                  // position, also color (r,g,b)
  Vec(double x_=0, double y_=0, double z_=0) : x(x_),y(y_),z(z_) {}
  Vec operator+(const Vec &b) const { return Vec(x+b.x,y+b.y,z+b.z); }
  Vec operator-(const Vec &b) const { return Vec(x-b.x,y-b.y,z-b.z); }
  Vec operator*(double b) const { return Vec(x*b,y*b,z*b); }
  Vec mult(const Vec &b) const { return Vec(x*b.x,y*b.y,z*b.z); }
  Vec norm() const { return *this * (1/sqrt(dot(*this))); }
  double dot(const Vec &b) const { return x*b.x+y*b.y+z*b.z; }
  Vec operator%(const Vec &b) const  // cross:
    { return Vec(y*b.z-z*b.y,z*b.x-x*b.z,x*b.y-y*b.x); }
};
struct Ray { Vec o, d; Ray(const Vec &o_, const Vec &d_) : o(o_), d(d_) {} };
enum Refl_t { DIFF, SPEC, REFR };  // material types, used in radiance()
struct Sphere {
  Vec p, e, c;      // position, emission, color
  double rad;       // radius
  Refl_t refl;      // reflection type (DIFFuse, SPECular, REFRactive)
  Sphere(double rad_, Vec p_, Vec e_, Vec c_, Refl_t refl_):
    p(p_), e(e_), c(c_), rad(rad_), refl(refl_) {}
  double intersect(const Ray &r) const { // returns distance, 0 if nohit
    Vec op(p-r.o); // Solve t^2*d.d + 2*t*(o-p).d + (o-p).(o-p)-R^2 = 0
    double t, eps=1e-4, b=op.dot(r.d), det=b*b-op.dot(op)+rad*rad;
    if (det<0) return 0; else det=sqrt(det);
    return (t=b-det)>eps ? t : ((t=b+det)>eps ? t : 0);
  }
};
const Sphere spheres[] = {//Scene: radius, position, emission, color, material
  Sphere(1e5, Vec( 1e5+1,40.8,81.6), Vec(),Vec(.75,.25,.25),DIFF),//Left
  Sphere(1e5, Vec(-1e5+99,40.8,81.6),Vec(),Vec(.25,.25,.75),DIFF),//Rght
  Sphere(1e5, Vec(50,40.8, 1e5),     Vec(),Vec(.75,.75,.75),DIFF),//Back
  Sphere(1e5, Vec(50,40.8,-1e5+170), Vec(),Vec(),           DIFF),//Frnt
  Sphere(1e5, Vec(50, 1e5, 81.6),    Vec(),Vec(.75,.75,.75),DIFF),//Botm
  Sphere(1e5, Vec(50,-1e5+81.6,81.6),Vec(),Vec(.75,.75,.75),DIFF),//Top
  Sphere(16.5,Vec(27,16.5,47),       Vec(),Vec(1,1,1)*.999, SPEC),//Mirr
  Sphere(16.5,Vec(73,16.5,78),       Vec(),Vec(1,1,1)*.999, REFR),//Glas
  Sphere(600, Vec(50,681.6-.27,81.6),Vec(12,12,12),  Vec(), DIFF) //Lite
};
double clamp(double x){ return x<0 ? 0 : x>1 ? 1 : x; }
int toInt(double x){ return int(pow(clamp(x),1/2.2)*255+.5); }
struct intersector_t {
  const Sphere *obj; double t;
  intersector_t(const Ray &r) : obj(0), t(1e20) { next<sizeof(spheres)/sizeof(Sphere)>(r); }
  template<size_t num> void next(const Ray &r) {
    enum { id = num - 1 }; const Sphere &s(spheres[id]);
    Vec op(s.p-r.o); // Solve t^2*d.d + 2*t*(o-p).d + (o-p).(o-p)-R^2 = 0
    double eps=1e-4, b=op.dot(r.d), det=b*b-op.dot(op)+s.rad*s.rad, d=sqrt(det);
    if (det>=0 && (d=b-copysign(d,b-d))>eps && d<t) { t=d; obj=spheres+id; }
    next<id>(r);
  }
  operator bool() const { return obj != 0; }
  const Sphere *operator ->() const { return obj; }
};
template<> void intersector_t::next<0>(const Ray &r) { (void) r; }
Vec radiance(const Ray &r, unsigned depth, rng_t &rng){
  if (intersector_t hit = intersector_t(r)) { // the hit object
    Vec x(r.o+r.d*hit.t), n((x-hit->p).norm()), nl(n.dot(r.d)<0?n:n*-1), f(hit->c);
    double p = f.x>f.y && f.x>f.z ? f.x : f.y>f.z ? f.y : f.z; // max refl
    if (++depth>5) if (rng<p) f=f*(1/p); else return hit->e; //R.R.
    if (hit->refl == DIFF){                  // Ideal DIFFUSE reflection
      double r1=2*M_PI*rng, r2=rng, r2s=sqrt(r2);
      Vec w(nl), u(((fabs(w.x)>.1?Vec(0,1):Vec(1))%w).norm()), v(w%u),
        d((u*cos(r1)*r2s + v*sin(r1)*r2s + w*sqrt(1-r2)).norm());
      return hit->e + f.mult(radiance(Ray(x,d),depth,rng));
    } else if (hit->refl == SPEC)            // Ideal SPECULAR reflection
      return hit->e + f.mult(radiance(Ray(x,r.d-n*2*n.dot(r.d)),depth,rng));
    Ray reflRay(x, r.d-n*2*n.dot(r.d));     // Ideal dielectric REFRACTION
    bool into = n.dot(nl)>0;                // Ray from outside going in?
    double nc=1, nt=1.5, nnt=into?nc/nt:nt/nc, ddn=r.d.dot(nl), cos2t;
    if ((cos2t=1-nnt*nnt*(1-ddn*ddn))<0)    // Total internal reflection
      return hit->e + f.mult(radiance(reflRay,depth,rng));
    Vec tdir((r.d*nnt - n*((into?1:-1)*(ddn*nnt+sqrt(cos2t)))).norm());
    double a=nt-nc, b=nt+nc, R0=a*a/(b*b), c = 1-(into?-ddn:tdir.dot(n));
    double Re=R0+(1-R0)*c*c*c*c*c,Tr=1-Re,P=.25+.5*Re,RP=Re/P,TP=Tr/(1-P);
    return hit->e + f.mult(depth>2 ? (rng<P ?   // Russian roulette
      radiance(reflRay,depth,rng)*RP:radiance(Ray(x,tdir),depth,rng)*TP) :
      radiance(reflRay,depth,rng)*Re+radiance(Ray(x,tdir),depth,rng)*Tr);
  } else return Vec(); // if miss, return black
}
int main(int argc, char *argv[]){
  int w=1024, h=768, samps = argc==2 ? atoi(argv[1])/4 : 1; // # samples
  Ray cam(Vec(50,52,295.6), Vec(0,-0.042612,-1).norm()); // cam pos, dir
  Vec cx=Vec(w*.5135/h), cy=(cx%cam.d).norm()*.5135, r, *c=new Vec[w*h];
#pragma omp parallel for schedule(dynamic, 1) private(r)       // OpenMP
  for (int y=0; y<h; y++){                       // Loop over image rows
    // fprintf(stderr,"\rRendering (%d spp) %5.2f%%",samps*4,100.*y/(h-1));
    rng_t rng(y*y*y);
    for (unsigned short x=0; x<w; x++)   // Loop cols
      for (int sy=0, i=(h-y-1)*w+x; sy<2; sy++)     // 2x2 subpixel rows
        for (int sx=0; sx<2; sx++, r=Vec()){        // 2x2 subpixel cols
          for (int s=0; s<samps; s++){
            double r1=2*rng, dx=r1<1 ? sqrt(r1)-1: 1-sqrt(2-r1);
            double r2=2*rng, dy=r2<1 ? sqrt(r2)-1: 1-sqrt(2-r2);
            Vec d = cx*( ( (sx+.5 + dx)/2 + x)/w - .5) +
                    cy*( ( (sy+.5 + dy)/2 + y)/h - .5) + cam.d;
            r = r + radiance(Ray(cam.o+d*140,d.norm()),0,rng)*(1./samps);
          } // Camera rays are pushed ^^^^^ forward to start in interior
          c[i] = c[i] + Vec(clamp(r.x),clamp(r.y),clamp(r.z))*.25;
        }
  }
  FILE *f = fopen("image.ppm", "w");         // Write image to PPM file.
  fprintf(f, "P3\n%d %d\n%d\n", w, h, 255);
  for (int i=0; i<w*h; i++)
    fprintf(f,"%d %d %d ", toInt(c[i].x), toInt(c[i].y), toInt(c[i].z));
}

(L) [2008/12/02] [Ben] [ smallpt: Global Illumination in 99 lines of C++] Wayback!

>> [*] 1 is excluded from possible values, but does it really matter?
Assuming you want [0,1] then mathematically yes, but realistically no.
>> [*] i'm uncertain what's the proper way to compose those 2*random 32bits into 53.
Doesn't matter, a mask would be fine, you lose 11 random bits anyway. If you assume you rngs are truely random then xor'ing the result should also be truely random. You could base the method you use to combine the bits on any downfalls of the rng.

(L) [2008/12/02] [phresnel] [ smallpt: Global Illumination in 99 lines of C++] Wayback!

Code: [LINK # Select all]int main(int argc, char *argv[]){
[...]
FILE *f = fopen("image.ppm", "w"); // Write image to PPM file.
fprintf(f, "P3\n%d %d\n%d\n", w, h, 255);
for (int i=0; i<w*h; i++)
fprintf(f,"%d %d %d ", toInt(c[i].x), toInt(c[i].y), toInt(c[i].z));
fclose (f); // [-X
}
Someone forgot to fclose the file.
Then, nitpicking, but I think this how Stroustroup would do it:
Code: [LINK # Select all]int main(int argc, char *argv[]){
[...]
ofstream f ("image.ppm");
f << "P3\n" << w << '\n' << h << '\n' << 255 << '\n';
for (int i=0; i<w*h; ++i)
f << toInt (c [i].x) << ' ' << toInt (c [i].y) << ' ' << toInt (c [i].z) << ' ';
f.close(); // << Hmm, is ~ofstream() defined to close the file? Could save a line here;
}

edit: Cannot argue about the Marsaglia-Implementation though. I am not so random.

(L) [2008/12/02] [gfyffe] [ smallpt: Global Illumination in 99 lines of C++] Wayback!

>> f.close(); // << Hmm, is ~ofstream() defined to close the file? Could save a line here;
Yes, the ofstream destructor closes the file. This is C++, after all.

(L) [2008/12/02] [phresnel !logged in] [ smallpt: Global Illumination in 99 lines of C++] Wayback!

Beason: Why not save one more line and simply write to stdout? [SMILEY :)]
>> gfyffe wrote:f.close(); // << Hmm, is ~ofstream() defined to close the file? Could save a line here;
Yes, the ofstream destructor closes the file. This is C++, after all.
I was unsure (and to be honest, to lazy to look it up) if it is *guaranteed* by stdcxx to flush and close properly. Simply putting "This is C++" is not an argument as ofstream is not part of the core language [SMILEY :)]

(L) [2008/12/03] [Ben] [ smallpt: Global Illumination in 99 lines of C++] Wayback!

>> phresnel !logged in wrote:Beason: Why not save one more line and simply write to stdout?
gfyffe wrote:...
I was unsure (and to be honest, to lazy to look it up) if it is *guaranteed* by stdcxx to flush and close properly. Simply putting "This is C++" is not an argument as ofstream is not part of the core language
'core language'?
The stl classes follow RAII so all opened/allocated by the class should be closed/deallocated in the deconstructor. I would consider the stl very much part of the core language.

(L) [2008/12/03] [beason] [ smallpt: Global Illumination in 99 lines of C++] Wayback!

>> phresnel wrote:Someone forgot to fclose the file.

I didn't forget. I'm exploiting the C++ standard which calls return(0) implicitly, which in turn calls exit(0), which flushes and closes open files. Woo! [SMILEY :)]
From the [LINK http://dev.feuvan.net/docs/isocpp/basic.html#basic.start.main C++ standard]:
>> A return statement in main has the effect of leaving the main function (destroying any objects with automatic storage duration) and calling exit with the return value as the argument. If control reaches the end of main without encountering a return statement, the effect is that of executing
Code: [LINK # Select all]return 0;

From the exit(3) man page:
>> All open stdio(3) streams are flushed and closed.

Maybe not a good practice, but (hopefully) still valid. Let me know if it caused you an error.

(L) [2008/12/03] [phresnel] [ smallpt: Global Illumination in 99 lines of C++] Wayback!

>> I would consider the stl very much part of the core language.
Sorry, I was unprecise here. With "core language" I meant the level below stdlib+language, i.e. the bare programming language. So my phrase goes "not an argument, as ofstream is not part of the C++ programming language".
>> The stl classes follow RAII so all opened/allocated by the class should be closed/deallocated in the deconstructor.
I am not sure how much RAII argues something about whether streams are flushed and closed. RAII is more about exception safetyness and leaving the program's/machine's state clean and undirty.

>> beason wrote:[...]
I didn't forget. I'm exploiting the C++ standard which calls return(0) implicitly, which in turn calls exit(0), which flushes and closes open files. Woo!
From the [LINK http://dev.feuvan.net/docs/isocpp/basic.html#basic.start.main C++ standard]:
[...]
From the exit(3) man page:
[...]
Maybe not a good practice, but (hopefully) still valid. Let me know if it caused you an error.
You Win [SMILEY :D]
And thanks for being unlazy and checking out! But then I have experienced that when I do ...
Code: [LINK # Select all]#include <iostream>
void raboof() {
std::cout << "foo!" << std::flush;
std::cout << "bar!";
exit (033653337357);
}
... most of the time "bar!" wouldn't be printed; this is where my confusion came from. The reason might be that ~cout() won't be called, or more generally, stack unwinding is not done (as no scope is left).
So it seems that cout does not necessarily keep streams open all the time, maybe it has it's own buffer; while with FILE*-functions you definitely put stuff on streams.
Still, would writing to stdout not save a line? [SMILEY :)]
edit: Ben, as you are not using a user account, sending you a P.M. is not possible, so this is the only place to ask. Are you bouliiiiiiiiiiiiiii? Just curious.

(L) [2008/12/03] [bouliiii] [ smallpt: Global Illumination in 99 lines of C++] Wayback!

>> phresnel wrote:edit: Ben, as you are not using a user account, sending you a P.M. is not possible, so this is the only place to ask. Are you bouliiiiiiiiiiiiiii? Just curious.
Ben is not bouliiii [SMILEY :)]

(L) [2008/12/03] [Ben] [ smallpt: Global Illumination in 99 lines of C++] Wayback!

>> phresnel wrote:The stl classes follow RAII so all opened/allocated by the class should be closed/deallocated in the deconstructor.
I am not sure how much RAII argues something about whether streams are flushed and closed. RAII is more about exception safetyness and leaving the program's/machine's state clean and undirty.
RAII [Resource Acquisition Is Initialization] can be applied to any resource, and simply means that resources will be acquired and released by tying them to objects. A stream generally has an underlying resource (e.g. a file handle or allocated memory) and so RAII can be applied to this.
>> phresnel wrote:Code: [LINK # Select all]#include <iostream>
void raboof() {
std::cout << "foo!" << std::flush;
std::cout << "bar!";
exit (033653337357);
}
... most of the time "bar!" wouldn't be printed;
I don't seem to be able to reproduce this.

(L) [2008/12/03] [phresnel] [ smallpt: Global Illumination in 99 lines of C++] Wayback!

>> Ben wrote:RAII [Resource Acquisition Is Initialization] can be applied to any resource, and simply means that resources will be acquired and released by tying them to objects. A stream generally has an underlying resource (e.g. a file handle or allocated memory) and so RAII can be applied to this.
RAII doesn't require streams to flush, as RAII doesn't require transactions to commit (catastrophe). And please stop teaching me about RAII, this is insulting. Everybody can google it.
>> phresnel wrote:... most of the time "bar!" wouldn't be printed;
I don't seem to be able to reproduce this.
Hard to reproduce in this case, indeed. Similar:
Code: [LINK # Select all]smach@seb-debian:~/Desktop$ echo "#include <fstream>
int main () {
std::ofstream f (\"/tmp/hmmm\");
f << \"foo\\n\" << std::flush;
f << \"bar\\n\";
exit (033653337357);
}
" > /tmp/reproduce.cc && g++ /tmp/reproduce.cc && ./a.out
smach@seb-debian:~/Desktop$ cat /tmp/hmmm
foo
smach@seb-debian:~/Desktop$

(L) [2008/12/03] [Ben] [ smallpt: Global Illumination in 99 lines of C++] Wayback!

>> Similar:
Happens in GCC but not MSVC (I had only tried msvc.) GCC seems to skip calling the destructors after calling exit()
>> And please stop teaching me about RAII, this is insulting.
I wasn't, I was checking my own knowledge and trying to understand your view.

(L) [2008/12/03] [beason] [ smallpt: Global Illumination in 99 lines of C++] Wayback!

>> phresnel wrote:Still, would writing to stdout not save a line?

You are absolutely right, I could save a line. I just thought it would be nicer if it didn't spam a ~million colors to anyone who forgot to redirect [SMILEY :)]

(L) [2008/12/11] [tbp] [ smallpt: Global Illumination in 99 lines of C++] Wayback!

I think enduring 3 BSOD, and later a few alt-sysreq s/b, gives me the right to flood this thread again with yet more crap. Only this time it's fashionable GPU crap.
mouarf64.png
I know, it's wrong, but it's kinda pretty and i'm done trying to a) fix my bugs b) hack around CUDA c) fit that numerical nightmare into (non IEEE compliant) floats for today; it's hopelessly slow and already takes way too much code to get there. But, hey, that's the wave of the future.

(L) [2008/12/11] [phresnel !!logged-in] [ smallpt: Global Illumination in 99 lines of C++] Wayback!

A) I guess the only thing that lets you really hack proprietary CUDA is
A.0) Stay your man and grunt a bit, just showing that you have done because you can (absolutely okay).
A.1) The hope it will be liberated once.
B) Good Work!
C) (Assuming that by "wrong" you mean the result and not the fact you are hacking in paled technology) Why do you think it's wrong?

(L) [2008/12/11] [tbp] [ smallpt: Global Illumination in 99 lines of C++] Wayback!

It's wrong because moments before posting i thought i could shave some registers and only got a screwed ray tree as a result, hence the not so refractive sphere on the right.
Foolishly my initial intention was to shoehorn everything into, say, 200 lines, but that was of course without taking account of various subtleties. So, with a bit more work, it's going to more or less match the original output but with some severe limitations and bloatage: for once, it's rather slow (~25s to produce that image on my cheapo 8600gts) and really fixing that would require a more radical approach. And much much code.
I'll post the code when i'll get my caustics back.

(L) [2008/12/11] [beason] [ smallpt: Global Illumination in 99 lines of C++] Wayback!

>> tbp wrote:I think enduring 3 BSOD, and later a few alt-sysreq s/b, gives me the right to flood this thread again with yet more crap. Only this time it's fashionable GPU crap.
This looks great!
25s, eh? For how many paths and at what resolution? These stats would be interesting whenever you get it working again to compare against the Q6600.
From my limited understanding of cuda, stacks and memory are precious. For the glass shader you can ditch the ray tree forking and just go with a single ray using russian roulette by changing the depth>2 test to depth>0 and culling the dead code and test. Then there is no more ray "tree", rather it is a single path at all times. Only drawback is this adds noise. (I apologize if this is all obvious.)
I'd love to try it out whenever... It will be interesting to see if my 8400gs (slow pos) can beat out the q6600 its paired with!

(L) [2008/12/11] [beason] [ smallpt: Global Illumination in 99 lines of C++] Wayback!

>> beason wrote:tbp wrote:I think enduring 3 BSOD, and later a few alt-sysreq s/b, gives me the right to flood this thread again with yet more crap. Only this time it's fashionable GPU crap.
This looks great!
25s, eh? For how many paths and at what resolution? These stats would be interesting whenever you get it working again to compare against the Q6600.
From my limited understanding of cuda, stacks and memory are precious. For the glass shader you can ditch the ray tree forking and just go with a single ray using russian roulette by changing the depth>2 test to depth>0 and culling the dead code and test. Then there is no more ray "tree", rather it is a single path at all times. Only drawback is this adds noise. (I apologize if this is all obvious.)
I'd love to try it out whenever... It will be interesting to see if my 8400gs (slow!) can beat out the q6600 its paired with!

(L) [2008/12/11] [tbp] [ smallpt: Global Illumination in 99 lines of C++] Wayback!

The ray tree is only there conceptually, it's a vague stack i avoid like the plague because it's in that sloooooow local memory. But i wish that was the biggest issue. I mean there's exactly 0 chance to get it to run decently unless you completely mangle it into tiny passes, because it's so register & memory starved - and annoyed by all that branching - it looks like a textbook example of what not to do; i mean, we all know the recipe: bake some randoms, shoot rays, gather, shoot, gather, shoot etc... Frankly, i'd have more fun breaking a leg.
I've only kludged around the most glaring 'features', like getting dumped if i dare to claim the damn thing for myself for more than 5s. I've gone as far as adding yet another level of indirection (to the ones you're forced to deal with) by tiling the render (and block sharing subpixels), but i refuse to top that with some fix for samples/pixels; unless everything's bounded (good luck with that), it will die eventually anyway.
Precision is also troublesome, to say the least.
Those 25s were for the image as presented (@ 64spp), no more no less, though it really doesn't mean much (beside that it's not lighting fast).
NB: It's the wave of the future, with that 90's flavor where process can take down a box, compilers are half assed, floats sorely miss some ULP and you get to play with kilobytes of memory [SMILEY :P]

(L) [2008/12/12] [beason] [ smallpt: Global Illumination in 99 lines of C++] Wayback!

I recall CUDA does not allow recursion. So you already converted it to a "stack". It shouldn't really be a stack though since there is no back tracking, so rather it is a simple loop (which has an inner loop to test the sphere list). Is the KB of memory for the random numbers? It must be because the scene data should be very small. When you say slow local memory, are you referring to on the video card? Is it really that slow? It should be a lot better than accessing the host machine's main memory!
Please understand I am speaking from zero experience using CUDA [SMILEY :)]

(L) [2008/12/12] [beason] [ smallpt: Global Illumination in 99 lines of C++] Wayback!

>> beason wrote:I recall CUDA does not allow recursion. So you already converted it to a "stack". It shouldn't really be a stack though since there is no back tracking, so rather it is a simple loop (which has an inner loop to test the sphere list). Is the KB of memory for the random numbers? It must be because the scene data should be very small. When you say slow local memory, are you referring to on the video card? Is it really that slow? It should be a lot better than accessing the host machine's main memory!
Please understand I am speaking from zero experience using CUDA

EDIT: Oops... There is back-tracking because you must use the returned color of child ray to compute returned color. I'm pretty sure this part of the algorithm could be "inverted" so you can compute the final returned color once you reach the light, with no back-tracking, by just carrying forward the BRDF-color-multiplier f and doing something with the emission component...

(L) [2008/12/12] [tbp] [ smallpt: Global Illumination in 99 lines of C++] Wayback!

If you grab that [LINK http://developer.download.nvidia.com/compute/cuda/CUDA_Occupancy_calculator.xls CUDA occupancy calculator] you'll see what i'm talking about.
Briefly there's a bunch of distinct memory pools, with that slow local memory attached to threads, those fast shared 16k (in compute 1.1) to blocks (of threads) and then the rest. How you affect that shared memory and register usage are the main bottleneck for how many threads you can put in a block.
I map a grid of blocks of threads to a tile of super sampled pixels, so i can accumulate back to a pixel within a block (quickly via shared memory); that should really be extended to per pixel samples, spread on blocks and coalesced back, for reasons evoked before.
So, i need a bit of shared memory already, but since i can't spawn too many threads, because of the horrible register usage (48, haha), not that much [SMILEY ;)]
Right now scene data, and some other baked quantities, live in constant memory (slow, but cached, global mem) but they could also use that neat shared memory.
And then there's that fat ray tree. I could not reduce storage requirement further than having a ray, an accumulated radiance, a pre multiplied f and a depth while still matching exactly what the original smallpt did. Already way too much for that prized shared memory.
I'd need a compute >= 1.2 card to spawn more threads to get a better theoretical occupancy, or on the software side drastically reduce register usage; for that, baking randoms would obviously be a low hanging fruit (i do Marsaglia's KISS on the spot and those GPU don't quite like integer ops anyway). Then, i'd need to address that hairy branching, but that would require to turn it all upside down and to march rays one step at a time. Meh.
Exec summary: it runs like shit and that was expected.
Now, removing that stack would help a bit (i don't quite grok what you've proposed [SMILEY :P]) but that's, again, only part of the problematic: it's only touched when required.
PS: Once in flight, everything's on the GPU and the only thing to get out is a tile (copied and collated back on host).
PS²: Theoretically, the occupancy being much better on compute >= 1.2 cards, it could be a tad faster there. Theorically.

(L) [2008/12/14] [tbp] [ smallpt: Global Illumination in 99 lines of C++] Wayback!

Nearing a nervous breakdown, i'm going to end today's WTF session and simply dump it all there. It's still quite borked, now even more on linux (WTF!), and obviously the PRNG seeding could use a bit more tinkering, but i give up for now. I know i said i wouldn't decouple sampling, but i did because a) what's the use if you can't waste more than 64spp b) other ways to get back some registers were even more annoying; of course that made proper seeding that more tricky and vital (register usage 39).
I've tried to see if throwing way more kernels at the problem at the same time (and collating samples/tiles on the GPU too) would bring anything, and the answer was a resounding no; i just ping pong 2 streams, and collate on the host.
1024x768x200spp done in 2mn04s.
smallpt_cuda-1024x768x200spp.jpg

// nvcc: --ptxas-options=-v
//
// tweaks
enum {
    SS = 2,                // super sampling (don't touch it for now).
    max_traces  = 32,    // arbitrary max ray tree depth.
    max_depth   = 64,    // also arbitrary, maximum number of paths (we got to release that GPU or else...)
    num_threads = 32,    // as many as can fit.
    // now let's devise what kind of grid we can throw at it.
    tile_size_x = 128, tile_size_y = tile_size_x, tile_area = tile_size_x*tile_size_y,
    block_size_y = SS, block_size_x = num_threads/block_size_y, // num_threads per block; super sampling happens within a block.
    grid_size_x = (tile_size_x*SS)/block_size_x, grid_size_y = (tile_size_y*SS)/block_size_y
};
#ifdef NO_CUDA
    #define _CRT_SECURE_NO_WARNINGS  // pff
    #define __host__
    #define __device__
    #define __constant__
    #define __shared__
    #define __local__
    #define __global__
    #define ALIGN(x)
    #define BREAKPOINT()
    #define FINLINE __attribute__((always_inline))
    #define NOINLINE __attribute__((noinline))
    #pragma warning(disable : 4244) // 'return' : conversion from 'double' to 'real', possible loss of data
    #pragma warning(disable : 4305) // 'initializing' : truncation from 'double' to 'real'
    #pragma warning(disable : 4146) // unary minus operator applied to unsigned type, result still unsigned
    struct fake_t { int x, y; };
    static fake_t threadIdx, blockIdx;
    // includes, system
    #include <cstdlib>
    #include <cstdio>
    #include <cstring>
    #include <cmath>
    // so that silly msvc can track those.
    #include <cuda.h>
    #include <cuda_runtime.h>
    #include <cutil.h>
    #include <math_constants.h>
    enum { no_cuda = 1 };
#else
    #include <cstdlib>
    #include <cstdio>
    #include <cstring>
    #include <cmath>
    #include <cutil.h>
    #include <math_constants.h>
    //#include <cutil_inline.h>
    // #include <sm_13_double_functions.h> // ok, that's not enough to get __double_as_longlong & co.
    // #include <math_functions_dbl_ptx3.h>
    #define ALIGN(x) __align__(x)
    #define BREAKPOINT()
    #define FINLINE inline
    #define NOINLINE __noinline__
    enum { no_cuda = 0 };
#endif
typedef unsigned char uchar_t;
typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
typedef int int32_t;
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
//
// float or double, that is the question
// at some point you could switch between floats & doubles, but not anymore (useless and a pain).
#if 1
    #define F(x)    x ## f
    typedef float real;
#else
    #define F(x)    x
    typedef double real;
#endif
// sorry for the use of macros, but nvcc is getting on my nerves.
#define PI            F(3.14159265358979323846)
#define LARGE_R        F(1e3) //F(1e3)            // used to refurbish that scene for floats.
#define REFIT(x)    ((x)/F(128.)) // ((x)*(1/F(128.)))

// CUDA can't deal with arrays of structs.
// Then you can't __align__(__alignof(T)).
// Since we only care about floats and NVidia should fix their compiler we'll go for...
template<typename T, size_t num>
    struct aligned_storage_t {
        char ALIGN(4) raw[sizeof(T)*num];
        __device__ T *index(const size_t i) { return reinterpret_cast<T*>(raw) + i; }
        __device__ const T *index(const size_t i) const { return reinterpret_cast<const T*>(raw) + i; }
    };
#ifdef NO_CUDA
    template<typename T, typename U>  static T bit_cast(const U val) { union { U u; T t; } bits = { val }; return bits.t; }
#else
    template<typename T, typename U>  static __device__ T bit_cast(const U val);
    template<> static __device__ uint32_t bit_cast<uint32_t, float>(const float val) { return __float_as_int(val); }
    template<> static __device__ float bit_cast<float, uint32_t>(const uint32_t val) { return __int_as_float(val); }
    template<> static __device__ uint64_t bit_cast<uint64_t, double>(const double val) { return 0; /*__double_as_longlong(val); */ }
    template<> static __device__ double bit_cast<double, uint64_t>(const uint64_t val) { return 0; /* __longlong_as_double(val);*/ }
#endif
// We need to be supah strict about exactly which function gets pulled; many env, much pain.
// Plus CUDA will force ie sin -> __sinf transformation if fast-math is on.
namespace math {
    #ifdef NO_CUDA
        static __host__ __device__ real sin(real x) { return std::sin(x); }
        static __host__ __device__ real cos(real x) { return std::cos(x); }
        static __host__ __device__ real abs(real x) { return std::abs(x); }
        static __host__ __device__ real sqrt(real x) { return std::sqrt(x); }
        static __host__ __device__ real rsqrt(real x) { return real(1)/math::sqrt(x); }
        static __host__ __device__ real pow(real x, real y) { return std::pow(x, y); }
        static __host__ __device__ real max(real x, real y) { return x > y ? x : y; } //FIXME:
        static __host__ __device__ real min(real x, real y) { return x < y ? x : y; }
        static __host__ __device__ real clamp(real x){ return math::min(1, math::max(0, x)); }
        static __host__ __device__ real mul(real x, real y) { return x*y; } //FIXME:
        static __host__ __device__ real div(real x, real y) { return x/y; } //FIXME:
        // integer
        static __device__ int32_t mul24(int32_t x, int32_t y) { return x*y; } // pretend.
        static __device__ uint32_t umul24(uint32_t x, uint32_t y) { return x*y; } // pretend.
    #else
        #if 0
            static __host__ __device__ real sin(real x) { return sinf(x); }
            static __host__ __device__ real cos(real x) { return cosf(x); }
        #else
            static  __device__ real sin(real x) { return __sinf(x); }
            static  __device__ real cos(real x) { return __cosf(x); }
        #endif
        #if 1
            static __host__ __device__ real mul(real x, real y) { return x*y; } //FIXME:
            static __host__ __device__ real div(real x, real y) { return x/y; } //FIXME:
        #else
            // static __host__ real mul(real x, real y) { return x*y; } //FIXME:
            // static __host__ real div(real x, real y) { return x/y; } //FIXME:
            static __device__ real mul(real x, real y) { return __fmul_rn(x, y); } //FIXME:
            static __device__ real div(real x, real y) { return __fdividef(x, y); } //FIXME:
        #endif
        static __host__ __device__ real abs(real x) { return fabsf(x); }
        static __host__ __device__ real sqrt(real x) { return sqrtf(x); }
        static __host__ __device__ real rsqrt(real x) { return rsqrtf(x); }
        static __host__ __device__ real pow(real x, real y) { return powf(x, y); }
        static __host__ __device__ real max(real x, real y) { return fmaxf(x,y); } //FIXME:
        static __host__ __device__ real min(real x, real y) { return fminf(x,y); }
        static __device__ real clamp(real x) { return __saturatef(x); }
        // integer
        static __device__ uint32_t umul24(uint32_t x, uint32_t y) { return __umul24(x, y); }
        static __device__ int32_t mul24(int32_t x, int32_t y) { return __mul24(x, y); }
    #endif
    static __host__ real clamp2(real x){ return math::min(1, math::max(0, x)); }
}

// Marsaglia KISS RNG (combines 3 simpler RNG to get a decent period).
namespace RNG {
    struct kiss_t {
        uint32_t x, y, z, c;
        // c doesn't need to be re-seeded but must be < 698769069; y must be non-zero; so...
        __device__ kiss_t(uint32_t sx=123456789, uint32_t sy=362436, uint32_t sz=521288629, uint32_t sc =7654321) :  x(sx), y(sy), z(sz), c(sc) {}
        __device__ uint32_t next() {
            x = 69069*x + 12345; y ^= y<<13; y ^= y>>17; y ^= y<<5;
            uint64_t t = 698769069ull*z + c; //TODO: check how it's handled; perhaps split it to 24bit muls.
            c = t>>32; z = uint32_t(t);
            return x + y + z;
        }
        __device__ operator real() {
            switch(sizeof(real)) {
                case 4: return bit_cast<float>((next() >> (32-23)) | bit_cast<uint32_t>(1.f)) - 1.f;
                case 8: bit_cast<double>(uint64_t(next())^(uint64_t(next())<<(53-32)) | bit_cast<uint64_t>(1.)) - 1.;
                default: return 0;
            }
        }
    };
    struct rigged_t {
        __device__ rigged_t(uint32_t,uint32_t,uint32_t,uint32_t) {}
        __device__ operator real() { return F(.75); }
    };
}
typedef RNG::kiss_t rng_t;
// typedef RNG::rigged_t rng_t; // just to see how many registers we could shave [SMILEY ;)]

struct vec_t {
    real x, y, z;
    __host__ __device__ inline vec_t() {}
    __host__ __device__ inline vec_t(real a, real b, real c) : x(a),y(b),z(c) {}
    __host__ __device__ vec_t operator+(const vec_t &b) const { return vec_t(x+b.x,y+b.y,z+b.z); }
    __host__ __device__ vec_t operator-(const vec_t &b) const { return vec_t(x-b.x,y-b.y,z-b.z); }
    __host__ __device__ vec_t operator*(real b) const { return vec_t(math::mul(x,b), math::mul(y, b), math::mul(z, b)); }
    __host__ __device__ vec_t operator/(real b) const { return vec_t(math::div(x,b), math::div(y, b), math::div(z, b)); }
    __host__ __device__ vec_t mult(const vec_t &b) const { return vec_t(math::mul(x,b.x), math::mul(y, b.y), math::mul(z, b.z)); }
    __host__ __device__ vec_t norm() const { return *this * math::rsqrt(dot(*this)); }
    __host__ __device__ real dot(const vec_t &b) const { return x*b.x+y*b.y+z*b.z; }
    __host__ __device__ vec_t operator%(const vec_t &b) const { return vec_t(y*b.z-z*b.y,z*b.x-x*b.z,x*b.y-y*b.x); } // cross
    __host__ __device__ vec_t reflect(const vec_t &n) const { return *this - n*2*n.dot(*this); }
    __host__ __device__ real horizontal_max() const { return math::max(math::max(x, y), z); }
};
struct ray_t {
    vec_t o, d;
    __host__ __device__ ray_t() {}
    __host__ __device__ ray_t(const vec_t &pos, const vec_t &dir) : o(pos), d(dir) {}
    __host__ __device__ vec_t advance(real t) const { return o + d*t; }
};
enum refl_t { DIFF, SPEC, REFR };  // material types, used in radiance()
struct sphere_t {
    vec_t p, e, c;    // position, emission, color
    real rad;        // radius
    refl_t refl;    // reflection type (DIFFuse, SPECular, REFRactive)
    // init on host side only. pfff.
    // WTF. Those 2 are not equivalent, we get much better precision with the first than second on windows. WTF.
    #if 1
        __host__ sphere_t(real rad_, const vec_t &p_, const vec_t &e_, const vec_t &c_, refl_t refl_){
            p = REFIT(p_);
            e = e_;
            c = c_;
            rad = REFIT(rad_);
            refl = refl_;
        }
    #else
        __host__ sphere_t(real rad_, const vec_t &p_, const vec_t &e_, const vec_t &c_, refl_t refl_) :
            p(REFIT(p_)), e(e_), c(c_), rad(REFIT(rad_)), refl(refl_)
        {}
    #endif

    // returns distance, 0 if nohit
    // Solve t^2*d.d + 2*t*(o-p).d + (o-p).(o-p)-R^2 = 0
    // note: different flavors of wrong; rays get stuck. epsilons are the tool of Satan.
    __device__ real intersect(const ray_t &r) const {
        const vec_t op(p-r.o);
        const real eps = 1e-5;
        const real b = op.dot(r.d), det = b*b-op.dot(op)+rad*rad;
        #if 1
            const real d = math::sqrt(det), t1 = b - d, t2 = b + d;
            if (det < 0) return 0;
            if (t2 < eps) return 0;
            return t1 > eps ? t1 : t2;
        #elif 0
            if (det >= 0) {
                const real d = math::sqrt(det), t1 = b - d, t2 = b + d;
                if(t2 >= eps) return t1 > eps ? t1 : t2;
            }
            return 0;
        #else
            // original method, but the return sequence is bogus.
            if (det<0) return 0;
            const real d = math::sqrt(det);
            real t;
            return (t=b-d)>eps ? t : ((t=b+d)>eps ? t : 0);
        #endif
    }
};

#define V(a,b,c) vec_t(real(a),real(b),real(c))
static const sphere_t init_spheres[] = {//Scene: radius, position, emission, color, material
#if 1
  sphere_t(LARGE_R,    V( LARGE_R+1,40.8,81.6),    V(0.,0.,0.),    V(.75,.25,.25),DIFF),//Left
  sphere_t(LARGE_R,    V(-LARGE_R+99,40.8,81.6),    V(0.,0.,0.),    V(.25,.25,.75),DIFF),//Rght
  sphere_t(LARGE_R,    V(50,40.8, LARGE_R),        V(0.,0.,0.),    V(.75,.75,.75),DIFF),//Back
  sphere_t(LARGE_R,    V(50,40.8,-LARGE_R+170),    V(0.,0.,0.),    V(0.,0.,0.),   DIFF),//Frnt
  sphere_t(LARGE_R,    V(50, LARGE_R, 81.6),        V(0.,0.,0.),    V(.75,.75,.75),DIFF),//Botm
  sphere_t(LARGE_R,    V(50,-LARGE_R+81.6,81.6),    V(0.,0.,0.),    V(.75,.75,.75),DIFF),//Top
  sphere_t(16.5,    V(27,16.5,47),                V(0,0,0),        V(1,1,1)*F(.999), SPEC),//Mirr
  sphere_t(16.5,    V(73,16.5,78),                V(0,0,0),        V(1,1,1)*F(.999), REFR),//Glas
#endif
  sphere_t(600,        vec_t(50,681.6-.27,81.6),    vec_t(12,12,12),V(0, 0, 0), DIFF) //Lite
};
#undef V
enum {
    scene_num_spheres = sizeof(init_spheres)/sizeof(sphere_t),
    scene_size = sizeof(sphere_t)*scene_num_spheres
};
// baked values
__constant__ aligned_storage_t<sphere_t, scene_num_spheres> scene;
__constant__ vec_t cu_cam[2];
__constant__ vec_t cu_center[2];

struct hit_t {
    unsigned id;
    __device__ hit_t(unsigned sphere_id) : id(sphere_id) {}
    __device__ operator bool() const { return id < scene_num_spheres; }
    __device__ const sphere_t *operator ->() const { return scene.index(id); }
    // intersect and move ray origin (simplifies stuff).
    static __device__ hit_t intersect(ray_t &ray) {
        const sphere_t * const spheres = scene.index(0);
        real t = F(1e20);
        unsigned id = ~0u;
        for (unsigned i=0; i<unsigned(scene_num_spheres); ++i)
            if (real d = spheres[i].intersect(ray))
                if (d < t) {
                    t  = d;
                    id = i;
                }
        ray.o = ray.advance(t);
        return hit_t(id);
    }
};
struct trace_t {
    ray_t ray;
    vec_t rad;
    vec_t fac;
    unsigned dph;
};
// yay, can't inherit or it craps out in emu.
// struct traces_t : aligned_storage_t<trace_t, max_traces> {
struct traces_t {
    aligned_storage_t<trace_t, max_traces> store;
    __device__ void set(unsigned idx, const vec_t &ray_o, const vec_t &ray_d, const vec_t &new_rad, const vec_t &new_fac, const unsigned new_dph) {
        trace_t &t(*store.index(idx));
        t.ray = ray_t(ray_o, ray_d); // new_ray;
        t.rad = new_rad;
        t.fac = new_fac;
        t.dph = new_dph;
    }
    __device__ const trace_t &get(unsigned idx) const { return *store.index(idx); }
};

#define APPLY(nr, nf) rad = rad + fac.mult(nr); fac = fac.mult(nf);
static __device__ vec_t radiance(const ray_t &primary, rng_t &rng) {
    traces_t traces;
    ray_t ray(primary);    // ray being traced around.
    vec_t fac(1, 1, 1);    // f being carried over.
    vec_t rad(0, 0, 0);    // accumulated radiance.
    unsigned idx = 0 /* stack */, depth = 0 /* ray tree depth */;
    march:
        while(const hit_t hit = hit_t::intersect(ray)) {
            vec_t f(hit->c);
            const real p = hit->c.horizontal_max(); // max refl
            if (++depth > 5)
                if (depth > max_depth || rng >= p) { // R.R
                    APPLY(hit->e, vec_t(0,0,0)); // shunt
                    break;
                }
                else f = f/p;
            const vec_t n((ray.o - hit->p).norm()), nl(n.dot(ray.d)<0 ? n : n*-1);
            if (hit->refl == DIFF) {
                // Ideal DIFFUSE reflection
                const real r1=2*PI*rng, r2=rng, r2s=math::sqrt(r2);
                const vec_t &w(nl), u(((math::abs(w.x) > F(.1) ? vec_t(0,1,0) : vec_t(1,0,0)) % w).norm()), v(w % u);
                ray.d = (u*math::cos(r1)*r2s + v*math::sin(r1)*r2s + w*math::sqrt(1-r2)).norm();
            }
            else if (hit->refl == SPEC)
                // Ideal SPECULAR reflection
                ray.d = ray.d.reflect(n);
            else {
                bool into = n.dot(nl) > 0;                // Ray from outside going in?
                const real nc=1, nt=F(1.5), nnt=into?nc/nt:nt/nc, ddn=ray.d.dot(nl), cos2t = 1-nnt*nnt*(1-ddn*ddn);
                if (cos2t < 0)    // Total internal reflection
                    ray.d = ray.d.reflect(n);
                else {
                    const vec_t tdir((ray.d*nnt - n*((into?1:-1)*(ddn*nnt+math::sqrt(cos2t)))).norm());
                    const real
                        a = nt-nc, b = nt+nc,
                        R0 = math::div(a*a, b*b),
                        c = 1 - (into ? -ddn : tdir.dot(n)),
                        Re = R0+(1-R0)*c*c*c*c*c,
                        Tr = 1-Re, P = F(.25)+F(.5)*Re,
                        RP = math::div(Re, P),
                        TP = math::div(Tr, 1-P);
                    if (depth > 2)
                        if (rng < P)
                            { f = f*RP; ray.d = ray.d.reflect(n); } // Russian roulette
                        else
                            { f = f*TP; ray.d = tdir; }
                    else {
                        // PUSH: put one aside, continue with the other.
                        if (idx + 1 < max_traces)
                            traces.set(idx++, ray.o, tdir, vec_t(0, 0, 0), fac.mult(f*Tr),depth);

                        f = f*Re; ray.d = ray.d.reflect(n); // other ray
                    }
                }
            }
            APPLY(hit->e, f);
        } // intersection
        // POP: and mix back.
        if (--idx < max_traces) {
            const trace_t &s(traces.get(idx));
            ray = s.ray;
            fac = s.fac;
            rad = s.rad + rad;
            depth = s.dph;
            goto march;
        }
    return rad;
}

//TODO: bank conflicts (shouldn't matter that much tho)
#if 1
    vec_t __shared__ xxx_block_output[block_size_y][block_size_x];
    #define block_output(x, y) xxx_block_output[y][x]
#else
    vec_t __shared__ xxx_block_output[block_size_x][block_size_y];
    #define block_output(x, y) xxx_block_output[x][y]
#endif
static __global__
void sample(const uint2 res /* global resolution */, const uint2 tile /* tile coords */, const unsigned s /* sample # */, vec_t *tile_out) {
    const uint32_t
        bx = block_size_x*blockIdx.x + threadIdx.x,        // subpixel coords, within tile.
        by = block_size_y*blockIdx.y + threadIdx.y,
        pixel_x = bx / SS, pixel_y = by / SS;
    const real
        x  = tile.x + pixel_x,    y  = tile.y + pixel_y,    // pixel in global coordinates
        sx = threadIdx.x % SS, sy = threadIdx.y;         // and its subpixel coords.
    // try to prime our PRNG.
    // simply using x/y isn't enough.
    const uint32_t
        rx = 123456789, ry = 362436000, rz = 362436000, rc = 7654321,
        xx = 2*x + sx, yy = 2*y + sy;
        // ns = 2048, s1 = (((res.x*yy) + xx)*ns) + s;
    rng_t rng(rx + xx, ry + yy, rz + s, rc);
    // shoot some rays around.
    real r1 = 2*rng, dx = r1<1 ? math::sqrt(r1)-1: 1-math::sqrt(2-r1);
    real r2 = 2*rng, dy = r2<1 ? math::sqrt(r2)-1: 1-math::sqrt(2-r2);
    const vec_t
        d(
            cu_center[0]*( ( (F(.5) + sx + dx)/2 + x)/res.x - F(.5)) +
            cu_center[1]*( ( (F(.5) + sy + dy)/2 + y)/res.y - F(.5)) + cu_cam[1]),
        rad(radiance(ray_t(cu_cam[0] + d*REFIT(140), d.norm()), rng));
    block_output(threadIdx.x, threadIdx.y) = rad;
    // and now gather subpixels.
    __syncthreads();
    if (((threadIdx.x|threadIdx.y) & 1) == 0) {
        const vec_t acc(
                block_output(threadIdx.x+0, threadIdx.y+0) +
                block_output(threadIdx.x+0, threadIdx.y+1) +
                block_output(threadIdx.x+1, threadIdx.y+0) +
                block_output(threadIdx.x+1, threadIdx.y+1));
        tile_out[tile_size_x*pixel_y + pixel_x] = acc/(SS*SS);
    }
}
// post processing happens here; clamping done once, also here.
static int toInt(real x, real scale){ return int(std::pow(math::clamp2(x*scale),1/F(2.2))*255+F(.5)); }
template<bool flip>
    static void write(const char * const filename, const vec_t * const p, unsigned w, unsigned h, int num_samples) {
        if (FILE *f = fopen(filename, "wb")) {
            const real scale = F(1.)/num_samples;
            fprintf(f, "P6\n%d %d\n%d\n", w, h, 255);
            for (unsigned y=0; y<h; ++y)
                for (unsigned x=0; x<w; ++x) {
                    const size_t idx = (flip ? h - y - 1 : y)*w + x;
                    fprintf(f,"%c%c%c", toInt(p[idx].x, scale), toInt(p[idx].y, scale), toInt(p[idx].z, scale));
                }
            printf("written to %s\n", filename);
        }
    }

static size_t round_up(size_t unit, size_t val) { return unit*(val/unit + (val % unit ? 1 : 0)); }
struct launcher_t {
    enum {
        num_streams = 2,
        buffer_size = tile_area*sizeof(vec_t)
    };
    cudaStream_t streams[num_streams];
    vec_t *d_buffers[num_streams]; // device side
    vec_t *h_buffers[num_streams];
    size_t flip, mark;
    const size_t res_x, res_y, num_samples; // num_samples: # of supersampled samples.
    launcher_t(size_t w, size_t h, size_t spp) : res_x(round_up(tile_size_x, w)), res_y(round_up(tile_size_y, h)), num_samples(spp/(SS*SS) ? spp/(SS*SS) : 1) {}

    // bake & upload some stuff (camera, scene...).
    void bake_constants() {
        const ray_t cam(vec_t(50,52,295.6)*REFIT(1), vec_t(0,-0.042612,-1).norm()); // cam pos, dir
        const vec_t
            cx(res_x*F(.5135) / res_y, 0, 0),
            center[2] = { cx, (cx%cam.d).norm()*F(.5135) };
        #ifdef NO_CUDA
            std::memcpy(cu_cam, &cam, sizeof(cam));
            std::memcpy(cu_center, center, sizeof(center));
            std::memcpy(scene, init_spheres, sizeof(init_spheres));
        #else
            cudaMemcpyToSymbol(cu_cam, &cam, sizeof(cam));
            cudaMemcpyToSymbol(cu_center, &center, sizeof(center));
            // cudaMemcpyToSymbol(raw_scene, &init_spheres, sizeof(sphere_t)*scene_num_spheres);
            cudaMemcpyToSymbol(scene, &init_spheres, sizeof(sphere_t)*scene_num_spheres);
        #endif
    }
    // spawn a kernel again to sample that tile.
    size_t spawn(size_t tx, size_t ty, size_t s) {
        if (--s < num_samples) {
            dim3 block(block_size_x, block_size_y);
            dim3 grid(grid_size_x, grid_size_y);
            const uint2 res(make_uint2(res_x, res_y)), tile(make_uint2(tile_size_x*tx, tile_size_y*ty));
            sample<<<grid, block, 0, streams[flip]>>>(res, tile, s, d_buffers[flip]);
            cudaMemcpyAsync(h_buffers[flip], d_buffers[flip], buffer_size, cudaMemcpyDeviceToHost, streams[flip]);
            mark |= 1<<mark;
            flip ^= 1;
        }
        return s;
    }
    // accumulate 1 tile sample into the final picture, on the host side.
    // clamping & other post processing ops are done later when writing out.
    void collect(size_t tx, size_t ty, vec_t *pixels) {
        if (mark) {
            size_t pick = mark > 1 ? 1 : 0;
            cudaStreamSynchronize(streams[pick]); // barrier for that stream.
            vec_t *p = pixels + res_x*tile_size_y*ty + tile_size_x*tx; // tile corner, destination.
            const vec_t *q = h_buffers[pick];    // source.
            for (size_t y=0; y<tile_size_y; ++y) {
                for (size_t x=0; x<tile_size_x; ++x)
                    p[x] = p[x] + q[x];
                p += res_x;
                q += tile_size_x;
            }
            mark &= ~(1 << pick);
        }
    }
    void banzai() {
        vec_t *pixels = new vec_t[res_x*res_y];
        std::memset(pixels, 0, sizeof(vec_t)*res_x*res_y);
        bake_constants();
        for (size_t i=0; i<num_streams; ++i) cudaStreamCreate(streams + i);
        for (size_t i=0; i<num_streams; ++i) cudaMalloc((void**) d_buffers + i, buffer_size);
        for (size_t i=0; i<num_streams; ++i) cudaMallocHost((void**) h_buffers + i, buffer_size);
        const size_t
            num_tiles_x = res_x/tile_size_x, num_tiles_y = res_y/tile_size_y,
            num_tiles = num_tiles_x*num_tiles_y;
        printf("%d threads, tile(%3d,%3d) -> block(%d,%d), grid(%d,%d)\n", num_threads, tile_size_x, tile_size_y, block_size_x, block_size_y, grid_size_x, grid_size_y);
        printf("fasten your seatbelt, going for res(%d,%d) tiles(%d,%d)x(%d,%d)x%d....\n",
                res_x, res_y, num_tiles_x, num_tiles_y, SS, SS, num_samples);
        // for each tile
        for (size_t tile_num=0; tile_num<num_tiles; ++tile_num) {
            const size_t tx = tile_num % num_tiles_x,  ty = tile_num / num_tiles_x;
            char progress[] = "|                                                                |\r";
            std::memset(progress + 1, '>', (tile_num+1)*64/num_tiles);
            printf(progress);
            fflush(stdout);
            // for each sample, ping-pong between 2 streams.
            size_t s = num_samples;
            flip = 0;
            mark = 0;
            s = spawn(tx, ty, s);
            s = spawn(tx, ty, s);
            do {
                collect(tx, ty, pixels);
                s = spawn(tx, ty, s);
            } while(mark);
        }
        printf("\n");
        char filename[128];
        sprintf(filename, "image-%03dx%03dx%03dspp.ppm", res_x, res_y, num_samples*SS*SS);
        write<true>(filename, pixels, res_x, res_y, num_samples);
        for (size_t i=0; i<num_streams; ++i) cudaFree(h_buffers[i]);
        for (size_t i=0; i<num_streams; ++i) cudaFree(d_buffers[i]);
        for (size_t i=0; i<num_streams; ++i) cudaStreamDestroy(streams[i]);
        delete [] pixels;
    }
};
static int usage(const char * const s) {
    printf("%s -w width (1024) -h height (768) -s samples per pixel (4)\n", s);
    return 0;
}
int main( int argc, char** argv) {
    CUT_DEVICE_INIT(argc, argv);
    unsigned int timer = 0;
    cutCreateTimer(&timer);
    cutStartTimer(timer);
    // size_t w = 1024, h = 768, s = 4;
    size_t w = 512, h = 512, s = 64;
    for (int i=1; i<argc; ++i)
        if (argv[i][0] == '-')
            switch(argv[i][1]) {
                case 'w': if (++i < argc) w = atoi(argv[i]); break;
                case 'h': if (++i < argc) h = atoi(argv[i]); break;
                case 's': if (++i < argc) s = atoi(argv[i]); break;
                default: printf("confused by '%s'\n", argv[i]); return usage(argv[0]);
            }
    launcher_t(w, h, s).banzai();
    cutStopTimer(timer);
    float dt = cutGetTimerValue(timer) / 1000.f, mn = floorf(dt / 60), sec = dt - mn*60;
    cutDeleteTimer(timer);
    printf("done in %d:%06.3f (mn:s) %f\n", int(mn), sec, dt);
    return 0;
}

(L) [2008/12/15] [phresnel] [ smallpt: Global Illumination in 99 lines of C++] Wayback!

I am still not in the position to argue about CUDA code, but what I see is that it's Real Verbose. Plus I am surprised about the sphere_t constructor and precision, this is really a Wtf. Anyways, great work [SMILEY :)]
Still I don't get why there is so much curvature in the image. Do you run on an older CRT screen and make your screenshots with a webcam?

(L) [2008/12/15] [tbp] [ smallpt: Global Illumination in 99 lines of C++] Wayback!

Verbosity is partially my own fault: once you know you can't make it real tight there's no reason to obfuscate further, plus since the emulated mode doesn't match in any way what the hardware does i've put my own braindead layer to allow to compile it as is with a real compiler; that simplified debugging precision issues but the code isn't all there. If that wasn't enough nvcc, NVidia compiler based on Open64 which uses whatever native compiler you have as a frontend, bugs differently between platforms and modes (emu, non emu and so on).
That ctor WTF happens on windows, with msvc9 as the frontend, but i was disgusted enough not to even start looking into the why; instead i cursed. A lot. Did the trick for me.
Being lazy and wanting to match the output of compliant double precision as close to as possible (for comparison), i've just scaled back the whole scene/camera into something more reasonable and that made pseudo walls appear more like what they really are: huge spheres. Point is, you'd have to design scenes for floats from the start to get it right. I mean the whole thing is a numerical nightmare, there's really no need to screw the odds even more [SMILEY ;)]
The only redhibitory thing is, i think, my apparent lame (per sample) seeding. I've tinkered a bit, but it's not that easy, and since i couldn't come up with a proper fix in time i've left in the simplest one that looked almost right. See where it gets to with 5000spp (51mn to render).
image-1024x768x5000spp.jpg
PS: you could think of addressing precision issues more systematically, and i've had some success, but then you'd have to forget about matching the original result.

(L) [2008/12/18] [beason] [ smallpt: Global Illumination in 99 lines of C++] Wayback!

tbp: Great work!! Sorry I took so long to respond, but here's a revised version of the radiance function that removes the stack and branching.

Vec radiance(const Ray &r_, int depth, unsigned short *Xi){
  double t;                               // distance to intersection
  int id=0;                               // id of intersected object
  Ray r=r_;
  // L0 = Le0 + f0*(L1)
  //    = Le0 + f0*(Le1 + f1*L2)
  //    = Le0 + f0*(Le1 + f1*(Le2 + f2*(L3))
  //    = Le0 + f0*(Le1 + f1*(Le2 + f2*(Le3 + f3*(L4)))
  //    = ...
  //    = Le0 + f0*Le1 + f0*f1*Le2 + f0*f1*f2*Le3 + f0*f1*f2*f3*Le4 + ...
  Vec cl(0,0,0);   // accumulated color
  Vec cf(1,1,1);  // accumulated reflectance
  while (1){
    if (!intersect(r, t, id)) return cl; // if miss, return black
    const Sphere &obj = spheres[id];        // the hit object
    cl = cl + cf.mult(obj.e);
    Vec x=r.o+r.d*t, n=(x-obj.p).norm(), nl=n.dot(r.d)<0?n:n*-1, f=obj.c;
    double p = f.x>f.y && f.x>f.z ? f.x : f.y>f.z ? f.y : f.z; // max refl
    if (++depth>5) if (erand48(Xi)<p) f=f*(1/p); else return cl; //R.R.
    cf = cf.mult(f);
    if (obj.refl == DIFF){                  // Ideal DIFFUSE reflection
      double r1=2*M_PI*erand48(Xi), r2=erand48(Xi), r2s=sqrt(r2);
      Vec w=nl, u=((fabs(w.x)>.1?Vec(0,1):Vec(1))%w).norm(), v=w%u;
      Vec d = (u*cos(r1)*r2s + v*sin(r1)*r2s + w*sqrt(1-r2)).norm();
      r = Ray(x,d);
      continue;
    } else if (obj.refl == SPEC){           // Ideal SPECULAR reflection
      r = Ray(x,r.d-n*2*n.dot(r.d));
      continue;
    }
    Ray reflRay(x, r.d-n*2*n.dot(r.d));     // Ideal dielectric REFRACTION
    bool into = n.dot(nl)>0;                // Ray from outside going in?
    double nc=1, nt=1.5, nnt=into?nc/nt:nt/nc, ddn=r.d.dot(nl), cos2t;
    if ((cos2t=1-nnt*nnt*(1-ddn*ddn))<0){    // Total internal reflection
      r = reflRay;
      continue;
    }
    Vec tdir = (r.d*nnt - n*((into?1:-1)*(ddn*nnt+sqrt(cos2t)))).norm();
    double a=nt-nc, b=nt+nc, R0=a*a/(b*b), c = 1-(into?-ddn:tdir.dot(n));
    double Re=R0+(1-R0)*c*c*c*c*c,Tr=1-Re,P=.25+.5*Re,RP=Re/P,TP=Tr/(1-P);
    if (erand48(Xi)<P){      // reflection
      cf = cf*RP;
      r = reflRay;
    } else {                 // refraction
      cf = cf*TP;
      r = Ray(x,tdir);
    }
    continue;
  }
}

For reference here is the result, with 5000 samples (it's a few minutes faster):
[IMG #1 Image]
and full source:
[LINK http://kevinbeason.com/tmp/forward.cpp]
I'm still working on the float vs. double precision issues.

[IMG #1]:Not scraped: https://web.archive.org/web/20110212052444im_/http://kevinbeason.com/tmp/5k_t.jpg

(L) [2008/12/19] [tbp] [ smallpt: Global Illumination in 99 lines of C++] Wayback!

Ok, i see (you have to talk to me slow and without big words you know).
But the branching issue i was talking about earlier... let's put it that way, somewhere you have x live threads (a warp in NVidia buzzword parlance, or in fact half warp for that matter) which gets fed the same instruction and everything's fine unless that instruction is a divergent branch with different outcome within the confine of that (half) warp; and then it's all serialized to hell & back.
Basically that means you have to reorganize paths to get around that crap (if that sounds familiar it's because that's exactly what you'd also do for a good old SIMD pipeline).
In fact, the other day i've started to code what i said i wouldn't code (yeah, again), that is decomposing it into chained ray-gen, intersect, paths reorg+tinkering passes. Stalled a bit for now, because it's a pain in the behindment.
Tho, not having to deal with a variable number of rays is definitely a plus [SMILEY ;)]

(L) [2008/12/20] [simongreen] [ smallpt: Global Illumination in 99 lines of C++] Wayback!

I've been working on my own CUDA port of smallpt. I hadn't checked this thread in a while, and now I see somebody else has beaten me to it!
I'm still in the process of debugging my version - I ran into some precision problems too (and the refraction is broken, hence the diffuse sphere), but I figured out how to remove the recursion and it basically works. The below image runs at about 2 fps on a GTX 280 (512 x 512 pixels, 64 samples):
cuda_smallpt.jpg
I used the CUDA random number generator from here:
[LINK http://www.amolf.nl/~vanmeel/mdgpu/about.html]
and put the scene data in GPU constant memory (which is probably faster than doing global loads).
Anyway, thanks for posting the original smallpt code. I'll post the CUDA version here once I've cleaned it up a bit!

(L) [2008/12/29] [tbp] [ smallpt: Global Illumination in 99 lines of C++] Wayback!

@simongreen: Thanks for pointing to that mdgpu rng. Refurbished.
Since all my previous magnificent ideas (trade memory for many small passes, reduce branch divergence on the fly and so on) proved to be worth squat, i've sniffed some more glue & tried something else.
Now that i'm down to 24 registers and 1.4s to render @512x512x64spp on my cheapo 8600GTS, CUDA occupancy calculator & I both think it should run decently on non-gimped hardware - accurate forecast to follow... 8600GTS/4 multiprocessors, GTX280/30 MP(+nuclear power plant) => 1.4*4/30 = 5.3571428571428571428571428571429 fps.
But i'm a bit bothered that it may require some adjustments to keep such beast busy. Hence this post and the attached archive, in which you'll find two win32 executables; they run in about 11.3s here @512x512x512spp (default).
NB: you'll find a .ppm - irfanview can display them - in the current dir once done; usage: -w width -h height -s spp

(L) [2008/12/30] [beason] [ smallpt: Global Illumination in 99 lines of C++] Wayback!

>> simongreen wrote:I've been working on my own CUDA port of smallpt. I hadn't checked this thread in a while, and now I see somebody else has beaten me to it!
This is great! Shame on me for not replying sooner. For reference my Q6600 takes 33s (0.03 fps) for about the same image, so that's quite an improvement!!! Congratulations. Perhaps my non-recursive code (previous post) may be of some help with the glass?
>> simongreen wrote:I'll post the CUDA version here once I've cleaned it up a bit!
Would love to see it. Thank you for sharing.
>> tbp wrote:@simongreen: Thanks for pointing to that mdgpu rng. Refurbished.
Since all my previous magnificent ideas (trade memory for many small passes, reduce branch divergence on the fly and so on) proved to be worth squat, i've sniffed some more glue & tried something else.
Now that i'm down to 24 registers and 1.4s to render @512x512x64spp on my cheapo 8600GTS, CUDA occupancy calculator & I both think it should run decently on non-gimped hardware - accurate forecast to follow... 8600GTS/4 multiprocessors, GTX280/30 MP(+nuclear power plant) => 1.4*4/30 = 5.3571428571428571428571428571429 fps.
But i'm a bit bothered that it may require some adjustments to keep such beast busy. Hence this post and the attached archive, in which you'll find two win32 executables; they run in about 11.3s here @512x512x512spp (default).
NB: you'll find a .ppm - irfanview can display them - in the current dir once done; usage: -w width -h height -s spp
Sounds like you've made a vast improvement in speed. Sweet! I tried your binaries on my WinXP (non-admin user) Core2duo / 2GB / 8600 GTS but I just get two black images in 0.07s. Any ideas?

(L) [2008/12/30] [tbp] [ smallpt: Global Illumination in 99 lines of C++] Wayback!

My bet is kernels aren't running because your driver is a tad too old.
Thing is, i couldn't convince a CUDA 2.0 nvcc to compile my code without crapping out. So it now requires a 2.1 beta to compile and apparently what gets produced doesn't run on 2.0 enabled drivers (meaning i can't test it on linux atm). Great feature.
You don't (well, shouldn't) need any special driver, just one that provides adequate CUDA features, but that's where it gets fuzzy. By version 178.something it was CUDA 2.0 and 180.something should be about right for 2.1. I suppose i should RTFM and add some code to check that req instead of wasting your time.
edit: NVidia, knowing your WHQL compliant 180.48 driver "Supports CUDA" makes me all warm inside, but what about stating precisely which [expletive] version somewhere, like, in your [expletive] release notes. [expletive]. Beason, if you feel reckless and want to be sure not to miss that glimpse into the future, [LINK http://www.nvidia.com/object/cuda_get.html click].

(L) [2008/12/30] [GPU sux] [ smallpt: Global Illumination in 99 lines of C++] Wayback!

One more reason to stay away from gfx cards, the drivers alway suck ... sometimes less and most of the time more [SMILEY [-o<]

(L) [2008/12/30] [lycium] [ smallpt: Global Illumination in 99 lines of C++] Wayback!

works with latest driver, second-to-latest makes the black image.
pretty nifty demo, now if only it would add together passes (dividing by the total number of passes for display) to progressively converge! [SMILEY :D] i wonder how fast it could get a 1200x1200 converged image on my 2/3 year old 8800...

(L) [2008/12/30] [tbp] [ smallpt: Global Illumination in 99 lines of C++] Wayback!

Thanks for pinpointing it. [fist shake]
Well, yes, you could accumulate them but then you'd quickly run into precisions issues; and trading memory to get around that isn't an option on my cheapo card (in fact i had to make some contortions so i could render @4096x4096 within 256M). So the solution would be a tad more involved. (cranking up the wazoo that -s parameter only get you that far for the same reason, so either way we're screwed).
Hmm. In fact i've started to look at the problem the other way around, because if you don't ask for too many samples at once, even here, it's mostly real time and i'm firmly convinced 2009 killer app is totally bound to be a real time ball editor.

(L) [2009/01/10] [Hou_Qiming] [ smallpt: Global Illumination in 99 lines of C++] Wayback!

Can't resist the urge of a BSGP port...
105 lines, 4096 samples at 8min26s on a GTX 280.
image.JPG
There are still some precision related artifacts, though:(
Code: [LINK # Select all]uses cubsgp;const{DIFF=0;SPEC=1;REFR=2;maxdep=200;BIG=2e3f;BIG2=60.f;}
#define Vec make_float3
#define Z Vec(0.f,0.f,0.f)
__both__ Sphere(&i,o,d,&tx, rad,p,e,c,refl)
eps=1e-4f;op=p-o;b=dot(op,d);det=b*b-sqr(op)+rad*rad
if det<0.f:return
det=sqrt(det)
t0=b-det-eps;t1=b+det-eps
t2=*(f32*)&min(*(u32*)&t0,*(u32*)&t1)
if *(u32*)&tx>*(u32*)&t2:
   tx=t2
   i.p=p;i.e=e;i.c=c;i.refl=refl
//Scene: radius, position, emission, color, material
__both__ Scene(&i,o,d)
tx=1e20f
Sphere(i,o,d,tx, BIG, Vec( BIG+1.f,40.8f,81.6f), Z,Vec(0.75f,0.25f,0.25f),DIFF)//Left
Sphere(i,o,d,tx, BIG, Vec(-BIG+99.f,40.8f,81.6f),Z,Vec(0.25f,0.25f,0.75f),DIFF)//Rght
Sphere(i,o,d,tx, BIG, Vec(50.f,40.8f, BIG),    Z,Vec(0.75f,0.75f,0.75f),DIFF)//Back
Sphere(i,o,d,tx, BIG, Vec(50.f,40.8f,-BIG+170.f), Z,Z,             DIFF)//Frnt
Sphere(i,o,d,tx, BIG, Vec(50.f, BIG, 81.6f),    Z,Vec(0.75f,0.75f,0.75f),DIFF)//Botm
Sphere(i,o,d,tx, BIG, Vec(50.f,-BIG+81.6f,81.6f),Z,Vec(0.75f,0.75f,0.75f),DIFF)//Top
Sphere(i,o,d,tx, 16.5f,Vec(27.f,16.5f,47.f),       Z,Vec(1.f,1.f,1.f)*0.999f, SPEC)//Mirr
Sphere(i,o,d,tx, 16.5f,Vec(73.f,16.5f,78.f),       Z,Vec(1.f,1.f,1.f)*0.999f, REFR)//Glas
Sphere(i,o,d,tx, BIG2, Vec(50.f,81.6f+BIG2-0.27f,81.6f),Vec(12.f,12.f,12.f),Z, DIFF) //Lite
return tx
__both__ toInt(x)=int(pow(max(min(x,1.f),0.f),1.f/2.2f)*255.f+0.5f)
__both__ erand48(&seed)
seed=seed*6364136223846793005Lu+1Lu
return *(f32*)&((((i32)(u32)(seed>>16))&0x3fffffff)|0x3f800000)-1.f
struct xsect
float3 p,e,c
int refl
__both__ radiance(o,d,Xi)
xsect obj
ret=Z;depth=0;mult=Vec(1.f,1.f,1.f)
for(;;)
   t=Scene(obj,o,d)
   if depth>=maxdep:t=1e20f
   if t>1e19f:return ret
   x=o+d*t;n=normalize(x-obj.p);nl=dot(n,d)<0.f?n:-n;f=obj.c;o=x
   p=max(max(f.x,f.y),f.z); ret+=mult*obj.e
   if ++depth>5:if erand48(Xi)<p:f/=p;else
      mult=Z
      depth=maxdep
   mult*=f
   if obj.refl==DIFF:
      r1=2.f*PI*erand48(Xi);r2=erand48(Xi);r2s=sqrt(r2);
      w=nl;u=normalize(cross((abs(w.x)>0.1f?Vec(0.f,1.f,0.f):Vec(1.f,0.f,0.f)),w))
      v=cross(w,u);
      d=normalize(cos(r1)*r2s*u+sin(r1)*r2s*v+sqrt(1.f-r2)*w);
   else
      drefl=d-2.f*dot(n,d)*n
      if obj.refl == SPEC:d=drefl;else
         into = dot(n,nl)>0.f;
         nc=1.f; nt=1.5f; nnt=into?nc/nt:nt/nc; ddn=dot(d,nl);
         cos2t=1.f-nnt*nnt*(1.f-ddn*ddn)
         if cos2t<0.f:d=drefl;else
            tdir=normalize(d*nnt-n*(f32(into*2-1)*(ddn*nnt+sqrt(cos2t))));
            a=nt-nc; b=nt+nc; R0=a*a/(b*b); c=1.f-(into?-ddn:dot(tdir,n));
            Re=R0+(1.f-R0)*c*c*c*c*c;Tr=1.f-Re;P=0.25f+0.5f*Re;RP=Re/P;TP=Tr/(1.f-P);
            if depth>2:
               Re=RP;Tr=TP
            if erand48(Xi)<P:{d=drefl;mult*=Re}else{d=tdir;mult*=Tr}
            o+=tdir*5.f
+
w=1024;h=768;samps=1;sampsi=1
if system.argc==2:
   read-sscanf(system.argv[1];samps);samps/=4 // # samples
if samps>512:{sampsi=samps/512;samps=512}
o=Vec(50.f,52.f,295.6f);d0=normalize(Vec(0.f,-0.042612f,-1.f)); // cam pos, dir
cx=Vec((float)w*0.4535f/(float)h,0.f,0.f)
cy=normalize(cross(cx,d0))*0.4535f
f=fopen("image.pfm","wb")
fprintf(f,"PF\n%d %d\n-1.000\n", w, h);
honce=max(4194304/(w*4*samps),1)
a=new dlist(float3)(honce*w*4*samps)
b=new dlist(int)(honce*w)
bc=new int[honce*w]
for y0=0:honce:h-1
   hi=min(honce,h-y0)
   spawn hi*w*4*samps
      require{thread.block.size=128}
      s=thread.rank;y=s/(w*4*samps);s-=y*(w*4*samps);y+=y0
      x=s/(4*samps);s-=x*(4*samps)
      sxy=s/samps;s-=sxy*samps;sx=f32(sxy&1);sy=f32((sxy>>1)&1)
      Xi=(u64)(u32)thread.rank
      r1=2.f*erand48(Xi);dx=r1<1.f?sqrt(r1)-1.f:1.f-sqrt(2.f-r1);
      r2=2.f*erand48(Xi);dy=r2<1.f?sqrt(r2)-1.f:1.f-sqrt(2.f-r2);
      d=(cx*(((sx+0.5f+dx)*0.5f+(f32)x)/(f32)w-0.5f)+
         cy*(((sy+0.5f+dy)*0.5f+(f32)y)/(f32)h-0.5f)+d0);
      r=Z;for j=0:sampsi-1{r+=radiance(o+d*140.f,normalize(d),Xi)}
      a[thread.rank]=r/(f32)sampsi
   spawn hi*w
      pt=thread.rank*(4*samps);ci=Z;r=Z
      for i=0:3
         for j=0:samps-1{r+=a[pt++]}
         r*=1.f/(float)samps
         For j=0:2{ci[j]+=max(min(r[j],1.f),0.f)*0.25f}
      cii=0;For j=0:2{cii+=toInt(ci[j])<<(j*8)}
      b[thread.rank]=cii
   cpyd2h(bc,b.d,hi*w*sizeof(int))
   for i=0:hi*w-1
      For j=0:2
         fwrite(&((float)((bc[i]>>j*8)&0xff)/255.f),sizeof(float),1,f)
fclose(f);csystem("image.pfm")

(L) [2009/01/13] [beason] [ smallpt: Global Illumination in 99 lines of C++] Wayback!

>> Hou_Qiming wrote:Can't resist the urge of a BSGP port...
105 lines, 4096 samples at 8min26s on a GTX 280.

Very cool! Thank you for sharing. This is the first time I've seen BSGP. Very succinct and still very fast. 8m is a lot better than 120m. [SMILEY =D>]
>> Hou_Qiming wrote:There are still some precision related artifacts, though:(

This is a recurring problem... I guess I'll take another stab at converting to floats to see what can be done.
>> Hou_Qiming wrote:Code: [LINK # Select all] o+=tdir*5.f
+

What are these lines?

(L) [2009/01/14] [fpsunflower] [ smallpt: Global Illumination in 99 lines of C++] Wayback!

I tried out the code with all floats. Naively search/replacing double to float (and switching sqrt to sqrtf, etc ..) gives this:
[IMG #1 Image]
a few minor edits (no extra lines - though some did overflow the column boundary) gives the correct result (16spp only).
[IMG #2 Image]
This is with eps=0 and no scene scale adjustments.

[IMG #1]:Not scraped: https://web.archive.org/web/20110212052444im_/http://img211.imageshack.us/img211/100/naiveld8.png
[IMG #2]:Not scraped: https://web.archive.org/web/20110212052444im_/http://img239.imageshack.us/img239/7492/robustlu2.png

smallpt: Global Illumination in 99 lines of C++ back

Board: Board index ‹ Ray tracing ‹ Tools, demos & sources