From 583cd9b417bbbda308205cc54237d18e97fde558 Mon Sep 17 00:00:00 2001
From: Kevin Trogant <kevin@trogant.de>
Date: Tue, 9 May 2023 13:04:14 +0200
Subject: [PATCH] test SIMD raytracing

---
 bin/defocus.c                |  25 ++++-
 include/defocus/base.h       |  56 ++++++++----
 include/defocus/camera.h     |  28 +-----
 include/defocus/image.h      |  14 +--
 include/defocus/models.h     |   4 +-
 include/defocus/raytracing.h |  29 ++++++
 lib/camera.c                 | 109 +++++++++++++++++++++-
 lib/image.c                  |  14 +--
 lib/log.c                    |   2 +-
 lib/math.c                   | 173 ++++++++++++++++++++++++++---------
 lib/raytracing.c             |  51 +++++++++++
 meson.build                  |  21 +++--
 12 files changed, 410 insertions(+), 116 deletions(-)
 create mode 100644 include/defocus/raytracing.h
 create mode 100644 lib/raytracing.c

diff --git a/bin/defocus.c b/bin/defocus.c
index 23cbe67..07e2ad2 100644
--- a/bin/defocus.c
+++ b/bin/defocus.c
@@ -3,15 +3,19 @@
 #include <stdlib.h>
 #include <defocus/defocus.h>
 
+#include <defocus/camera.h>
+
 int pinhole_fn(int argc, char **argv);
 int thin_lense_fn(int argc, char **argv);
+int test_fn(int argc, char **argv);
 
-static const char *_model_names[] = {"pinhole", "thin_lense"};
+static const char *_model_names[] = {"pinhole", "thin_lense", "test"};
 typedef int (*model_fn)(int argc, char **argv);
 
 static model_fn _model_fns[] = {
     pinhole_fn,
     thin_lense_fn,
+    test_fn,
 };
 
 void usage(const char *pname)
@@ -25,7 +29,6 @@ void usage(const char *pname)
 
 int main(int argc, char **argv)
 {
-
     if (argc < 2) {
         fprintf(stderr, "Missing model name!\n");
         usage(argv[0]);
@@ -218,4 +221,22 @@ int thin_lense_fn(int argc, char **argv)
     df_release_image(out_image);
 
     return error_code;
+}
+
+int test_fn( int argc, char **argv ) {
+
+    df_v3 v = {1.f, 2.f, 3.f};
+    df_m4 t = df_translate(-1.f, 2.f, 1.5f);
+    t = df_inverse_transform(t);
+
+    df_v3 tv = df_transform_v3(t, v);
+
+    df_camera_i cam = df_create_perspective_camera(1024.0, 1024.0, 1024.0, 1024.0, 2.e-4f, 1000.f, 350.0, 21.0);
+    df_ray_packet packet = cam.build_ray_packet(cam.o);
+    df_evaluate_ray_packet(&packet);
+
+    df_release_ray_packet(&packet);
+    
+    cam.release(cam.o);
+    return 0;
 }
\ No newline at end of file
diff --git a/include/defocus/base.h b/include/defocus/base.h
index e8fa3fb..dafec37 100644
--- a/include/defocus/base.h
+++ b/include/defocus/base.h
@@ -38,6 +38,20 @@ typedef enum
 /** @brief Zero an array */
 #define DF_ZERO_ARRAY(a, n) DF_ZERO_MEMORY(a, sizeof((a)[0]) * (n))
 
+#ifdef _MSC_VER
+#ifdef __cplusplus__
+#define DF_API extern "C" __declspec(dllexport)
+#else
+#define DF_API __declspec(dllexport)
+#endif /* _MSC_VER && __cplusplus__ */
+#else
+#ifdef __cplusplus__
+#define DF_API extern "C"
+#else
+#define DF_API
+#endif
+#endif
+
 /* Simple logging function */
 
 #ifndef DF_ENABLE_LOGGING
@@ -69,7 +83,7 @@ enum
  * @param line line number of the log location
  * @param fmt format string of the message.
  */
-void df_log_impl(int level, const char *file, int line, const char *fmt, ...);
+DF_API void df_log_impl(int level, const char *file, int line, const char *fmt, ...);
 
 #define df_log_verbose(...) df_log_impl(df_log_level_verbose, __FILE__, __LINE__, __VA_ARGS__)
 #define df_log_info(...) df_log_impl(df_log_level_info, __FILE__, __LINE__, __VA_ARGS__)
@@ -112,7 +126,7 @@ typedef union {
  * @param t interpolation factor
  * @return a + (b - a) * t
  */
-df_color df_lerp_color(df_color a, df_color b, double t);
+DF_API df_color df_lerp_color(df_color a, df_color b, double t);
 
 /** @brief 2d vector */
 typedef union {
@@ -130,22 +144,22 @@ typedef union {
 } df_v2;
 
 /** @brief Add two 2d vectors */
-df_v2 df_add_v2(df_v2 a, df_v2 b);
+DF_API df_v2 df_add_v2(df_v2 a, df_v2 b);
 
 /** @brief Subtract two 2d vectors */
-df_v2 df_sub_v2(df_v2 a, df_v2 b);
+DF_API df_v2 df_sub_v2(df_v2 a, df_v2 b);
 
 /** @brief Calculate the dot product of 2d vectors a and b.
  */
-float df_dot_v2(df_v2 a, df_v2 b);
+DF_API float df_dot_v2(df_v2 a, df_v2 b);
 
 /** @brief Multiply a 2d vector with a scalar.
  */
-df_v2 df_mul_v2(float t, df_v2 v);
+DF_API df_v2 df_mul_v2(float t, df_v2 v);
 
 /** @brief Returns the normalized version of a 2d vector v
  */
-df_v2 df_normalize_v2(df_v2 v);
+DF_API df_v2 df_normalize_v2(df_v2 v);
 
 /** @brief 3d vector */
 typedef union {
@@ -159,22 +173,22 @@ typedef union {
 } df_v3;
 
 /** @brief Add two 3d vectors */
-df_v3 df_add_v3(df_v3 a, df_v3 b);
+DF_API df_v3 df_add_v3(df_v3 a, df_v3 b);
 
 /** @brief Subtract two 3d vectors */
-df_v3 df_sub_v3(df_v3 a, df_v3 b);
+DF_API df_v3 df_sub_v3(df_v3 a, df_v3 b);
 
 /** @brief Calculate the dot product of 3d vectors a and b.
  */
-float df_dot_v3(df_v3 a, df_v3 b);
+DF_API float df_dot_v3(df_v3 a, df_v3 b);
 
 /** @brief Multiply a 3d vector with a scalar.
  */
-df_v3 df_mul_v3(float t, df_v3 v);
+DF_API df_v3 df_mul_v3(float t, df_v3 v);
 
 /** @brief Returns the normalized version of a 3d vector v
  */
-df_v3 df_normalize_v3(df_v3 v);
+DF_API df_v3 df_normalize_v3(df_v3 v);
 
 /** @brief A plane in 3d space.
  *
@@ -221,7 +235,7 @@ typedef struct
 } df_line_plane_intersection;
 
 /** @brief Calculate the intersection between a line and a plane in 3d space */
-df_line_plane_intersection df_calc_line_plane_intersection(df_line line, df_plane plane);
+DF_API df_line_plane_intersection df_calc_line_plane_intersection(df_line line, df_plane plane);
 
 /** @brief A 4x4 matrix.
  *
@@ -239,22 +253,26 @@ typedef struct df_m4
 #define DF_M4_AT(m, row, col) ((m).e[(row)*4 + (col)])
 
 /** @brief Matrix multiply 4x4 matrix a and b */
-df_m4 df_mul_m4(df_m4 a, df_m4 b);
+DF_API df_m4 df_mul_m4(df_m4 a, df_m4 b);
 
 /** @brief Get a scale matrix */
-df_m4 df_scale(float x, float y, float z);
+DF_API df_m4 df_scale(float x, float y, float z);
 
-df_m4 df_translate(float x, float y, float z);
+DF_API df_m4 df_translate(float x, float y, float z);
 
 /** @brief Transform (i.e. multiply) a 3d vector v by the transformation matrix T */
-df_v3 df_transform_v3(df_m4 T, df_v3 v);
+DF_API df_v3 df_transform_v3(df_m4 T, df_v3 v);
 
 /** @brief Calculate the inverse of a non-scaling transform matrix.
  *
  * Special fast case.
  */
-df_m4 df_inverse_transform_no_scale(df_m4 M);
+DF_API df_m4 df_inverse_transform_no_scale(df_m4 M);
 
 /** @brief Calculate the inverse of a transform matrix */
-df_m4 df_inverse_transform(df_m4 M);
+DF_API df_m4 df_inverse_transform(df_m4 M);
+
+/** @brief Calculate the inverse of a general (invertible) 4x4 matrix */
+DF_API df_m4 df_inverse(df_m4 M);
+
 #endif
\ No newline at end of file
diff --git a/include/defocus/camera.h b/include/defocus/camera.h
index 21a0657..7980cd5 100644
--- a/include/defocus/camera.h
+++ b/include/defocus/camera.h
@@ -2,34 +2,12 @@
 #define DEFOCUS_CAMERA_H
 
 #include "base.h"
+#include "raytracing.h"
 
 /** @file camera.h
  * @brief basic camera functions
  */
 
-/** @brief Stores information for rays in a SIMD friendly way */
-typedef struct
-{
-    /** Packs all SIMD data into a single buffer */
-    float *simd_mem;
-
-    /** Source uvs for rays */
-    df_v2 *ray_uvs;
-
-    /* Points into simd_mem */
-    float *base_x;
-    float *base_y;
-    float *base_z;
-    float *dir_x;
-    float *dir_y;
-    float *dir_z;
-
-    size_t ray_count;
-} df_ray_packet;
-
-/** @brief Free ray packet memory */
-void df_release_ray_packet(df_ray_packet *rays);
-
 /** @brief Interface for cameras. */
 typedef struct
 {
@@ -44,7 +22,7 @@ typedef struct
     void *o;
 } df_camera_i;
 
-df_camera_i
-df_create_perspective_camera(float image_width, float image_height, float raster_width, float raster_height);
+DF_API df_camera_i df_create_perspective_camera(
+    float image_width, float image_height, float raster_width, float raster_height, float far_dist, float near_dist, float focal_dist, float lens_radius);
 
 #endif
\ No newline at end of file
diff --git a/include/defocus/image.h b/include/defocus/image.h
index 216ed68..72ce747 100644
--- a/include/defocus/image.h
+++ b/include/defocus/image.h
@@ -18,7 +18,7 @@ typedef struct df_image df_image;
  * @param out_image receives the image object
  * @return error code
  */
-df_result df_create_image(int w, int h, df_image **out_image);
+DF_API df_result df_create_image(int w, int h, df_image **out_image);
 
 /** @brief load an image file
  *
@@ -29,34 +29,34 @@ df_result df_create_image(int w, int h, df_image **out_image);
  * @param out_image receives the image object
  * @return error code
  */
-df_result df_load_image(const char *path, int *out_w, int *out_h, df_image **out_image);
+DF_API df_result df_load_image(const char *path, int *out_w, int *out_h, df_image **out_image);
 
 /** @brief Write an image to a PNG file
  * @param img the image
  * @param path the path
  */
-df_result df_write_image(df_image *img, const char *path);
+DF_API df_result df_write_image(df_image *img, const char *path);
 
 /** @brief Free an image.
  *
  * Any pointer to the image will be invalid after this.
  * @param img the image
  */
-void df_release_image(df_image *img);
+DF_API void df_release_image(df_image *img);
 
 /** @brief Returns the dimensions of the image */
-void df_get_image_size(const df_image *image, int *w, int *h);
+DF_API void df_get_image_size(const df_image *image, int *w, int *h);
 
 /** @brief Returns the color value at pixel coordinates x, y
  *
  * Returns black for coordinates outside the image.
  */
-df_color df_get_image_pixel(const df_image *image, int x, int y);
+DF_API df_color df_get_image_pixel(const df_image *image, int x, int y);
 
 /** @brief Set the color value at pixel coordinates x, y.
  *
  * Does nothing for coordinates outside the image.
  */
-void df_set_image_pixel(df_image *image, int x, int y, df_color c);
+DF_API void df_set_image_pixel(df_image *image, int x, int y, df_color c);
 
 #endif
diff --git a/include/defocus/models.h b/include/defocus/models.h
index 6f7320a..b172c73 100644
--- a/include/defocus/models.h
+++ b/include/defocus/models.h
@@ -44,7 +44,7 @@ typedef struct df_pinhole_params
  * @param in_image input image.
  * @param out_image the output image.
  */
-void df_pinhole(df_pinhole_params params, const df_image *in_image, df_image *out_image);
+DF_API void df_pinhole(df_pinhole_params params, const df_image *in_image, df_image *out_image);
 
 /** Parameters for the thin lense model.
  */
@@ -72,6 +72,6 @@ typedef struct df_thin_lense_params
  * @param in_image input image
  * @param out_image the output image.
  */
-void df_thin_lense(df_thin_lense_params params, const df_image *in_image, df_image *out_image);
+DF_API void df_thin_lense(df_thin_lense_params params, const df_image *in_image, df_image *out_image);
 
 #endif
diff --git a/include/defocus/raytracing.h b/include/defocus/raytracing.h
new file mode 100644
index 0000000..9406173
--- /dev/null
+++ b/include/defocus/raytracing.h
@@ -0,0 +1,29 @@
+#ifndef DF_RAYTRACING_H
+#define DF_RAYTRACING_H
+
+/** @brief Stores information for rays in a SIMD friendly way */
+typedef struct
+{
+    /** Packs all SIMD data into a single buffer */
+    float *simd_mem;
+
+    /** Source uvs for rays */
+    df_v2 *ray_uvs;
+
+    /* Points into simd_mem */
+    float *base_x;
+    float *base_y;
+    float *base_z;
+    float *dir_x;
+    float *dir_y;
+    float *dir_z;
+
+    size_t ray_count;
+} df_ray_packet;
+
+/** @brief Free ray packet memory */
+DF_API void df_release_ray_packet(df_ray_packet *rays);
+
+DF_API void df_evaluate_ray_packet(const df_ray_packet *rays);
+
+#endif
diff --git a/lib/camera.c b/lib/camera.c
index 988951e..6790920 100644
--- a/lib/camera.c
+++ b/lib/camera.c
@@ -1,6 +1,12 @@
 #include <defocus/camera.h>
 
 #include <string.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#ifdef _MSC_VER
+#include <malloc.h>
+#endif
 
 /* ********************************************************
  *
@@ -8,7 +14,7 @@
  *
  * ********************************************************/
 
-void df_release_ray_packet(df_ray_packet *rays)
+DF_API void df_release_ray_packet(df_ray_packet *rays)
 {
     free(rays->simd_mem);
     free(rays->ray_uvs);
@@ -26,18 +32,95 @@ typedef struct
 {
     float focal_dist;
     float lens_radius;
+    
+    /* Raster space, i.e. the space for which we generate rays */
+    float raster_width;
+    float raster_height;
+
+    /* Image(=screen) space, i.e. the space in which we store pixels */
+    float image_width;
+    float image_height;
 
     df_m4 screen_to_raster;
     df_m4 raster_to_screen;
     df_m4 raster_to_camera;
-
 } df_perspective_camera;
 
 static void pc_release(void *o) { df_perspective_camera *camera = o; }
 
-static df_ray_packet pc_build_ray_packet(void *o) { df_perspective_camera *camera = o; }
+static df_ray_packet pc_build_ray_packet(void *o)
+{
+    df_perspective_camera *camera = o;
+    df_ray_packet packet;
+    memset(&packet, 0, sizeof(packet));
 
-df_camera_i df_create_perspective_camera(float image_width, float image_height, float raster_width, float raster_height)
+    /* Generate 1 ray per pixel (ignore lens for now) */
+    size_t count = (size_t)camera->raster_width * (size_t)camera->raster_height;
+
+    size_t alloc_count = count;
+    /* Round up to nearest multiple of 4, for SSE.
+     * Also satisfies 16 byte alignment (assuming simd_mem is 16 byte aligned)
+     * because 4 * sizeof(float) = 16.
+     */
+    if ((alloc_count % 4) != 0) {
+        alloc_count = ((alloc_count + 3) / 4) * 4;
+    }
+
+#ifdef _MSC_VER
+    packet.simd_mem = _aligned_malloc(sizeof(float) * alloc_count * 6, 16);
+#elif defined(_ISOC11_SOURCE) /* Feature test macro for GCC */
+    packet.simd_mem = aligned_alloc(16, sizeof(float) * alloc_count * 6);
+#else
+    /* Fall back to regular malloc and hope for the best */
+    packet.simd_mem = malloc(sizeof(float) * alloc_count * 6);
+#endif
+
+    packet.base_x = packet.simd_mem;
+    packet.base_y = packet.base_x + alloc_count;
+    packet.base_z = packet.base_y + alloc_count;
+    packet.dir_x = packet.base_z + alloc_count;
+    packet.dir_y = packet.dir_x + alloc_count;
+    packet.dir_z = packet.dir_y + alloc_count;
+
+    packet.ray_uvs = malloc(sizeof(df_v2) * count);
+
+    packet.ray_count = count;
+
+    size_t i = 0;
+    for (float y = 0; y < camera->raster_height; y += 1.f) {
+        for (float x = 0; x < camera->raster_width; x += 1.f) {
+            packet.base_x[i] = 0.f;
+            packet.base_y[i] = 0.f;
+            packet.base_z[i] = 0.f;
+
+            df_v3 raster_p = {x, y, 0.f};
+            df_v3 camera_p = df_transform_v3(camera->raster_to_camera, raster_p);
+            df_v3 dir = df_normalize_v3(camera_p);
+
+            packet.dir_x[i] = dir.x;
+            packet.dir_y[i] = dir.y;
+            packet.dir_z[i] = dir.z;
+
+            df_v3 img_p = df_transform_v3(camera->raster_to_screen, raster_p);
+            packet.ray_uvs[i].u = img_p.x / camera->image_width;
+            packet.ray_uvs[i].v = img_p.y / camera->image_height;
+
+            ++i;
+            assert(i <= count);
+        }
+    }
+
+    return packet;
+}
+
+DF_API df_camera_i df_create_perspective_camera(float image_width,
+                                                float image_height,
+                                                float raster_width,
+                                                float raster_height,
+                                                float far_dist,
+                                                float near_dist,
+                                                float focal_dist,
+                                                float lens_radius)
 {
     df_perspective_camera *camera = malloc(sizeof(*camera));
 
@@ -45,9 +128,25 @@ df_camera_i df_create_perspective_camera(float image_width, float image_height,
     camera->screen_to_raster =
         df_mul_m4(camera->screen_to_raster, df_scale(1.f / image_width, -1.f / image_height, 1.f));
     camera->screen_to_raster = df_mul_m4(camera->screen_to_raster, df_translate(0.f, -image_height, 0.f));
-
     camera->raster_to_screen = df_inverse_transform(camera->screen_to_raster);
 
+    /* Perspective projection matrix */
+    df_m4 persp = {0.f};
+    DF_M4_AT(persp, 0, 0) = 1.f;
+    DF_M4_AT(persp, 1, 1) = 1.f;
+    DF_M4_AT(persp, 2, 2) = far_dist / (far_dist - near_dist);
+    DF_M4_AT(persp, 2, 3) = -1.f * far_dist * near_dist / (far_dist - near_dist);
+    DF_M4_AT(persp, 3, 2) = 1.f;
+
+    camera->raster_to_camera = df_mul_m4(df_inverse(persp), camera->raster_to_screen);
+
+    camera->focal_dist = focal_dist;
+    camera->lens_radius = lens_radius;
+    camera->raster_width = raster_width;
+    camera->raster_height = raster_height;
+    camera->image_width = image_width;
+    camera->image_height = image_height;
+
     df_camera_i iface = {.release = pc_release, .build_ray_packet = pc_build_ray_packet, .o = camera};
     return iface;
 }
diff --git a/lib/image.c b/lib/image.c
index 46570ed..84f59b2 100644
--- a/lib/image.c
+++ b/lib/image.c
@@ -17,7 +17,7 @@ struct df_image
     uint8_t *pixels;
 };
 
-df_result df_create_image(int w, int h, df_image **out_image)
+DF_API df_result df_create_image(int w, int h, df_image **out_image)
 {
     df_image *img = malloc(sizeof(df_image));
     if (!img)
@@ -37,7 +37,7 @@ df_result df_create_image(int w, int h, df_image **out_image)
     return df_result_success;
 }
 
-df_result df_load_image(const char *path, int *out_w, int *out_h, df_image **out_image)
+DF_API df_result df_load_image(const char *path, int *out_w, int *out_h, df_image **out_image)
 {
     int w, h, c;
     stbi_uc *pixels = stbi_load(path, &w, &h, &c, 4);
@@ -69,7 +69,7 @@ out:
     return res;
 }
 
-void df_release_image(df_image *img)
+DF_API void df_release_image(df_image *img)
 {
     if (img) {
         free(img->pixels);
@@ -77,7 +77,7 @@ void df_release_image(df_image *img)
     }
 }
 
-df_result df_write_image(df_image *image, const char *path)
+DF_API df_result df_write_image(df_image *image, const char *path)
 {
     df_result res = stbi_write_png(path, image->width, image->height, 4, image->pixels, image->width * 4)
                         ? df_result_success
@@ -85,7 +85,7 @@ df_result df_write_image(df_image *image, const char *path)
     return res;
 }
 
-void df_get_image_size(const df_image *image, int *w, int *h)
+DF_API void df_get_image_size(const df_image *image, int *w, int *h)
 {
     if (w)
         *w = image->width;
@@ -93,7 +93,7 @@ void df_get_image_size(const df_image *image, int *w, int *h)
         *h = image->height;
 }
 
-df_color df_get_image_pixel(const df_image *image, int x, int y)
+DF_API df_color df_get_image_pixel(const df_image *image, int x, int y)
 {
     df_color c = {0, 0, 0, 255};
     if (x >= 0 && x < image->width && y >= 0 && y < image->height) {
@@ -102,7 +102,7 @@ df_color df_get_image_pixel(const df_image *image, int x, int y)
     return c;
 }
 
-void df_set_image_pixel(df_image *image, int x, int y, df_color c)
+DF_API void df_set_image_pixel(df_image *image, int x, int y, df_color c)
 {
     if (x >= 0 && x < image->width && y >= 0 && y < image->height) {
         memcpy(&image->pixels[4 * (y * image->width + x)], &c.e[0], 4);
diff --git a/lib/log.c b/lib/log.c
index 8032c70..7d54533 100644
--- a/lib/log.c
+++ b/lib/log.c
@@ -4,7 +4,7 @@
 
 static const char *log_level_names[] = {"VERBOSE", "INFO", "WARN", "ERROR"};
 
-void df_log_impl(int level, const char *file, int line, const char *fmt, ...)
+DF_API void df_log_impl(int level, const char *file, int line, const char *fmt, ...)
 {
     va_list ap;
     va_start(ap, fmt);
diff --git a/lib/math.c b/lib/math.c
index 204d85b..5d1017d 100644
--- a/lib/math.c
+++ b/lib/math.c
@@ -5,27 +5,27 @@
 #include <immintrin.h>
 #include <pmmintrin.h>
 
-df_v2 df_add_v2(df_v2 a, df_v2 b)
+DF_API df_v2 df_add_v2(df_v2 a, df_v2 b)
 {
     df_v2 v = {a.x + b.x, a.y + b.y};
     return v;
 }
 
-df_v2 df_sub_v2(df_v2 a, df_v2 b)
+DF_API df_v2 df_sub_v2(df_v2 a, df_v2 b)
 {
     df_v2 v = {a.x - b.x, a.y - b.y};
     return v;
 }
 
-float df_dot_v2(df_v2 a, df_v2 b) { return a.x * b.x + a.y * b.y; }
+DF_API float df_dot_v2(df_v2 a, df_v2 b) { return a.x * b.x + a.y * b.y; }
 
-df_v2 df_mul_v2(float t, df_v2 v)
+DF_API df_v2 df_mul_v2(float t, df_v2 v)
 {
     df_v2 r = {t * v.x, t * v.y};
     return r;
 }
 
-df_v2 df_normalize_v2(df_v2 v)
+DF_API df_v2 df_normalize_v2(df_v2 v)
 {
     float len_square = df_dot_v2(v, v);
     float len = sqrtf(len_square);
@@ -33,27 +33,27 @@ df_v2 df_normalize_v2(df_v2 v)
     return n;
 }
 
-df_v3 df_add_v3(df_v3 a, df_v3 b)
+DF_API df_v3 df_add_v3(df_v3 a, df_v3 b)
 {
     df_v3 v = {a.x + b.x, a.y + b.y, a.z + b.z};
     return v;
 }
 
-df_v3 df_sub_v3(df_v3 a, df_v3 b)
+DF_API df_v3 df_sub_v3(df_v3 a, df_v3 b)
 {
     df_v3 v = {a.x - b.x, a.y - b.y, a.z - b.z};
     return v;
 }
 
-float df_dot_v3(df_v3 a, df_v3 b) { return a.x * b.x + a.y * b.y + a.z * b.z; }
+DF_API float df_dot_v3(df_v3 a, df_v3 b) { return a.x * b.x + a.y * b.y + a.z * b.z; }
 
-df_v3 df_mul_v3(float t, df_v3 v)
+DF_API df_v3 df_mul_v3(float t, df_v3 v)
 {
     df_v3 r = {t * v.x, t * v.y, t * v.z};
     return r;
 }
 
-df_v3 df_normalize_v3(df_v3 v)
+DF_API df_v3 df_normalize_v3(df_v3 v)
 {
     float len_square = df_dot_v3(v, v);
     float len = sqrtf(len_square);
@@ -61,7 +61,7 @@ df_v3 df_normalize_v3(df_v3 v)
     return n;
 }
 
-df_line_plane_intersection df_calc_line_plane_intersection(df_line line, df_plane plane)
+DF_API df_line_plane_intersection df_calc_line_plane_intersection(df_line line, df_plane plane)
 {
     /* check case */
     float dot = df_dot_v3(line.direction, plane.normal);
@@ -90,7 +90,7 @@ df_line_plane_intersection df_calc_line_plane_intersection(df_line line, df_plan
     }
 }
 
-df_m4 df_mul_m4(df_m4 a, df_m4 b)
+DF_API df_m4 df_mul_m4(df_m4 a, df_m4 b)
 {
     /* Super simple, we could probably do it a lot better via SIMD. */
     df_m4 p;
@@ -105,7 +105,7 @@ df_m4 df_mul_m4(df_m4 a, df_m4 b)
     return p;
 }
 
-df_m4 df_scale(float x, float y, float z)
+DF_API df_m4 df_scale(float x, float y, float z)
 {
     /* clang-format off */
     df_m4 s = {{
@@ -118,7 +118,7 @@ df_m4 df_scale(float x, float y, float z)
     return s;
 }
 
-df_m4 df_translate(float x, float y, float z)
+DF_API df_m4 df_translate(float x, float y, float z)
 {
     /* clang-format off */
     df_m4 t = {{
@@ -153,7 +153,7 @@ static float hsum(__m128 v)
     return _mm_cvtss_f32(sum);
 }
 
-df_v3 df_transform_v3(df_m4 T, df_v3 v)
+DF_API df_v3 df_transform_v3(df_m4 T, df_v3 v)
 {
     df_v3 transf;
     _Alignas(16) float tmp_v[4] = {v.x, v.y, v.z, 1.f};
@@ -182,56 +182,145 @@ df_v3 df_transform_v3(df_m4 T, df_v3 v)
  * | R T |
  * | 0 1 |
  */
-df_m4 df_inverse_transform_no_scale(df_m4 M)
+DF_API df_m4 df_inverse_transform_no_scale(df_m4 M)
 {
     df_m4 I = {0.f};
 
-    /* transpose 3x3, we know that m03 = m13 = m23 = 0 */
+    /* transpose 3x3 */
     __m128 t0 = DF_VEC_SHUFFLE_0101(M.vec[0], M.vec[1]); /* 00, 01, 10, 11 */
     __m128 t1 = DF_VEC_SHUFFLE_2323(M.vec[0], M.vec[1]); /* 02, 03, 12, 13 */
     I.vec[0] = DF_VEC_SHUFFLE(t0, M.vec[2], 0, 2, 0, 3); /* 00, 01, 20, 23(=0) */
     I.vec[1] = DF_VEC_SHUFFLE(t0, M.vec[2], 1, 3, 1, 3); /* 01, 11, 21, 23(=0) */
     I.vec[2] = DF_VEC_SHUFFLE(t1, M.vec[2], 0, 2, 2, 3); /* 02, 12, 22, 23(=0) */
 
-    /* last */
-    I.vec[3] = _mm_mul_ps(I.vec[0], DF_VEC_SWIZZLE1(M.vec[3], 0));
-    I.vec[3] = _mm_add_ps(I.vec[3], _mm_mul_ps(I.vec[1], DF_VEC_SWIZZLE1(M.vec[3], 1)));
-    I.vec[3] = _mm_add_ps(I.vec[3], _mm_mul_ps(I.vec[2], DF_VEC_SWIZZLE1(M.vec[3], 2)));
-    I.vec[3] = _mm_sub_ps(_mm_setr_ps(0.f, 0.f, 0.f, 1.f), I.vec[3]);
+    __m128 t = _mm_setr_ps(DF_M4_AT(M, 0, 3), DF_M4_AT(M, 1, 3), DF_M4_AT(M, 2, 3), 0.f);
+    DF_M4_AT(I, 0, 3) = -1.f * hsum(_mm_mul_ps(I.vec[0], t));
+
+    DF_M4_AT(I, 1, 3) = -1.f * hsum(_mm_mul_ps(I.vec[1], t));
+    DF_M4_AT(I, 2, 3) = -1.f * hsum(_mm_mul_ps(I.vec[2], t));
+    I.vec[3] = _mm_setr_ps(0.f, 0.f, 0.f, 1.f);
 
     return I;
 }
 
-df_m4 df_inverse_transform(df_m4 M)
+DF_API df_m4 df_inverse_transform(df_m4 M)
 {
-#define SMALL_NUMBER (1.e-8f)
-
     df_m4 I = {0.f};
 
-    /* transpose 3x3, we know that m03 = m13 = m23 = 0 */
+    /* transpose 3x3 */
     __m128 t0 = DF_VEC_SHUFFLE_0101(M.vec[0], M.vec[1]); /* 00, 01, 10, 11 */
     __m128 t1 = DF_VEC_SHUFFLE_2323(M.vec[0], M.vec[1]); /* 02, 03, 12, 13 */
     I.vec[0] = DF_VEC_SHUFFLE(t0, M.vec[2], 0, 2, 0, 3); /* 00, 01, 20, 23(=0) */
     I.vec[1] = DF_VEC_SHUFFLE(t0, M.vec[2], 1, 3, 1, 3); /* 01, 11, 21, 23(=0) */
     I.vec[2] = DF_VEC_SHUFFLE(t1, M.vec[2], 0, 2, 2, 3); /* 02, 12, 22, 23(=0) */
 
-    /* divide by the squared scale */
-    __m128 size_sqr = _mm_mul_ps(I.vec[0], I.vec[0]);
-    size_sqr = _mm_add_ps(size_sqr, _mm_mul_ps(I.vec[1], I.vec[1]));
-    size_sqr = _mm_add_ps(size_sqr, _mm_mul_ps(I.vec[2], I.vec[2]));
+    __m128 t = _mm_setr_ps(DF_M4_AT(M, 0, 3), DF_M4_AT(M, 1, 3), DF_M4_AT(M, 2, 3), 0.f);
+    DF_M4_AT(I, 0, 3) = -1.f * hsum(_mm_mul_ps(I.vec[0], t));
 
-    __m128 r_size_sqr = _mm_div_ps(_mm_set1_ps(1.f), size_sqr);
-
-    I.vec[0] = _mm_mul_ps(I.vec[0], r_size_sqr);
-    I.vec[1] = _mm_mul_ps(I.vec[1], r_size_sqr);
-    I.vec[2] = _mm_mul_ps(I.vec[2], r_size_sqr);
-
-    /* last */
-    I.vec[3] = _mm_mul_ps(I.vec[0], DF_VEC_SWIZZLE1(M.vec[3], 0));
-    I.vec[3] = _mm_add_ps(I.vec[3], _mm_mul_ps(I.vec[1], DF_VEC_SWIZZLE1(M.vec[3], 1)));
-    I.vec[3] = _mm_add_ps(I.vec[3], _mm_mul_ps(I.vec[2], DF_VEC_SWIZZLE1(M.vec[3], 2)));
-    I.vec[3] = _mm_sub_ps(_mm_setr_ps(0.f, 0.f, 0.f, 1.f), I.vec[3]);
+    DF_M4_AT(I, 1, 3) = -1.f * hsum(_mm_mul_ps(I.vec[1], t));
+    DF_M4_AT(I, 2, 3) = -1.f * hsum(_mm_mul_ps(I.vec[2], t));
+    I.vec[3] = _mm_setr_ps(0.f, 0.f, 0.f, 1.f);
 
     return I;
-#undef SMALL_NUMBER
+}
+
+
+
+// for row major matrix
+// we use __m128 to represent 2x2 matrix as A = | A0  A1 |
+//                                              | A2  A3 |
+// 2x2 row major Matrix multiply A*B
+static __forceinline __m128 mat2_mul(__m128 vec1, __m128 vec2)
+{
+    return _mm_add_ps(_mm_mul_ps(vec1, DF_VEC_SWIZZLE(vec2, 0, 3, 0, 3)),
+                      _mm_mul_ps(DF_VEC_SWIZZLE(vec1, 1, 0, 3, 2), DF_VEC_SWIZZLE(vec2, 2, 1, 2, 1)));
+}
+// 2x2 row major Matrix adjugate multiply (A#)*B
+static __forceinline __m128 mat2_adj_mul(__m128 vec1, __m128 vec2)
+{
+    return _mm_sub_ps(_mm_mul_ps(DF_VEC_SWIZZLE(vec1, 3, 3, 0, 0), vec2),
+                      _mm_mul_ps(DF_VEC_SWIZZLE(vec1, 1, 1, 2, 2), DF_VEC_SWIZZLE(vec2, 2, 3, 0, 1)));
+}
+// 2x2 row major Matrix multiply adjugate A*(B#)
+static __forceinline __m128 mat2_mul_adj(__m128 vec1, __m128 vec2)
+{
+    return _mm_sub_ps(_mm_mul_ps(vec1, DF_VEC_SWIZZLE(vec2, 3, 0, 3, 0)),
+                      _mm_mul_ps(DF_VEC_SWIZZLE(vec1, 1, 0, 3, 2), DF_VEC_SWIZZLE(vec2, 2, 1, 2, 1)));
+}
+
+DF_API df_m4 df_inverse( df_m4 M ) {
+    // use block matrix method
+    // A is a matrix, then i(A) or iA means inverse of A, A# (or A_ in code) means adjugate of A, |A| (or detA in code)
+    // is determinant, tr(A) is trace
+
+    // sub matrices
+    __m128 A = DF_VEC_SHUFFLE_0101(M.vec[0], M.vec[1]);
+    __m128 B = DF_VEC_SHUFFLE_2323(M.vec[0], M.vec[1]);
+    __m128 C = DF_VEC_SHUFFLE_0101(M.vec[2], M.vec[3]);
+    __m128 D = DF_VEC_SHUFFLE_2323(M.vec[2], M.vec[3]);
+
+#if 0
+	__m128 detA = _mm_set1_ps(M.m[0][0] * M.m[1][1] - M.m[0][1] * M.m[1][0]);
+	__m128 detB = _mm_set1_ps(M.m[0][2] * M.m[1][3] - M.m[0][3] * M.m[1][2]);
+	__m128 detC = _mm_set1_ps(M.m[2][0] * M.m[3][1] - M.m[2][1] * M.m[3][0]);
+	__m128 detD = _mm_set1_ps(M.m[2][2] * M.m[3][3] - M.m[2][3] * M.m[3][2]);
+#else
+    // determinant as (|A| |B| |C| |D|)
+    __m128 detSub = _mm_sub_ps(
+        _mm_mul_ps(DF_VEC_SHUFFLE(M.vec[0], M.vec[2], 0, 2, 0, 2), DF_VEC_SHUFFLE(M.vec[1], M.vec[3], 1, 3, 1, 3)),
+        _mm_mul_ps(DF_VEC_SHUFFLE(M.vec[0], M.vec[2], 1, 3, 1, 3), DF_VEC_SHUFFLE(M.vec[1], M.vec[3], 0, 2, 0, 2)));
+    __m128 detA = DF_VEC_SWIZZLE1(detSub, 0);
+    __m128 detB = DF_VEC_SWIZZLE1(detSub, 1);
+    __m128 detC = DF_VEC_SWIZZLE1(detSub, 2);
+    __m128 detD = DF_VEC_SWIZZLE1(detSub, 3);
+#endif
+
+    // let iM = 1/|M| * | X  Y |
+    //                  | Z  W |
+
+    // D#C
+    __m128 D_C = mat2_adj_mul(D, C);
+    // A#B
+    __m128 A_B = mat2_adj_mul(A, B);
+    // X# = |D|A - B(D#C)
+    __m128 X_ = _mm_sub_ps(_mm_mul_ps(detD, A), mat2_mul(B, D_C));
+    // W# = |A|D - C(A#B)
+    __m128 W_ = _mm_sub_ps(_mm_mul_ps(detA, D), mat2_mul(C, A_B));
+
+    // |M| = |A|*|D| + ... (continue later)
+    __m128 detM = _mm_mul_ps(detA, detD);
+
+    // Y# = |B|C - D(A#B)#
+    __m128 Y_ = _mm_sub_ps(_mm_mul_ps(detB, C), mat2_mul_adj(D, A_B));
+    // Z# = |C|B - A(D#C)#
+    __m128 Z_ = _mm_sub_ps(_mm_mul_ps(detC, B), mat2_mul_adj(A, D_C));
+
+    // |M| = |A|*|D| + |B|*|C| ... (continue later)
+    detM = _mm_add_ps(detM, _mm_mul_ps(detB, detC));
+
+    // tr((A#B)(D#C))
+    __m128 tr = _mm_mul_ps(A_B, DF_VEC_SWIZZLE(D_C, 0, 2, 1, 3));
+    tr = _mm_hadd_ps(tr, tr);
+    tr = _mm_hadd_ps(tr, tr);
+    // |M| = |A|*|D| + |B|*|C| - tr((A#B)(D#C)
+    detM = _mm_sub_ps(detM, tr);
+
+    const __m128 adjSignMask = _mm_setr_ps(1.f, -1.f, -1.f, 1.f);
+    // (1/|M|, -1/|M|, -1/|M|, 1/|M|)
+    __m128 rDetM = _mm_div_ps(adjSignMask, detM);
+
+    X_ = _mm_mul_ps(X_, rDetM);
+    Y_ = _mm_mul_ps(Y_, rDetM);
+    Z_ = _mm_mul_ps(Z_, rDetM);
+    W_ = _mm_mul_ps(W_, rDetM);
+
+    df_m4 r;
+
+    // apply adjugate and store, here we combine adjugate shuffle and store shuffle
+    r.vec[0] = DF_VEC_SHUFFLE(X_, Y_, 3, 1, 3, 1);
+    r.vec[1] = DF_VEC_SHUFFLE(X_, Y_, 2, 0, 2, 0);
+    r.vec[2] = DF_VEC_SHUFFLE(Z_, W_, 3, 1, 3, 1);
+    r.vec[3] = DF_VEC_SHUFFLE(Z_, W_, 2, 0, 2, 0);
+
+    return r;
 }
diff --git a/lib/raytracing.c b/lib/raytracing.c
new file mode 100644
index 0000000..79901d0
--- /dev/null
+++ b/lib/raytracing.c
@@ -0,0 +1,51 @@
+#include <defocus/camera.h>
+
+#include <immintrin.h>
+
+DF_API void df_evaluate_ray_packet(const df_ray_packet *rays) {
+    const __m128 *base_x = (const __m128 *)rays->base_x; 
+    const __m128 *base_y = (const __m128 *)rays->base_y;
+    const __m128 *base_z = (const __m128 *)rays->base_z;
+    
+    const __m128 *dir_x = (const __m128 *)rays->dir_x;
+    const __m128 *dir_y = (const __m128 *)rays->dir_y;
+    const __m128 *dir_z = (const __m128 *)rays->dir_z;
+
+    /* Simple test: Let rays intersect with plane at z = 350.0 */
+
+    float PLANE_Z = 350.0f;
+
+    __m128 plane_z = _mm_set1_ps(PLANE_Z);
+
+    size_t ray_count = rays->ray_count;
+
+    /* TODO(kevin): divide to multiple threads */
+    for (size_t i = 0; i < ray_count; i += 4) {
+        __m128 rays_base_x = base_x[i];
+        __m128 rays_base_y = base_y[i];
+        __m128 rays_base_z = base_z[i];
+        __m128 rays_dir_x = dir_x[i];
+        __m128 rays_dir_y = dir_y[i];
+        __m128 rays_dir_z = dir_z[i];
+
+        /* Solve for t: base.z + t * dir.z = plane_z 
+         *              t = (plane_z - base.z) / dir.z 
+         */
+        __m128 delta = _mm_sub_ps(plane_z, rays_base_z);
+        __m128 t = _mm_div_ps(delta, rays_dir_z);
+
+        /* Sample p = base.z + t * dir */
+        __m128 sample_p_x = _mm_mul_ps(t, rays_dir_x);
+        __m128 sample_p_y = _mm_mul_ps(t, rays_dir_y);
+        __m128 sample_p_z = _mm_mul_ps(t, rays_dir_z);
+        sample_p_x = _mm_add_ps(sample_p_x, rays_base_x);
+        sample_p_y = _mm_add_ps(sample_p_y, rays_base_y);
+        sample_p_z = _mm_add_ps(sample_p_z, rays_base_z);
+
+    }
+
+    /* Handle remaining (< 4) rays */
+    if ((ray_count % 4) != 0) {
+    
+    }
+}
\ No newline at end of file
diff --git a/meson.build b/meson.build
index 47460e2..c5dd12a 100644
--- a/meson.build
+++ b/meson.build
@@ -5,7 +5,9 @@ incdir = include_directories('include', '3p')
 cc = meson.get_compiler('c')
 m_dep = cc.find_library('m', required: false)
 
-add_project_arguments([ '-msse3', '-Wno-missing-braces' ], language: 'c')
+if cc.get_id() == 'gcc' or cc.get_id() == 'clang'
+  add_project_arguments([ '-msse3', '-msse4.1', '-Wno-missing-braces' ], language: 'c')
+endif
 
 lib = library('df', 
               'lib/log.c',
@@ -15,14 +17,21 @@ lib = library('df',
               'lib/image.c',
               'lib/color.c',
               'lib/thin_lense.c',
+              'lib/raytracing.c',
+              'include/defocus/base.h',
+              'include/defocus/camera.h',
+              'include/defocus/defocus.h',
+              'include/defocus/image.h',
+              'include/defocus/intrinsic_helper.h',
+              'include/defocus/models.h',
+              'include/defocus/scene.h',
+              'include/defocus/raytracing.h',
               include_directories: incdir,
-              dependencies: m_dep,
-              version: '0.1.0',
-              soversion: '0')
+              dependencies: m_dep)
 
 # Command Line Executable
 executable('defocus', 'bin/defocus.c', include_directories: incdir, link_with: lib)
 
 # Test driver
-munit_dep = dependency('munit', fallback: ['munit', 'munit_dep'])
-executable('tests', 'tests/tests.c', include_directories: incdir, link_with: lib, dependencies: munit_dep)
\ No newline at end of file
+#munit_dep = dependency('munit', fallback: ['munit', 'munit_dep'])
+#executable('tests', 'tests/tests.c', include_directories: incdir, link_with: lib, dependencies: munit_dep)
\ No newline at end of file