From 583cd9b417bbbda308205cc54237d18e97fde558 Mon Sep 17 00:00:00 2001 From: Kevin Trogant Date: Tue, 9 May 2023 13:04:14 +0200 Subject: [PATCH] test SIMD raytracing --- bin/defocus.c | 25 ++++- include/defocus/base.h | 56 ++++++++---- include/defocus/camera.h | 28 +----- include/defocus/image.h | 14 +-- include/defocus/models.h | 4 +- include/defocus/raytracing.h | 29 ++++++ lib/camera.c | 109 +++++++++++++++++++++- lib/image.c | 14 +-- lib/log.c | 2 +- lib/math.c | 173 ++++++++++++++++++++++++++--------- lib/raytracing.c | 51 +++++++++++ meson.build | 21 +++-- 12 files changed, 410 insertions(+), 116 deletions(-) create mode 100644 include/defocus/raytracing.h create mode 100644 lib/raytracing.c diff --git a/bin/defocus.c b/bin/defocus.c index 23cbe67..07e2ad2 100644 --- a/bin/defocus.c +++ b/bin/defocus.c @@ -3,15 +3,19 @@ #include #include +#include + int pinhole_fn(int argc, char **argv); int thin_lense_fn(int argc, char **argv); +int test_fn(int argc, char **argv); -static const char *_model_names[] = {"pinhole", "thin_lense"}; +static const char *_model_names[] = {"pinhole", "thin_lense", "test"}; typedef int (*model_fn)(int argc, char **argv); static model_fn _model_fns[] = { pinhole_fn, thin_lense_fn, + test_fn, }; void usage(const char *pname) @@ -25,7 +29,6 @@ void usage(const char *pname) int main(int argc, char **argv) { - if (argc < 2) { fprintf(stderr, "Missing model name!\n"); usage(argv[0]); @@ -218,4 +221,22 @@ int thin_lense_fn(int argc, char **argv) df_release_image(out_image); return error_code; +} + +int test_fn( int argc, char **argv ) { + + df_v3 v = {1.f, 2.f, 3.f}; + df_m4 t = df_translate(-1.f, 2.f, 1.5f); + t = df_inverse_transform(t); + + df_v3 tv = df_transform_v3(t, v); + + df_camera_i cam = df_create_perspective_camera(1024.0, 1024.0, 1024.0, 1024.0, 2.e-4f, 1000.f, 350.0, 21.0); + df_ray_packet packet = cam.build_ray_packet(cam.o); + df_evaluate_ray_packet(&packet); + + df_release_ray_packet(&packet); + + cam.release(cam.o); + return 0; } \ No newline at end of file diff --git a/include/defocus/base.h b/include/defocus/base.h index e8fa3fb..dafec37 100644 --- a/include/defocus/base.h +++ b/include/defocus/base.h @@ -38,6 +38,20 @@ typedef enum /** @brief Zero an array */ #define DF_ZERO_ARRAY(a, n) DF_ZERO_MEMORY(a, sizeof((a)[0]) * (n)) +#ifdef _MSC_VER +#ifdef __cplusplus__ +#define DF_API extern "C" __declspec(dllexport) +#else +#define DF_API __declspec(dllexport) +#endif /* _MSC_VER && __cplusplus__ */ +#else +#ifdef __cplusplus__ +#define DF_API extern "C" +#else +#define DF_API +#endif +#endif + /* Simple logging function */ #ifndef DF_ENABLE_LOGGING @@ -69,7 +83,7 @@ enum * @param line line number of the log location * @param fmt format string of the message. */ -void df_log_impl(int level, const char *file, int line, const char *fmt, ...); +DF_API void df_log_impl(int level, const char *file, int line, const char *fmt, ...); #define df_log_verbose(...) df_log_impl(df_log_level_verbose, __FILE__, __LINE__, __VA_ARGS__) #define df_log_info(...) df_log_impl(df_log_level_info, __FILE__, __LINE__, __VA_ARGS__) @@ -112,7 +126,7 @@ typedef union { * @param t interpolation factor * @return a + (b - a) * t */ -df_color df_lerp_color(df_color a, df_color b, double t); +DF_API df_color df_lerp_color(df_color a, df_color b, double t); /** @brief 2d vector */ typedef union { @@ -130,22 +144,22 @@ typedef union { } df_v2; /** @brief Add two 2d vectors */ -df_v2 df_add_v2(df_v2 a, df_v2 b); +DF_API df_v2 df_add_v2(df_v2 a, df_v2 b); /** @brief Subtract two 2d vectors */ -df_v2 df_sub_v2(df_v2 a, df_v2 b); +DF_API df_v2 df_sub_v2(df_v2 a, df_v2 b); /** @brief Calculate the dot product of 2d vectors a and b. */ -float df_dot_v2(df_v2 a, df_v2 b); +DF_API float df_dot_v2(df_v2 a, df_v2 b); /** @brief Multiply a 2d vector with a scalar. */ -df_v2 df_mul_v2(float t, df_v2 v); +DF_API df_v2 df_mul_v2(float t, df_v2 v); /** @brief Returns the normalized version of a 2d vector v */ -df_v2 df_normalize_v2(df_v2 v); +DF_API df_v2 df_normalize_v2(df_v2 v); /** @brief 3d vector */ typedef union { @@ -159,22 +173,22 @@ typedef union { } df_v3; /** @brief Add two 3d vectors */ -df_v3 df_add_v3(df_v3 a, df_v3 b); +DF_API df_v3 df_add_v3(df_v3 a, df_v3 b); /** @brief Subtract two 3d vectors */ -df_v3 df_sub_v3(df_v3 a, df_v3 b); +DF_API df_v3 df_sub_v3(df_v3 a, df_v3 b); /** @brief Calculate the dot product of 3d vectors a and b. */ -float df_dot_v3(df_v3 a, df_v3 b); +DF_API float df_dot_v3(df_v3 a, df_v3 b); /** @brief Multiply a 3d vector with a scalar. */ -df_v3 df_mul_v3(float t, df_v3 v); +DF_API df_v3 df_mul_v3(float t, df_v3 v); /** @brief Returns the normalized version of a 3d vector v */ -df_v3 df_normalize_v3(df_v3 v); +DF_API df_v3 df_normalize_v3(df_v3 v); /** @brief A plane in 3d space. * @@ -221,7 +235,7 @@ typedef struct } df_line_plane_intersection; /** @brief Calculate the intersection between a line and a plane in 3d space */ -df_line_plane_intersection df_calc_line_plane_intersection(df_line line, df_plane plane); +DF_API df_line_plane_intersection df_calc_line_plane_intersection(df_line line, df_plane plane); /** @brief A 4x4 matrix. * @@ -239,22 +253,26 @@ typedef struct df_m4 #define DF_M4_AT(m, row, col) ((m).e[(row)*4 + (col)]) /** @brief Matrix multiply 4x4 matrix a and b */ -df_m4 df_mul_m4(df_m4 a, df_m4 b); +DF_API df_m4 df_mul_m4(df_m4 a, df_m4 b); /** @brief Get a scale matrix */ -df_m4 df_scale(float x, float y, float z); +DF_API df_m4 df_scale(float x, float y, float z); -df_m4 df_translate(float x, float y, float z); +DF_API df_m4 df_translate(float x, float y, float z); /** @brief Transform (i.e. multiply) a 3d vector v by the transformation matrix T */ -df_v3 df_transform_v3(df_m4 T, df_v3 v); +DF_API df_v3 df_transform_v3(df_m4 T, df_v3 v); /** @brief Calculate the inverse of a non-scaling transform matrix. * * Special fast case. */ -df_m4 df_inverse_transform_no_scale(df_m4 M); +DF_API df_m4 df_inverse_transform_no_scale(df_m4 M); /** @brief Calculate the inverse of a transform matrix */ -df_m4 df_inverse_transform(df_m4 M); +DF_API df_m4 df_inverse_transform(df_m4 M); + +/** @brief Calculate the inverse of a general (invertible) 4x4 matrix */ +DF_API df_m4 df_inverse(df_m4 M); + #endif \ No newline at end of file diff --git a/include/defocus/camera.h b/include/defocus/camera.h index 21a0657..7980cd5 100644 --- a/include/defocus/camera.h +++ b/include/defocus/camera.h @@ -2,34 +2,12 @@ #define DEFOCUS_CAMERA_H #include "base.h" +#include "raytracing.h" /** @file camera.h * @brief basic camera functions */ -/** @brief Stores information for rays in a SIMD friendly way */ -typedef struct -{ - /** Packs all SIMD data into a single buffer */ - float *simd_mem; - - /** Source uvs for rays */ - df_v2 *ray_uvs; - - /* Points into simd_mem */ - float *base_x; - float *base_y; - float *base_z; - float *dir_x; - float *dir_y; - float *dir_z; - - size_t ray_count; -} df_ray_packet; - -/** @brief Free ray packet memory */ -void df_release_ray_packet(df_ray_packet *rays); - /** @brief Interface for cameras. */ typedef struct { @@ -44,7 +22,7 @@ typedef struct void *o; } df_camera_i; -df_camera_i -df_create_perspective_camera(float image_width, float image_height, float raster_width, float raster_height); +DF_API df_camera_i df_create_perspective_camera( + float image_width, float image_height, float raster_width, float raster_height, float far_dist, float near_dist, float focal_dist, float lens_radius); #endif \ No newline at end of file diff --git a/include/defocus/image.h b/include/defocus/image.h index 216ed68..72ce747 100644 --- a/include/defocus/image.h +++ b/include/defocus/image.h @@ -18,7 +18,7 @@ typedef struct df_image df_image; * @param out_image receives the image object * @return error code */ -df_result df_create_image(int w, int h, df_image **out_image); +DF_API df_result df_create_image(int w, int h, df_image **out_image); /** @brief load an image file * @@ -29,34 +29,34 @@ df_result df_create_image(int w, int h, df_image **out_image); * @param out_image receives the image object * @return error code */ -df_result df_load_image(const char *path, int *out_w, int *out_h, df_image **out_image); +DF_API df_result df_load_image(const char *path, int *out_w, int *out_h, df_image **out_image); /** @brief Write an image to a PNG file * @param img the image * @param path the path */ -df_result df_write_image(df_image *img, const char *path); +DF_API df_result df_write_image(df_image *img, const char *path); /** @brief Free an image. * * Any pointer to the image will be invalid after this. * @param img the image */ -void df_release_image(df_image *img); +DF_API void df_release_image(df_image *img); /** @brief Returns the dimensions of the image */ -void df_get_image_size(const df_image *image, int *w, int *h); +DF_API void df_get_image_size(const df_image *image, int *w, int *h); /** @brief Returns the color value at pixel coordinates x, y * * Returns black for coordinates outside the image. */ -df_color df_get_image_pixel(const df_image *image, int x, int y); +DF_API df_color df_get_image_pixel(const df_image *image, int x, int y); /** @brief Set the color value at pixel coordinates x, y. * * Does nothing for coordinates outside the image. */ -void df_set_image_pixel(df_image *image, int x, int y, df_color c); +DF_API void df_set_image_pixel(df_image *image, int x, int y, df_color c); #endif diff --git a/include/defocus/models.h b/include/defocus/models.h index 6f7320a..b172c73 100644 --- a/include/defocus/models.h +++ b/include/defocus/models.h @@ -44,7 +44,7 @@ typedef struct df_pinhole_params * @param in_image input image. * @param out_image the output image. */ -void df_pinhole(df_pinhole_params params, const df_image *in_image, df_image *out_image); +DF_API void df_pinhole(df_pinhole_params params, const df_image *in_image, df_image *out_image); /** Parameters for the thin lense model. */ @@ -72,6 +72,6 @@ typedef struct df_thin_lense_params * @param in_image input image * @param out_image the output image. */ -void df_thin_lense(df_thin_lense_params params, const df_image *in_image, df_image *out_image); +DF_API void df_thin_lense(df_thin_lense_params params, const df_image *in_image, df_image *out_image); #endif diff --git a/include/defocus/raytracing.h b/include/defocus/raytracing.h new file mode 100644 index 0000000..9406173 --- /dev/null +++ b/include/defocus/raytracing.h @@ -0,0 +1,29 @@ +#ifndef DF_RAYTRACING_H +#define DF_RAYTRACING_H + +/** @brief Stores information for rays in a SIMD friendly way */ +typedef struct +{ + /** Packs all SIMD data into a single buffer */ + float *simd_mem; + + /** Source uvs for rays */ + df_v2 *ray_uvs; + + /* Points into simd_mem */ + float *base_x; + float *base_y; + float *base_z; + float *dir_x; + float *dir_y; + float *dir_z; + + size_t ray_count; +} df_ray_packet; + +/** @brief Free ray packet memory */ +DF_API void df_release_ray_packet(df_ray_packet *rays); + +DF_API void df_evaluate_ray_packet(const df_ray_packet *rays); + +#endif diff --git a/lib/camera.c b/lib/camera.c index 988951e..6790920 100644 --- a/lib/camera.c +++ b/lib/camera.c @@ -1,6 +1,12 @@ #include #include +#include +#include + +#ifdef _MSC_VER +#include +#endif /* ******************************************************** * @@ -8,7 +14,7 @@ * * ********************************************************/ -void df_release_ray_packet(df_ray_packet *rays) +DF_API void df_release_ray_packet(df_ray_packet *rays) { free(rays->simd_mem); free(rays->ray_uvs); @@ -26,18 +32,95 @@ typedef struct { float focal_dist; float lens_radius; + + /* Raster space, i.e. the space for which we generate rays */ + float raster_width; + float raster_height; + + /* Image(=screen) space, i.e. the space in which we store pixels */ + float image_width; + float image_height; df_m4 screen_to_raster; df_m4 raster_to_screen; df_m4 raster_to_camera; - } df_perspective_camera; static void pc_release(void *o) { df_perspective_camera *camera = o; } -static df_ray_packet pc_build_ray_packet(void *o) { df_perspective_camera *camera = o; } +static df_ray_packet pc_build_ray_packet(void *o) +{ + df_perspective_camera *camera = o; + df_ray_packet packet; + memset(&packet, 0, sizeof(packet)); -df_camera_i df_create_perspective_camera(float image_width, float image_height, float raster_width, float raster_height) + /* Generate 1 ray per pixel (ignore lens for now) */ + size_t count = (size_t)camera->raster_width * (size_t)camera->raster_height; + + size_t alloc_count = count; + /* Round up to nearest multiple of 4, for SSE. + * Also satisfies 16 byte alignment (assuming simd_mem is 16 byte aligned) + * because 4 * sizeof(float) = 16. + */ + if ((alloc_count % 4) != 0) { + alloc_count = ((alloc_count + 3) / 4) * 4; + } + +#ifdef _MSC_VER + packet.simd_mem = _aligned_malloc(sizeof(float) * alloc_count * 6, 16); +#elif defined(_ISOC11_SOURCE) /* Feature test macro for GCC */ + packet.simd_mem = aligned_alloc(16, sizeof(float) * alloc_count * 6); +#else + /* Fall back to regular malloc and hope for the best */ + packet.simd_mem = malloc(sizeof(float) * alloc_count * 6); +#endif + + packet.base_x = packet.simd_mem; + packet.base_y = packet.base_x + alloc_count; + packet.base_z = packet.base_y + alloc_count; + packet.dir_x = packet.base_z + alloc_count; + packet.dir_y = packet.dir_x + alloc_count; + packet.dir_z = packet.dir_y + alloc_count; + + packet.ray_uvs = malloc(sizeof(df_v2) * count); + + packet.ray_count = count; + + size_t i = 0; + for (float y = 0; y < camera->raster_height; y += 1.f) { + for (float x = 0; x < camera->raster_width; x += 1.f) { + packet.base_x[i] = 0.f; + packet.base_y[i] = 0.f; + packet.base_z[i] = 0.f; + + df_v3 raster_p = {x, y, 0.f}; + df_v3 camera_p = df_transform_v3(camera->raster_to_camera, raster_p); + df_v3 dir = df_normalize_v3(camera_p); + + packet.dir_x[i] = dir.x; + packet.dir_y[i] = dir.y; + packet.dir_z[i] = dir.z; + + df_v3 img_p = df_transform_v3(camera->raster_to_screen, raster_p); + packet.ray_uvs[i].u = img_p.x / camera->image_width; + packet.ray_uvs[i].v = img_p.y / camera->image_height; + + ++i; + assert(i <= count); + } + } + + return packet; +} + +DF_API df_camera_i df_create_perspective_camera(float image_width, + float image_height, + float raster_width, + float raster_height, + float far_dist, + float near_dist, + float focal_dist, + float lens_radius) { df_perspective_camera *camera = malloc(sizeof(*camera)); @@ -45,9 +128,25 @@ df_camera_i df_create_perspective_camera(float image_width, float image_height, camera->screen_to_raster = df_mul_m4(camera->screen_to_raster, df_scale(1.f / image_width, -1.f / image_height, 1.f)); camera->screen_to_raster = df_mul_m4(camera->screen_to_raster, df_translate(0.f, -image_height, 0.f)); - camera->raster_to_screen = df_inverse_transform(camera->screen_to_raster); + /* Perspective projection matrix */ + df_m4 persp = {0.f}; + DF_M4_AT(persp, 0, 0) = 1.f; + DF_M4_AT(persp, 1, 1) = 1.f; + DF_M4_AT(persp, 2, 2) = far_dist / (far_dist - near_dist); + DF_M4_AT(persp, 2, 3) = -1.f * far_dist * near_dist / (far_dist - near_dist); + DF_M4_AT(persp, 3, 2) = 1.f; + + camera->raster_to_camera = df_mul_m4(df_inverse(persp), camera->raster_to_screen); + + camera->focal_dist = focal_dist; + camera->lens_radius = lens_radius; + camera->raster_width = raster_width; + camera->raster_height = raster_height; + camera->image_width = image_width; + camera->image_height = image_height; + df_camera_i iface = {.release = pc_release, .build_ray_packet = pc_build_ray_packet, .o = camera}; return iface; } diff --git a/lib/image.c b/lib/image.c index 46570ed..84f59b2 100644 --- a/lib/image.c +++ b/lib/image.c @@ -17,7 +17,7 @@ struct df_image uint8_t *pixels; }; -df_result df_create_image(int w, int h, df_image **out_image) +DF_API df_result df_create_image(int w, int h, df_image **out_image) { df_image *img = malloc(sizeof(df_image)); if (!img) @@ -37,7 +37,7 @@ df_result df_create_image(int w, int h, df_image **out_image) return df_result_success; } -df_result df_load_image(const char *path, int *out_w, int *out_h, df_image **out_image) +DF_API df_result df_load_image(const char *path, int *out_w, int *out_h, df_image **out_image) { int w, h, c; stbi_uc *pixels = stbi_load(path, &w, &h, &c, 4); @@ -69,7 +69,7 @@ out: return res; } -void df_release_image(df_image *img) +DF_API void df_release_image(df_image *img) { if (img) { free(img->pixels); @@ -77,7 +77,7 @@ void df_release_image(df_image *img) } } -df_result df_write_image(df_image *image, const char *path) +DF_API df_result df_write_image(df_image *image, const char *path) { df_result res = stbi_write_png(path, image->width, image->height, 4, image->pixels, image->width * 4) ? df_result_success @@ -85,7 +85,7 @@ df_result df_write_image(df_image *image, const char *path) return res; } -void df_get_image_size(const df_image *image, int *w, int *h) +DF_API void df_get_image_size(const df_image *image, int *w, int *h) { if (w) *w = image->width; @@ -93,7 +93,7 @@ void df_get_image_size(const df_image *image, int *w, int *h) *h = image->height; } -df_color df_get_image_pixel(const df_image *image, int x, int y) +DF_API df_color df_get_image_pixel(const df_image *image, int x, int y) { df_color c = {0, 0, 0, 255}; if (x >= 0 && x < image->width && y >= 0 && y < image->height) { @@ -102,7 +102,7 @@ df_color df_get_image_pixel(const df_image *image, int x, int y) return c; } -void df_set_image_pixel(df_image *image, int x, int y, df_color c) +DF_API void df_set_image_pixel(df_image *image, int x, int y, df_color c) { if (x >= 0 && x < image->width && y >= 0 && y < image->height) { memcpy(&image->pixels[4 * (y * image->width + x)], &c.e[0], 4); diff --git a/lib/log.c b/lib/log.c index 8032c70..7d54533 100644 --- a/lib/log.c +++ b/lib/log.c @@ -4,7 +4,7 @@ static const char *log_level_names[] = {"VERBOSE", "INFO", "WARN", "ERROR"}; -void df_log_impl(int level, const char *file, int line, const char *fmt, ...) +DF_API void df_log_impl(int level, const char *file, int line, const char *fmt, ...) { va_list ap; va_start(ap, fmt); diff --git a/lib/math.c b/lib/math.c index 204d85b..5d1017d 100644 --- a/lib/math.c +++ b/lib/math.c @@ -5,27 +5,27 @@ #include #include -df_v2 df_add_v2(df_v2 a, df_v2 b) +DF_API df_v2 df_add_v2(df_v2 a, df_v2 b) { df_v2 v = {a.x + b.x, a.y + b.y}; return v; } -df_v2 df_sub_v2(df_v2 a, df_v2 b) +DF_API df_v2 df_sub_v2(df_v2 a, df_v2 b) { df_v2 v = {a.x - b.x, a.y - b.y}; return v; } -float df_dot_v2(df_v2 a, df_v2 b) { return a.x * b.x + a.y * b.y; } +DF_API float df_dot_v2(df_v2 a, df_v2 b) { return a.x * b.x + a.y * b.y; } -df_v2 df_mul_v2(float t, df_v2 v) +DF_API df_v2 df_mul_v2(float t, df_v2 v) { df_v2 r = {t * v.x, t * v.y}; return r; } -df_v2 df_normalize_v2(df_v2 v) +DF_API df_v2 df_normalize_v2(df_v2 v) { float len_square = df_dot_v2(v, v); float len = sqrtf(len_square); @@ -33,27 +33,27 @@ df_v2 df_normalize_v2(df_v2 v) return n; } -df_v3 df_add_v3(df_v3 a, df_v3 b) +DF_API df_v3 df_add_v3(df_v3 a, df_v3 b) { df_v3 v = {a.x + b.x, a.y + b.y, a.z + b.z}; return v; } -df_v3 df_sub_v3(df_v3 a, df_v3 b) +DF_API df_v3 df_sub_v3(df_v3 a, df_v3 b) { df_v3 v = {a.x - b.x, a.y - b.y, a.z - b.z}; return v; } -float df_dot_v3(df_v3 a, df_v3 b) { return a.x * b.x + a.y * b.y + a.z * b.z; } +DF_API float df_dot_v3(df_v3 a, df_v3 b) { return a.x * b.x + a.y * b.y + a.z * b.z; } -df_v3 df_mul_v3(float t, df_v3 v) +DF_API df_v3 df_mul_v3(float t, df_v3 v) { df_v3 r = {t * v.x, t * v.y, t * v.z}; return r; } -df_v3 df_normalize_v3(df_v3 v) +DF_API df_v3 df_normalize_v3(df_v3 v) { float len_square = df_dot_v3(v, v); float len = sqrtf(len_square); @@ -61,7 +61,7 @@ df_v3 df_normalize_v3(df_v3 v) return n; } -df_line_plane_intersection df_calc_line_plane_intersection(df_line line, df_plane plane) +DF_API df_line_plane_intersection df_calc_line_plane_intersection(df_line line, df_plane plane) { /* check case */ float dot = df_dot_v3(line.direction, plane.normal); @@ -90,7 +90,7 @@ df_line_plane_intersection df_calc_line_plane_intersection(df_line line, df_plan } } -df_m4 df_mul_m4(df_m4 a, df_m4 b) +DF_API df_m4 df_mul_m4(df_m4 a, df_m4 b) { /* Super simple, we could probably do it a lot better via SIMD. */ df_m4 p; @@ -105,7 +105,7 @@ df_m4 df_mul_m4(df_m4 a, df_m4 b) return p; } -df_m4 df_scale(float x, float y, float z) +DF_API df_m4 df_scale(float x, float y, float z) { /* clang-format off */ df_m4 s = {{ @@ -118,7 +118,7 @@ df_m4 df_scale(float x, float y, float z) return s; } -df_m4 df_translate(float x, float y, float z) +DF_API df_m4 df_translate(float x, float y, float z) { /* clang-format off */ df_m4 t = {{ @@ -153,7 +153,7 @@ static float hsum(__m128 v) return _mm_cvtss_f32(sum); } -df_v3 df_transform_v3(df_m4 T, df_v3 v) +DF_API df_v3 df_transform_v3(df_m4 T, df_v3 v) { df_v3 transf; _Alignas(16) float tmp_v[4] = {v.x, v.y, v.z, 1.f}; @@ -182,56 +182,145 @@ df_v3 df_transform_v3(df_m4 T, df_v3 v) * | R T | * | 0 1 | */ -df_m4 df_inverse_transform_no_scale(df_m4 M) +DF_API df_m4 df_inverse_transform_no_scale(df_m4 M) { df_m4 I = {0.f}; - /* transpose 3x3, we know that m03 = m13 = m23 = 0 */ + /* transpose 3x3 */ __m128 t0 = DF_VEC_SHUFFLE_0101(M.vec[0], M.vec[1]); /* 00, 01, 10, 11 */ __m128 t1 = DF_VEC_SHUFFLE_2323(M.vec[0], M.vec[1]); /* 02, 03, 12, 13 */ I.vec[0] = DF_VEC_SHUFFLE(t0, M.vec[2], 0, 2, 0, 3); /* 00, 01, 20, 23(=0) */ I.vec[1] = DF_VEC_SHUFFLE(t0, M.vec[2], 1, 3, 1, 3); /* 01, 11, 21, 23(=0) */ I.vec[2] = DF_VEC_SHUFFLE(t1, M.vec[2], 0, 2, 2, 3); /* 02, 12, 22, 23(=0) */ - /* last */ - I.vec[3] = _mm_mul_ps(I.vec[0], DF_VEC_SWIZZLE1(M.vec[3], 0)); - I.vec[3] = _mm_add_ps(I.vec[3], _mm_mul_ps(I.vec[1], DF_VEC_SWIZZLE1(M.vec[3], 1))); - I.vec[3] = _mm_add_ps(I.vec[3], _mm_mul_ps(I.vec[2], DF_VEC_SWIZZLE1(M.vec[3], 2))); - I.vec[3] = _mm_sub_ps(_mm_setr_ps(0.f, 0.f, 0.f, 1.f), I.vec[3]); + __m128 t = _mm_setr_ps(DF_M4_AT(M, 0, 3), DF_M4_AT(M, 1, 3), DF_M4_AT(M, 2, 3), 0.f); + DF_M4_AT(I, 0, 3) = -1.f * hsum(_mm_mul_ps(I.vec[0], t)); + + DF_M4_AT(I, 1, 3) = -1.f * hsum(_mm_mul_ps(I.vec[1], t)); + DF_M4_AT(I, 2, 3) = -1.f * hsum(_mm_mul_ps(I.vec[2], t)); + I.vec[3] = _mm_setr_ps(0.f, 0.f, 0.f, 1.f); return I; } -df_m4 df_inverse_transform(df_m4 M) +DF_API df_m4 df_inverse_transform(df_m4 M) { -#define SMALL_NUMBER (1.e-8f) - df_m4 I = {0.f}; - /* transpose 3x3, we know that m03 = m13 = m23 = 0 */ + /* transpose 3x3 */ __m128 t0 = DF_VEC_SHUFFLE_0101(M.vec[0], M.vec[1]); /* 00, 01, 10, 11 */ __m128 t1 = DF_VEC_SHUFFLE_2323(M.vec[0], M.vec[1]); /* 02, 03, 12, 13 */ I.vec[0] = DF_VEC_SHUFFLE(t0, M.vec[2], 0, 2, 0, 3); /* 00, 01, 20, 23(=0) */ I.vec[1] = DF_VEC_SHUFFLE(t0, M.vec[2], 1, 3, 1, 3); /* 01, 11, 21, 23(=0) */ I.vec[2] = DF_VEC_SHUFFLE(t1, M.vec[2], 0, 2, 2, 3); /* 02, 12, 22, 23(=0) */ - /* divide by the squared scale */ - __m128 size_sqr = _mm_mul_ps(I.vec[0], I.vec[0]); - size_sqr = _mm_add_ps(size_sqr, _mm_mul_ps(I.vec[1], I.vec[1])); - size_sqr = _mm_add_ps(size_sqr, _mm_mul_ps(I.vec[2], I.vec[2])); + __m128 t = _mm_setr_ps(DF_M4_AT(M, 0, 3), DF_M4_AT(M, 1, 3), DF_M4_AT(M, 2, 3), 0.f); + DF_M4_AT(I, 0, 3) = -1.f * hsum(_mm_mul_ps(I.vec[0], t)); - __m128 r_size_sqr = _mm_div_ps(_mm_set1_ps(1.f), size_sqr); - - I.vec[0] = _mm_mul_ps(I.vec[0], r_size_sqr); - I.vec[1] = _mm_mul_ps(I.vec[1], r_size_sqr); - I.vec[2] = _mm_mul_ps(I.vec[2], r_size_sqr); - - /* last */ - I.vec[3] = _mm_mul_ps(I.vec[0], DF_VEC_SWIZZLE1(M.vec[3], 0)); - I.vec[3] = _mm_add_ps(I.vec[3], _mm_mul_ps(I.vec[1], DF_VEC_SWIZZLE1(M.vec[3], 1))); - I.vec[3] = _mm_add_ps(I.vec[3], _mm_mul_ps(I.vec[2], DF_VEC_SWIZZLE1(M.vec[3], 2))); - I.vec[3] = _mm_sub_ps(_mm_setr_ps(0.f, 0.f, 0.f, 1.f), I.vec[3]); + DF_M4_AT(I, 1, 3) = -1.f * hsum(_mm_mul_ps(I.vec[1], t)); + DF_M4_AT(I, 2, 3) = -1.f * hsum(_mm_mul_ps(I.vec[2], t)); + I.vec[3] = _mm_setr_ps(0.f, 0.f, 0.f, 1.f); return I; -#undef SMALL_NUMBER +} + + + +// for row major matrix +// we use __m128 to represent 2x2 matrix as A = | A0 A1 | +// | A2 A3 | +// 2x2 row major Matrix multiply A*B +static __forceinline __m128 mat2_mul(__m128 vec1, __m128 vec2) +{ + return _mm_add_ps(_mm_mul_ps(vec1, DF_VEC_SWIZZLE(vec2, 0, 3, 0, 3)), + _mm_mul_ps(DF_VEC_SWIZZLE(vec1, 1, 0, 3, 2), DF_VEC_SWIZZLE(vec2, 2, 1, 2, 1))); +} +// 2x2 row major Matrix adjugate multiply (A#)*B +static __forceinline __m128 mat2_adj_mul(__m128 vec1, __m128 vec2) +{ + return _mm_sub_ps(_mm_mul_ps(DF_VEC_SWIZZLE(vec1, 3, 3, 0, 0), vec2), + _mm_mul_ps(DF_VEC_SWIZZLE(vec1, 1, 1, 2, 2), DF_VEC_SWIZZLE(vec2, 2, 3, 0, 1))); +} +// 2x2 row major Matrix multiply adjugate A*(B#) +static __forceinline __m128 mat2_mul_adj(__m128 vec1, __m128 vec2) +{ + return _mm_sub_ps(_mm_mul_ps(vec1, DF_VEC_SWIZZLE(vec2, 3, 0, 3, 0)), + _mm_mul_ps(DF_VEC_SWIZZLE(vec1, 1, 0, 3, 2), DF_VEC_SWIZZLE(vec2, 2, 1, 2, 1))); +} + +DF_API df_m4 df_inverse( df_m4 M ) { + // use block matrix method + // A is a matrix, then i(A) or iA means inverse of A, A# (or A_ in code) means adjugate of A, |A| (or detA in code) + // is determinant, tr(A) is trace + + // sub matrices + __m128 A = DF_VEC_SHUFFLE_0101(M.vec[0], M.vec[1]); + __m128 B = DF_VEC_SHUFFLE_2323(M.vec[0], M.vec[1]); + __m128 C = DF_VEC_SHUFFLE_0101(M.vec[2], M.vec[3]); + __m128 D = DF_VEC_SHUFFLE_2323(M.vec[2], M.vec[3]); + +#if 0 + __m128 detA = _mm_set1_ps(M.m[0][0] * M.m[1][1] - M.m[0][1] * M.m[1][0]); + __m128 detB = _mm_set1_ps(M.m[0][2] * M.m[1][3] - M.m[0][3] * M.m[1][2]); + __m128 detC = _mm_set1_ps(M.m[2][0] * M.m[3][1] - M.m[2][1] * M.m[3][0]); + __m128 detD = _mm_set1_ps(M.m[2][2] * M.m[3][3] - M.m[2][3] * M.m[3][2]); +#else + // determinant as (|A| |B| |C| |D|) + __m128 detSub = _mm_sub_ps( + _mm_mul_ps(DF_VEC_SHUFFLE(M.vec[0], M.vec[2], 0, 2, 0, 2), DF_VEC_SHUFFLE(M.vec[1], M.vec[3], 1, 3, 1, 3)), + _mm_mul_ps(DF_VEC_SHUFFLE(M.vec[0], M.vec[2], 1, 3, 1, 3), DF_VEC_SHUFFLE(M.vec[1], M.vec[3], 0, 2, 0, 2))); + __m128 detA = DF_VEC_SWIZZLE1(detSub, 0); + __m128 detB = DF_VEC_SWIZZLE1(detSub, 1); + __m128 detC = DF_VEC_SWIZZLE1(detSub, 2); + __m128 detD = DF_VEC_SWIZZLE1(detSub, 3); +#endif + + // let iM = 1/|M| * | X Y | + // | Z W | + + // D#C + __m128 D_C = mat2_adj_mul(D, C); + // A#B + __m128 A_B = mat2_adj_mul(A, B); + // X# = |D|A - B(D#C) + __m128 X_ = _mm_sub_ps(_mm_mul_ps(detD, A), mat2_mul(B, D_C)); + // W# = |A|D - C(A#B) + __m128 W_ = _mm_sub_ps(_mm_mul_ps(detA, D), mat2_mul(C, A_B)); + + // |M| = |A|*|D| + ... (continue later) + __m128 detM = _mm_mul_ps(detA, detD); + + // Y# = |B|C - D(A#B)# + __m128 Y_ = _mm_sub_ps(_mm_mul_ps(detB, C), mat2_mul_adj(D, A_B)); + // Z# = |C|B - A(D#C)# + __m128 Z_ = _mm_sub_ps(_mm_mul_ps(detC, B), mat2_mul_adj(A, D_C)); + + // |M| = |A|*|D| + |B|*|C| ... (continue later) + detM = _mm_add_ps(detM, _mm_mul_ps(detB, detC)); + + // tr((A#B)(D#C)) + __m128 tr = _mm_mul_ps(A_B, DF_VEC_SWIZZLE(D_C, 0, 2, 1, 3)); + tr = _mm_hadd_ps(tr, tr); + tr = _mm_hadd_ps(tr, tr); + // |M| = |A|*|D| + |B|*|C| - tr((A#B)(D#C) + detM = _mm_sub_ps(detM, tr); + + const __m128 adjSignMask = _mm_setr_ps(1.f, -1.f, -1.f, 1.f); + // (1/|M|, -1/|M|, -1/|M|, 1/|M|) + __m128 rDetM = _mm_div_ps(adjSignMask, detM); + + X_ = _mm_mul_ps(X_, rDetM); + Y_ = _mm_mul_ps(Y_, rDetM); + Z_ = _mm_mul_ps(Z_, rDetM); + W_ = _mm_mul_ps(W_, rDetM); + + df_m4 r; + + // apply adjugate and store, here we combine adjugate shuffle and store shuffle + r.vec[0] = DF_VEC_SHUFFLE(X_, Y_, 3, 1, 3, 1); + r.vec[1] = DF_VEC_SHUFFLE(X_, Y_, 2, 0, 2, 0); + r.vec[2] = DF_VEC_SHUFFLE(Z_, W_, 3, 1, 3, 1); + r.vec[3] = DF_VEC_SHUFFLE(Z_, W_, 2, 0, 2, 0); + + return r; } diff --git a/lib/raytracing.c b/lib/raytracing.c new file mode 100644 index 0000000..79901d0 --- /dev/null +++ b/lib/raytracing.c @@ -0,0 +1,51 @@ +#include + +#include + +DF_API void df_evaluate_ray_packet(const df_ray_packet *rays) { + const __m128 *base_x = (const __m128 *)rays->base_x; + const __m128 *base_y = (const __m128 *)rays->base_y; + const __m128 *base_z = (const __m128 *)rays->base_z; + + const __m128 *dir_x = (const __m128 *)rays->dir_x; + const __m128 *dir_y = (const __m128 *)rays->dir_y; + const __m128 *dir_z = (const __m128 *)rays->dir_z; + + /* Simple test: Let rays intersect with plane at z = 350.0 */ + + float PLANE_Z = 350.0f; + + __m128 plane_z = _mm_set1_ps(PLANE_Z); + + size_t ray_count = rays->ray_count; + + /* TODO(kevin): divide to multiple threads */ + for (size_t i = 0; i < ray_count; i += 4) { + __m128 rays_base_x = base_x[i]; + __m128 rays_base_y = base_y[i]; + __m128 rays_base_z = base_z[i]; + __m128 rays_dir_x = dir_x[i]; + __m128 rays_dir_y = dir_y[i]; + __m128 rays_dir_z = dir_z[i]; + + /* Solve for t: base.z + t * dir.z = plane_z + * t = (plane_z - base.z) / dir.z + */ + __m128 delta = _mm_sub_ps(plane_z, rays_base_z); + __m128 t = _mm_div_ps(delta, rays_dir_z); + + /* Sample p = base.z + t * dir */ + __m128 sample_p_x = _mm_mul_ps(t, rays_dir_x); + __m128 sample_p_y = _mm_mul_ps(t, rays_dir_y); + __m128 sample_p_z = _mm_mul_ps(t, rays_dir_z); + sample_p_x = _mm_add_ps(sample_p_x, rays_base_x); + sample_p_y = _mm_add_ps(sample_p_y, rays_base_y); + sample_p_z = _mm_add_ps(sample_p_z, rays_base_z); + + } + + /* Handle remaining (< 4) rays */ + if ((ray_count % 4) != 0) { + + } +} \ No newline at end of file diff --git a/meson.build b/meson.build index 47460e2..c5dd12a 100644 --- a/meson.build +++ b/meson.build @@ -5,7 +5,9 @@ incdir = include_directories('include', '3p') cc = meson.get_compiler('c') m_dep = cc.find_library('m', required: false) -add_project_arguments([ '-msse3', '-Wno-missing-braces' ], language: 'c') +if cc.get_id() == 'gcc' or cc.get_id() == 'clang' + add_project_arguments([ '-msse3', '-msse4.1', '-Wno-missing-braces' ], language: 'c') +endif lib = library('df', 'lib/log.c', @@ -15,14 +17,21 @@ lib = library('df', 'lib/image.c', 'lib/color.c', 'lib/thin_lense.c', + 'lib/raytracing.c', + 'include/defocus/base.h', + 'include/defocus/camera.h', + 'include/defocus/defocus.h', + 'include/defocus/image.h', + 'include/defocus/intrinsic_helper.h', + 'include/defocus/models.h', + 'include/defocus/scene.h', + 'include/defocus/raytracing.h', include_directories: incdir, - dependencies: m_dep, - version: '0.1.0', - soversion: '0') + dependencies: m_dep) # Command Line Executable executable('defocus', 'bin/defocus.c', include_directories: incdir, link_with: lib) # Test driver -munit_dep = dependency('munit', fallback: ['munit', 'munit_dep']) -executable('tests', 'tests/tests.c', include_directories: incdir, link_with: lib, dependencies: munit_dep) \ No newline at end of file +#munit_dep = dependency('munit', fallback: ['munit', 'munit_dep']) +#executable('tests', 'tests/tests.c', include_directories: incdir, link_with: lib, dependencies: munit_dep) \ No newline at end of file