test SIMD raytracing

This commit is contained in:
Kevin Trogant 2023-05-09 13:04:14 +02:00
parent 2c98040c42
commit 583cd9b417
12 changed files with 410 additions and 116 deletions

View File

@ -3,15 +3,19 @@
#include <stdlib.h>
#include <defocus/defocus.h>
#include <defocus/camera.h>
int pinhole_fn(int argc, char **argv);
int thin_lense_fn(int argc, char **argv);
int test_fn(int argc, char **argv);
static const char *_model_names[] = {"pinhole", "thin_lense"};
static const char *_model_names[] = {"pinhole", "thin_lense", "test"};
typedef int (*model_fn)(int argc, char **argv);
static model_fn _model_fns[] = {
pinhole_fn,
thin_lense_fn,
test_fn,
};
void usage(const char *pname)
@ -25,7 +29,6 @@ void usage(const char *pname)
int main(int argc, char **argv)
{
if (argc < 2) {
fprintf(stderr, "Missing model name!\n");
usage(argv[0]);
@ -219,3 +222,21 @@ int thin_lense_fn(int argc, char **argv)
return error_code;
}
int test_fn( int argc, char **argv ) {
df_v3 v = {1.f, 2.f, 3.f};
df_m4 t = df_translate(-1.f, 2.f, 1.5f);
t = df_inverse_transform(t);
df_v3 tv = df_transform_v3(t, v);
df_camera_i cam = df_create_perspective_camera(1024.0, 1024.0, 1024.0, 1024.0, 2.e-4f, 1000.f, 350.0, 21.0);
df_ray_packet packet = cam.build_ray_packet(cam.o);
df_evaluate_ray_packet(&packet);
df_release_ray_packet(&packet);
cam.release(cam.o);
return 0;
}

View File

@ -38,6 +38,20 @@ typedef enum
/** @brief Zero an array */
#define DF_ZERO_ARRAY(a, n) DF_ZERO_MEMORY(a, sizeof((a)[0]) * (n))
#ifdef _MSC_VER
#ifdef __cplusplus__
#define DF_API extern "C" __declspec(dllexport)
#else
#define DF_API __declspec(dllexport)
#endif /* _MSC_VER && __cplusplus__ */
#else
#ifdef __cplusplus__
#define DF_API extern "C"
#else
#define DF_API
#endif
#endif
/* Simple logging function */
#ifndef DF_ENABLE_LOGGING
@ -69,7 +83,7 @@ enum
* @param line line number of the log location
* @param fmt format string of the message.
*/
void df_log_impl(int level, const char *file, int line, const char *fmt, ...);
DF_API void df_log_impl(int level, const char *file, int line, const char *fmt, ...);
#define df_log_verbose(...) df_log_impl(df_log_level_verbose, __FILE__, __LINE__, __VA_ARGS__)
#define df_log_info(...) df_log_impl(df_log_level_info, __FILE__, __LINE__, __VA_ARGS__)
@ -112,7 +126,7 @@ typedef union {
* @param t interpolation factor
* @return a + (b - a) * t
*/
df_color df_lerp_color(df_color a, df_color b, double t);
DF_API df_color df_lerp_color(df_color a, df_color b, double t);
/** @brief 2d vector */
typedef union {
@ -130,22 +144,22 @@ typedef union {
} df_v2;
/** @brief Add two 2d vectors */
df_v2 df_add_v2(df_v2 a, df_v2 b);
DF_API df_v2 df_add_v2(df_v2 a, df_v2 b);
/** @brief Subtract two 2d vectors */
df_v2 df_sub_v2(df_v2 a, df_v2 b);
DF_API df_v2 df_sub_v2(df_v2 a, df_v2 b);
/** @brief Calculate the dot product of 2d vectors a and b.
*/
float df_dot_v2(df_v2 a, df_v2 b);
DF_API float df_dot_v2(df_v2 a, df_v2 b);
/** @brief Multiply a 2d vector with a scalar.
*/
df_v2 df_mul_v2(float t, df_v2 v);
DF_API df_v2 df_mul_v2(float t, df_v2 v);
/** @brief Returns the normalized version of a 2d vector v
*/
df_v2 df_normalize_v2(df_v2 v);
DF_API df_v2 df_normalize_v2(df_v2 v);
/** @brief 3d vector */
typedef union {
@ -159,22 +173,22 @@ typedef union {
} df_v3;
/** @brief Add two 3d vectors */
df_v3 df_add_v3(df_v3 a, df_v3 b);
DF_API df_v3 df_add_v3(df_v3 a, df_v3 b);
/** @brief Subtract two 3d vectors */
df_v3 df_sub_v3(df_v3 a, df_v3 b);
DF_API df_v3 df_sub_v3(df_v3 a, df_v3 b);
/** @brief Calculate the dot product of 3d vectors a and b.
*/
float df_dot_v3(df_v3 a, df_v3 b);
DF_API float df_dot_v3(df_v3 a, df_v3 b);
/** @brief Multiply a 3d vector with a scalar.
*/
df_v3 df_mul_v3(float t, df_v3 v);
DF_API df_v3 df_mul_v3(float t, df_v3 v);
/** @brief Returns the normalized version of a 3d vector v
*/
df_v3 df_normalize_v3(df_v3 v);
DF_API df_v3 df_normalize_v3(df_v3 v);
/** @brief A plane in 3d space.
*
@ -221,7 +235,7 @@ typedef struct
} df_line_plane_intersection;
/** @brief Calculate the intersection between a line and a plane in 3d space */
df_line_plane_intersection df_calc_line_plane_intersection(df_line line, df_plane plane);
DF_API df_line_plane_intersection df_calc_line_plane_intersection(df_line line, df_plane plane);
/** @brief A 4x4 matrix.
*
@ -239,22 +253,26 @@ typedef struct df_m4
#define DF_M4_AT(m, row, col) ((m).e[(row)*4 + (col)])
/** @brief Matrix multiply 4x4 matrix a and b */
df_m4 df_mul_m4(df_m4 a, df_m4 b);
DF_API df_m4 df_mul_m4(df_m4 a, df_m4 b);
/** @brief Get a scale matrix */
df_m4 df_scale(float x, float y, float z);
DF_API df_m4 df_scale(float x, float y, float z);
df_m4 df_translate(float x, float y, float z);
DF_API df_m4 df_translate(float x, float y, float z);
/** @brief Transform (i.e. multiply) a 3d vector v by the transformation matrix T */
df_v3 df_transform_v3(df_m4 T, df_v3 v);
DF_API df_v3 df_transform_v3(df_m4 T, df_v3 v);
/** @brief Calculate the inverse of a non-scaling transform matrix.
*
* Special fast case.
*/
df_m4 df_inverse_transform_no_scale(df_m4 M);
DF_API df_m4 df_inverse_transform_no_scale(df_m4 M);
/** @brief Calculate the inverse of a transform matrix */
df_m4 df_inverse_transform(df_m4 M);
DF_API df_m4 df_inverse_transform(df_m4 M);
/** @brief Calculate the inverse of a general (invertible) 4x4 matrix */
DF_API df_m4 df_inverse(df_m4 M);
#endif

View File

@ -2,34 +2,12 @@
#define DEFOCUS_CAMERA_H
#include "base.h"
#include "raytracing.h"
/** @file camera.h
* @brief basic camera functions
*/
/** @brief Stores information for rays in a SIMD friendly way */
typedef struct
{
/** Packs all SIMD data into a single buffer */
float *simd_mem;
/** Source uvs for rays */
df_v2 *ray_uvs;
/* Points into simd_mem */
float *base_x;
float *base_y;
float *base_z;
float *dir_x;
float *dir_y;
float *dir_z;
size_t ray_count;
} df_ray_packet;
/** @brief Free ray packet memory */
void df_release_ray_packet(df_ray_packet *rays);
/** @brief Interface for cameras. */
typedef struct
{
@ -44,7 +22,7 @@ typedef struct
void *o;
} df_camera_i;
df_camera_i
df_create_perspective_camera(float image_width, float image_height, float raster_width, float raster_height);
DF_API df_camera_i df_create_perspective_camera(
float image_width, float image_height, float raster_width, float raster_height, float far_dist, float near_dist, float focal_dist, float lens_radius);
#endif

View File

@ -18,7 +18,7 @@ typedef struct df_image df_image;
* @param out_image receives the image object
* @return error code
*/
df_result df_create_image(int w, int h, df_image **out_image);
DF_API df_result df_create_image(int w, int h, df_image **out_image);
/** @brief load an image file
*
@ -29,34 +29,34 @@ df_result df_create_image(int w, int h, df_image **out_image);
* @param out_image receives the image object
* @return error code
*/
df_result df_load_image(const char *path, int *out_w, int *out_h, df_image **out_image);
DF_API df_result df_load_image(const char *path, int *out_w, int *out_h, df_image **out_image);
/** @brief Write an image to a PNG file
* @param img the image
* @param path the path
*/
df_result df_write_image(df_image *img, const char *path);
DF_API df_result df_write_image(df_image *img, const char *path);
/** @brief Free an image.
*
* Any pointer to the image will be invalid after this.
* @param img the image
*/
void df_release_image(df_image *img);
DF_API void df_release_image(df_image *img);
/** @brief Returns the dimensions of the image */
void df_get_image_size(const df_image *image, int *w, int *h);
DF_API void df_get_image_size(const df_image *image, int *w, int *h);
/** @brief Returns the color value at pixel coordinates x, y
*
* Returns black for coordinates outside the image.
*/
df_color df_get_image_pixel(const df_image *image, int x, int y);
DF_API df_color df_get_image_pixel(const df_image *image, int x, int y);
/** @brief Set the color value at pixel coordinates x, y.
*
* Does nothing for coordinates outside the image.
*/
void df_set_image_pixel(df_image *image, int x, int y, df_color c);
DF_API void df_set_image_pixel(df_image *image, int x, int y, df_color c);
#endif

View File

@ -44,7 +44,7 @@ typedef struct df_pinhole_params
* @param in_image input image.
* @param out_image the output image.
*/
void df_pinhole(df_pinhole_params params, const df_image *in_image, df_image *out_image);
DF_API void df_pinhole(df_pinhole_params params, const df_image *in_image, df_image *out_image);
/** Parameters for the thin lense model.
*/
@ -72,6 +72,6 @@ typedef struct df_thin_lense_params
* @param in_image input image
* @param out_image the output image.
*/
void df_thin_lense(df_thin_lense_params params, const df_image *in_image, df_image *out_image);
DF_API void df_thin_lense(df_thin_lense_params params, const df_image *in_image, df_image *out_image);
#endif

View File

@ -0,0 +1,29 @@
#ifndef DF_RAYTRACING_H
#define DF_RAYTRACING_H
/** @brief Stores information for rays in a SIMD friendly way */
typedef struct
{
/** Packs all SIMD data into a single buffer */
float *simd_mem;
/** Source uvs for rays */
df_v2 *ray_uvs;
/* Points into simd_mem */
float *base_x;
float *base_y;
float *base_z;
float *dir_x;
float *dir_y;
float *dir_z;
size_t ray_count;
} df_ray_packet;
/** @brief Free ray packet memory */
DF_API void df_release_ray_packet(df_ray_packet *rays);
DF_API void df_evaluate_ray_packet(const df_ray_packet *rays);
#endif

View File

@ -1,6 +1,12 @@
#include <defocus/camera.h>
#include <string.h>
#include <stdlib.h>
#include <assert.h>
#ifdef _MSC_VER
#include <malloc.h>
#endif
/* ********************************************************
*
@ -8,7 +14,7 @@
*
* ********************************************************/
void df_release_ray_packet(df_ray_packet *rays)
DF_API void df_release_ray_packet(df_ray_packet *rays)
{
free(rays->simd_mem);
free(rays->ray_uvs);
@ -27,17 +33,94 @@ typedef struct
float focal_dist;
float lens_radius;
/* Raster space, i.e. the space for which we generate rays */
float raster_width;
float raster_height;
/* Image(=screen) space, i.e. the space in which we store pixels */
float image_width;
float image_height;
df_m4 screen_to_raster;
df_m4 raster_to_screen;
df_m4 raster_to_camera;
} df_perspective_camera;
static void pc_release(void *o) { df_perspective_camera *camera = o; }
static df_ray_packet pc_build_ray_packet(void *o) { df_perspective_camera *camera = o; }
static df_ray_packet pc_build_ray_packet(void *o)
{
df_perspective_camera *camera = o;
df_ray_packet packet;
memset(&packet, 0, sizeof(packet));
df_camera_i df_create_perspective_camera(float image_width, float image_height, float raster_width, float raster_height)
/* Generate 1 ray per pixel (ignore lens for now) */
size_t count = (size_t)camera->raster_width * (size_t)camera->raster_height;
size_t alloc_count = count;
/* Round up to nearest multiple of 4, for SSE.
* Also satisfies 16 byte alignment (assuming simd_mem is 16 byte aligned)
* because 4 * sizeof(float) = 16.
*/
if ((alloc_count % 4) != 0) {
alloc_count = ((alloc_count + 3) / 4) * 4;
}
#ifdef _MSC_VER
packet.simd_mem = _aligned_malloc(sizeof(float) * alloc_count * 6, 16);
#elif defined(_ISOC11_SOURCE) /* Feature test macro for GCC */
packet.simd_mem = aligned_alloc(16, sizeof(float) * alloc_count * 6);
#else
/* Fall back to regular malloc and hope for the best */
packet.simd_mem = malloc(sizeof(float) * alloc_count * 6);
#endif
packet.base_x = packet.simd_mem;
packet.base_y = packet.base_x + alloc_count;
packet.base_z = packet.base_y + alloc_count;
packet.dir_x = packet.base_z + alloc_count;
packet.dir_y = packet.dir_x + alloc_count;
packet.dir_z = packet.dir_y + alloc_count;
packet.ray_uvs = malloc(sizeof(df_v2) * count);
packet.ray_count = count;
size_t i = 0;
for (float y = 0; y < camera->raster_height; y += 1.f) {
for (float x = 0; x < camera->raster_width; x += 1.f) {
packet.base_x[i] = 0.f;
packet.base_y[i] = 0.f;
packet.base_z[i] = 0.f;
df_v3 raster_p = {x, y, 0.f};
df_v3 camera_p = df_transform_v3(camera->raster_to_camera, raster_p);
df_v3 dir = df_normalize_v3(camera_p);
packet.dir_x[i] = dir.x;
packet.dir_y[i] = dir.y;
packet.dir_z[i] = dir.z;
df_v3 img_p = df_transform_v3(camera->raster_to_screen, raster_p);
packet.ray_uvs[i].u = img_p.x / camera->image_width;
packet.ray_uvs[i].v = img_p.y / camera->image_height;
++i;
assert(i <= count);
}
}
return packet;
}
DF_API df_camera_i df_create_perspective_camera(float image_width,
float image_height,
float raster_width,
float raster_height,
float far_dist,
float near_dist,
float focal_dist,
float lens_radius)
{
df_perspective_camera *camera = malloc(sizeof(*camera));
@ -45,9 +128,25 @@ df_camera_i df_create_perspective_camera(float image_width, float image_height,
camera->screen_to_raster =
df_mul_m4(camera->screen_to_raster, df_scale(1.f / image_width, -1.f / image_height, 1.f));
camera->screen_to_raster = df_mul_m4(camera->screen_to_raster, df_translate(0.f, -image_height, 0.f));
camera->raster_to_screen = df_inverse_transform(camera->screen_to_raster);
/* Perspective projection matrix */
df_m4 persp = {0.f};
DF_M4_AT(persp, 0, 0) = 1.f;
DF_M4_AT(persp, 1, 1) = 1.f;
DF_M4_AT(persp, 2, 2) = far_dist / (far_dist - near_dist);
DF_M4_AT(persp, 2, 3) = -1.f * far_dist * near_dist / (far_dist - near_dist);
DF_M4_AT(persp, 3, 2) = 1.f;
camera->raster_to_camera = df_mul_m4(df_inverse(persp), camera->raster_to_screen);
camera->focal_dist = focal_dist;
camera->lens_radius = lens_radius;
camera->raster_width = raster_width;
camera->raster_height = raster_height;
camera->image_width = image_width;
camera->image_height = image_height;
df_camera_i iface = {.release = pc_release, .build_ray_packet = pc_build_ray_packet, .o = camera};
return iface;
}

View File

@ -17,7 +17,7 @@ struct df_image
uint8_t *pixels;
};
df_result df_create_image(int w, int h, df_image **out_image)
DF_API df_result df_create_image(int w, int h, df_image **out_image)
{
df_image *img = malloc(sizeof(df_image));
if (!img)
@ -37,7 +37,7 @@ df_result df_create_image(int w, int h, df_image **out_image)
return df_result_success;
}
df_result df_load_image(const char *path, int *out_w, int *out_h, df_image **out_image)
DF_API df_result df_load_image(const char *path, int *out_w, int *out_h, df_image **out_image)
{
int w, h, c;
stbi_uc *pixels = stbi_load(path, &w, &h, &c, 4);
@ -69,7 +69,7 @@ out:
return res;
}
void df_release_image(df_image *img)
DF_API void df_release_image(df_image *img)
{
if (img) {
free(img->pixels);
@ -77,7 +77,7 @@ void df_release_image(df_image *img)
}
}
df_result df_write_image(df_image *image, const char *path)
DF_API df_result df_write_image(df_image *image, const char *path)
{
df_result res = stbi_write_png(path, image->width, image->height, 4, image->pixels, image->width * 4)
? df_result_success
@ -85,7 +85,7 @@ df_result df_write_image(df_image *image, const char *path)
return res;
}
void df_get_image_size(const df_image *image, int *w, int *h)
DF_API void df_get_image_size(const df_image *image, int *w, int *h)
{
if (w)
*w = image->width;
@ -93,7 +93,7 @@ void df_get_image_size(const df_image *image, int *w, int *h)
*h = image->height;
}
df_color df_get_image_pixel(const df_image *image, int x, int y)
DF_API df_color df_get_image_pixel(const df_image *image, int x, int y)
{
df_color c = {0, 0, 0, 255};
if (x >= 0 && x < image->width && y >= 0 && y < image->height) {
@ -102,7 +102,7 @@ df_color df_get_image_pixel(const df_image *image, int x, int y)
return c;
}
void df_set_image_pixel(df_image *image, int x, int y, df_color c)
DF_API void df_set_image_pixel(df_image *image, int x, int y, df_color c)
{
if (x >= 0 && x < image->width && y >= 0 && y < image->height) {
memcpy(&image->pixels[4 * (y * image->width + x)], &c.e[0], 4);

View File

@ -4,7 +4,7 @@
static const char *log_level_names[] = {"VERBOSE", "INFO", "WARN", "ERROR"};
void df_log_impl(int level, const char *file, int line, const char *fmt, ...)
DF_API void df_log_impl(int level, const char *file, int line, const char *fmt, ...)
{
va_list ap;
va_start(ap, fmt);

View File

@ -5,27 +5,27 @@
#include <immintrin.h>
#include <pmmintrin.h>
df_v2 df_add_v2(df_v2 a, df_v2 b)
DF_API df_v2 df_add_v2(df_v2 a, df_v2 b)
{
df_v2 v = {a.x + b.x, a.y + b.y};
return v;
}
df_v2 df_sub_v2(df_v2 a, df_v2 b)
DF_API df_v2 df_sub_v2(df_v2 a, df_v2 b)
{
df_v2 v = {a.x - b.x, a.y - b.y};
return v;
}
float df_dot_v2(df_v2 a, df_v2 b) { return a.x * b.x + a.y * b.y; }
DF_API float df_dot_v2(df_v2 a, df_v2 b) { return a.x * b.x + a.y * b.y; }
df_v2 df_mul_v2(float t, df_v2 v)
DF_API df_v2 df_mul_v2(float t, df_v2 v)
{
df_v2 r = {t * v.x, t * v.y};
return r;
}
df_v2 df_normalize_v2(df_v2 v)
DF_API df_v2 df_normalize_v2(df_v2 v)
{
float len_square = df_dot_v2(v, v);
float len = sqrtf(len_square);
@ -33,27 +33,27 @@ df_v2 df_normalize_v2(df_v2 v)
return n;
}
df_v3 df_add_v3(df_v3 a, df_v3 b)
DF_API df_v3 df_add_v3(df_v3 a, df_v3 b)
{
df_v3 v = {a.x + b.x, a.y + b.y, a.z + b.z};
return v;
}
df_v3 df_sub_v3(df_v3 a, df_v3 b)
DF_API df_v3 df_sub_v3(df_v3 a, df_v3 b)
{
df_v3 v = {a.x - b.x, a.y - b.y, a.z - b.z};
return v;
}
float df_dot_v3(df_v3 a, df_v3 b) { return a.x * b.x + a.y * b.y + a.z * b.z; }
DF_API float df_dot_v3(df_v3 a, df_v3 b) { return a.x * b.x + a.y * b.y + a.z * b.z; }
df_v3 df_mul_v3(float t, df_v3 v)
DF_API df_v3 df_mul_v3(float t, df_v3 v)
{
df_v3 r = {t * v.x, t * v.y, t * v.z};
return r;
}
df_v3 df_normalize_v3(df_v3 v)
DF_API df_v3 df_normalize_v3(df_v3 v)
{
float len_square = df_dot_v3(v, v);
float len = sqrtf(len_square);
@ -61,7 +61,7 @@ df_v3 df_normalize_v3(df_v3 v)
return n;
}
df_line_plane_intersection df_calc_line_plane_intersection(df_line line, df_plane plane)
DF_API df_line_plane_intersection df_calc_line_plane_intersection(df_line line, df_plane plane)
{
/* check case */
float dot = df_dot_v3(line.direction, plane.normal);
@ -90,7 +90,7 @@ df_line_plane_intersection df_calc_line_plane_intersection(df_line line, df_plan
}
}
df_m4 df_mul_m4(df_m4 a, df_m4 b)
DF_API df_m4 df_mul_m4(df_m4 a, df_m4 b)
{
/* Super simple, we could probably do it a lot better via SIMD. */
df_m4 p;
@ -105,7 +105,7 @@ df_m4 df_mul_m4(df_m4 a, df_m4 b)
return p;
}
df_m4 df_scale(float x, float y, float z)
DF_API df_m4 df_scale(float x, float y, float z)
{
/* clang-format off */
df_m4 s = {{
@ -118,7 +118,7 @@ df_m4 df_scale(float x, float y, float z)
return s;
}
df_m4 df_translate(float x, float y, float z)
DF_API df_m4 df_translate(float x, float y, float z)
{
/* clang-format off */
df_m4 t = {{
@ -153,7 +153,7 @@ static float hsum(__m128 v)
return _mm_cvtss_f32(sum);
}
df_v3 df_transform_v3(df_m4 T, df_v3 v)
DF_API df_v3 df_transform_v3(df_m4 T, df_v3 v)
{
df_v3 transf;
_Alignas(16) float tmp_v[4] = {v.x, v.y, v.z, 1.f};
@ -182,56 +182,145 @@ df_v3 df_transform_v3(df_m4 T, df_v3 v)
* | R T |
* | 0 1 |
*/
df_m4 df_inverse_transform_no_scale(df_m4 M)
DF_API df_m4 df_inverse_transform_no_scale(df_m4 M)
{
df_m4 I = {0.f};
/* transpose 3x3, we know that m03 = m13 = m23 = 0 */
/* transpose 3x3 */
__m128 t0 = DF_VEC_SHUFFLE_0101(M.vec[0], M.vec[1]); /* 00, 01, 10, 11 */
__m128 t1 = DF_VEC_SHUFFLE_2323(M.vec[0], M.vec[1]); /* 02, 03, 12, 13 */
I.vec[0] = DF_VEC_SHUFFLE(t0, M.vec[2], 0, 2, 0, 3); /* 00, 01, 20, 23(=0) */
I.vec[1] = DF_VEC_SHUFFLE(t0, M.vec[2], 1, 3, 1, 3); /* 01, 11, 21, 23(=0) */
I.vec[2] = DF_VEC_SHUFFLE(t1, M.vec[2], 0, 2, 2, 3); /* 02, 12, 22, 23(=0) */
/* last */
I.vec[3] = _mm_mul_ps(I.vec[0], DF_VEC_SWIZZLE1(M.vec[3], 0));
I.vec[3] = _mm_add_ps(I.vec[3], _mm_mul_ps(I.vec[1], DF_VEC_SWIZZLE1(M.vec[3], 1)));
I.vec[3] = _mm_add_ps(I.vec[3], _mm_mul_ps(I.vec[2], DF_VEC_SWIZZLE1(M.vec[3], 2)));
I.vec[3] = _mm_sub_ps(_mm_setr_ps(0.f, 0.f, 0.f, 1.f), I.vec[3]);
__m128 t = _mm_setr_ps(DF_M4_AT(M, 0, 3), DF_M4_AT(M, 1, 3), DF_M4_AT(M, 2, 3), 0.f);
DF_M4_AT(I, 0, 3) = -1.f * hsum(_mm_mul_ps(I.vec[0], t));
DF_M4_AT(I, 1, 3) = -1.f * hsum(_mm_mul_ps(I.vec[1], t));
DF_M4_AT(I, 2, 3) = -1.f * hsum(_mm_mul_ps(I.vec[2], t));
I.vec[3] = _mm_setr_ps(0.f, 0.f, 0.f, 1.f);
return I;
}
df_m4 df_inverse_transform(df_m4 M)
DF_API df_m4 df_inverse_transform(df_m4 M)
{
#define SMALL_NUMBER (1.e-8f)
df_m4 I = {0.f};
/* transpose 3x3, we know that m03 = m13 = m23 = 0 */
/* transpose 3x3 */
__m128 t0 = DF_VEC_SHUFFLE_0101(M.vec[0], M.vec[1]); /* 00, 01, 10, 11 */
__m128 t1 = DF_VEC_SHUFFLE_2323(M.vec[0], M.vec[1]); /* 02, 03, 12, 13 */
I.vec[0] = DF_VEC_SHUFFLE(t0, M.vec[2], 0, 2, 0, 3); /* 00, 01, 20, 23(=0) */
I.vec[1] = DF_VEC_SHUFFLE(t0, M.vec[2], 1, 3, 1, 3); /* 01, 11, 21, 23(=0) */
I.vec[2] = DF_VEC_SHUFFLE(t1, M.vec[2], 0, 2, 2, 3); /* 02, 12, 22, 23(=0) */
/* divide by the squared scale */
__m128 size_sqr = _mm_mul_ps(I.vec[0], I.vec[0]);
size_sqr = _mm_add_ps(size_sqr, _mm_mul_ps(I.vec[1], I.vec[1]));
size_sqr = _mm_add_ps(size_sqr, _mm_mul_ps(I.vec[2], I.vec[2]));
__m128 t = _mm_setr_ps(DF_M4_AT(M, 0, 3), DF_M4_AT(M, 1, 3), DF_M4_AT(M, 2, 3), 0.f);
DF_M4_AT(I, 0, 3) = -1.f * hsum(_mm_mul_ps(I.vec[0], t));
__m128 r_size_sqr = _mm_div_ps(_mm_set1_ps(1.f), size_sqr);
I.vec[0] = _mm_mul_ps(I.vec[0], r_size_sqr);
I.vec[1] = _mm_mul_ps(I.vec[1], r_size_sqr);
I.vec[2] = _mm_mul_ps(I.vec[2], r_size_sqr);
/* last */
I.vec[3] = _mm_mul_ps(I.vec[0], DF_VEC_SWIZZLE1(M.vec[3], 0));
I.vec[3] = _mm_add_ps(I.vec[3], _mm_mul_ps(I.vec[1], DF_VEC_SWIZZLE1(M.vec[3], 1)));
I.vec[3] = _mm_add_ps(I.vec[3], _mm_mul_ps(I.vec[2], DF_VEC_SWIZZLE1(M.vec[3], 2)));
I.vec[3] = _mm_sub_ps(_mm_setr_ps(0.f, 0.f, 0.f, 1.f), I.vec[3]);
DF_M4_AT(I, 1, 3) = -1.f * hsum(_mm_mul_ps(I.vec[1], t));
DF_M4_AT(I, 2, 3) = -1.f * hsum(_mm_mul_ps(I.vec[2], t));
I.vec[3] = _mm_setr_ps(0.f, 0.f, 0.f, 1.f);
return I;
#undef SMALL_NUMBER
}
// for row major matrix
// we use __m128 to represent 2x2 matrix as A = | A0 A1 |
// | A2 A3 |
// 2x2 row major Matrix multiply A*B
static __forceinline __m128 mat2_mul(__m128 vec1, __m128 vec2)
{
return _mm_add_ps(_mm_mul_ps(vec1, DF_VEC_SWIZZLE(vec2, 0, 3, 0, 3)),
_mm_mul_ps(DF_VEC_SWIZZLE(vec1, 1, 0, 3, 2), DF_VEC_SWIZZLE(vec2, 2, 1, 2, 1)));
}
// 2x2 row major Matrix adjugate multiply (A#)*B
static __forceinline __m128 mat2_adj_mul(__m128 vec1, __m128 vec2)
{
return _mm_sub_ps(_mm_mul_ps(DF_VEC_SWIZZLE(vec1, 3, 3, 0, 0), vec2),
_mm_mul_ps(DF_VEC_SWIZZLE(vec1, 1, 1, 2, 2), DF_VEC_SWIZZLE(vec2, 2, 3, 0, 1)));
}
// 2x2 row major Matrix multiply adjugate A*(B#)
static __forceinline __m128 mat2_mul_adj(__m128 vec1, __m128 vec2)
{
return _mm_sub_ps(_mm_mul_ps(vec1, DF_VEC_SWIZZLE(vec2, 3, 0, 3, 0)),
_mm_mul_ps(DF_VEC_SWIZZLE(vec1, 1, 0, 3, 2), DF_VEC_SWIZZLE(vec2, 2, 1, 2, 1)));
}
DF_API df_m4 df_inverse( df_m4 M ) {
// use block matrix method
// A is a matrix, then i(A) or iA means inverse of A, A# (or A_ in code) means adjugate of A, |A| (or detA in code)
// is determinant, tr(A) is trace
// sub matrices
__m128 A = DF_VEC_SHUFFLE_0101(M.vec[0], M.vec[1]);
__m128 B = DF_VEC_SHUFFLE_2323(M.vec[0], M.vec[1]);
__m128 C = DF_VEC_SHUFFLE_0101(M.vec[2], M.vec[3]);
__m128 D = DF_VEC_SHUFFLE_2323(M.vec[2], M.vec[3]);
#if 0
__m128 detA = _mm_set1_ps(M.m[0][0] * M.m[1][1] - M.m[0][1] * M.m[1][0]);
__m128 detB = _mm_set1_ps(M.m[0][2] * M.m[1][3] - M.m[0][3] * M.m[1][2]);
__m128 detC = _mm_set1_ps(M.m[2][0] * M.m[3][1] - M.m[2][1] * M.m[3][0]);
__m128 detD = _mm_set1_ps(M.m[2][2] * M.m[3][3] - M.m[2][3] * M.m[3][2]);
#else
// determinant as (|A| |B| |C| |D|)
__m128 detSub = _mm_sub_ps(
_mm_mul_ps(DF_VEC_SHUFFLE(M.vec[0], M.vec[2], 0, 2, 0, 2), DF_VEC_SHUFFLE(M.vec[1], M.vec[3], 1, 3, 1, 3)),
_mm_mul_ps(DF_VEC_SHUFFLE(M.vec[0], M.vec[2], 1, 3, 1, 3), DF_VEC_SHUFFLE(M.vec[1], M.vec[3], 0, 2, 0, 2)));
__m128 detA = DF_VEC_SWIZZLE1(detSub, 0);
__m128 detB = DF_VEC_SWIZZLE1(detSub, 1);
__m128 detC = DF_VEC_SWIZZLE1(detSub, 2);
__m128 detD = DF_VEC_SWIZZLE1(detSub, 3);
#endif
// let iM = 1/|M| * | X Y |
// | Z W |
// D#C
__m128 D_C = mat2_adj_mul(D, C);
// A#B
__m128 A_B = mat2_adj_mul(A, B);
// X# = |D|A - B(D#C)
__m128 X_ = _mm_sub_ps(_mm_mul_ps(detD, A), mat2_mul(B, D_C));
// W# = |A|D - C(A#B)
__m128 W_ = _mm_sub_ps(_mm_mul_ps(detA, D), mat2_mul(C, A_B));
// |M| = |A|*|D| + ... (continue later)
__m128 detM = _mm_mul_ps(detA, detD);
// Y# = |B|C - D(A#B)#
__m128 Y_ = _mm_sub_ps(_mm_mul_ps(detB, C), mat2_mul_adj(D, A_B));
// Z# = |C|B - A(D#C)#
__m128 Z_ = _mm_sub_ps(_mm_mul_ps(detC, B), mat2_mul_adj(A, D_C));
// |M| = |A|*|D| + |B|*|C| ... (continue later)
detM = _mm_add_ps(detM, _mm_mul_ps(detB, detC));
// tr((A#B)(D#C))
__m128 tr = _mm_mul_ps(A_B, DF_VEC_SWIZZLE(D_C, 0, 2, 1, 3));
tr = _mm_hadd_ps(tr, tr);
tr = _mm_hadd_ps(tr, tr);
// |M| = |A|*|D| + |B|*|C| - tr((A#B)(D#C)
detM = _mm_sub_ps(detM, tr);
const __m128 adjSignMask = _mm_setr_ps(1.f, -1.f, -1.f, 1.f);
// (1/|M|, -1/|M|, -1/|M|, 1/|M|)
__m128 rDetM = _mm_div_ps(adjSignMask, detM);
X_ = _mm_mul_ps(X_, rDetM);
Y_ = _mm_mul_ps(Y_, rDetM);
Z_ = _mm_mul_ps(Z_, rDetM);
W_ = _mm_mul_ps(W_, rDetM);
df_m4 r;
// apply adjugate and store, here we combine adjugate shuffle and store shuffle
r.vec[0] = DF_VEC_SHUFFLE(X_, Y_, 3, 1, 3, 1);
r.vec[1] = DF_VEC_SHUFFLE(X_, Y_, 2, 0, 2, 0);
r.vec[2] = DF_VEC_SHUFFLE(Z_, W_, 3, 1, 3, 1);
r.vec[3] = DF_VEC_SHUFFLE(Z_, W_, 2, 0, 2, 0);
return r;
}

51
lib/raytracing.c Normal file
View File

@ -0,0 +1,51 @@
#include <defocus/camera.h>
#include <immintrin.h>
DF_API void df_evaluate_ray_packet(const df_ray_packet *rays) {
const __m128 *base_x = (const __m128 *)rays->base_x;
const __m128 *base_y = (const __m128 *)rays->base_y;
const __m128 *base_z = (const __m128 *)rays->base_z;
const __m128 *dir_x = (const __m128 *)rays->dir_x;
const __m128 *dir_y = (const __m128 *)rays->dir_y;
const __m128 *dir_z = (const __m128 *)rays->dir_z;
/* Simple test: Let rays intersect with plane at z = 350.0 */
float PLANE_Z = 350.0f;
__m128 plane_z = _mm_set1_ps(PLANE_Z);
size_t ray_count = rays->ray_count;
/* TODO(kevin): divide to multiple threads */
for (size_t i = 0; i < ray_count; i += 4) {
__m128 rays_base_x = base_x[i];
__m128 rays_base_y = base_y[i];
__m128 rays_base_z = base_z[i];
__m128 rays_dir_x = dir_x[i];
__m128 rays_dir_y = dir_y[i];
__m128 rays_dir_z = dir_z[i];
/* Solve for t: base.z + t * dir.z = plane_z
* t = (plane_z - base.z) / dir.z
*/
__m128 delta = _mm_sub_ps(plane_z, rays_base_z);
__m128 t = _mm_div_ps(delta, rays_dir_z);
/* Sample p = base.z + t * dir */
__m128 sample_p_x = _mm_mul_ps(t, rays_dir_x);
__m128 sample_p_y = _mm_mul_ps(t, rays_dir_y);
__m128 sample_p_z = _mm_mul_ps(t, rays_dir_z);
sample_p_x = _mm_add_ps(sample_p_x, rays_base_x);
sample_p_y = _mm_add_ps(sample_p_y, rays_base_y);
sample_p_z = _mm_add_ps(sample_p_z, rays_base_z);
}
/* Handle remaining (< 4) rays */
if ((ray_count % 4) != 0) {
}
}

View File

@ -5,7 +5,9 @@ incdir = include_directories('include', '3p')
cc = meson.get_compiler('c')
m_dep = cc.find_library('m', required: false)
add_project_arguments([ '-msse3', '-Wno-missing-braces' ], language: 'c')
if cc.get_id() == 'gcc' or cc.get_id() == 'clang'
add_project_arguments([ '-msse3', '-msse4.1', '-Wno-missing-braces' ], language: 'c')
endif
lib = library('df',
'lib/log.c',
@ -15,14 +17,21 @@ lib = library('df',
'lib/image.c',
'lib/color.c',
'lib/thin_lense.c',
'lib/raytracing.c',
'include/defocus/base.h',
'include/defocus/camera.h',
'include/defocus/defocus.h',
'include/defocus/image.h',
'include/defocus/intrinsic_helper.h',
'include/defocus/models.h',
'include/defocus/scene.h',
'include/defocus/raytracing.h',
include_directories: incdir,
dependencies: m_dep,
version: '0.1.0',
soversion: '0')
dependencies: m_dep)
# Command Line Executable
executable('defocus', 'bin/defocus.c', include_directories: incdir, link_with: lib)
# Test driver
munit_dep = dependency('munit', fallback: ['munit', 'munit_dep'])
executable('tests', 'tests/tests.c', include_directories: incdir, link_with: lib, dependencies: munit_dep)
#munit_dep = dependency('munit', fallback: ['munit', 'munit_dep'])
#executable('tests', 'tests/tests.c', include_directories: incdir, link_with: lib, dependencies: munit_dep)