#include #include #include #include #include df_v2 df_add_v2(df_v2 a, df_v2 b) { df_v2 v = {a.x + b.x, a.y + b.y}; return v; } df_v2 df_sub_v2(df_v2 a, df_v2 b) { df_v2 v = {a.x - b.x, a.y - b.y}; return v; } float df_dot_v2(df_v2 a, df_v2 b) { return a.x * b.x + a.y * b.y; } df_v2 df_mul_v2(float t, df_v2 v) { df_v2 r = {t * v.x, t * v.y}; return r; } df_v2 df_normalize_v2(df_v2 v) { float len_square = df_dot_v2(v, v); float len = sqrtf(len_square); df_v2 n = {v.x / len, v.y / len}; return n; } df_v3 df_add_v3(df_v3 a, df_v3 b) { df_v3 v = {a.x + b.x, a.y + b.y, a.z + b.z}; return v; } df_v3 df_sub_v3(df_v3 a, df_v3 b) { df_v3 v = {a.x - b.x, a.y - b.y, a.z - b.z}; return v; } float df_dot_v3(df_v3 a, df_v3 b) { return a.x * b.x + a.y * b.y + a.z * b.z; } df_v3 df_mul_v3(float t, df_v3 v) { df_v3 r = {t * v.x, t * v.y, t * v.z}; return r; } df_v3 df_normalize_v3(df_v3 v) { float len_square = df_dot_v3(v, v); float len = sqrtf(len_square); df_v3 n = {v.x / len, v.y / len, v.z / len}; return n; } df_line_plane_intersection df_calc_line_plane_intersection(df_line line, df_plane plane) { /* check case */ float dot = df_dot_v3(line.direction, plane.normal); df_v3 base_diff = df_sub_v3(plane.base, line.base); float dot2 = df_dot_v3(base_diff, plane.normal); if (dot != 0) { float t = dot2 / dot; df_v3 point = df_add_v3(line.base, df_mul_v3(t, line.direction)); df_line_plane_intersection intersection = { .type = df_line_plane_intersection_vector, .vec = point, }; return intersection; } else { if (dot2 == 0) { df_line_plane_intersection intersection = { .type = df_line_plane_intersection_contained, }; return intersection; } else { df_line_plane_intersection intersection = { .type = df_line_plane_intersection_none, }; return intersection; } } } df_m4 df_mul_m4(df_m4 a, df_m4 b) { /* Super simple, we could probably do it a lot better via SIMD. */ df_m4 p; for (int row = 0; row < 4; ++row) { for (int col = 0; col < 4; ++col) { DF_M4_AT(p, row, col) = 0.f; for (int i = 0; i < 4; ++i) { DF_M4_AT(p, row, col) += DF_M4_AT(a, row, i) * DF_M4_AT(b, i, col); } } } return p; } df_m4 df_scale(float x, float y, float z) { /* clang-format off */ df_m4 s = {{ x, 0.f, 0.f, 0.f, 0.f, y, 0.f, 0.f, 0.f, 0.f, z, 0.f, 0.f, 0.f, 0.f, 1.f, }}; /* clang-format on */ return s; } df_m4 df_translate(float x, float y, float z) { /* clang-format off */ df_m4 t = {{ 1.f, 0.f, 0.f, x, 0.f, 1.f, 0.f, y, 0.f, 0.f, 1.f, z, 0.f, 0.f, 0.f, 1.f, }}; /* clang-format on */ return t; } /** SSE3 version of a horizontal sum: * computes the sum of 4 32bit floats inside v */ static float hsum(__m128 v) { /* v = [v0, v1, v2, v3] */ /* shuf = [v1, v1, v3, v3] */ __m128 shuf = _mm_movehdup_ps(v); /* sum = [v0 + v1, 2*v1, v2 + v3, 2*v3] */ __m128 sum = _mm_add_ps(v, shuf); /* shuf = [v2 + v3, 2*v3, v3, v3] */ shuf = _mm_movehl_ps(shuf, sum); /* sum = [v0 + v1 + v2 + v3, ...] */ sum = _mm_add_ss(sum, shuf); return _mm_cvtss_f32(sum); } df_v3 df_transform_v3(df_m4 T, df_v3 v) { df_v3 transf; _Alignas(16) float tmp_v[4] = {v.x, v.y, v.z, 1.f}; __m128 homov = _mm_load_ps(tmp_v); __m128 row0 = _mm_load_ps(&T.e[0]); __m128 row1 = _mm_load_ps(&T.e[4]); __m128 row2 = _mm_load_ps(&T.e[8]); __m128 prod = _mm_mul_ps(row0, homov); transf.x = hsum(prod); prod = _mm_mul_ps(row1, homov); transf.y = hsum(prod); prod = _mm_mul_ps(row2, homov); transf.z = hsum(prod); return transf; } /* Fast 4x4 matrix inverse via SIMD adapted from: * https://lxjk.github.io/2017/09/03/Fast-4x4-Matrix-Inverse-with-SSE-SIMD-Explained.html */ /* Only works if M is a transform matrix without scale: * * | R T | * | 0 1 | */ df_m4 df_inverse_transform_no_scale(df_m4 M) { df_m4 I = {0.f}; /* transpose 3x3, we know that m03 = m13 = m23 = 0 */ __m128 t0 = DF_VEC_SHUFFLE_0101(M.vec[0], M.vec[1]); /* 00, 01, 10, 11 */ __m128 t1 = DF_VEC_SHUFFLE_2323(M.vec[0], M.vec[1]); /* 02, 03, 12, 13 */ I.vec[0] = DF_VEC_SHUFFLE(t0, M.vec[2], 0, 2, 0, 3); /* 00, 01, 20, 23(=0) */ I.vec[1] = DF_VEC_SHUFFLE(t0, M.vec[2], 1, 3, 1, 3); /* 01, 11, 21, 23(=0) */ I.vec[2] = DF_VEC_SHUFFLE(t1, M.vec[2], 0, 2, 2, 3); /* 02, 12, 22, 23(=0) */ /* last */ I.vec[3] = _mm_mul_ps(I.vec[0], DF_VEC_SWIZZLE1(M.vec[3], 0)); I.vec[3] = _mm_add_ps(I.vec[3], _mm_mul_ps(I.vec[1], DF_VEC_SWIZZLE1(M.vec[3], 1))); I.vec[3] = _mm_add_ps(I.vec[3], _mm_mul_ps(I.vec[2], DF_VEC_SWIZZLE1(M.vec[3], 2))); I.vec[3] = _mm_sub_ps(_mm_setr_ps(0.f, 0.f, 0.f, 1.f), I.vec[3]); return I; } df_m4 df_inverse_transform(df_m4 M) { #define SMALL_NUMBER (1.e-8f) df_m4 I = {0.f}; /* transpose 3x3, we know that m03 = m13 = m23 = 0 */ __m128 t0 = DF_VEC_SHUFFLE_0101(M.vec[0], M.vec[1]); /* 00, 01, 10, 11 */ __m128 t1 = DF_VEC_SHUFFLE_2323(M.vec[0], M.vec[1]); /* 02, 03, 12, 13 */ I.vec[0] = DF_VEC_SHUFFLE(t0, M.vec[2], 0, 2, 0, 3); /* 00, 01, 20, 23(=0) */ I.vec[1] = DF_VEC_SHUFFLE(t0, M.vec[2], 1, 3, 1, 3); /* 01, 11, 21, 23(=0) */ I.vec[2] = DF_VEC_SHUFFLE(t1, M.vec[2], 0, 2, 2, 3); /* 02, 12, 22, 23(=0) */ /* divide by the squared scale */ __m128 size_sqr = _mm_mul_ps(I.vec[0], I.vec[0]); size_sqr = _mm_add_ps(size_sqr, _mm_mul_ps(I.vec[1], I.vec[1])); size_sqr = _mm_add_ps(size_sqr, _mm_mul_ps(I.vec[2], I.vec[2])); __m128 r_size_sqr = _mm_div_ps(_mm_set1_ps(1.f), size_sqr); I.vec[0] = _mm_mul_ps(I.vec[0], r_size_sqr); I.vec[1] = _mm_mul_ps(I.vec[1], r_size_sqr); I.vec[2] = _mm_mul_ps(I.vec[2], r_size_sqr); /* last */ I.vec[3] = _mm_mul_ps(I.vec[0], DF_VEC_SWIZZLE1(M.vec[3], 0)); I.vec[3] = _mm_add_ps(I.vec[3], _mm_mul_ps(I.vec[1], DF_VEC_SWIZZLE1(M.vec[3], 1))); I.vec[3] = _mm_add_ps(I.vec[3], _mm_mul_ps(I.vec[2], DF_VEC_SWIZZLE1(M.vec[3], 2))); I.vec[3] = _mm_sub_ps(_mm_setr_ps(0.f, 0.f, 0.f, 1.f), I.vec[3]); return I; #undef SMALL_NUMBER }