327 lines
9.7 KiB
C
327 lines
9.7 KiB
C
#include <defocus/base.h>
|
|
#include <defocus/intrinsic_helper.h>
|
|
|
|
#include <math.h>
|
|
#include <immintrin.h>
|
|
#include <pmmintrin.h>
|
|
|
|
DF_API df_v2 df_add_v2(df_v2 a, df_v2 b)
|
|
{
|
|
df_v2 v = {a.x + b.x, a.y + b.y};
|
|
return v;
|
|
}
|
|
|
|
DF_API df_v2 df_sub_v2(df_v2 a, df_v2 b)
|
|
{
|
|
df_v2 v = {a.x - b.x, a.y - b.y};
|
|
return v;
|
|
}
|
|
|
|
DF_API float df_dot_v2(df_v2 a, df_v2 b) { return a.x * b.x + a.y * b.y; }
|
|
|
|
DF_API df_v2 df_mul_v2(float t, df_v2 v)
|
|
{
|
|
df_v2 r = {t * v.x, t * v.y};
|
|
return r;
|
|
}
|
|
|
|
DF_API df_v2 df_normalize_v2(df_v2 v)
|
|
{
|
|
float len_square = df_dot_v2(v, v);
|
|
float len = sqrtf(len_square);
|
|
df_v2 n = {v.x / len, v.y / len};
|
|
return n;
|
|
}
|
|
|
|
DF_API df_v3 df_add_v3(df_v3 a, df_v3 b)
|
|
{
|
|
df_v3 v = {a.x + b.x, a.y + b.y, a.z + b.z};
|
|
return v;
|
|
}
|
|
|
|
DF_API df_v3 df_sub_v3(df_v3 a, df_v3 b)
|
|
{
|
|
df_v3 v = {a.x - b.x, a.y - b.y, a.z - b.z};
|
|
return v;
|
|
}
|
|
|
|
DF_API float df_dot_v3(df_v3 a, df_v3 b) { return a.x * b.x + a.y * b.y + a.z * b.z; }
|
|
|
|
DF_API df_v3 df_mul_v3(float t, df_v3 v)
|
|
{
|
|
df_v3 r = {t * v.x, t * v.y, t * v.z};
|
|
return r;
|
|
}
|
|
|
|
DF_API df_v3 df_normalize_v3(df_v3 v)
|
|
{
|
|
float len_square = df_dot_v3(v, v);
|
|
float len = sqrtf(len_square);
|
|
df_v3 n = {v.x / len, v.y / len, v.z / len};
|
|
return n;
|
|
}
|
|
|
|
DF_API df_line_plane_intersection df_calc_line_plane_intersection(df_line line, df_plane plane)
|
|
{
|
|
/* check case */
|
|
float dot = df_dot_v3(line.direction, plane.normal);
|
|
df_v3 base_diff = df_sub_v3(plane.base, line.base);
|
|
float dot2 = df_dot_v3(base_diff, plane.normal);
|
|
if (dot != 0) {
|
|
float t = dot2 / dot;
|
|
df_v3 point = df_add_v3(line.base, df_mul_v3(t, line.direction));
|
|
df_line_plane_intersection intersection = {
|
|
.type = df_line_plane_intersection_vector,
|
|
.vec = point,
|
|
};
|
|
return intersection;
|
|
} else {
|
|
if (dot2 == 0) {
|
|
df_line_plane_intersection intersection = {
|
|
.type = df_line_plane_intersection_contained,
|
|
};
|
|
return intersection;
|
|
} else {
|
|
df_line_plane_intersection intersection = {
|
|
.type = df_line_plane_intersection_none,
|
|
};
|
|
return intersection;
|
|
}
|
|
}
|
|
}
|
|
|
|
DF_API df_m4 df_mul_m4(df_m4 a, df_m4 b)
|
|
{
|
|
/* Super simple, we could probably do it a lot better via SIMD. */
|
|
df_m4 p;
|
|
for (int row = 0; row < 4; ++row) {
|
|
for (int col = 0; col < 4; ++col) {
|
|
DF_M4_AT(p, row, col) = 0.f;
|
|
for (int i = 0; i < 4; ++i) {
|
|
DF_M4_AT(p, row, col) += DF_M4_AT(a, row, i) * DF_M4_AT(b, i, col);
|
|
}
|
|
}
|
|
}
|
|
return p;
|
|
}
|
|
|
|
DF_API df_m4 df_scale(float x, float y, float z)
|
|
{
|
|
/* clang-format off */
|
|
df_m4 s = {{
|
|
x, 0.f, 0.f, 0.f,
|
|
0.f, y, 0.f, 0.f,
|
|
0.f, 0.f, z, 0.f,
|
|
0.f, 0.f, 0.f, 1.f,
|
|
}};
|
|
/* clang-format on */
|
|
return s;
|
|
}
|
|
|
|
DF_API df_m4 df_translate(float x, float y, float z)
|
|
{
|
|
/* clang-format off */
|
|
df_m4 t = {{
|
|
1.f, 0.f, 0.f, x,
|
|
0.f, 1.f, 0.f, y,
|
|
0.f, 0.f, 1.f, z,
|
|
0.f, 0.f, 0.f, 1.f,
|
|
}};
|
|
/* clang-format on */
|
|
return t;
|
|
}
|
|
|
|
/** SSE3 version of a horizontal sum:
|
|
* computes the sum of 4 32bit floats inside v
|
|
*/
|
|
static float hsum(__m128 v)
|
|
{
|
|
/* v = [v0, v1, v2, v3] */
|
|
|
|
/* shuf = [v1, v1, v3, v3] */
|
|
__m128 shuf = _mm_movehdup_ps(v);
|
|
|
|
/* sum = [v0 + v1, 2*v1, v2 + v3, 2*v3] */
|
|
__m128 sum = _mm_add_ps(v, shuf);
|
|
|
|
/* shuf = [v2 + v3, 2*v3, v3, v3] */
|
|
shuf = _mm_movehl_ps(shuf, sum);
|
|
|
|
/* sum = [v0 + v1 + v2 + v3, ...] */
|
|
sum = _mm_add_ss(sum, shuf);
|
|
|
|
return _mm_cvtss_f32(sum);
|
|
}
|
|
|
|
DF_API df_v3 df_transform_v3(df_m4 T, df_v3 v)
|
|
{
|
|
df_v3 transf;
|
|
_Alignas(16) float tmp_v[4] = {v.x, v.y, v.z, 1.f};
|
|
__m128 homov = _mm_load_ps(tmp_v);
|
|
|
|
__m128 row0 = _mm_load_ps(&T.e[0]);
|
|
__m128 row1 = _mm_load_ps(&T.e[4]);
|
|
__m128 row2 = _mm_load_ps(&T.e[8]);
|
|
|
|
__m128 prod = _mm_mul_ps(row0, homov);
|
|
transf.x = hsum(prod);
|
|
prod = _mm_mul_ps(row1, homov);
|
|
transf.y = hsum(prod);
|
|
prod = _mm_mul_ps(row2, homov);
|
|
transf.z = hsum(prod);
|
|
|
|
return transf;
|
|
}
|
|
|
|
/* Fast 4x4 matrix inverse via SIMD adapted from:
|
|
* https://lxjk.github.io/2017/09/03/Fast-4x4-Matrix-Inverse-with-SSE-SIMD-Explained.html
|
|
*/
|
|
|
|
/* Only works if M is a transform matrix without scale:
|
|
*
|
|
* | R T |
|
|
* | 0 1 |
|
|
*/
|
|
DF_API df_m4 df_inverse_transform_no_scale(df_m4 M)
|
|
{
|
|
df_m4 I = {0.f};
|
|
|
|
/* transpose 3x3 */
|
|
__m128 t0 = DF_VEC_SHUFFLE_0101(M.vec[0], M.vec[1]); /* 00, 01, 10, 11 */
|
|
__m128 t1 = DF_VEC_SHUFFLE_2323(M.vec[0], M.vec[1]); /* 02, 03, 12, 13 */
|
|
I.vec[0] = DF_VEC_SHUFFLE(t0, M.vec[2], 0, 2, 0, 3); /* 00, 01, 20, 23(=0) */
|
|
I.vec[1] = DF_VEC_SHUFFLE(t0, M.vec[2], 1, 3, 1, 3); /* 01, 11, 21, 23(=0) */
|
|
I.vec[2] = DF_VEC_SHUFFLE(t1, M.vec[2], 0, 2, 2, 3); /* 02, 12, 22, 23(=0) */
|
|
|
|
__m128 t = _mm_setr_ps(DF_M4_AT(M, 0, 3), DF_M4_AT(M, 1, 3), DF_M4_AT(M, 2, 3), 0.f);
|
|
DF_M4_AT(I, 0, 3) = -1.f * hsum(_mm_mul_ps(I.vec[0], t));
|
|
|
|
DF_M4_AT(I, 1, 3) = -1.f * hsum(_mm_mul_ps(I.vec[1], t));
|
|
DF_M4_AT(I, 2, 3) = -1.f * hsum(_mm_mul_ps(I.vec[2], t));
|
|
I.vec[3] = _mm_setr_ps(0.f, 0.f, 0.f, 1.f);
|
|
|
|
return I;
|
|
}
|
|
|
|
DF_API df_m4 df_inverse_transform(df_m4 M)
|
|
{
|
|
df_m4 I = {0.f};
|
|
|
|
/* transpose 3x3 */
|
|
__m128 t0 = DF_VEC_SHUFFLE_0101(M.vec[0], M.vec[1]); /* 00, 01, 10, 11 */
|
|
__m128 t1 = DF_VEC_SHUFFLE_2323(M.vec[0], M.vec[1]); /* 02, 03, 12, 13 */
|
|
I.vec[0] = DF_VEC_SHUFFLE(t0, M.vec[2], 0, 2, 0, 3); /* 00, 01, 20, 23(=0) */
|
|
I.vec[1] = DF_VEC_SHUFFLE(t0, M.vec[2], 1, 3, 1, 3); /* 01, 11, 21, 23(=0) */
|
|
I.vec[2] = DF_VEC_SHUFFLE(t1, M.vec[2], 0, 2, 2, 3); /* 02, 12, 22, 23(=0) */
|
|
|
|
__m128 t = _mm_setr_ps(DF_M4_AT(M, 0, 3), DF_M4_AT(M, 1, 3), DF_M4_AT(M, 2, 3), 0.f);
|
|
DF_M4_AT(I, 0, 3) = -1.f * hsum(_mm_mul_ps(I.vec[0], t));
|
|
|
|
DF_M4_AT(I, 1, 3) = -1.f * hsum(_mm_mul_ps(I.vec[1], t));
|
|
DF_M4_AT(I, 2, 3) = -1.f * hsum(_mm_mul_ps(I.vec[2], t));
|
|
I.vec[3] = _mm_setr_ps(0.f, 0.f, 0.f, 1.f);
|
|
|
|
return I;
|
|
}
|
|
|
|
|
|
|
|
// for row major matrix
|
|
// we use __m128 to represent 2x2 matrix as A = | A0 A1 |
|
|
// | A2 A3 |
|
|
// 2x2 row major Matrix multiply A*B
|
|
static __forceinline __m128 mat2_mul(__m128 vec1, __m128 vec2)
|
|
{
|
|
return _mm_add_ps(_mm_mul_ps(vec1, DF_VEC_SWIZZLE(vec2, 0, 3, 0, 3)),
|
|
_mm_mul_ps(DF_VEC_SWIZZLE(vec1, 1, 0, 3, 2), DF_VEC_SWIZZLE(vec2, 2, 1, 2, 1)));
|
|
}
|
|
// 2x2 row major Matrix adjugate multiply (A#)*B
|
|
static __forceinline __m128 mat2_adj_mul(__m128 vec1, __m128 vec2)
|
|
{
|
|
return _mm_sub_ps(_mm_mul_ps(DF_VEC_SWIZZLE(vec1, 3, 3, 0, 0), vec2),
|
|
_mm_mul_ps(DF_VEC_SWIZZLE(vec1, 1, 1, 2, 2), DF_VEC_SWIZZLE(vec2, 2, 3, 0, 1)));
|
|
}
|
|
// 2x2 row major Matrix multiply adjugate A*(B#)
|
|
static __forceinline __m128 mat2_mul_adj(__m128 vec1, __m128 vec2)
|
|
{
|
|
return _mm_sub_ps(_mm_mul_ps(vec1, DF_VEC_SWIZZLE(vec2, 3, 0, 3, 0)),
|
|
_mm_mul_ps(DF_VEC_SWIZZLE(vec1, 1, 0, 3, 2), DF_VEC_SWIZZLE(vec2, 2, 1, 2, 1)));
|
|
}
|
|
|
|
DF_API df_m4 df_inverse( df_m4 M ) {
|
|
// use block matrix method
|
|
// A is a matrix, then i(A) or iA means inverse of A, A# (or A_ in code) means adjugate of A, |A| (or detA in code)
|
|
// is determinant, tr(A) is trace
|
|
|
|
// sub matrices
|
|
__m128 A = DF_VEC_SHUFFLE_0101(M.vec[0], M.vec[1]);
|
|
__m128 B = DF_VEC_SHUFFLE_2323(M.vec[0], M.vec[1]);
|
|
__m128 C = DF_VEC_SHUFFLE_0101(M.vec[2], M.vec[3]);
|
|
__m128 D = DF_VEC_SHUFFLE_2323(M.vec[2], M.vec[3]);
|
|
|
|
#if 0
|
|
__m128 detA = _mm_set1_ps(M.m[0][0] * M.m[1][1] - M.m[0][1] * M.m[1][0]);
|
|
__m128 detB = _mm_set1_ps(M.m[0][2] * M.m[1][3] - M.m[0][3] * M.m[1][2]);
|
|
__m128 detC = _mm_set1_ps(M.m[2][0] * M.m[3][1] - M.m[2][1] * M.m[3][0]);
|
|
__m128 detD = _mm_set1_ps(M.m[2][2] * M.m[3][3] - M.m[2][3] * M.m[3][2]);
|
|
#else
|
|
// determinant as (|A| |B| |C| |D|)
|
|
__m128 detSub = _mm_sub_ps(
|
|
_mm_mul_ps(DF_VEC_SHUFFLE(M.vec[0], M.vec[2], 0, 2, 0, 2), DF_VEC_SHUFFLE(M.vec[1], M.vec[3], 1, 3, 1, 3)),
|
|
_mm_mul_ps(DF_VEC_SHUFFLE(M.vec[0], M.vec[2], 1, 3, 1, 3), DF_VEC_SHUFFLE(M.vec[1], M.vec[3], 0, 2, 0, 2)));
|
|
__m128 detA = DF_VEC_SWIZZLE1(detSub, 0);
|
|
__m128 detB = DF_VEC_SWIZZLE1(detSub, 1);
|
|
__m128 detC = DF_VEC_SWIZZLE1(detSub, 2);
|
|
__m128 detD = DF_VEC_SWIZZLE1(detSub, 3);
|
|
#endif
|
|
|
|
// let iM = 1/|M| * | X Y |
|
|
// | Z W |
|
|
|
|
// D#C
|
|
__m128 D_C = mat2_adj_mul(D, C);
|
|
// A#B
|
|
__m128 A_B = mat2_adj_mul(A, B);
|
|
// X# = |D|A - B(D#C)
|
|
__m128 X_ = _mm_sub_ps(_mm_mul_ps(detD, A), mat2_mul(B, D_C));
|
|
// W# = |A|D - C(A#B)
|
|
__m128 W_ = _mm_sub_ps(_mm_mul_ps(detA, D), mat2_mul(C, A_B));
|
|
|
|
// |M| = |A|*|D| + ... (continue later)
|
|
__m128 detM = _mm_mul_ps(detA, detD);
|
|
|
|
// Y# = |B|C - D(A#B)#
|
|
__m128 Y_ = _mm_sub_ps(_mm_mul_ps(detB, C), mat2_mul_adj(D, A_B));
|
|
// Z# = |C|B - A(D#C)#
|
|
__m128 Z_ = _mm_sub_ps(_mm_mul_ps(detC, B), mat2_mul_adj(A, D_C));
|
|
|
|
// |M| = |A|*|D| + |B|*|C| ... (continue later)
|
|
detM = _mm_add_ps(detM, _mm_mul_ps(detB, detC));
|
|
|
|
// tr((A#B)(D#C))
|
|
__m128 tr = _mm_mul_ps(A_B, DF_VEC_SWIZZLE(D_C, 0, 2, 1, 3));
|
|
tr = _mm_hadd_ps(tr, tr);
|
|
tr = _mm_hadd_ps(tr, tr);
|
|
// |M| = |A|*|D| + |B|*|C| - tr((A#B)(D#C)
|
|
detM = _mm_sub_ps(detM, tr);
|
|
|
|
const __m128 adjSignMask = _mm_setr_ps(1.f, -1.f, -1.f, 1.f);
|
|
// (1/|M|, -1/|M|, -1/|M|, 1/|M|)
|
|
__m128 rDetM = _mm_div_ps(adjSignMask, detM);
|
|
|
|
X_ = _mm_mul_ps(X_, rDetM);
|
|
Y_ = _mm_mul_ps(Y_, rDetM);
|
|
Z_ = _mm_mul_ps(Z_, rDetM);
|
|
W_ = _mm_mul_ps(W_, rDetM);
|
|
|
|
df_m4 r;
|
|
|
|
// apply adjugate and store, here we combine adjugate shuffle and store shuffle
|
|
r.vec[0] = DF_VEC_SHUFFLE(X_, Y_, 3, 1, 3, 1);
|
|
r.vec[1] = DF_VEC_SHUFFLE(X_, Y_, 2, 0, 2, 0);
|
|
r.vec[2] = DF_VEC_SHUFFLE(Z_, W_, 3, 1, 3, 1);
|
|
r.vec[3] = DF_VEC_SHUFFLE(Z_, W_, 2, 0, 2, 0);
|
|
|
|
return r;
|
|
}
|