defocus-modules/lib/math.c

#include <defocus/base.h>
#include <defocus/intrinsic_helper.h>

#include <math.h>
#include <immintrin.h>
#include <pmmintrin.h>

df_v2 df_add_v2(df_v2 a, df_v2 b)
{
    df_v2 v = {a.x + b.x, a.y + b.y};
    return v;
}

df_v2 df_sub_v2(df_v2 a, df_v2 b)
{
    df_v2 v = {a.x - b.x, a.y - b.y};
    return v;
}

float df_dot_v2(df_v2 a, df_v2 b) { return a.x * b.x + a.y * b.y; }

df_v2 df_mul_v2(float t, df_v2 v)
{
    df_v2 r = {t * v.x, t * v.y};
    return r;
}

df_v2 df_normalize_v2(df_v2 v)
{
    float len_square = df_dot_v2(v, v);
    float len = sqrtf(len_square);
    df_v2 n = {v.x / len, v.y / len};
    return n;
}

df_v3 df_add_v3(df_v3 a, df_v3 b)
{
    df_v3 v = {a.x + b.x, a.y + b.y, a.z + b.z};
    return v;
}

df_v3 df_sub_v3(df_v3 a, df_v3 b)
{
    df_v3 v = {a.x - b.x, a.y - b.y, a.z - b.z};
    return v;
}

float df_dot_v3(df_v3 a, df_v3 b) { return a.x * b.x + a.y * b.y + a.z * b.z; }

df_v3 df_mul_v3(float t, df_v3 v)
{
    df_v3 r = {t * v.x, t * v.y, t * v.z};
    return r;
}

df_v3 df_normalize_v3(df_v3 v)
{
    float len_square = df_dot_v3(v, v);
    float len = sqrtf(len_square);
    df_v3 n = {v.x / len, v.y / len, v.z / len};
    return n;
}

df_line_plane_intersection df_calc_line_plane_intersection(df_line line, df_plane plane)
{
    /* check case */
    float dot = df_dot_v3(line.direction, plane.normal);
    df_v3 base_diff = df_sub_v3(plane.base, line.base);
    float dot2 = df_dot_v3(base_diff, plane.normal);
    if (dot != 0) {
        float t = dot2 / dot;
        df_v3 point = df_add_v3(line.base, df_mul_v3(t, line.direction));
        df_line_plane_intersection intersection = {
            .type = df_line_plane_intersection_vector,
            .vec = point,
        };
        return intersection;
    } else {
        if (dot2 == 0) {
            df_line_plane_intersection intersection = {
                .type = df_line_plane_intersection_contained,
            };
            return intersection;
        } else {
            df_line_plane_intersection intersection = {
                .type = df_line_plane_intersection_none,
            };
            return intersection;
        }
    }
}

df_m4 df_mul_m4(df_m4 a, df_m4 b)
{
    /* Super simple, we could probably do it a lot better via SIMD. */
    df_m4 p;
    for (int row = 0; row < 4; ++row) {
        for (int col = 0; col < 4; ++col) {
            DF_M4_AT(p, row, col) = 0.f;
            for (int i = 0; i < 4; ++i) {
                DF_M4_AT(p, row, col) += DF_M4_AT(a, row, i) * DF_M4_AT(b, i, col);
            }
        }
    }
    return p;
}

df_m4 df_scale(float x, float y, float z)
{
    /* clang-format off */
    df_m4 s = {{
          x, 0.f, 0.f, 0.f,
        0.f,   y, 0.f, 0.f,
        0.f, 0.f,   z, 0.f,
        0.f, 0.f, 0.f, 1.f,
    }};
    /* clang-format on */
    return s;
}

df_m4 df_translate(float x, float y, float z)
{
    /* clang-format off */
    df_m4 t = {{
        1.f, 0.f, 0.f, x,
        0.f, 1.f, 0.f, y,
        0.f, 0.f, 1.f, z,
        0.f, 0.f, 0.f, 1.f,
    }};
    /* clang-format on */
    return t;
}

/** SSE3 version of a horizontal sum:
 * computes the sum of 4 32bit floats inside v
 */
static float hsum(__m128 v)
{
    /* v = [v0, v1, v2, v3] */

    /* shuf = [v1, v1, v3, v3] */
    __m128 shuf = _mm_movehdup_ps(v);

    /* sum = [v0 + v1, 2*v1, v2 + v3, 2*v3] */
    __m128 sum = _mm_add_ps(v, shuf);

    /* shuf = [v2 + v3, 2*v3, v3, v3] */
    shuf = _mm_movehl_ps(shuf, sum);

    /* sum = [v0 + v1 + v2 + v3, ...] */
    sum = _mm_add_ss(sum, shuf);

    return _mm_cvtss_f32(sum);
}

df_v3 df_transform_v3(df_m4 T, df_v3 v)
{
    df_v3 transf;
    _Alignas(16) float tmp_v[4] = {v.x, v.y, v.z, 1.f};
    __m128 homov = _mm_load_ps(tmp_v);

    __m128 row0 = _mm_load_ps(&T.e[0]);
    __m128 row1 = _mm_load_ps(&T.e[4]);
    __m128 row2 = _mm_load_ps(&T.e[8]);

    __m128 prod = _mm_mul_ps(row0, homov);
    transf.x = hsum(prod);
    prod = _mm_mul_ps(row1, homov);
    transf.y = hsum(prod);
    prod = _mm_mul_ps(row2, homov);
    transf.z = hsum(prod);

    return transf;
}

/* Fast 4x4 matrix inverse via SIMD adapted from:
 * https://lxjk.github.io/2017/09/03/Fast-4x4-Matrix-Inverse-with-SSE-SIMD-Explained.html
 */

/* Only works if M is a transform matrix without scale:
 *
 * | R T |
 * | 0 1 |
 */
df_m4 df_inverse_transform_no_scale(df_m4 M)
{
    df_m4 I = {0.f};

    /* transpose 3x3, we know that m03 = m13 = m23 = 0 */
    __m128 t0 = DF_VEC_SHUFFLE_0101(M.vec[0], M.vec[1]); /* 00, 01, 10, 11 */
    __m128 t1 = DF_VEC_SHUFFLE_2323(M.vec[0], M.vec[1]); /* 02, 03, 12, 13 */
    I.vec[0] = DF_VEC_SHUFFLE(t0, M.vec[2], 0, 2, 0, 3); /* 00, 01, 20, 23(=0) */
    I.vec[1] = DF_VEC_SHUFFLE(t0, M.vec[2], 1, 3, 1, 3); /* 01, 11, 21, 23(=0) */
    I.vec[2] = DF_VEC_SHUFFLE(t1, M.vec[2], 0, 2, 2, 3); /* 02, 12, 22, 23(=0) */

    /* last */
    I.vec[3] = _mm_mul_ps(I.vec[0], DF_VEC_SWIZZLE1(M.vec[3], 0));
    I.vec[3] = _mm_add_ps(I.vec[3], _mm_mul_ps(I.vec[1], DF_VEC_SWIZZLE1(M.vec[3], 1)));
    I.vec[3] = _mm_add_ps(I.vec[3], _mm_mul_ps(I.vec[2], DF_VEC_SWIZZLE1(M.vec[3], 2)));
    I.vec[3] = _mm_sub_ps(_mm_setr_ps(0.f, 0.f, 0.f, 1.f), I.vec[3]);

    return I;
}

df_m4 df_inverse_transform(df_m4 M)
{
#define SMALL_NUMBER (1.e-8f)

    df_m4 I = {0.f};

    /* transpose 3x3, we know that m03 = m13 = m23 = 0 */
    __m128 t0 = DF_VEC_SHUFFLE_0101(M.vec[0], M.vec[1]); /* 00, 01, 10, 11 */
    __m128 t1 = DF_VEC_SHUFFLE_2323(M.vec[0], M.vec[1]); /* 02, 03, 12, 13 */
    I.vec[0] = DF_VEC_SHUFFLE(t0, M.vec[2], 0, 2, 0, 3); /* 00, 01, 20, 23(=0) */
    I.vec[1] = DF_VEC_SHUFFLE(t0, M.vec[2], 1, 3, 1, 3); /* 01, 11, 21, 23(=0) */
    I.vec[2] = DF_VEC_SHUFFLE(t1, M.vec[2], 0, 2, 2, 3); /* 02, 12, 22, 23(=0) */

    /* divide by the squared scale */
    __m128 size_sqr = _mm_mul_ps(I.vec[0], I.vec[0]);
    size_sqr = _mm_add_ps(size_sqr, _mm_mul_ps(I.vec[1], I.vec[1]));
    size_sqr = _mm_add_ps(size_sqr, _mm_mul_ps(I.vec[2], I.vec[2]));

    __m128 r_size_sqr = _mm_div_ps(_mm_set1_ps(1.f), size_sqr);

    I.vec[0] = _mm_mul_ps(I.vec[0], r_size_sqr);
    I.vec[1] = _mm_mul_ps(I.vec[1], r_size_sqr);
    I.vec[2] = _mm_mul_ps(I.vec[2], r_size_sqr);

    /* last */
    I.vec[3] = _mm_mul_ps(I.vec[0], DF_VEC_SWIZZLE1(M.vec[3], 0));
    I.vec[3] = _mm_add_ps(I.vec[3], _mm_mul_ps(I.vec[1], DF_VEC_SWIZZLE1(M.vec[3], 1)));
    I.vec[3] = _mm_add_ps(I.vec[3], _mm_mul_ps(I.vec[2], DF_VEC_SWIZZLE1(M.vec[3], 2)));
    I.vec[3] = _mm_sub_ps(_mm_setr_ps(0.f, 0.f, 0.f, 1.f), I.vec[3]);

    return I;
#undef SMALL_NUMBER
}
Initial commit First (sort-of) working version of a pinhole camera model. Improvement idea: - Stretch result image to the whole size 2023-04-06 18:54:30 +02:00			`#include <defocus/base.h>`
dump state 2023-05-08 13:28:53 +02:00			`#include <defocus/intrinsic_helper.h>`
Initial commit First (sort-of) working version of a pinhole camera model. Improvement idea: - Stretch result image to the whole size 2023-04-06 18:54:30 +02:00
			`#include <math.h>`
dump state 2023-05-08 13:28:53 +02:00			`#include <immintrin.h>`
			`#include <pmmintrin.h>`

			`df_v2 df_add_v2(df_v2 a, df_v2 b)`
			`{`
			`df_v2 v = {a.x + b.x, a.y + b.y};`
			`return v;`
			`}`

			`df_v2 df_sub_v2(df_v2 a, df_v2 b)`
			`{`
			`df_v2 v = {a.x - b.x, a.y - b.y};`
			`return v;`
			`}`

			`float df_dot_v2(df_v2 a, df_v2 b) { return a.x * b.x + a.y * b.y; }`

			`df_v2 df_mul_v2(float t, df_v2 v)`
			`{`
			`df_v2 r = {t * v.x, t * v.y};`
			`return r;`
			`}`

			`df_v2 df_normalize_v2(df_v2 v)`
			`{`
			`float len_square = df_dot_v2(v, v);`
			`float len = sqrtf(len_square);`
			`df_v2 n = {v.x / len, v.y / len};`
			`return n;`
			`}`
Initial commit First (sort-of) working version of a pinhole camera model. Improvement idea: - Stretch result image to the whole size 2023-04-06 18:54:30 +02:00
			`df_v3 df_add_v3(df_v3 a, df_v3 b)`
			`{`
			`df_v3 v = {a.x + b.x, a.y + b.y, a.z + b.z};`
			`return v;`
			`}`

			`df_v3 df_sub_v3(df_v3 a, df_v3 b)`
			`{`
			`df_v3 v = {a.x - b.x, a.y - b.y, a.z - b.z};`
			`return v;`
			`}`

			`float df_dot_v3(df_v3 a, df_v3 b) { return a.x * b.x + a.y * b.y + a.z * b.z; }`

			`df_v3 df_mul_v3(float t, df_v3 v)`
			`{`
			`df_v3 r = {t * v.x, t * v.y, t * v.z};`
			`return r;`
			`}`

			`df_v3 df_normalize_v3(df_v3 v)`
			`{`
			`float len_square = df_dot_v3(v, v);`
			`float len = sqrtf(len_square);`
			`df_v3 n = {v.x / len, v.y / len, v.z / len};`
			`return n;`
			`}`

			`df_line_plane_intersection df_calc_line_plane_intersection(df_line line, df_plane plane)`
			`{`
			`/* check case */`
			`float dot = df_dot_v3(line.direction, plane.normal);`
			`df_v3 base_diff = df_sub_v3(plane.base, line.base);`
			`float dot2 = df_dot_v3(base_diff, plane.normal);`
			`if (dot != 0) {`
			`float t = dot2 / dot;`
			`df_v3 point = df_add_v3(line.base, df_mul_v3(t, line.direction));`
			`df_line_plane_intersection intersection = {`
			`.type = df_line_plane_intersection_vector,`
			`.vec = point,`
			`};`
			`return intersection;`
			`} else {`
			`if (dot2 == 0) {`
			`df_line_plane_intersection intersection = {`
			`.type = df_line_plane_intersection_contained,`
			`};`
			`return intersection;`
			`} else {`
			`df_line_plane_intersection intersection = {`
			`.type = df_line_plane_intersection_none,`
			`};`
			`return intersection;`
			`}`
			`}`
dump state 2023-05-08 13:28:53 +02:00			`}`

			`df_m4 df_mul_m4(df_m4 a, df_m4 b)`
			`{`
			`/* Super simple, we could probably do it a lot better via SIMD. */`
			`df_m4 p;`
			`for (int row = 0; row < 4; ++row) {`
			`for (int col = 0; col < 4; ++col) {`
			`DF_M4_AT(p, row, col) = 0.f;`
			`for (int i = 0; i < 4; ++i) {`
			`DF_M4_AT(p, row, col) += DF_M4_AT(a, row, i) * DF_M4_AT(b, i, col);`
			`}`
			`}`
			`}`
			`return p;`
			`}`

			`df_m4 df_scale(float x, float y, float z)`
			`{`
			`/* clang-format off */`
			`df_m4 s = {{`
			`x, 0.f, 0.f, 0.f,`
			`0.f, y, 0.f, 0.f,`
			`0.f, 0.f, z, 0.f,`
			`0.f, 0.f, 0.f, 1.f,`
			`}};`
			`/* clang-format on */`
			`return s;`
			`}`

			`df_m4 df_translate(float x, float y, float z)`
			`{`
			`/* clang-format off */`
			`df_m4 t = {{`
			`1.f, 0.f, 0.f, x,`
			`0.f, 1.f, 0.f, y,`
			`0.f, 0.f, 1.f, z,`
			`0.f, 0.f, 0.f, 1.f,`
			`}};`
			`/* clang-format on */`
			`return t;`
			`}`

			`/** SSE3 version of a horizontal sum:`
			`* computes the sum of 4 32bit floats inside v`
			`*/`
			`static float hsum(__m128 v)`
			`{`
			`/* v = [v0, v1, v2, v3] */`

			`/* shuf = [v1, v1, v3, v3] */`
			`__m128 shuf = _mm_movehdup_ps(v);`

			`/* sum = [v0 + v1, 2v1, v2 + v3, 2v3] */`
			`__m128 sum = _mm_add_ps(v, shuf);`

			`/* shuf = [v2 + v3, 2v3, v3, v3] /`
			`shuf = _mm_movehl_ps(shuf, sum);`

			`/* sum = [v0 + v1 + v2 + v3, ...] */`
			`sum = _mm_add_ss(sum, shuf);`

			`return _mm_cvtss_f32(sum);`
			`}`

			`df_v3 df_transform_v3(df_m4 T, df_v3 v)`
			`{`
			`df_v3 transf;`
			`_Alignas(16) float tmp_v[4] = {v.x, v.y, v.z, 1.f};`
			`__m128 homov = _mm_load_ps(tmp_v);`

			`__m128 row0 = _mm_load_ps(&T.e[0]);`
			`__m128 row1 = _mm_load_ps(&T.e[4]);`
			`__m128 row2 = _mm_load_ps(&T.e[8]);`

			`__m128 prod = _mm_mul_ps(row0, homov);`
			`transf.x = hsum(prod);`
			`prod = _mm_mul_ps(row1, homov);`
			`transf.y = hsum(prod);`
			`prod = _mm_mul_ps(row2, homov);`
			`transf.z = hsum(prod);`

			`return transf;`
			`}`

			`/* Fast 4x4 matrix inverse via SIMD adapted from:`
			`* https://lxjk.github.io/2017/09/03/Fast-4x4-Matrix-Inverse-with-SSE-SIMD-Explained.html`
			`*/`

			`/* Only works if M is a transform matrix without scale:`
			`*`
			`* \| R T \|`
			`* \| 0 1 \|`
			`*/`
			`df_m4 df_inverse_transform_no_scale(df_m4 M)`
			`{`
			`df_m4 I = {0.f};`

			`/* transpose 3x3, we know that m03 = m13 = m23 = 0 */`
			`__m128 t0 = DF_VEC_SHUFFLE_0101(M.vec[0], M.vec[1]); /* 00, 01, 10, 11 */`
			`__m128 t1 = DF_VEC_SHUFFLE_2323(M.vec[0], M.vec[1]); /* 02, 03, 12, 13 */`
			`I.vec[0] = DF_VEC_SHUFFLE(t0, M.vec[2], 0, 2, 0, 3); /* 00, 01, 20, 23(=0) */`
			`I.vec[1] = DF_VEC_SHUFFLE(t0, M.vec[2], 1, 3, 1, 3); /* 01, 11, 21, 23(=0) */`
			`I.vec[2] = DF_VEC_SHUFFLE(t1, M.vec[2], 0, 2, 2, 3); /* 02, 12, 22, 23(=0) */`

			`/* last */`
			`I.vec[3] = _mm_mul_ps(I.vec[0], DF_VEC_SWIZZLE1(M.vec[3], 0));`
			`I.vec[3] = _mm_add_ps(I.vec[3], _mm_mul_ps(I.vec[1], DF_VEC_SWIZZLE1(M.vec[3], 1)));`
			`I.vec[3] = _mm_add_ps(I.vec[3], _mm_mul_ps(I.vec[2], DF_VEC_SWIZZLE1(M.vec[3], 2)));`
			`I.vec[3] = _mm_sub_ps(_mm_setr_ps(0.f, 0.f, 0.f, 1.f), I.vec[3]);`

			`return I;`
			`}`

			`df_m4 df_inverse_transform(df_m4 M)`
			`{`
			`#define SMALL_NUMBER (1.e-8f)`

			`df_m4 I = {0.f};`

			`/* transpose 3x3, we know that m03 = m13 = m23 = 0 */`
			`__m128 t0 = DF_VEC_SHUFFLE_0101(M.vec[0], M.vec[1]); /* 00, 01, 10, 11 */`
			`__m128 t1 = DF_VEC_SHUFFLE_2323(M.vec[0], M.vec[1]); /* 02, 03, 12, 13 */`
			`I.vec[0] = DF_VEC_SHUFFLE(t0, M.vec[2], 0, 2, 0, 3); /* 00, 01, 20, 23(=0) */`
			`I.vec[1] = DF_VEC_SHUFFLE(t0, M.vec[2], 1, 3, 1, 3); /* 01, 11, 21, 23(=0) */`
			`I.vec[2] = DF_VEC_SHUFFLE(t1, M.vec[2], 0, 2, 2, 3); /* 02, 12, 22, 23(=0) */`

			`/* divide by the squared scale */`
			`__m128 size_sqr = _mm_mul_ps(I.vec[0], I.vec[0]);`
			`size_sqr = _mm_add_ps(size_sqr, _mm_mul_ps(I.vec[1], I.vec[1]));`
			`size_sqr = _mm_add_ps(size_sqr, _mm_mul_ps(I.vec[2], I.vec[2]));`

			`__m128 r_size_sqr = _mm_div_ps(_mm_set1_ps(1.f), size_sqr);`

			`I.vec[0] = _mm_mul_ps(I.vec[0], r_size_sqr);`
			`I.vec[1] = _mm_mul_ps(I.vec[1], r_size_sqr);`
			`I.vec[2] = _mm_mul_ps(I.vec[2], r_size_sqr);`

			`/* last */`
			`I.vec[3] = _mm_mul_ps(I.vec[0], DF_VEC_SWIZZLE1(M.vec[3], 0));`
			`I.vec[3] = _mm_add_ps(I.vec[3], _mm_mul_ps(I.vec[1], DF_VEC_SWIZZLE1(M.vec[3], 1)));`
			`I.vec[3] = _mm_add_ps(I.vec[3], _mm_mul_ps(I.vec[2], DF_VEC_SWIZZLE1(M.vec[3], 2)));`
			`I.vec[3] = _mm_sub_ps(_mm_setr_ps(0.f, 0.f, 0.f, 1.f), I.vec[3]);`

			`return I;`
			`#undef SMALL_NUMBER`
			`}`