#include #include DF_API void df_evaluate_ray_packet(const df_ray_packet *rays) { const __m128 *base_x = (const __m128 *)rays->base_x; const __m128 *base_y = (const __m128 *)rays->base_y; const __m128 *base_z = (const __m128 *)rays->base_z; const __m128 *dir_x = (const __m128 *)rays->dir_x; const __m128 *dir_y = (const __m128 *)rays->dir_y; const __m128 *dir_z = (const __m128 *)rays->dir_z; /* Simple test: Let rays intersect with plane at z = 350.0 */ float PLANE_Z = 350.0f; __m128 plane_z = _mm_set1_ps(PLANE_Z); size_t ray_count = rays->ray_count; /* TODO(kevin): divide to multiple threads */ for (size_t i = 0; i < ray_count; i += 4) { __m128 rays_base_x = base_x[i]; __m128 rays_base_y = base_y[i]; __m128 rays_base_z = base_z[i]; __m128 rays_dir_x = dir_x[i]; __m128 rays_dir_y = dir_y[i]; __m128 rays_dir_z = dir_z[i]; /* Solve for t: base.z + t * dir.z = plane_z * t = (plane_z - base.z) / dir.z */ __m128 delta = _mm_sub_ps(plane_z, rays_base_z); __m128 t = _mm_div_ps(delta, rays_dir_z); /* Sample p = base.z + t * dir */ __m128 sample_p_x = _mm_mul_ps(t, rays_dir_x); __m128 sample_p_y = _mm_mul_ps(t, rays_dir_y); __m128 sample_p_z = _mm_mul_ps(t, rays_dir_z); sample_p_x = _mm_add_ps(sample_p_x, rays_base_x); sample_p_y = _mm_add_ps(sample_p_y, rays_base_y); sample_p_z = _mm_add_ps(sample_p_z, rays_base_z); } /* Handle remaining (< 4) rays */ if ((ray_count % 4) != 0) { } }