17 #ifndef __UTIL_MATH_FLOAT4_H__
18 #define __UTIL_MATH_FLOAT4_H__
20 #ifndef __UTIL_MATH_H__
21 # error "Do not include this file directly, include util_types.h instead."
30 #ifndef __KERNEL_OPENCL__
74 template<
size_t index_0,
size_t index_1,
size_t index_2,
size_t index_3>
76 template<
size_t index_0,
size_t index_1,
size_t index_2,
size_t index_3>
82 template<>
__forceinline const float4 shuffle<2, 3, 2, 3>(
const float4 &
a,
const float4 &b);
84 # ifdef __KERNEL_SSE3__
90 #ifndef __KERNEL_GPU__
103 #ifdef __KERNEL_SSE__
104 return float4(_mm_setzero_ps());
115 #ifndef __KERNEL_OPENCL__
118 # ifdef __KERNEL_SSE__
119 __m128
mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
120 return float4(_mm_xor_ps(
a.m128,
mask));
128 # ifdef __KERNEL_SSE__
129 return float4(_mm_mul_ps(
a.m128, b.m128));
137 # if defined(__KERNEL_SSE__)
151 return a * (1.0f / f);
156 # ifdef __KERNEL_SSE__
157 return float4(_mm_div_ps(
a.m128, b.m128));
170 # ifdef __KERNEL_SSE__
171 return float4(_mm_add_ps(
a.m128, b.m128));
184 # ifdef __KERNEL_SSE__
185 return float4(_mm_sub_ps(
a.m128, b.m128));
218 # ifdef __KERNEL_SSE__
219 return int4(_mm_castps_si128(_mm_cmplt_ps(
a.m128, b.m128)));
221 return make_int4(
a.x < b.x,
a.y < b.y,
a.z < b.z,
a.w < b.w);
227 # ifdef __KERNEL_SSE__
228 return int4(_mm_castps_si128(_mm_cmpge_ps(
a.m128, b.m128)));
230 return make_int4(
a.x >= b.x,
a.y >= b.y,
a.z >= b.z,
a.w >= b.w);
236 # ifdef __KERNEL_SSE__
237 return int4(_mm_castps_si128(_mm_cmple_ps(
a.m128, b.m128)));
239 return make_int4(
a.x <= b.x,
a.y <= b.y,
a.z <= b.z,
a.w <= b.w);
245 # ifdef __KERNEL_SSE__
246 return (_mm_movemask_ps(_mm_cmpeq_ps(
a.m128, b.m128)) & 15) == 15;
248 return (
a.x == b.x &&
a.y == b.y &&
a.z == b.z &&
a.w == b.w);
259 # if defined(__KERNEL_SSE41__) && defined(__KERNEL_SSE__)
260 # if defined(__KERNEL_NEON__)
261 __m128
t = vmulq_f32(
a, b);
262 return vaddvq_f32(
t);
264 return _mm_cvtss_f32(_mm_dp_ps(
a, b, 0xFF));
267 return (
a.x * b.x +
a.y * b.y) + (
a.z * b.z +
a.w * b.w);
278 # ifdef __KERNEL_SSE__
280 return float4(_mm_div_ps(_mm_set_ps1(1.0f),
a.m128));
288 # ifdef __KERNEL_SSE__
289 return float4(_mm_sqrt_ps(
a.m128));
302 # ifdef __KERNEL_SSE__
303 return (shuffle<1, 2, 0, 0>(
a) * shuffle<2, 0, 1, 0>(b)) -
304 (shuffle<2, 0, 1, 0>(
a) * shuffle<1, 2, 0, 0>(b));
306 return make_float4(
a.y * b.z -
a.z * b.y,
a.z * b.x -
a.x * b.z,
a.x * b.y -
a.y * b.x, 0.0f);
312 # ifdef __KERNEL_SSE__
315 return (
a.x == 0.0f &&
a.y == 0.0f &&
a.z == 0.0f &&
a.w == 0.0f);
321 # if defined(__KERNEL_SSE__)
322 # if defined(__KERNEL_NEON__)
323 return float4(vdupq_n_f32(vaddvq_f32(
a)));
324 # elif defined(__KERNEL_SSE3__)
325 float4 h(_mm_hadd_ps(
a.m128,
a.m128));
326 return float4(_mm_hadd_ps(h.m128, h.m128));
328 float4 h(shuffle<1, 0, 3, 2>(
a) +
a);
329 return shuffle<2, 3, 0, 1>(h) + h;
332 float sum = (
a.x +
a.y) + (
a.z +
a.w);
355 return (
t != 0.0f) ?
a /
t :
a;
360 # ifdef __KERNEL_SSE__
361 return float4(_mm_min_ps(
a.m128, b.m128));
369 # ifdef __KERNEL_SSE__
370 return float4(_mm_max_ps(
a.m128, b.m128));
383 # if defined(__KERNEL_SSE__)
384 # if defined(__KERNEL_NEON__)
385 return float4(vabsq_f32(
a));
387 return float4(_mm_and_ps(
a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))));
396 # ifdef __KERNEL_SSE__
397 return float4(_mm_floor_ps(
a));
405 return a +
t * (b -
a);
410 #ifdef __KERNEL_SSE__
411 template<
size_t index_0,
size_t index_1,
size_t index_2,
size_t index_3>
414 # if defined(__KERNEL_NEON__)
415 return float4(shuffle_neon<__m128, index_0, index_1, index_2, index_3>(b.m128));
417 return float4(_mm_castsi128_ps(
418 _mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(index_3, index_2, index_1, index_0))));
422 template<
size_t index_0,
size_t index_1,
size_t index_2,
size_t index_3>
425 # if defined(__KERNEL_NEON__)
426 return float4(shuffle_neon<__m128, index_0, index_1, index_2, index_3>(
a.m128, b.m128));
428 return float4(_mm_shuffle_ps(
a.m128, b.m128, _MM_SHUFFLE(index_3, index_2, index_1, index_0)));
434 return float4(_mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(b))));
439 return float4(_mm_movelh_ps(
a.m128, b.m128));
442 template<>
__forceinline const float4 shuffle<2, 3, 2, 3>(
const float4 &
a,
const float4 &b)
444 return float4(_mm_movehl_ps(b.m128,
a.m128));
447 # ifdef __KERNEL_SSE3__
450 return float4(_mm_moveldup_ps(b));
455 return float4(_mm_movehdup_ps(b));
460 #ifndef __KERNEL_GPU__
463 # ifdef __KERNEL_SSE__
464 return float4(_mm_blendv_ps(b.m128,
a.m128, _mm_castsi128_ps(
mask.m128)));
479 # if defined(__KERNEL_SSE__)
480 # if defined(__KERNEL_NEON__)
481 return float4(vdupq_n_f32(vminvq_f32(
a)));
483 float4 h =
min(shuffle<1, 0, 3, 2>(
a),
a);
484 return min(shuffle<2, 3, 0, 1>(h), h);
493 # if defined(__KERNEL_SSE__)
494 # if defined(__KERNEL_NEON__)
495 return float4(vdupq_n_f32(vmaxvq_f32(
a)));
497 float4 h =
max(shuffle<1, 0, 3, 2>(
a),
a);
498 return max(shuffle<2, 3, 0, 1>(h), h);
507 # ifdef __KERNEL_SSE__
508 return float4(_mm_loadu_ps(
v));
_GL_VOID GLfloat value _GL_VOID_RET _GL_VOID const GLuint GLboolean *residences _GL_BOOL_RET _GL_VOID GLsizei GLfloat GLfloat GLfloat GLfloat const GLubyte *bitmap _GL_VOID_RET _GL_VOID GLenum const void *lists _GL_VOID_RET _GL_VOID const GLdouble *equation _GL_VOID_RET _GL_VOID GLdouble GLdouble blue _GL_VOID_RET _GL_VOID GLfloat GLfloat blue _GL_VOID_RET _GL_VOID GLint GLint blue _GL_VOID_RET _GL_VOID GLshort GLshort blue _GL_VOID_RET _GL_VOID GLubyte GLubyte blue _GL_VOID_RET _GL_VOID GLuint GLuint blue _GL_VOID_RET _GL_VOID GLushort GLushort blue _GL_VOID_RET _GL_VOID GLbyte GLbyte GLbyte alpha _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble alpha _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat alpha _GL_VOID_RET _GL_VOID GLint GLint GLint alpha _GL_VOID_RET _GL_VOID GLshort GLshort GLshort alpha _GL_VOID_RET _GL_VOID GLubyte GLubyte GLubyte alpha _GL_VOID_RET _GL_VOID GLuint GLuint GLuint alpha _GL_VOID_RET _GL_VOID GLushort GLushort GLushort alpha _GL_VOID_RET _GL_VOID GLenum mode _GL_VOID_RET _GL_VOID GLint GLsizei GLsizei GLenum type _GL_VOID_RET _GL_VOID GLsizei GLenum GLenum const void *pixels _GL_VOID_RET _GL_VOID const void *pointer _GL_VOID_RET _GL_VOID GLdouble v _GL_VOID_RET _GL_VOID GLfloat v _GL_VOID_RET _GL_VOID GLint GLint i2 _GL_VOID_RET _GL_VOID GLint j _GL_VOID_RET _GL_VOID GLfloat param _GL_VOID_RET _GL_VOID GLint param _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble GLdouble GLdouble zFar _GL_VOID_RET _GL_UINT GLdouble *equation _GL_VOID_RET _GL_VOID GLenum GLint *params _GL_VOID_RET _GL_VOID GLenum GLfloat *v _GL_VOID_RET _GL_VOID GLenum GLfloat *params _GL_VOID_RET _GL_VOID GLfloat *values _GL_VOID_RET _GL_VOID GLushort *values _GL_VOID_RET _GL_VOID GLenum GLfloat *params _GL_VOID_RET _GL_VOID GLenum GLdouble *params _GL_VOID_RET _GL_VOID GLenum GLint *params _GL_VOID_RET _GL_VOID GLsizei const void *pointer _GL_VOID_RET _GL_VOID GLsizei const void *pointer _GL_VOID_RET _GL_BOOL GLfloat param _GL_VOID_RET _GL_VOID GLint param _GL_VOID_RET _GL_VOID GLenum GLfloat param _GL_VOID_RET _GL_VOID GLenum GLint param _GL_VOID_RET _GL_VOID GLushort pattern _GL_VOID_RET _GL_VOID GLdouble GLdouble GLint GLint const GLdouble *points _GL_VOID_RET _GL_VOID GLdouble GLdouble GLint GLint GLdouble GLdouble GLint GLint const GLdouble *points _GL_VOID_RET _GL_VOID GLdouble GLdouble u2 _GL_VOID_RET _GL_VOID GLdouble GLdouble GLint GLdouble GLdouble v2 _GL_VOID_RET _GL_VOID GLenum GLfloat param _GL_VOID_RET _GL_VOID GLenum GLint param _GL_VOID_RET _GL_VOID GLenum mode _GL_VOID_RET _GL_VOID GLdouble GLdouble nz _GL_VOID_RET _GL_VOID GLfloat GLfloat nz _GL_VOID_RET _GL_VOID GLint GLint nz _GL_VOID_RET _GL_VOID GLshort GLshort nz _GL_VOID_RET _GL_VOID GLsizei const void *pointer _GL_VOID_RET _GL_VOID GLsizei const GLfloat *values _GL_VOID_RET _GL_VOID GLsizei const GLushort *values _GL_VOID_RET _GL_VOID GLint param _GL_VOID_RET _GL_VOID const GLuint const GLclampf *priorities _GL_VOID_RET _GL_VOID GLdouble y _GL_VOID_RET _GL_VOID GLfloat y _GL_VOID_RET _GL_VOID GLint y _GL_VOID_RET _GL_VOID GLshort y _GL_VOID_RET _GL_VOID GLdouble GLdouble z _GL_VOID_RET _GL_VOID GLfloat GLfloat z _GL_VOID_RET _GL_VOID GLint GLint z _GL_VOID_RET _GL_VOID GLshort GLshort z _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble w _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat w _GL_VOID_RET _GL_VOID GLint GLint GLint w _GL_VOID_RET _GL_VOID GLshort GLshort GLshort w _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble y2 _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat y2 _GL_VOID_RET _GL_VOID GLint GLint GLint y2 _GL_VOID_RET _GL_VOID GLshort GLshort GLshort y2 _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble z _GL_VOID_RET _GL_VOID GLdouble GLdouble z _GL_VOID_RET _GL_VOID GLuint *buffer _GL_VOID_RET _GL_VOID GLdouble t _GL_VOID_RET _GL_VOID GLfloat t _GL_VOID_RET _GL_VOID GLint t _GL_VOID_RET _GL_VOID GLshort t _GL_VOID_RET _GL_VOID GLdouble t
ATTR_WARN_UNUSED_RESULT const BMVert * v
static T sum(const btAlignedObjectArray< T > &items)
static void shuffle(float2 points[], int size, int rng_seed)
#define ccl_device_inline
#define CCL_NAMESPACE_END
#define make_int4(x, y, z, w)
#define make_float4(x, y, z, w)
__forceinline const avxi shuffle< 0, 0, 2, 2 >(const avxi &b)
__forceinline const avxi shuffle< 0, 1, 0, 1 >(const avxi &b)
__forceinline const avxi shuffle< 1, 1, 3, 3 >(const avxi &b)
ccl_device_inline bool isfinite_safe(float f)
ccl_device_inline float len_squared(const float4 &a)
ccl_device_inline float4 reduce_min(const float4 &a)
ccl_device_inline float4 reduce_max(const float4 &a)
ccl_device_inline float4 operator+=(float4 &a, const float4 &b)
ccl_device_inline bool operator==(const float4 &a, const float4 &b)
ccl_device_inline float4 safe_normalize(const float4 &a)
ccl_device_inline float4 normalize(const float4 &a)
ccl_device_inline float4 one_float4()
ccl_device_inline int4 operator<(const float4 &a, const float4 &b)
ccl_device_inline float4 rcp(const float4 &a)
ccl_device_inline bool isfinite4_safe(float4 v)
ccl_device_inline float4 operator+(const float4 &a, const float f)
CCL_NAMESPACE_BEGIN ccl_device_inline float4 operator-(const float4 &a)
ccl_device_inline int4 operator>=(const float4 &a, const float4 &b)
ccl_device_inline int4 operator<=(const float4 &a, const float4 &b)
ccl_device_inline float4 zero_float4()
ccl_device_inline float distance(const float4 &a, const float4 &b)
ccl_device_inline float4 floor(const float4 &a)
ccl_device_inline float len(const float4 &a)
ccl_device_inline float4 operator*(const float4 &a, const float4 &b)
ccl_device_inline float4 sqrt(const float4 &a)
ccl_device_inline float4 mask(const int4 &mask, const float4 &a)
ccl_device_inline float4 sqr(const float4 &a)
ccl_device_inline float4 fabs(const float4 &a)
ccl_device_inline float average(const float4 &a)
ccl_device_inline float4 reduce_add(const float4 &a)
ccl_device_inline float4 mix(const float4 &a, const float4 &b, float t)
ccl_device_inline float4 min(const float4 &a, const float4 &b)
ccl_device_inline float dot(const float4 &a, const float4 &b)
ccl_device_inline bool is_zero(const float4 &a)
ccl_device_inline float4 select(const int4 &mask, const float4 &a, const float4 &b)
ccl_device_inline float4 safe_divide_float4_float(const float4 a, const float b)
ccl_device_inline float4 operator*=(float4 &a, const float4 &b)
ccl_device_inline float4 operator-=(float4 &a, const float4 &b)
ccl_device_inline float4 ensure_finite4(float4 v)
ccl_device_inline float4 cross(const float4 &a, const float4 &b)
ccl_device_inline float4 clamp(const float4 &a, const float4 &mn, const float4 &mx)
ccl_device_inline float4 load_float4(const float *v)
ccl_device_inline float4 max(const float4 &a, const float4 &b)
ccl_device_inline float4 operator/=(float4 &a, float f)
ccl_device_inline float4 operator/(const float4 &a, float f)