18 #ifndef __UTIL_SIMD_TYPES_H__
19 #define __UTIL_SIMD_TYPES_H__
32 #if defined(FREE_WINDOWS64)
34 #elif defined(_MSC_VER)
36 #elif (defined(__x86_64__) || defined(__i386__))
37 # include <x86intrin.h>
38 #elif defined(__KERNEL_NEON__)
39 # define SSE2NEON_PRECISE_MINMAX 1
40 # include <sse2neon.h>
44 #if defined(__x86_64__) || defined(_M_X64)
45 # define SIMD_SET_FLUSH_TO_ZERO \
46 _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); \
47 _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
49 # define SIMD_SET_FLUSH_TO_ZERO
55 #ifdef __KERNEL_SSE2__
57 extern const __m128 _mm_lookupmask_ps[16];
59 static struct TrueTy {
66 static struct FalseTy {
73 static struct ZeroTy {
95 static struct NegInfTy {
98 return -std::numeric_limits<float>::infinity();
106 static struct PosInfTy {
109 return std::numeric_limits<float>::infinity();
117 static struct StepTy {
123 #if defined(__KERNEL_NEON__)
124 template<
class type,
int i0,
int i1,
int i2,
int i3>
type shuffle_neon(
const type &
a)
126 if (i0 ==
i1 && i0 == i2 && i0 == i3) {
127 return vdupq_laneq_s32(
a, i0);
129 static const uint8_t tbl[16] = {(i0 * 4) + 0,
146 return vqtbl1q_s8(int8x16_t(
a), *(int8x16_t *)tbl);
149 template<
class type,
int i0,
int i1,
int i2,
int i3>
153 static const uint8_t tbl[16] = {(i0 * 4) + 0,
170 return vqtbl1q_s8(int8x16_t(b), *(int8x16_t *)tbl);
174 static const uint8_t tbl[16] = {(i0 * 4) + 0,
191 return vqtbl2q_s8((int8x16x2_t){
a, b}, *(int8x16_t *)tbl);
200 #if defined(__BMI__) && defined(__GNUC__)
202 # define _tzcnt_u32 __tzcnt_u32
205 # define _tzcnt_u64 __tzcnt_u64
209 #if defined(__LZCNT__)
210 # define _lzcnt_u32 __lzcnt32
211 # define _lzcnt_u64 __lzcnt64
214 #if defined(_WIN32) && !defined(__MINGW32__) && !defined(__clang__)
218 # if defined(__KERNEL_AVX2__)
219 return _tzcnt_u32(
v);
222 _BitScanForward(&
r,
v);
230 _BitScanReverse(&
r,
v);
237 _bittestandcomplement(&
r, i);
243 # if defined(__KERNEL_AVX2__)
244 return _tzcnt_u32(
v);
250 # if defined(__KERNEL_64_BIT__)
254 # if defined(__KERNEL_AVX2__)
255 return _tzcnt_u64(
v);
258 _BitScanForward64(&
r,
v);
266 _BitScanReverse64(&
r,
v);
273 _bittestandcomplement64((__int64 *)&
r, i);
279 # if defined(__KERNEL_AVX2__)
280 # if defined(__KERNEL_64_BIT__)
281 return _tzcnt_u64(
v);
283 return _tzcnt_u32(
v);
292 #elif (defined(__x86_64__) || defined(__i386__)) && defined(__KERNEL_SSE2__)
298 asm(
"bsf %1,%0" :
"=r"(
r) :
"r"(
v));
305 asm(
"bsr %1,%0" :
"=r"(
r) :
"r"(
v));
312 asm(
"btc %1,%0" :
"=r"(
r) :
"r"(i),
"0"(
v) :
"flags");
316 # if (defined(__KERNEL_64_BIT__) || defined(__APPLE__)) && \
317 !(defined(__ILP32__) && defined(__x86_64__))
321 asm(
"bsf %1,%0" :
"=r"(
r) :
"r"(
v));
329 asm(
"bsr %1,%0" :
"=r"(
r) :
"r"(
v));
336 asm(
"btc %1,%0" :
"=r"(
r) :
"r"(i),
"0"(
v) :
"flags");
342 # if defined(__KERNEL_AVX2__)
343 return _tzcnt_u32(
v);
349 # if (defined(__KERNEL_64_BIT__) || defined(__APPLE__)) && \
350 !(defined(__ILP32__) && defined(__x86_64__))
353 # if defined(__KERNEL_AVX2__)
354 # if defined(__KERNEL_64_BIT__)
355 return _tzcnt_u64(
v);
357 return _tzcnt_u32(
v);
369 for (
int i = 0; i < 32; i++) {
378 for (
int i = 0; i < 32; i++) {
379 if (
x & (1U << (31 - i)))
393 for (
int i = 0; i < 64; i++) {
402 for (
int i = 0; i < 64; i++) {
403 if (
x & (1UL << (63 - i)))
419 while ((value & (1 << bit)) == 0) {
429 while ((value & (1 << bit)) == 0) {
441 #ifdef __KERNEL_SSE2__
446 # if !(defined(__KERNEL_SSE41__) || defined(__SSE4_1__) || defined(__SSE4_2__))
450 # define _MM_FROUND_TO_NEAREST_INT 0x00
451 # define _MM_FROUND_TO_NEG_INF 0x01
452 # define _MM_FROUND_TO_POS_INF 0x02
453 # define _MM_FROUND_TO_ZERO 0x03
454 # define _MM_FROUND_CUR_DIRECTION 0x04
456 # undef _mm_blendv_ps
457 # define _mm_blendv_ps _mm_blendv_ps_emu
460 __m128i isignmask = _mm_set1_epi32(0x80000000);
461 __m128 signmask = _mm_castsi128_ps(isignmask);
462 __m128i iandsign = _mm_castps_si128(_mm_and_ps(
mask, signmask));
463 __m128i icmpmask = _mm_cmpeq_epi32(iandsign, isignmask);
464 __m128 cmpmask = _mm_castsi128_ps(icmpmask);
465 return _mm_or_ps(_mm_and_ps(cmpmask, input), _mm_andnot_ps(cmpmask, value));
469 # define _mm_blend_ps _mm_blend_ps_emu
470 __forceinline __m128 _mm_blend_ps_emu(__m128 value, __m128 input,
const int mask)
473 return _mm_blendv_ps(value, input, _mm_lookupmask_ps[
mask]);
476 # undef _mm_blendv_epi8
477 # define _mm_blendv_epi8 _mm_blendv_epi8_emu
478 __forceinline __m128i _mm_blendv_epi8_emu(__m128i value, __m128i input, __m128i
mask)
480 return _mm_or_si128(_mm_and_si128(
mask, input), _mm_andnot_si128(
mask, value));
483 # undef _mm_min_epi32
484 # define _mm_min_epi32 _mm_min_epi32_emu
485 __forceinline __m128i _mm_min_epi32_emu(__m128i value, __m128i input)
487 return _mm_blendv_epi8(input, value, _mm_cmplt_epi32(value, input));
490 # undef _mm_max_epi32
491 # define _mm_max_epi32 _mm_max_epi32_emu
492 __forceinline __m128i _mm_max_epi32_emu(__m128i value, __m128i input)
494 return _mm_blendv_epi8(value, input, _mm_cmplt_epi32(value, input));
497 # ifndef __KERNEL_NEON__
498 # undef _mm_extract_epi32
499 # define _mm_extract_epi32 _mm_extract_epi32_emu
500 __forceinline int _mm_extract_epi32_emu(__m128i input,
const int index)
504 return _mm_cvtsi128_si32(input);
506 return _mm_cvtsi128_si32(_mm_shuffle_epi32(input, _MM_SHUFFLE(1, 1, 1, 1)));
508 return _mm_cvtsi128_si32(_mm_shuffle_epi32(input, _MM_SHUFFLE(2, 2, 2, 2)));
510 return _mm_cvtsi128_si32(_mm_shuffle_epi32(input, _MM_SHUFFLE(3, 3, 3, 3)));
518 # undef _mm_insert_epi32
519 # define _mm_insert_epi32 _mm_insert_epi32_emu
520 __forceinline __m128i _mm_insert_epi32_emu(__m128i value,
int input,
const int index)
522 assert(index >= 0 && index < 4);
523 ((
int *)&value)[index] = input;
527 # undef _mm_insert_ps
528 # define _mm_insert_ps _mm_insert_ps_emu
529 __forceinline __m128 _mm_insert_ps_emu(__m128 value, __m128 input,
const int index)
531 assert(index < 0x100);
532 ((
float *)&value)[(index >> 4) & 0x3] = ((
float *)&input)[index >> 6];
533 return _mm_andnot_ps(_mm_lookupmask_ps[index & 0xf], value);
537 # define _mm_round_ps _mm_round_ps_emu
538 __forceinline __m128 _mm_round_ps_emu(__m128 value,
const int flags)
541 case _MM_FROUND_TO_NEAREST_INT:
542 return _mm_cvtepi32_ps(_mm_cvtps_epi32(value));
543 case _MM_FROUND_TO_NEG_INF:
544 return _mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_add_ps(value, _mm_set1_ps(-0.5f))));
545 case _MM_FROUND_TO_POS_INF:
546 return _mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_add_ps(value, _mm_set1_ps(0.5f))));
547 case _MM_FROUND_TO_ZERO:
548 return _mm_cvtepi32_ps(_mm_cvttps_epi32(value));
557 # if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
558 # undef _mm256_cvtss_f32
559 # define _mm256_cvtss_f32(a) (_mm_cvtss_f32(_mm256_castps256_ps128(a)))
565 #if defined(__KERNEL_SSE2__) || defined(__KERNEL_SSE3__) || defined(__KERNEL_SSSE3__) || \
566 defined(__KERNEL_SSE41__) || defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
typedef float(TangentPoint)[2]
_GL_VOID GLfloat value _GL_VOID_RET _GL_VOID const GLuint GLboolean *residences _GL_BOOL_RET _GL_VOID GLsizei GLfloat GLfloat GLfloat GLfloat const GLubyte *bitmap _GL_VOID_RET _GL_VOID GLenum const void *lists _GL_VOID_RET _GL_VOID const GLdouble *equation _GL_VOID_RET _GL_VOID GLdouble GLdouble blue _GL_VOID_RET _GL_VOID GLfloat GLfloat blue _GL_VOID_RET _GL_VOID GLint GLint blue _GL_VOID_RET _GL_VOID GLshort GLshort blue _GL_VOID_RET _GL_VOID GLubyte GLubyte blue _GL_VOID_RET _GL_VOID GLuint GLuint blue _GL_VOID_RET _GL_VOID GLushort GLushort blue _GL_VOID_RET _GL_VOID GLbyte GLbyte GLbyte alpha _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble alpha _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat alpha _GL_VOID_RET _GL_VOID GLint GLint GLint alpha _GL_VOID_RET _GL_VOID GLshort GLshort GLshort alpha _GL_VOID_RET _GL_VOID GLubyte GLubyte GLubyte alpha _GL_VOID_RET _GL_VOID GLuint GLuint GLuint alpha _GL_VOID_RET _GL_VOID GLushort GLushort GLushort alpha _GL_VOID_RET _GL_VOID GLenum mode _GL_VOID_RET _GL_VOID GLint GLsizei GLsizei GLenum type _GL_VOID_RET _GL_VOID GLsizei GLenum GLenum const void *pixels _GL_VOID_RET _GL_VOID const void *pointer _GL_VOID_RET _GL_VOID GLdouble v _GL_VOID_RET _GL_VOID GLfloat v _GL_VOID_RET _GL_VOID GLint GLint i2 _GL_VOID_RET _GL_VOID GLint j _GL_VOID_RET _GL_VOID GLfloat param _GL_VOID_RET _GL_VOID GLint param _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble GLdouble GLdouble zFar _GL_VOID_RET _GL_UINT GLdouble *equation _GL_VOID_RET _GL_VOID GLenum GLint *params _GL_VOID_RET _GL_VOID GLenum GLfloat *v _GL_VOID_RET _GL_VOID GLenum GLfloat *params _GL_VOID_RET _GL_VOID GLfloat *values _GL_VOID_RET _GL_VOID GLushort *values _GL_VOID_RET _GL_VOID GLenum GLfloat *params _GL_VOID_RET _GL_VOID GLenum GLdouble *params _GL_VOID_RET _GL_VOID GLenum GLint *params _GL_VOID_RET _GL_VOID GLsizei const void *pointer _GL_VOID_RET _GL_VOID GLsizei const void *pointer _GL_VOID_RET _GL_BOOL GLfloat param _GL_VOID_RET _GL_VOID GLint param _GL_VOID_RET _GL_VOID GLenum GLfloat param _GL_VOID_RET _GL_VOID GLenum GLint param _GL_VOID_RET _GL_VOID GLushort pattern _GL_VOID_RET _GL_VOID GLdouble GLdouble GLint GLint const GLdouble *points _GL_VOID_RET _GL_VOID GLdouble GLdouble GLint GLint GLdouble GLdouble GLint GLint const GLdouble *points _GL_VOID_RET _GL_VOID GLdouble GLdouble u2 _GL_VOID_RET _GL_VOID GLdouble GLdouble GLint GLdouble GLdouble v2 _GL_VOID_RET _GL_VOID GLenum GLfloat param _GL_VOID_RET _GL_VOID GLenum GLint param _GL_VOID_RET _GL_VOID GLenum mode _GL_VOID_RET _GL_VOID GLdouble GLdouble nz _GL_VOID_RET _GL_VOID GLfloat GLfloat nz _GL_VOID_RET _GL_VOID GLint GLint nz _GL_VOID_RET _GL_VOID GLshort GLshort nz _GL_VOID_RET _GL_VOID GLsizei const void *pointer _GL_VOID_RET _GL_VOID GLsizei const GLfloat *values _GL_VOID_RET _GL_VOID GLsizei const GLushort *values _GL_VOID_RET _GL_VOID GLint param _GL_VOID_RET _GL_VOID const GLuint const GLclampf *priorities _GL_VOID_RET _GL_VOID GLdouble y _GL_VOID_RET _GL_VOID GLfloat y _GL_VOID_RET _GL_VOID GLint y _GL_VOID_RET _GL_VOID GLshort y _GL_VOID_RET _GL_VOID GLdouble GLdouble z _GL_VOID_RET _GL_VOID GLfloat GLfloat z _GL_VOID_RET _GL_VOID GLint GLint z _GL_VOID_RET _GL_VOID GLshort GLshort z _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble w _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat w _GL_VOID_RET _GL_VOID GLint GLint GLint w _GL_VOID_RET _GL_VOID GLshort GLshort GLshort w _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble y2 _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat y2 _GL_VOID_RET _GL_VOID GLint GLint GLint y2 _GL_VOID_RET _GL_VOID GLshort GLshort GLshort y2 _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble z _GL_VOID_RET _GL_VOID GLdouble GLdouble z _GL_VOID_RET _GL_VOID GLuint *buffer _GL_VOID_RET _GL_VOID GLdouble t _GL_VOID_RET _GL_VOID GLfloat t _GL_VOID_RET _GL_VOID GLint t _GL_VOID_RET _GL_VOID GLshort t _GL_VOID_RET _GL_VOID GLdouble GLdouble r _GL_VOID_RET _GL_VOID GLfloat GLfloat r _GL_VOID_RET _GL_VOID GLint GLint r _GL_VOID_RET _GL_VOID GLshort GLshort r _GL_VOID_RET _GL_VOID GLdouble GLdouble r
_GL_VOID GLfloat value _GL_VOID_RET _GL_VOID const GLuint GLboolean *residences _GL_BOOL_RET _GL_VOID GLsizei GLfloat GLfloat GLfloat GLfloat const GLubyte *bitmap _GL_VOID_RET _GL_VOID GLenum type
_GL_VOID GLfloat value _GL_VOID_RET _GL_VOID const GLuint GLboolean *residences _GL_BOOL_RET _GL_VOID GLsizei GLfloat GLfloat GLfloat GLfloat const GLubyte *bitmap _GL_VOID_RET _GL_VOID GLenum const void *lists _GL_VOID_RET _GL_VOID const GLdouble *equation _GL_VOID_RET _GL_VOID GLdouble GLdouble blue _GL_VOID_RET _GL_VOID GLfloat GLfloat blue _GL_VOID_RET _GL_VOID GLint GLint blue _GL_VOID_RET _GL_VOID GLshort GLshort blue _GL_VOID_RET _GL_VOID GLubyte GLubyte blue _GL_VOID_RET _GL_VOID GLuint GLuint blue _GL_VOID_RET _GL_VOID GLushort GLushort blue _GL_VOID_RET _GL_VOID GLbyte GLbyte GLbyte alpha _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble alpha _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat alpha _GL_VOID_RET _GL_VOID GLint GLint GLint alpha _GL_VOID_RET _GL_VOID GLshort GLshort GLshort alpha _GL_VOID_RET _GL_VOID GLubyte GLubyte GLubyte alpha _GL_VOID_RET _GL_VOID GLuint GLuint GLuint alpha _GL_VOID_RET _GL_VOID GLushort GLushort GLushort alpha _GL_VOID_RET _GL_VOID GLenum mode _GL_VOID_RET _GL_VOID GLint GLsizei GLsizei GLenum type _GL_VOID_RET _GL_VOID GLsizei GLenum GLenum const void *pixels _GL_VOID_RET _GL_VOID const void *pointer _GL_VOID_RET _GL_VOID GLdouble v _GL_VOID_RET _GL_VOID GLfloat v _GL_VOID_RET _GL_VOID GLint i1
ATTR_WARN_UNUSED_RESULT const BMVert * v
#define CCL_NAMESPACE_END
unsigned __int64 uint64_t
ccl_device_inline float4 mask(const int4 &mask, const float4 &a)
CCL_NAMESPACE_BEGIN __forceinline uint32_t __bsf(const uint32_t x)
__forceinline uint32_t __bsr(const uint32_t x)
__forceinline uint32_t __btc(const uint32_t x, const uint32_t bit)
__forceinline uint32_t bitscan(uint32_t value)