25 #ifdef BT_USE_DOUBLE_PRECISION
26 #define btVector3Data btVector3DoubleData
27 #define btVector3DataName "btVector3DoubleData"
29 #define btVector3Data btVector3FloatData
30 #define btVector3DataName "btVector3FloatData"
31 #endif //BT_USE_DOUBLE_PRECISION
33 #if defined BT_USE_SSE
38 #pragma warning(disable: 4556) // value of intrinsic immediate argument '4294967239' is out of range '0 - 255'
42 #define BT_SHUFFLE(x,y,z,w) ((w)<<6 | (z)<<4 | (y)<<2 | (x))
44 #define bt_pshufd_ps( _a, _mask ) _mm_shuffle_ps((_a), (_a), (_mask) )
45 #define bt_splat3_ps( _a, _i ) bt_pshufd_ps((_a), BT_SHUFFLE(_i,_i,_i, 3) )
46 #define bt_splat_ps( _a, _i ) bt_pshufd_ps((_a), BT_SHUFFLE(_i,_i,_i,_i) )
48 #define btv3AbsiMask (_mm_set_epi32(0x00000000, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF))
49 #define btvAbsMask (_mm_set_epi32( 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF))
50 #define btvFFF0Mask (_mm_set_epi32(0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF))
51 #define btv3AbsfMask btCastiTo128f(btv3AbsiMask)
52 #define btvFFF0fMask btCastiTo128f(btvFFF0Mask)
53 #define btvxyzMaskf btvFFF0fMask
54 #define btvAbsfMask btCastiTo128f(btvAbsMask)
57 #define btvMzeroMask (_mm_set_ps(-0.0f, -0.0f, -0.0f, -0.0f))
58 #define v1110 (_mm_set_ps(0.0f, 1.0f, 1.0f, 1.0f))
59 #define vHalf (_mm_set_ps(0.5f, 0.5f, 0.5f, 0.5f))
60 #define v1_5 (_mm_set_ps(1.5f, 1.5f, 1.5f, 1.5f))
71 const float32x4_t
ATTRIBUTE_ALIGNED16(btvMzeroMask) = (float32x4_t){-0.0f, -0.0f, -0.0f, -0.0f};
73 static_cast<int32_t>(0xFFFFFFFF),
static_cast<int32_t>(0xFFFFFFFF), 0x0};
74 const int32x4_t
ATTRIBUTE_ALIGNED16(btvAbsMask) = (int32x4_t){0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
75 const int32x4_t
ATTRIBUTE_ALIGNED16(btv3AbsMask) = (int32x4_t){0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x0};
89 #if defined (__SPU__) && defined (__CELLOS_LV2__)
94 return *((
const vec_float4*)&m_floats[0]);
97 #else //__CELLOS_LV2__ __SPU__
98 #if defined (BT_USE_SSE) || defined(BT_USE_NEON) // _WIN32 || ARM
100 btSimdFloat4 mVec128;
114 #endif //__CELLOS_LV2__ __SPU__
139 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE) )|| defined (BT_USE_NEON)
149 mVec128 = rhs.mVec128;
160 #endif // #if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
166 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
167 mVec128 = _mm_add_ps(mVec128, v.mVec128);
168 #elif defined(BT_USE_NEON)
169 mVec128 = vaddq_f32(mVec128, v.mVec128);
183 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
184 mVec128 = _mm_sub_ps(mVec128, v.mVec128);
185 #elif defined(BT_USE_NEON)
186 mVec128 = vsubq_f32(mVec128, v.mVec128);
199 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
200 __m128 vs = _mm_load_ss(&s);
201 vs = bt_pshufd_ps(vs, 0x80);
202 mVec128 = _mm_mul_ps(mVec128, vs);
203 #elif defined(BT_USE_NEON)
204 mVec128 = vmulq_n_f32(mVec128, s);
219 #if 0 //defined(BT_USE_SSE_IN_API)
221 __m128 vs = _mm_load_ss(&s);
222 vs = _mm_div_ss(v1110, vs);
223 vs = bt_pshufd_ps(vs, 0x00);
225 mVec128 = _mm_mul_ps(mVec128, vs);
237 #if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
238 __m128 vd = _mm_mul_ps(mVec128, v.mVec128);
239 __m128 z = _mm_movehl_ps(vd, vd);
240 __m128 y = _mm_shuffle_ps(vd, vd, 0x55);
241 vd = _mm_add_ss(vd, y);
242 vd = _mm_add_ss(vd, z);
243 return _mm_cvtss_f32(vd);
244 #elif defined(BT_USE_NEON)
245 float32x4_t vd = vmulq_f32(mVec128, v.mVec128);
246 float32x2_t x = vpadd_f32(vget_low_f32(vd), vget_low_f32(vd));
247 x = vadd_f32(x, vget_high_f32(vd));
248 return vget_lane_f32(x, 0);
250 return m_floats[0] * v.
m_floats[0] +
314 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
316 __m128 vd = _mm_mul_ps(mVec128, mVec128);
317 __m128 z = _mm_movehl_ps(vd, vd);
318 __m128 y = _mm_shuffle_ps(vd, vd, 0x55);
319 vd = _mm_add_ss(vd, y);
320 vd = _mm_add_ss(vd, z);
323 vd = _mm_sqrt_ss(vd);
324 vd = _mm_div_ss(v1110, vd);
325 vd = bt_splat_ps(vd, 0x80);
326 mVec128 = _mm_mul_ps(mVec128, vd);
330 y = _mm_rsqrt_ss(vd);
334 vd = _mm_mul_ss(vd, vHalf);
336 vd = _mm_mul_ss(vd, y);
337 vd = _mm_mul_ss(vd, y);
338 z = _mm_sub_ss(z, vd);
340 y = _mm_mul_ss(y, z);
342 y = bt_splat_ps(y, 0x80);
343 mVec128 = _mm_mul_ps(mVec128, y);
375 #if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
376 return btVector3(_mm_and_ps(mVec128, btv3AbsfMask));
377 #elif defined(BT_USE_NEON)
391 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
394 T = bt_pshufd_ps(mVec128, BT_SHUFFLE(1, 2, 0, 3));
395 V = bt_pshufd_ps(v.mVec128, BT_SHUFFLE(1, 2, 0, 3));
397 V = _mm_mul_ps(V, mVec128);
398 T = _mm_mul_ps(T, v.mVec128);
399 V = _mm_sub_ps(V, T);
401 V = bt_pshufd_ps(V, BT_SHUFFLE(1, 2, 0, 3));
403 #elif defined(BT_USE_NEON)
406 float32x2_t Tlow = vget_low_f32(mVec128);
407 float32x2_t Vlow = vget_low_f32(v.mVec128);
408 T = vcombine_f32(vext_f32(Tlow, vget_high_f32(mVec128), 1), Tlow);
409 V = vcombine_f32(vext_f32(Vlow, vget_high_f32(v.mVec128), 1), Vlow);
411 V = vmulq_f32(V, mVec128);
412 T = vmulq_f32(T, v.mVec128);
414 Vlow = vget_low_f32(V);
416 V = vcombine_f32(vext_f32(Vlow, vget_high_f32(V), 1), Vlow);
417 V = (float32x4_t)vandq_s32((int32x4_t)V, btvFFF0Mask);
430 #if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
432 __m128 T = _mm_shuffle_ps(v1.mVec128, v1.mVec128, BT_SHUFFLE(1, 2, 0, 3));
433 __m128 V = _mm_shuffle_ps(v2.mVec128, v2.mVec128, BT_SHUFFLE(1, 2, 0, 3));
435 V = _mm_mul_ps(V, v1.mVec128);
436 T = _mm_mul_ps(T, v2.mVec128);
437 V = _mm_sub_ps(V, T);
439 V = _mm_shuffle_ps(V, V, BT_SHUFFLE(1, 2, 0, 3));
442 V = _mm_mul_ps(V, mVec128);
443 __m128 z = _mm_movehl_ps(V, V);
444 __m128 y = _mm_shuffle_ps(V, V, 0x55);
445 V = _mm_add_ss(V, y);
446 V = _mm_add_ss(V, z);
447 return _mm_cvtss_f32(V);
449 #elif defined(BT_USE_NEON)
453 float32x2_t Tlow = vget_low_f32(v1.mVec128);
454 float32x2_t Vlow = vget_low_f32(v2.mVec128);
455 T = vcombine_f32(vext_f32(Tlow, vget_high_f32(v1.mVec128), 1), Tlow);
456 V = vcombine_f32(vext_f32(Vlow, vget_high_f32(v2.mVec128), 1), Vlow);
458 V = vmulq_f32(V, v1.mVec128);
459 T = vmulq_f32(T, v2.mVec128);
461 Vlow = vget_low_f32(V);
463 V = vcombine_f32(vext_f32(Vlow, vget_high_f32(V), 1), Vlow);
466 V = vmulq_f32(mVec128, V);
467 float32x2_t x = vpadd_f32(vget_low_f32(V), vget_low_f32(V));
468 x = vadd_f32(x, vget_high_f32(V));
469 return vget_lane_f32(x, 0);
482 return m_floats[0] < m_floats[1] ? (m_floats[0] <m_floats[2] ? 0 : 2) : (m_floats[1] <m_floats[2] ? 1 : 2);
489 return m_floats[0] < m_floats[1] ? (m_floats[1] <m_floats[2] ? 2 : 1) : (m_floats[0] <m_floats[2] ? 2 : 0);
494 return absolute().minAxis();
499 return absolute().maxAxis();
505 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
506 __m128 vrt = _mm_load_ss(&rt);
508 __m128 vs = _mm_load_ss(&s);
509 vs = bt_pshufd_ps(vs, 0x80);
510 __m128 r0 = _mm_mul_ps(v0.mVec128, vs);
511 vrt = bt_pshufd_ps(vrt, 0x80);
512 __m128 r1 = _mm_mul_ps(v1.mVec128, vrt);
513 __m128 tmp3 = _mm_add_ps(r0,r1);
515 #elif defined(BT_USE_NEON)
516 float32x4_t vl = vsubq_f32(v1.mVec128, v0.mVec128);
517 vl = vmulq_n_f32(vl, rt);
518 mVec128 = vaddq_f32(vl, v0.mVec128);
534 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
535 __m128 vt = _mm_load_ss(&t);
536 vt = bt_pshufd_ps(vt, 0x80);
537 __m128 vl = _mm_sub_ps(v.mVec128, mVec128);
538 vl = _mm_mul_ps(vl, vt);
539 vl = _mm_add_ps(vl, mVec128);
542 #elif defined(BT_USE_NEON)
543 float32x4_t vl = vsubq_f32(v.mVec128, mVec128);
544 vl = vmulq_n_f32(vl, t);
545 vl = vaddq_f32(vl, mVec128);
551 m_floats[1] + (v.
m_floats[1] - m_floats[1]) * t,
552 m_floats[2] + (v.
m_floats[2] - m_floats[2]) * t);
560 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
561 mVec128 = _mm_mul_ps(mVec128, v.mVec128);
562 #elif defined(BT_USE_NEON)
563 mVec128 = vmulq_f32(mVec128, v.mVec128);
603 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
604 return (0xf == _mm_movemask_ps((__m128)_mm_cmpeq_ps(mVec128, other.mVec128)));
606 return ((m_floats[3]==other.
m_floats[3]) &&
615 return !(*
this == other);
623 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
624 mVec128 = _mm_max_ps(mVec128, other.mVec128);
625 #elif defined(BT_USE_NEON)
626 mVec128 = vmaxq_f32(mVec128, other.mVec128);
640 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
641 mVec128 = _mm_min_ps(mVec128, other.mVec128);
642 #elif defined(BT_USE_NEON)
643 mVec128 = vminq_f32(mVec128, other.mVec128);
662 #if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
664 __m128 V = _mm_and_ps(mVec128, btvFFF0fMask);
665 __m128 V0 = _mm_xor_ps(btvMzeroMask, V);
666 __m128 V2 = _mm_movelh_ps(V0, V);
668 __m128 V1 = _mm_shuffle_ps(V, V0, 0xCE);
670 V0 = _mm_shuffle_ps(V0, V, 0xDB);
671 V2 = _mm_shuffle_ps(V2, V, 0xF9);
685 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
686 mVec128 = (__m128)_mm_xor_ps(mVec128, mVec128);
687 #elif defined(BT_USE_NEON)
688 int32x4_t vi = vdupq_n_s32(0);
689 mVec128 = vreinterpretq_f32_s32(vi);
733 #if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
735 __m128 a0 = _mm_mul_ps( v0.mVec128, this->mVec128 );
736 __m128 a1 = _mm_mul_ps( v1.mVec128, this->mVec128 );
737 __m128 a2 = _mm_mul_ps( v2.mVec128, this->mVec128 );
738 __m128 b0 = _mm_unpacklo_ps( a0, a1 );
739 __m128 b1 = _mm_unpackhi_ps( a0, a1 );
740 __m128 b2 = _mm_unpacklo_ps( a2, _mm_setzero_ps() );
741 __m128 r = _mm_movelh_ps( b0, b2 );
742 r = _mm_add_ps( r, _mm_movehl_ps( b2, b0 ));
743 a2 = _mm_and_ps( a2, btvxyzMaskf);
744 r = _mm_add_ps( r, btCastdTo128f (_mm_move_sd( btCastfTo128d(a2), btCastfTo128d(b1) )));
747 #elif defined(BT_USE_NEON)
748 static const uint32x4_t xyzMask = (
const uint32x4_t){
static_cast<uint32_t>(-1),
static_cast<uint32_t>(-1),
static_cast<uint32_t>(-1), 0 };
749 float32x4_t a0 = vmulq_f32( v0.mVec128, this->mVec128);
750 float32x4_t a1 = vmulq_f32( v1.mVec128, this->mVec128);
751 float32x4_t a2 = vmulq_f32( v2.mVec128, this->mVec128);
752 float32x2x2_t zLo = vtrn_f32( vget_high_f32(a0), vget_high_f32(a1));
753 a2 = (float32x4_t) vandq_u32((uint32x4_t) a2, xyzMask );
754 float32x2_t b0 = vadd_f32( vpadd_f32( vget_low_f32(a0), vget_low_f32(a1)), zLo.val[0] );
755 float32x2_t b1 = vpadd_f32( vpadd_f32( vget_low_f32(a2), vget_high_f32(a2)), vdup_n_f32(0.0f));
756 return btVector3( vcombine_f32(b0, b1) );
767 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
768 return btVector3(_mm_add_ps(v1.mVec128, v2.mVec128));
769 #elif defined(BT_USE_NEON)
770 return btVector3(vaddq_f32(v1.mVec128, v2.mVec128));
783 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
784 return btVector3(_mm_mul_ps(v1.mVec128, v2.mVec128));
785 #elif defined(BT_USE_NEON)
786 return btVector3(vmulq_f32(v1.mVec128, v2.mVec128));
799 #if defined BT_USE_SIMD_VECTOR3 && (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE))
802 __m128 r = _mm_sub_ps(v1.mVec128, v2.mVec128);
803 return btVector3(_mm_and_ps(r, btvFFF0fMask));
804 #elif defined(BT_USE_NEON)
805 float32x4_t r = vsubq_f32(v1.mVec128, v2.mVec128);
806 return btVector3((float32x4_t)vandq_s32((int32x4_t)r, btvFFF0Mask));
819 #if defined BT_USE_SIMD_VECTOR3 && (defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
820 __m128 r = _mm_xor_ps(v.mVec128, btvMzeroMask);
821 return btVector3(_mm_and_ps(r, btvFFF0fMask));
822 #elif defined(BT_USE_NEON)
823 return btVector3((btSimdFloat4)veorq_s32((int32x4_t)v.mVec128, (int32x4_t)btvMzeroMask));
833 #if defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
834 __m128 vs = _mm_load_ss(&s);
835 vs = bt_pshufd_ps(vs, 0x80);
836 return btVector3(_mm_mul_ps(v.mVec128, vs));
837 #elif defined(BT_USE_NEON)
838 float32x4_t r = vmulq_n_f32(v.mVec128, s);
839 return btVector3((float32x4_t)vandq_s32((int32x4_t)r, btvFFF0Mask));
857 #if 0 //defined(BT_USE_SSE_IN_API)
859 __m128 vs = _mm_load_ss(&s);
860 vs = _mm_div_ss(v1110, vs);
861 vs = bt_pshufd_ps(vs, 0x00);
863 return btVector3(_mm_mul_ps(v.mVec128, vs));
873 #if defined BT_USE_SIMD_VECTOR3 && (defined(BT_USE_SSE_IN_API)&& defined (BT_USE_SSE))
874 __m128 vec = _mm_div_ps(v1.mVec128, v2.mVec128);
875 vec = _mm_and_ps(vec, btvFFF0fMask);
877 #elif defined(BT_USE_NEON)
878 float32x4_t x, y, v, m;
884 m = vrecpsq_f32(y, v);
886 m = vrecpsq_f32(y, v);
949 return v1.
lerp(v2, t);
956 return (v - *
this).length2();
961 return (v - *
this).length();
975 #if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
977 __m128 O = _mm_mul_ps(wAxis.mVec128, mVec128);
979 __m128 C = wAxis.
cross( mVec128 ).mVec128;
980 O = _mm_and_ps(O, btvFFF0fMask);
983 __m128 vsin = _mm_load_ss(&ssin);
984 __m128 vcos = _mm_load_ss(&scos);
986 __m128 Y = bt_pshufd_ps(O, 0xC9);
987 __m128 Z = bt_pshufd_ps(O, 0xD2);
988 O = _mm_add_ps(O, Y);
989 vsin = bt_pshufd_ps(vsin, 0x80);
990 O = _mm_add_ps(O, Z);
991 vcos = bt_pshufd_ps(vcos, 0x80);
994 O = O * wAxis.mVec128;
995 __m128 X = mVec128 - O;
1007 _y = wAxis.
cross( *
this );
1009 return ( o + _x *
btCos( _angle ) + _y *
btSin( _angle ) );
1015 #if (defined BT_USE_SSE && defined BT_USE_SIMD_VECTOR3 && defined BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
1016 #if defined _WIN32 || defined (BT_USE_SSE)
1017 const long scalar_cutoff = 10;
1018 long _maxdot_large(
const float *array,
const float *vec,
unsigned long array_count,
float *dotOut );
1019 #elif defined BT_USE_NEON
1020 const long scalar_cutoff = 4;
1021 extern long (*_maxdot_large)(
const float *array,
const float *vec,
unsigned long array_count,
float *dotOut );
1023 if( array_count < scalar_cutoff )
1029 for( i = 0; i < array_count; i++ )
1043 #if (defined BT_USE_SSE && defined BT_USE_SIMD_VECTOR3 && defined BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
1044 return _maxdot_large( (
float*) array, (
float*) &
m_floats[0], array_count, &dotOut );
1050 #if (defined BT_USE_SSE && defined BT_USE_SIMD_VECTOR3 && defined BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
1051 #if defined BT_USE_SSE
1052 const long scalar_cutoff = 10;
1053 long _mindot_large(
const float *array,
const float *vec,
unsigned long array_count,
float *dotOut );
1054 #elif defined BT_USE_NEON
1055 const long scalar_cutoff = 4;
1056 extern long (*_mindot_large)(
const float *array,
const float *vec,
unsigned long array_count,
float *dotOut );
1058 #error unhandled arch!
1061 if( array_count < scalar_cutoff )
1068 for( i = 0; i < array_count; i++ )
1083 #if (defined BT_USE_SSE && defined BT_USE_SIMD_VECTOR3 && defined BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
1084 return _mindot_large( (
float*) array, (
float*) &
m_floats[0], array_count, &dotOut );
1085 #endif//BT_USE_SIMD_VECTOR3
1102 #if (defined (BT_USE_SSE_IN_API)&& defined (BT_USE_SSE)) || defined (BT_USE_NEON)
1110 mVec128 = rhs.mVec128;
1116 mVec128 = v.mVec128;
1119 #endif // #if defined (BT_USE_SSE_IN_API) || defined (BT_USE_NEON)
1123 #if defined BT_USE_SIMD_VECTOR3 && defined(BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
1124 return btVector4(_mm_and_ps(mVec128, btvAbsfMask));
1125 #elif defined(BT_USE_NEON)
1240 #ifdef BT_USE_DOUBLE_PRECISION
1241 unsigned char* dest = (
unsigned char*) &destVal;
1242 unsigned char* src = (
unsigned char*) &sourceVal;
1252 unsigned char* dest = (
unsigned char*) &destVal;
1253 unsigned char* src = (
unsigned char*) &sourceVal;
1258 #endif //BT_USE_DOUBLE_PRECISION
1263 for (
int i=0;i<4;i++)
1275 for (
int i=0;i<4;i++)
1279 vector = swappedVec;
1287 btScalar a = n[1]*n[1] + n[2]*n[2];
1299 btScalar a = n[0]*n[0] + n[1]*n[1];
1326 for (
int i=0;i<4;i++)
1332 for (
int i=0;i<4;i++)
1340 for (
int i=0;i<4;i++)
1346 for (
int i=0;i<4;i++)
1354 for (
int i=0;i<4;i++)
1360 for (
int i=0;i<4;i++)
1364 #endif //BT_VECTOR3_H