17 #ifndef BT_SIMD__QUATERNION_H_
18 #define BT_SIMD__QUATERNION_H_
25 #ifdef BT_USE_DOUBLE_PRECISION
26 #define btQuaternionData btQuaternionDoubleData
27 #define btQuaternionDataName "btQuaternionDoubleData"
29 #define btQuaternionData btQuaternionFloatData
30 #define btQuaternionDataName "btQuaternionFloatData"
31 #endif //BT_USE_DOUBLE_PRECISION
38 #define vOnes (_mm_set_ps(1.0f, 1.0f, 1.0f, 1.0f))
42 #if defined(BT_USE_SSE)
44 #define vQInv (_mm_set_ps(+0.0f, -0.0f, -0.0f, -0.0f))
45 #define vPPPM (_mm_set_ps(-0.0f, +0.0f, +0.0f, +0.0f))
47 #elif defined(BT_USE_NEON)
60 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE))|| defined(BT_USE_NEON)
70 mVec128 = rhs.mVec128;
103 #ifndef BT_EULER_DEFAULT_ZYX
135 setValue(cosRoll * sinPitch * cosYaw + sinRoll * cosPitch * sinYaw,
136 cosRoll * cosPitch * sinYaw - sinRoll * sinPitch * cosYaw,
137 sinRoll * cosPitch * cosYaw - cosRoll * sinPitch * sinYaw,
138 cosRoll * cosPitch * cosYaw + sinRoll * sinPitch * sinYaw);
155 setValue(sinRoll * cosPitch * cosYaw - cosRoll * sinPitch * sinYaw,
156 cosRoll * sinPitch * cosYaw + sinRoll * cosPitch * sinYaw,
157 cosRoll * cosPitch * sinYaw - sinRoll * sinPitch * cosYaw,
158 cosRoll * cosPitch * cosYaw + sinRoll * sinPitch * sinYaw);
186 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
187 mVec128 = _mm_add_ps(mVec128, q.mVec128);
188 #elif defined(BT_USE_NEON)
189 mVec128 = vaddq_f32(mVec128, q.mVec128);
203 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
204 mVec128 = _mm_sub_ps(mVec128, q.mVec128);
205 #elif defined(BT_USE_NEON)
206 mVec128 = vsubq_f32(mVec128, q.mVec128);
220 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
221 __m128 vs = _mm_load_ss(&s);
222 vs = bt_pshufd_ps(vs, 0);
223 mVec128 = _mm_mul_ps(mVec128, vs);
224 #elif defined(BT_USE_NEON)
225 mVec128 = vmulq_n_f32(mVec128, s);
240 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
241 __m128 vQ2 = q.get128();
243 __m128 A1 = bt_pshufd_ps(mVec128, BT_SHUFFLE(0,1,2,0));
244 __m128 B1 = bt_pshufd_ps(vQ2, BT_SHUFFLE(3,3,3,0));
248 __m128 A2 = bt_pshufd_ps(mVec128, BT_SHUFFLE(1,2,0,1));
249 __m128 B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(2,0,1,1));
253 B1 = bt_pshufd_ps(mVec128, BT_SHUFFLE(2,0,1,2));
254 B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(1,2,0,2));
258 mVec128 = bt_splat_ps(mVec128, 3);
259 mVec128 = mVec128 * vQ2;
262 mVec128 = mVec128 - B1;
263 A1 = _mm_xor_ps(A1, vPPPM);
264 mVec128 = mVec128+ A1;
266 #elif defined(BT_USE_NEON)
268 float32x4_t vQ1 = mVec128;
269 float32x4_t vQ2 = q.get128();
270 float32x4_t A0, A1, B1, A2, B2, A3, B3;
271 float32x2_t vQ1zx, vQ2wx, vQ1yz, vQ2zx, vQ2yz, vQ2xz;
275 tmp = vtrn_f32( vget_high_f32(vQ1), vget_low_f32(vQ1) );
278 tmp = vtrn_f32( vget_high_f32(vQ2), vget_low_f32(vQ2) );
281 vQ2wx = vext_f32(vget_high_f32(vQ2), vget_low_f32(vQ2), 1);
283 vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1);
285 vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1);
286 vQ2xz = vext_f32(vQ2zx, vQ2zx, 1);
288 A1 = vcombine_f32(vget_low_f32(vQ1), vQ1zx);
289 B1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ2), 1), vQ2wx);
291 A2 = vcombine_f32(vQ1yz, vget_low_f32(vQ1));
292 B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1));
294 A3 = vcombine_f32(vQ1zx, vQ1yz);
295 B3 = vcombine_f32(vQ2yz, vQ2xz);
297 A1 = vmulq_f32(A1, B1);
298 A2 = vmulq_f32(A2, B2);
299 A3 = vmulq_f32(A3, B3);
300 A0 = vmulq_lane_f32(vQ2, vget_high_f32(vQ1), 1);
302 A1 = vaddq_f32(A1, A2);
303 A0 = vsubq_f32(A0, A3);
306 A1 = (btSimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)vPPPM);
307 A0 = vaddq_f32(A0, A1);
323 #if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
326 vd = _mm_mul_ps(mVec128, q.mVec128);
328 __m128 t = _mm_movehl_ps(vd, vd);
329 vd = _mm_add_ps(vd, t);
330 t = _mm_shuffle_ps(vd, vd, 0x55);
331 vd = _mm_add_ss(vd, t);
333 return _mm_cvtss_f32(vd);
334 #elif defined(BT_USE_NEON)
335 float32x4_t vd = vmulq_f32(mVec128, q.mVec128);
336 float32x2_t
x = vpadd_f32(vget_low_f32(vd), vget_high_f32(vd));
338 return vget_lane_f32(
x, 0);
371 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
374 vd = _mm_mul_ps(mVec128, mVec128);
376 __m128 t = _mm_movehl_ps(vd, vd);
377 vd = _mm_add_ps(vd, t);
378 t = _mm_shuffle_ps(vd, vd, 0x55);
379 vd = _mm_add_ss(vd, t);
381 vd = _mm_sqrt_ss(vd);
382 vd = _mm_div_ss(vOnes, vd);
383 vd = bt_pshufd_ps(vd, 0);
384 mVec128 = _mm_mul_ps(mVec128, vd);
397 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
398 __m128 vs = _mm_load_ss(&s);
399 vs = bt_pshufd_ps(vs, 0x00);
402 #elif defined(BT_USE_NEON)
484 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
486 #elif defined(BT_USE_NEON)
487 return btQuaternion((btSimdFloat4)veorq_s32((int32x4_t)mVec128, (int32x4_t)vQInv));
498 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
500 #elif defined(BT_USE_NEON)
513 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
515 #elif defined(BT_USE_NEON)
527 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
529 #elif defined(BT_USE_NEON)
530 return btQuaternion((btSimdFloat4)veorq_s32((int32x4_t)mVec128, (int32x4_t)btvMzeroMask) );
625 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
626 __m128 vQ1 = q1.get128();
627 __m128 vQ2 = q2.get128();
628 __m128 A0, A1, B1, A2, B2;
630 A1 = bt_pshufd_ps(vQ1, BT_SHUFFLE(0,1,2,0));
631 B1 = bt_pshufd_ps(vQ2, BT_SHUFFLE(3,3,3,0));
635 A2 = bt_pshufd_ps(vQ1, BT_SHUFFLE(1,2,0,1));
636 B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(2,0,1,1));
640 B1 = bt_pshufd_ps(vQ1, BT_SHUFFLE(2,0,1,2));
641 B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(1,2,0,2));
645 A0 = bt_splat_ps(vQ1, 3);
651 A1 = _mm_xor_ps(A1, vPPPM);
656 #elif defined(BT_USE_NEON)
658 float32x4_t vQ1 = q1.get128();
659 float32x4_t vQ2 = q2.get128();
660 float32x4_t A0, A1, B1, A2, B2, A3, B3;
661 float32x2_t vQ1zx, vQ2wx, vQ1yz, vQ2zx, vQ2yz, vQ2xz;
665 tmp = vtrn_f32( vget_high_f32(vQ1), vget_low_f32(vQ1) );
668 tmp = vtrn_f32( vget_high_f32(vQ2), vget_low_f32(vQ2) );
671 vQ2wx = vext_f32(vget_high_f32(vQ2), vget_low_f32(vQ2), 1);
673 vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1);
675 vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1);
676 vQ2xz = vext_f32(vQ2zx, vQ2zx, 1);
678 A1 = vcombine_f32(vget_low_f32(vQ1), vQ1zx);
679 B1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ2), 1), vQ2wx);
681 A2 = vcombine_f32(vQ1yz, vget_low_f32(vQ1));
682 B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1));
684 A3 = vcombine_f32(vQ1zx, vQ1yz);
685 B3 = vcombine_f32(vQ2yz, vQ2xz);
687 A1 = vmulq_f32(A1, B1);
688 A2 = vmulq_f32(A2, B2);
689 A3 = vmulq_f32(A3, B3);
690 A0 = vmulq_lane_f32(vQ2, vget_high_f32(vQ1), 1);
692 A1 = vaddq_f32(A1, A2);
693 A0 = vsubq_f32(A0, A3);
696 A1 = (btSimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)vPPPM);
697 A0 = vaddq_f32(A0, A1);
703 q1.
w() * q2.
x() + q1.
x() * q2.
w() + q1.
y() * q2.
z() - q1.
z() * q2.
y(),
704 q1.
w() * q2.
y() + q1.
y() * q2.
w() + q1.
z() * q2.
x() - q1.
x() * q2.
z(),
705 q1.
w() * q2.
z() + q1.
z() * q2.
w() + q1.
x() * q2.
y() - q1.
y() * q2.
x(),
706 q1.
w() * q2.
w() - q1.
x() * q2.
x() - q1.
y() * q2.
y() - q1.
z() * q2.
z());
713 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
714 __m128 vQ1 = q.get128();
715 __m128 vQ2 = w.get128();
716 __m128 A1, B1, A2, B2, A3, B3;
718 A1 = bt_pshufd_ps(vQ1, BT_SHUFFLE(3,3,3,0));
719 B1 = bt_pshufd_ps(vQ2, BT_SHUFFLE(0,1,2,0));
723 A2 = bt_pshufd_ps(vQ1, BT_SHUFFLE(1,2,0,1));
724 B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(2,0,1,1));
728 A3 = bt_pshufd_ps(vQ1, BT_SHUFFLE(2,0,1,2));
729 B3 = bt_pshufd_ps(vQ2, BT_SHUFFLE(1,2,0,2));
734 A1 = _mm_xor_ps(A1, vPPPM);
739 #elif defined(BT_USE_NEON)
741 float32x4_t vQ1 = q.get128();
742 float32x4_t vQ2 = w.get128();
743 float32x4_t A1, B1, A2, B2, A3, B3;
744 float32x2_t vQ1wx, vQ2zx, vQ1yz, vQ2yz, vQ1zx, vQ2xz;
746 vQ1wx = vext_f32(vget_high_f32(vQ1), vget_low_f32(vQ1), 1);
750 tmp = vtrn_f32( vget_high_f32(vQ2), vget_low_f32(vQ2) );
753 tmp = vtrn_f32( vget_high_f32(vQ1), vget_low_f32(vQ1) );
757 vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1);
759 vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1);
760 vQ2xz = vext_f32(vQ2zx, vQ2zx, 1);
762 A1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ1), 1), vQ1wx);
763 B1 = vcombine_f32(vget_low_f32(vQ2), vQ2zx);
765 A2 = vcombine_f32(vQ1yz, vget_low_f32(vQ1));
766 B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1));
768 A3 = vcombine_f32(vQ1zx, vQ1yz);
769 B3 = vcombine_f32(vQ2yz, vQ2xz);
771 A1 = vmulq_f32(A1, B1);
772 A2 = vmulq_f32(A2, B2);
773 A3 = vmulq_f32(A3, B3);
775 A1 = vaddq_f32(A1, A2);
778 A1 = (btSimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)vPPPM);
780 A1 = vsubq_f32(A1, A3);
786 q.
w() * w.
x() + q.
y() * w.
z() - q.
z() * w.
y(),
787 q.
w() * w.
y() + q.
z() * w.
x() - q.
x() * w.
z(),
788 q.
w() * w.
z() + q.
x() * w.
y() - q.
y() * w.
x(),
789 -q.
x() * w.
x() - q.
y() * w.
y() - q.
z() * w.
z());
796 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
797 __m128 vQ1 = w.get128();
798 __m128 vQ2 = q.get128();
799 __m128 A1, B1, A2, B2, A3, B3;
801 A1 = bt_pshufd_ps(vQ1, BT_SHUFFLE(0,1,2,0));
802 B1 = bt_pshufd_ps(vQ2, BT_SHUFFLE(3,3,3,0));
806 A2 = bt_pshufd_ps(vQ1, BT_SHUFFLE(1,2,0,1));
807 B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(2,0,1,1));
811 A3 = bt_pshufd_ps(vQ1, BT_SHUFFLE(2,0,1,2));
812 B3 = bt_pshufd_ps(vQ2, BT_SHUFFLE(1,2,0,2));
817 A1 = _mm_xor_ps(A1, vPPPM);
822 #elif defined(BT_USE_NEON)
824 float32x4_t vQ1 = w.get128();
825 float32x4_t vQ2 = q.get128();
826 float32x4_t A1, B1, A2, B2, A3, B3;
827 float32x2_t vQ1zx, vQ2wx, vQ1yz, vQ2zx, vQ2yz, vQ2xz;
832 tmp = vtrn_f32( vget_high_f32(vQ1), vget_low_f32(vQ1) );
835 tmp = vtrn_f32( vget_high_f32(vQ2), vget_low_f32(vQ2) );
838 vQ2wx = vext_f32(vget_high_f32(vQ2), vget_low_f32(vQ2), 1);
840 vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1);
842 vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1);
843 vQ2xz = vext_f32(vQ2zx, vQ2zx, 1);
845 A1 = vcombine_f32(vget_low_f32(vQ1), vQ1zx);
846 B1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ2), 1), vQ2wx);
848 A2 = vcombine_f32(vQ1yz, vget_low_f32(vQ1));
849 B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1));
851 A3 = vcombine_f32(vQ1zx, vQ1yz);
852 B3 = vcombine_f32(vQ2yz, vQ2xz);
854 A1 = vmulq_f32(A1, B1);
855 A2 = vmulq_f32(A2, B2);
856 A3 = vmulq_f32(A3, B3);
858 A1 = vaddq_f32(A1, A2);
861 A1 = (btSimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)vPPPM);
863 A1 = vsubq_f32(A1, A3);
869 +w.
x() * q.
w() + w.
y() * q.
z() - w.
z() * q.
y(),
870 +w.
y() * q.
w() + w.
z() * q.
x() - w.
x() * q.
z(),
871 +w.
z() * q.
w() + w.
x() * q.
y() - w.
y() * q.
x(),
872 -w.
x() * q.
x() - w.
y() * q.
y() - w.
z() * q.
z());
913 return q1.
slerp(q2, t);
921 #if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
922 return btVector3(_mm_and_ps(q.get128(), btvFFF0fMask));
923 #elif defined(BT_USE_NEON)
924 return btVector3((float32x4_t)vandq_s32((int32x4_t)q.get128(), btvFFF0Mask));
974 for (
int i=0;i<4;i++)
980 for (
int i=0;i<4;i++)
988 for (
int i=0;i<4;i++)
994 for (
int i=0;i<4;i++)
1002 for (
int i=0;i<4;i++)
1008 for (
int i=0;i<4;i++)
1013 #endif //BT_SIMD__QUATERNION_H_