diff --git a/simgear/math/SGVec2.hxx b/simgear/math/SGVec2.hxx index e4fe25af..9f735332 100644 --- a/simgear/math/SGVec2.hxx +++ b/simgear/math/SGVec2.hxx @@ -223,7 +223,7 @@ template inline T dot(SGVec2 v1, const SGVec2& v2) -{ v1.simd2() *= v2.simd2(); return (v1(0)+v1(1)+v1(2)); } +{ v1.simd2() *= v2.simd2(); return (v1(0)+v1(1)); } /// The euclidean norm of the vector, that is what most people call length template diff --git a/simgear/math/SGVec3.hxx b/simgear/math/SGVec3.hxx index 2d25a216..84524ea1 100644 --- a/simgear/math/SGVec3.hxx +++ b/simgear/math/SGVec3.hxx @@ -20,7 +20,7 @@ #include -#include +#include "SGVec2.hxx" #include #include "simd.hxx" diff --git a/simgear/math/simd.hxx b/simgear/math/simd.hxx index 021f65e1..8498b397 100644 --- a/simgear/math/simd.hxx +++ b/simgear/math/simd.hxx @@ -26,7 +26,7 @@ class simd4_t { private: union { - T v4[4]; + T _v4[4]; T vec[N]; }; @@ -214,6 +214,36 @@ public: } }; +template +inline simd4_t operator+(simd4_t v1, const simd4_t& v2) { + v1 += v2; + return v1; +} + +template +inline simd4_t operator-(simd4_t v1, const simd4_t& v2) { + v1 -= v2; + return v1; +} + +template +inline simd4_t operator*(simd4_t v1, const simd4_t& v2) { + v1 *= v2; + return v1; +} + +template +inline simd4_t operator*(T f, simd4_t v) { + v *= f; + return v; +} + +template +inline simd4_t operator*(simd4_t v, T f) { + v *= f; + return v; +} + namespace simd4 { template @@ -254,11 +284,9 @@ public: simd4_t(const simd4_t& v) { simd4 = v.simd4; } -#if 0 - simd4_t(const simd4_t& v) { - simd4 = _mm_movelh_ps(_mm_cvtpd_ps(v.v4()[0]), _mm_cvtpd_ps(v.v4()[1])); + simd4_t(const __m128& v) { + simd4 = v; } -#endif inline __m128 (&v4(void)) { return simd4; @@ -350,7 +378,9 @@ inline simd4_tabs(simd4_t v) { # endif -# ifdef __SSE2 +# ifdef __SSE2__ +# include + template class simd4_t { @@ -375,6 +405,10 @@ public: simd4[0] = v.simd4[0]; simd4[1] = v.simd4[1]; } + simd4_t(const __m128d v[2]) { + simd4[0] = v[0]; + simd4[1] = v[1]; + } inline __m128d (&v4(void))[2] { return simd4; @@ -401,7 +435,7 @@ public: } inline simd4_t& operator=(double d) { - simd4[0] = simd[1] = _mm_set1_pd(d); + simd4[0] = simd4[1] = _mm_set1_pd(d); return *this; } inline simd4_t& operator=(const __vec4d_t v) { @@ -414,6 +448,11 @@ public: simd4[1] = v.simd4[1]; return *this; } + inline simd4_t& operator=(const __m128d v[2]) { + simd4[0] = v[0]; + simd4[1] = v[1]; + return *this; + } inline simd4_t& operator+=(double d) { simd4_t r(d); @@ -469,8 +508,8 @@ namespace simd4 template inline simd4_tabs(simd4_t v) { static const __m128d sign_mask = _mm_set1_pd(-0.); // -0. = 1 << 63 - v.v4[0] = mm_andnot_ps(sign_mask, v4[0]); - v.v4[1] = _mm_andnot_ps(sign_mask, v4[1]); + v.v4()[0] = _mm_andnot_pd(sign_mask, v.v4()[0]); + v.v4()[1] = _mm_andnot_pd(sign_mask, v.v4()[1]); return v; } }; @@ -502,6 +541,17 @@ public: simd4_t(const simd4_t& v) { simd4 = v.simd4; } + simd4_t(const __m128i& v) { + simd4 = v; + } + + inline __m128i (&v4(void)) { + return simd4; + } + + inline const __m128i (&v4(void) const) { + return simd4; + } inline const int (&ptr(void) const)[N] { return vec; diff --git a/simgear/math/simd4x4.hxx b/simgear/math/simd4x4.hxx index 3a3cdf6a..0d28e138 100644 --- a/simgear/math/simd4x4.hxx +++ b/simgear/math/simd4x4.hxx @@ -20,6 +20,7 @@ #include +#include "simd.hxx" template class simd4x4_t @@ -33,11 +34,11 @@ private: public: simd4x4_t(void) {} - explicit simd4x4_t(const T m[N][N]) { - std::memcpy(array, m, sizeof(T[N][N])); + simd4x4_t(const T m[N*N]) { + std::memcpy(array, m, sizeof(T[N*N])); } simd4x4_t(const simd4x4_t& m) { - std::memcpy(array, m, sizeof(T[N][N])); + std::memcpy(array, m, sizeof(T[N*N])); } ~simd4x4_t(void) {} @@ -57,12 +58,12 @@ public: return array; } - inline simd4x4_t& operator=(const T m[N][N]) { - std::memcpy(array, m, sizeof(T[N][N])); + inline simd4x4_t& operator=(const T m[N*N]) { + std::memcpy(array, m, sizeof(T[N*N])); return *this; } inline simd4x4_t& operator=(const simd4x4_t& m) { - std::memcpy(array, m, sizeof(T[N][N])); + std::memcpy(array, m, sizeof(T[N*N])); return *this; } @@ -105,15 +106,33 @@ public: return *this; } - template - inline simd4x4_t& operator*=(S s) { + inline simd4x4_t& operator*=(T s) { for (int i=0; i& operator*=(const simd4x4_t& m) { -//TODO + simd4x4_t& operator*=(const simd4x4_t& m1) { + simd4x4_t m2 = *this; + simd4_t row; + for (int j=0; j& r) { template void unit(simd4x4_t& r) { zeros(r); - for (int i=0; i<4; i++) { + for (int i=0; i -simd4_t -_pt4Matrix4_sse(const simd4x4_t& m, const simd4_t& vi) -{ - simd4_t r; - simd4_t x(vi[0]); - r = x*m[0]; - for (int i=1; i +inline simd4x4_t operator-(simd4x4_t m) { + simd4x4_t r; + simd4x4::zeros(r); + r -= m; return r; } -#endif + + +template +inline simd4_t operator*(const simd4x4_t& m, const simd4_t& vi) +{ + simd4_t mv; + simd4_t row(m); + mv = vi.ptr()[0] * row; + for (int j=1; j row(m[j*N]); + mv += vi.ptr()[j] * row; + } + return mv; } +template +inline simd4x4_t operator*(const simd4x4_t& m1, const simd4x4_t& m2) +{ + simd4x4_t m = m1; + m *= m2; + return m; +} + + # ifdef __SSE__ # include @@ -172,9 +206,15 @@ private: public: simd4x4_t(void) {} + simd4x4_t(const float m[4*4]) { + for (int i=0; i<4; i++) { + simd4x4[i] = simd4_t((const float*)&m[4*i]).v4(); + } + } + explicit simd4x4_t(const __mtx4f_t m) { for (int i=0; i<4; i++) { - simd4x4[i] = _mm_loadu_ps(m[i]); + simd4x4[i] = simd4_t(m[i]).v4(); } } simd4x4_t(const simd4x4_t& m) { @@ -210,7 +250,7 @@ public: inline simd4x4_t& operator=(const __mtx4f_t m) { for (int i=0; i<4; i++) { - simd4x4[i] = _mm_loadu_ps(m[i]); + simd4x4[i] = simd4_t(m[i]).v4(); } return *this; } @@ -236,14 +276,317 @@ public: } inline simd4x4_t& operator*=(float f) { - __m128 f4 = _mm_set1_ps(f); + simd4_t f4(f); for (int i=0; i<4; i++) { - simd4x4[i] *= f4; + simd4x4[i] *= f4.v4(); } return *this; } }; +template<> +inline simd4_t operator*(const simd4x4_t& m, const simd4_t& vi) +{ + simd4_t mv(m); + mv *= vi.ptr()[0]; + for (int i=1; i<4; i++) { + simd4_t row(m.m4x4()[i]); + row *= vi.ptr()[i]; + mv.v4() += row.v4(); + } + return mv; +} + +template<> +inline simd4x4_t operator*(const simd4x4_t& m1, const simd4x4_t& m2) +{ + simd4_t row, col; + simd4x4_t m; + + for (int i=0; i<4; i++) { + simd4_t col(m2.ptr()[i][0]); + row.v4() = m1.m4x4()[0] * col.v4(); + for (int j=1; j<4; j++) { + simd4_t col(m2.ptr()[i][j]); + row.v4() += m1.m4x4()[j] * col.v4(); + } + m.m4x4()[i] = row.v4(); + } + + return m; +} +# endif + + +# ifdef __SSE2__ +# include + +template<> +class simd4x4_t +{ +private: + typedef double __mtx4d_t[4][4]; + + union { + __m128d simd4x4[4][2]; + __mtx4d_t mtx; + double array[4*4]; + }; + +public: + simd4x4_t(void) {} + explicit simd4x4_t(const double m[4*4]) { + for (int i=0; i<4; i++) { + simd4x4[i][0] = simd4_t((const double*)&m[4*i]).v4()[0]; + simd4x4[i][1] = simd4_t((const double*)&m[4*i+2]).v4()[1]; + } + } + + explicit simd4x4_t(const __mtx4d_t m) { + for (int i=0; i<4; i++) { + simd4x4[i][0] = simd4_t(m[i]).v4()[0]; + simd4x4[i][1] = simd4_t(m[i]).v4()[1]; + } + } + simd4x4_t(const simd4x4_t& m) { + for (int i=0; i<4; i++) { + simd4x4[i][0] = m.m4x4()[i][0]; + simd4x4[i][1] = m.m4x4()[i][1]; + } + } + ~simd4x4_t(void) {} + + inline __m128d (&m4x4(void))[4][2] { + return simd4x4; + } + + inline const __m128d (&m4x4(void) const)[4][2] { + return simd4x4; + } + + inline const double (&ptr(void) const)[4][4] { + return mtx; + } + + inline double (&ptr(void))[4][4] { + return mtx; + } + + inline operator const double*(void) const { + return array; + } + + inline operator double*(void) { + return array; + } + + inline simd4x4_t& operator=(const __mtx4d_t m) { + for (int i=0; i<4; i++) { + simd4x4[i][0] = simd4_t(m[i]).v4()[0]; + simd4x4[i][1] = simd4_t(m[i]).v4()[1]; + } + return *this; + } + inline simd4x4_t& operator=(const simd4x4_t& m) { + for (int i=0; i<4; i++) { + simd4x4[i][0] = m.m4x4()[i][0]; + simd4x4[i][1] = m.m4x4()[i][1]; + } + return *this; + } + + inline simd4x4_t& operator+=(const simd4x4_t& m) { + for (int i=0; i<4; i++) { + simd4x4[i][0] += m.m4x4()[i][0]; + simd4x4[i][1] += m.m4x4()[i][1]; + } + return *this; + } + + inline simd4x4_t& operator-=(const simd4x4_t& m) { + for (int i=0; i<4; i++) { + simd4x4[i][0] -= m.m4x4()[i][0]; + simd4x4[i][1] -= m.m4x4()[i][1]; + } + return *this; + } + + inline simd4x4_t& operator*=(double f) { + simd4_t f4(f); + for (int i=0; i<4; i++) { + simd4x4[i][0] *= f4.v4()[0]; + simd4x4[i][1] *= f4.v4()[0]; + } + return *this; + } +}; + + +#if 0 +template<> +inline simd4_t operator*(const simd4x4_t& m, const simd4_t& vi) +{ + simd4_t mv(m.m4x4()[0]); + mv.v4()[0] *= vi.ptr()[0]; + mv.v4()[1] *= vi.ptr()[2]; + for (int i=1; i<4; i+=2) { + simd4_t row = m.m4x4()[i]; + row.v4()[0] *= vi.ptr()[i]; + row.v4()[1] *= vi.ptr()[i+2]; + mv += row; + } + return mv; +} +#endif + +template<> +inline simd4x4_t operator*(const simd4x4_t& m1, const simd4x4_t& m2) +{ + simd4_t row, col; + simd4x4_t m; + + for (int i=0; i<4; i++ ) { + simd4_t col = m1.m4x4()[0]; + row = col * m2.ptr()[i][0]; + for (int j=1; j<4; j++) { + col = m1.m4x4()[j]; + row += col * m2.ptr()[i][j]; + } + m.m4x4()[i][0] = row.v4()[0]; + m.m4x4()[i][1] = row.v4()[1]; + } + + return m; +} +# endif + + +# ifdef __SSE2__ +# include + +template<> +class simd4x4_t +{ +private: + typedef int __mtx4i_t[4][4]; + + union { + __m128i simd4x4[4]; + __mtx4i_t mtx; + int array[4*4]; + }; + +public: + simd4x4_t(void) {} + simd4x4_t(const int m[4*4]) { + for (int i=0; i<4; i++) { + simd4x4[i] = simd4_t((const int*)&m[4*i]).v4(); + } + } + + explicit simd4x4_t(const __mtx4i_t m) { + for (int i=0; i<4; i++) { + simd4x4[i] = simd4_t(m[i]).v4(); + } + } + simd4x4_t(const simd4x4_t& m) { + for (int i=0; i<4; i++) { + simd4x4[i] = m.m4x4()[i]; + } + } + ~simd4x4_t(void) {} + + inline __m128i (&m4x4(void))[4] { + return simd4x4; + } + + inline const __m128i (&m4x4(void) const)[4] { + return simd4x4; + } + + inline const int (&ptr(void) const)[4][4] { + return mtx; + } + + inline int (&ptr(void))[4][4] { + return mtx; + } + + inline operator const int*(void) const { + return array; + } + + inline operator int*(void) { + return array; + } + + inline simd4x4_t& operator=(const __mtx4i_t m) { + for (int i=0; i<4; i++) { + simd4x4[i] = simd4_t(m[i]).v4(); + } + return *this; + } + inline simd4x4_t& operator=(const simd4x4_t& m) { + for (int i=0; i<4; i++) { + simd4x4[i] = m.m4x4()[i]; + } + return *this; + } + + inline simd4x4_t& operator+=(const simd4x4_t& m) { + for (int i=0; i<4; i++) { + simd4x4[i] += m.m4x4()[i]; + } + return *this; + } + + inline simd4x4_t& operator-=(const simd4x4_t& m) { + for (int i=0; i<4; i++) { + simd4x4[i] -= m.m4x4()[i]; + } + return *this; + } + + inline simd4x4_t& operator*=(int f) { + simd4_t f4(f); + for (int i=0; i<4; i++) { + simd4x4[i] *= f4.v4(); + } + return *this; + } +}; + +template<> +inline simd4_t operator*(const simd4x4_t& m, const simd4_t& vi) +{ + simd4_t mv(m); + mv *= vi.ptr()[0]; + for (int i=1; i<4; i++) { + simd4_t row(m.m4x4()[i]); + row *= vi.ptr()[i]; + mv.v4() += row.v4(); + } + return mv; +} + +template<> +inline simd4x4_t operator*(const simd4x4_t& m1, const simd4x4_t& m2) +{ + simd4_t row, col; + simd4x4_t m; + + for (int i=0; i<4; i++) { + simd4_t col(m2.ptr()[i][0]); + row.v4() = m1.m4x4()[0] * col.v4(); + for (int j=1; j<4; j++) { + simd4_t col(m2.ptr()[i][j]); + row.v4() += m1.m4x4()[j] * col.v4(); + } + m.m4x4()[i] = row.v4(); + } + + return m; +} # endif #endif /* __SIMD4X4_H__ */