Minor speedups in small tweaks

This commit is contained in:
Erik Hofman 2016-12-15 09:57:10 +01:00
parent 0586cb62c3
commit f9450d136d
5 changed files with 146 additions and 235 deletions

View File

@ -46,7 +46,7 @@ public:
/// Constructor. Initialize by the content of a plain array,
/// make sure it has at least 16 elements
explicit SGMatrix(const T* data)
{ simd4x4_t<T,4> x(data); _data = x; }
{ _data = simd4x4_t<T,4>(data); }
/// Constructor, build up a SGMatrix from given elements
SGMatrix(T m00, T m01, T m02, T m03,

View File

@ -50,7 +50,7 @@ public:
/// Constructor. Initialize by the content of a plain array,
/// make sure it has at least 2 elements
explicit SGVec2(const T* d)
{ simd4_t<T,2> r(d); _data = r; }
{ _data = simd4_t<T,2>(d); }
template<typename S>
explicit SGVec2(const SGVec2<S>& d)
{ data()[0] = d[0]; data()[1] = d[1]; }

View File

@ -20,7 +20,7 @@
#include <iosfwd>
#include "SGVec2.hxx"
#include <simgear/math/SGVec2.hxx>
#include <simgear/math/SGGeodesy.hxx>
#include "simd.hxx"
@ -60,7 +60,7 @@ public:
/// Constructor. Initialize by the content of a plain array,
/// make sure it has at least 3 elements
explicit SGVec3(const T* d)
{ simd4_t<T,3> r(d); _data = r; }
{ _data = simd4_t<T,3>(d); }
template<typename S>
explicit SGVec3(const SGVec3<S>& d)
{ data()[0] = d[0]; data()[1] = d[1]; data()[2] = d[2]; }

View File

@ -31,7 +31,7 @@ namespace simd4
template<typename T, int N>
inline simd4_t<T,N> min(simd4_t<T,N> v1, const simd4_t<T,N>& v2) {
for (int i=0; i<N; i++) {
for (int i=0; i<N; ++i) {
v1[i] = SGMisc<T>::min(v1[i], v2[i]);
}
return v1;
@ -39,7 +39,7 @@ inline simd4_t<T,N> min(simd4_t<T,N> v1, const simd4_t<T,N>& v2) {
template<typename T, int N>
inline simd4_t<T,N> max(simd4_t<T,N> v1, const simd4_t<T,N>& v2) {
for (int i=0; i<N; i++) {
for (int i=0; i<N; ++i) {
v1[i] = SGMisc<T>::max(v1[i], v2[i]);
}
return v1;
@ -47,7 +47,7 @@ inline simd4_t<T,N> max(simd4_t<T,N> v1, const simd4_t<T,N>& v2) {
template<typename T, int N>
inline simd4_t<T,N> abs(simd4_t<T,N> v) {
for (int i=0; i<N; i++) {
for (int i=0; i<N; ++i) {
v[i] = std::abs(v[i]);
}
return v;
@ -57,7 +57,7 @@ template<typename T, int N>
inline T magnitude2(simd4_t<T,N> v) {
T mag2 = 0;
v = v*v;
for (int i=0; i<N; i++) {
for (int i=0; i<N; ++i) {
mag2 += v[i];
}
return mag2;
@ -83,7 +83,7 @@ template<typename T, int N>
inline T dot(simd4_t<T,N> v1, const simd4_t<T,N>& v2) {
T dp = 0;
v1 *= v2;
for (int i=0; i<N; i++) {
for (int i=0; i<N; ++i) {
dp += v1[i];
}
return dp;
@ -104,25 +104,22 @@ private:
public:
simd4_t(void) {}
simd4_t(T s) {
for (int i=0; i<N; i++) vec[i] = s;
for (int i=N; i<4; i++) _v4[i] = 0;
}
simd4_t(T x, T y) {
_v4[0] = x; _v4[1] = y; _v4[2] = 0; _v4[3] = 0;
}
simd4_t(T x, T y, T z) {
_v4[0] = x; _v4[1] = y; _v4[2] = z; _v4[3] = 0;
for (int i=0; i<N; ++i) vec[i] = s;
for (int i=N; i<4; ++i) _v4[i] = 0;
}
simd4_t(T x, T y) : simd4_t(x,y,0,0) {}
simd4_t(T x, T y, T z) : simd4_t(x,y,z,0) {}
simd4_t(T x, T y, T z, T w) {
_v4[0] = x; _v4[1] = y; _v4[2] = z; _v4[3] = w;
}
explicit simd4_t(const T v[N]) {
std::memcpy(vec, v, sizeof(T[N]));
for (int i=N; i<4; ++i) _v4[i] = 0;
}
template<int M>
simd4_t(const simd4_t<T,M>& v) {
std::memcpy(vec, v.ptr(), sizeof(T[M]));
for (int i=N; i<4; i++) _v4[i] = 0;
for (int i=M; i<4; ++i) _v4[i] = 0;
}
~simd4_t(void) {}
@ -151,132 +148,74 @@ public:
}
inline simd4_t<T,N>& operator=(T s) {
for (int i=0; i<N; i++) vec[i] = s;
for (int i=N; i<4; i++) _v4[i] = 0;
for (int i=0; i<N; ++i) vec[i] = s;
for (int i=N; i<4; ++i) _v4[i] = 0;
return *this;
}
inline simd4_t<T,N>& operator=(const T v[N]) {
if (N<4) std::memset(_v4+N, 0, sizeof(T[4-N]));
std::memcpy(vec, v, sizeof(T[N]));
for (int i=N; i<4; ++i) _v4[i] = 0;
return *this;
}
template<int M>
inline simd4_t<T,N>& operator=(const simd4_t<T,M>& v) {
if (M<4) std::memset(_v4+M, 0, sizeof(T[4-M]));
std::memcpy(vec, v.ptr(), sizeof(T[M]));
for (int i=M; i<4; ++i) _v4[i] = 0;
return *this;
}
inline simd4_t<T,N> operator+(T s) {
simd4_t<T,N> r(*this);
r += s;
return r;
}
inline simd4_t<T,N> operator+(const T v[N]) {
simd4_t<T,N> r(v);
r += *this;
return r;
}
inline simd4_t<T,N> operator+(simd4_t<T,N> v)
{
v += *this;
return v;
}
inline simd4_t<T,N> operator-(void) {
simd4_t<T,N> r(0);
r -= *this;
return r;
}
inline simd4_t<T,N> operator-(T s) {
simd4_t<T,N> r(*this);
r -= s;
return r;
}
inline simd4_t<T,N> operator-(const simd4_t<T,N>& v) {
simd4_t<T,N> r(*this);
r -= v;
return r;
}
inline simd4_t<T,N> operator*(T s) {
simd4_t<T,N> r(s);
r *= *this;
return r;
}
inline simd4_t<T,N> operator*(const T v[N]) {
simd4_t<T,N> r(v);
r *= *this;
return r;
}
inline simd4_t<T,N> operator*(simd4_t<T,N> v) {
v *= *this; return v;
}
inline simd4_t<T,N> operator/(T s) {
simd4_t<T,N> r(1/s);
r *= this;
return r;
}
inline simd4_t<T,N> operator/(const T v[N]) {
simd4_t<T,N> r(*this);
r /= v;
return r;
}
inline simd4_t<T,N> operator/(const simd4_t<T,N>& v) {
simd4_t<T,N> r(*this);
r /= v; return v;
}
inline simd4_t<T,N>& operator+=(T s) {
for (int i=0; i<N; i++) {
for (int i=0; i<N; ++i) {
vec[i] += s;
}
return *this;
}
inline simd4_t<T,N>& operator+=(const T v[N]) {
simd4_t<T,N> r(v);
*this += r.v4();
for (int i=0; i<N; ++i) {
vec[i] += v[i];
}
return *this;
}
template<int M>
inline simd4_t<T,N>& operator+=(const simd4_t<T,M>& v) {
for (int i=0; i<M; i++) {
inline simd4_t<T,N>& operator+=(const simd4_t<T,N>& v) {
for (int i=0; i<N; ++i) {
vec[i] += v[i];
}
return *this;
}
inline simd4_t<T,N>& operator-=(T s) {
for (int i=0; i<N; i++) {
for (int i=0; i<N; ++i) {
vec[i] -= s;
}
return *this;
}
inline simd4_t<T,N>& operator-=(const T v[N]) {
simd4_t<T,N> r(v);
*this -= r.v4();
for (int i=0; i<N; ++i) {
vec[i] -= v[i];
}
return *this;
}
inline simd4_t<T,N>& operator-=(const simd4_t<T,N>& v) {
for (int i=0; i<N; i++) {
for (int i=0; i<N; ++i) {
vec[i] -= v[i];
}
return *this;
}
inline simd4_t<T,N>& operator*=(T s) {
for (int i=0; i<N; i++) {
for (int i=0; i<N; ++i) {
vec[i] *= s;
}
return *this;
}
inline simd4_t<T,N>& operator*=(const T v[N]) {
simd4_t<T,N> r(v);
*this *= r.v4();
for (int i=0; i<N; ++i) {
vec[i] *= v[i];
}
return *this;
}
inline simd4_t<T,N>& operator*=(const simd4_t<T,N>& v) {
for (int i=0; i<N; i++) {
for (int i=0; i<N; ++i) {
vec[i] *= v[i];
}
return *this;
@ -286,12 +225,13 @@ public:
return operator*=(1/s);
}
inline simd4_t<T,N>& operator/=(const T v[N]) {
simd4_t<T,N> r(v);
*this /= r.v4();
for (int i=0; i<N; ++i) {
vec[i] /= v[i];
}
return *this;
}
inline simd4_t<T,N>& operator/=(const simd4_t<T,N>& v) {
for (int i=0; i<N; i++) {
for (int i=0; i<N; ++i) {
vec[i] /= v[i];
}
return *this;
@ -316,6 +256,12 @@ inline simd4_t<T,N> operator*(simd4_t<T,N> v1, const simd4_t<T,N>& v2) {
return v1;
}
template<typename T, int N>
inline simd4_t<T,N> operator/(simd4_t<T,N> v1, const simd4_t<T,N>& v2) {
v1 /= v2;
return v1;
}
template<typename T, int N>
inline simd4_t<T,N> operator*(T f, simd4_t<T,N> v) {
v *= f;
@ -362,7 +308,7 @@ public:
simd4_t(void) {}
simd4_t(float f) {
simd4 = _mm_set1_ps(f);
for (int i=N; i<4; i++) _v4[i] = 0.0f;
for (int i=N; i<4; ++i) _v4[i] = 0.0f;
}
simd4_t(float x, float y) : simd4_t(x,y,0,0) {}
simd4_t(float x, float y, float z) : simd4_t(x,y,z,0) {}
@ -371,29 +317,27 @@ public:
}
explicit simd4_t(const __vec4f_t v) {
simd4 = _mm_loadu_ps(v);
for (int i=N; i<4; i++) _v4[i] = 0.0f;
for (int i=N; i<4; ++i) _v4[i] = 0.0f;
}
template<int M>
simd4_t(const simd4_t<float,M>& v) {
simd4 = v.v4();
for (int i=M; i<4; i++) _v4[i] = 0.0f;
for (int i=M; i<4; ++i) _v4[i] = 0.0f;
}
simd4_t(const __m128& v) {
simd4 = v;
}
inline __m128 (&v4(void)) {
inline const __m128 (&v4(void) const) {
return simd4;
}
inline const __m128 (&v4(void) const) {
inline __m128 (&v4(void)) {
return simd4;
}
inline const float (&ptr(void) const)[N] {
return vec;
}
inline float (&ptr(void))[N] {
return vec;
}
@ -401,31 +345,29 @@ public:
inline operator const float*(void) const {
return vec;
}
inline operator float*(void) {
return vec;
}
inline simd4_t<float,N>& operator=(float f) {
simd4 = _mm_set1_ps(f);
for (int i=N; i<4; i++) _v4[i] = 0.0f;
for (int i=N; i<4; ++i) _v4[i] = 0.0f;
return *this;
}
inline simd4_t<float,N>& operator=(const __vec4f_t v) {
simd4 = _mm_loadu_ps(v);
for (int i=N; i<4; i++) _v4[i] = 0.0f;
for (int i=N; i<4; ++i) _v4[i] = 0.0f;
return *this;
}
template<int M>
inline simd4_t<float,N>& operator=(const simd4_t<float,M>& v) {
simd4 = v.v4();
for (int i=M; i<4; i++) _v4[i] = 0.0f;
for (int i=M; i<4; ++i) _v4[i] = 0.0f;
return *this;
}
inline simd4_t<float,N>& operator+=(float f) {
*this += simd4_t<float,N>(f);
return *this;
return operator+=(simd4_t<float,N>(f));
}
inline simd4_t<float,N>& operator+=(const simd4_t<float,N>& v) {
simd4 = _mm_add_ps(simd4, v.v4());
@ -433,8 +375,7 @@ public:
}
inline simd4_t<float,N>& operator-=(float f) {
*this -= simd4_t<float,N>(f);
return *this;
return operator-=(simd4_t<float,N>(f));
}
inline simd4_t<float,N>& operator-=(const simd4_t<float,N>& v) {
simd4 = _mm_sub_ps(simd4, v.v4());
@ -442,8 +383,7 @@ public:
}
inline simd4_t<float,N>& operator*=(float f) {
*this *= simd4_t<float,N>(f);
return *this;
return operator*=(simd4_t<float,N>(f));
}
inline simd4_t<float,N>& operator*=(const simd4_t<float,N>& v) {
simd4 = _mm_mul_ps(simd4, v.v4());
@ -451,8 +391,7 @@ public:
}
inline simd4_t<float,N>& operator/=(float f) {
*this /= simd4_t<float,N>(f);
return *this;
return operator/=(simd4_t<float,N>(f));
}
inline simd4_t<float,N>& operator/=(const simd4_t<float,N>& v) {
simd4 = _mm_div_ps(simd4, v.v4());
@ -481,13 +420,13 @@ namespace simd4
}
# endif
template<int N>
inline float magnitude2(simd4_t<float,N> v) {
template<>
inline float magnitude2(simd4_t<float,4> v) {
return hsum_ps_sse(v.v4()*v.v4());
}
template<int N>
inline float dot(simd4_t<float,N> v1, const simd4_t<float,N>& v2) {
template<>
inline float dot(simd4_t<float,4> v1, const simd4_t<float,4>& v2) {
return hsum_ps_sse(v1.v4()*v2.v4());
}
@ -534,41 +473,40 @@ public:
simd4_t(void) {}
simd4_t(double d) {
simd4[0] = simd4[1] = _mm_set1_pd(d);
for (int i=N; i<4; i++) _v4[i] = 0.0;
for (int i=N; i<4; ++i) _v4[i] = 0.0;
}
simd4_t(double x, double y) : simd4_t(x,y,0,0) {}
simd4_t(double x, double y, double z) : simd4_t(x,y,z,0) {}
simd4_t(double x, double y, double z, double w) {
simd4[0] = _mm_set_pd(y,x); simd4[1] = _mm_set_pd(w,z);
simd4[0] = _mm_set_pd(y,x);
simd4[1] = _mm_set_pd(w,z);
}
explicit simd4_t(const __vec4d_t v) {
simd4[0] = _mm_loadu_pd(v);
simd4[1] = _mm_loadu_pd(v+2);
for (int i=N; i<4; i++) _v4[i] = 0.0;
for (int i=N; i<4; ++i) _v4[i] = 0.0;
}
template<int M>
simd4_t(const simd4_t<double,M>& v) {
simd4[0] = v.v4()[0];
simd4[1] = v.v4()[1];
for (int i=M; i<4; i++) _v4[i] = 0.0;
for (int i=M; i<4; ++i) _v4[i] = 0.0;
}
simd4_t(const __m128d v[2]) {
simd4[0] = v[0];
simd4[1] = v[1];
}
inline __m128d (&v4(void))[2] {
inline const __m128d (&v4(void) const)[2] {
return simd4;
}
inline const __m128d (&v4(void) const)[2] {
inline __m128d (&v4(void))[2] {
return simd4;
}
inline const double (&ptr(void) const)[N] {
return vec;
}
inline double (&ptr(void))[N] {
return vec;
}
@ -576,27 +514,26 @@ public:
inline operator const double*(void) const {
return vec;
}
inline operator double*(void) {
return vec;
}
inline simd4_t<double,N>& operator=(double d) {
simd4[0] = simd4[1] = _mm_set1_pd(d);
for (int i=N; i<4; i++) _v4[i] = 0.0;
for (int i=N; i<4; ++i) _v4[i] = 0.0;
return *this;
}
inline simd4_t<double,N>& operator=(const __vec4d_t v) {
simd4[0] = _mm_loadu_pd(v);
simd4[1] = _mm_loadu_pd(v+2);
for (int i=N; i<4; i++) _v4[i] = 0.0;
for (int i=N; i<4; ++i) _v4[i] = 0.0;
return *this;
}
template<int M>
inline simd4_t<double,N>& operator=(const simd4_t<double,M>& v) {
simd4[0] = v.v4()[0];
simd4[1] = v.v4()[1];
for (int i=M; i<4; i++) _v4[i] = 0.0;
for (int i=M; i<4; ++i) _v4[i] = 0.0;
return *this;
}
inline simd4_t<double,N>& operator=(const __m128d v[2]) {
@ -606,8 +543,7 @@ public:
}
inline simd4_t<double,N>& operator+=(double d) {
*this += simd4_t<double,N>(d);
return *this;
return operator+=(simd4_t<double,N>(d));
}
inline simd4_t<double,N>& operator+=(const simd4_t<double,N>& v) {
simd4[0] = _mm_add_pd(simd4[0], v.v4()[0]);
@ -616,8 +552,7 @@ public:
}
inline simd4_t<double,N>& operator-=(double d) {
*this -= simd4_t<double,N>(d);
return *this;
return operator-=(simd4_t<double,N>(d));
}
inline simd4_t<double,N>& operator-=(const simd4_t<double,N>& v) {
simd4[0] = _mm_sub_pd(simd4[0], v.v4()[0]);
@ -626,8 +561,7 @@ public:
}
inline simd4_t<double,N>& operator*=(double d) {
*this *= simd4_t<double,N>(d);
return *this;
return operator*=(simd4_t<double,N>(d));
}
inline simd4_t<double,N>& operator*=(const simd4_t<double,N>& v) {
simd4[0] = _mm_mul_pd(simd4[0], v.v4()[0]);
@ -636,8 +570,7 @@ public:
}
inline simd4_t<double,N>& operator/=(double d) {
*this /= simd4_t<double,N>(d);
return *this;
return operator/=(simd4_t<double,N>(d));
}
inline simd4_t<double,N>& operator/=(const simd4_t<double,N>& v) {
simd4[0] = _mm_div_pd(simd4[0], v.v4()[0]);
@ -649,21 +582,26 @@ public:
namespace simd4
{
// http://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-float-vector-sum-on-x86
inline double hsum_pd_sse(__m128d vd) {
__m128 undef = _mm_undefined_ps();
__m128 shuftmp= _mm_movehl_ps(undef, _mm_castpd_ps(vd));
__m128d shuf = _mm_castps_pd(shuftmp);
return _mm_cvtsd_f64(_mm_add_sd(vd, shuf));
inline double hsum_pd_sse(const __m128d vd[2]) {
__m128 undef = _mm_undefined_ps();
__m128 shuftmp1 = _mm_movehl_ps(undef, _mm_castpd_ps(vd[0]));
__m128 shuftmp2 = _mm_movehl_ps(undef, _mm_castpd_ps(vd[1]));
__m128d shuf1 = _mm_castps_pd(shuftmp1);
__m128d shuf2 = _mm_castps_pd(shuftmp2);
return _mm_cvtsd_f64(_mm_add_sd(vd[0], shuf1)) +
_mm_cvtsd_f64(_mm_add_sd(vd[1], shuf2));
}
template<int N>
inline double magnitude2(simd4_t<double,N> v) {
return hsum_pd_sse(v.v4()[0]*v.v4()[0]) + hsum_pd_sse(v.v4()[1]*v.v4()[1]);
template<>
inline double magnitude2(simd4_t<double,4> v) {
v *= v;
return hsum_pd_sse(v.v4());
}
template<int N>
inline double dot(simd4_t<double,N> v1, const simd4_t<double,N>& v2) {
return hsum_pd_sse(v1.v4()[0]*v2.v4()[0])+hsum_pd_sse(v1.v4()[1]*v2.v4()[1]);
template<>
inline double dot(simd4_t<double,4> v1, const simd4_t<double,4>& v2) {
v1 *= v2;
return hsum_pd_sse(v1.v4());
}
template<int N>
@ -715,7 +653,7 @@ public:
simd4_t(void) {}
simd4_t(int i) {
simd4 = _mm_set1_epi32(i);
for (int i=N; i<4; i++) _v4[i] = 0;
for (int i=N; i<4; ++i) _v4[i] = 0;
}
simd4_t(int x, int y) : simd4_t(x,y,0,0) {}
simd4_t(int x, int y, int z) : simd4_t(x,y,z,0) {}
@ -724,12 +662,12 @@ public:
}
explicit simd4_t(const __vec4i_t v) {
simd4 = _mm_loadu_si128((__m128i*)v);
for (int i=N; i<4; i++) _v4[i] = 0;
for (int i=N; i<4; ++i) _v4[i] = 0;
}
template<int M>
simd4_t(const simd4_t<int,M>& v) {
simd4 = v.v4();
for (int i=M; i<4; i++) _v4[i] = 0;
for (int i=M; i<4; ++i) _v4[i] = 0;
}
simd4_t(const __m128i& v) {
simd4 = v;
@ -761,24 +699,23 @@ public:
inline simd4_t<int,N>& operator=(int i) {
simd4 = _mm_set1_epi32(i);
for (int i=N; i<4; i++) _v4[i] = 0;
for (int i=N; i<4; ++i) _v4[i] = 0;
return *this;
}
inline simd4_t<int,N>& operator=(const __vec4i_t v) {
simd4 = _mm_loadu_si128((__m128i*)v);
for (int i=N; i<4; i++) _v4[i] = 0;
for (int i=N; i<4; ++i) _v4[i] = 0;
return *this;
}
template<int M>
inline simd4_t<int,N>& operator=(const simd4_t<int,M>& v) {
simd4 = v.v4();
for (int i=M; i<4; i++) _v4[i] = 0;
for (int i=M; i<4; ++i) _v4[i] = 0;
return *this;
}
inline simd4_t<int,N>& operator+=(int i) {
*this += simd4_t<int,N>(i);
return *this;
return operator+=(simd4_t<int,N>(i));
}
inline simd4_t<int,N>& operator+=(const simd4_t<int,N>& v) {
simd4 = _mm_add_epi32(simd4, v.v4());
@ -786,8 +723,7 @@ public:
}
inline simd4_t<int,N>& operator-=(int i) {
*this -= simd4_t<int,N>(i);
return *this;
return operator-=(simd4_t<int,N>(i));
}
inline simd4_t<int,N>& operator-=(const simd4_t<int,N>& v) {
simd4 = _mm_sub_epi32(simd4, v.v4());
@ -795,8 +731,7 @@ public:
}
inline simd4_t<int,N>& operator*=(int i) {
*this *= simd4_t<int,N>(i);
return *this;
return operator*=(simd4_t<int,N>(i));
}
// https://software.intel.com/en-us/forums/intel-c-compiler/topic/288768
inline simd4_t<int,N>& operator*=(const simd4_t<int,N>& v) {
@ -834,6 +769,5 @@ inline simd4_t<int,N> max(simd4_t<int,N> v1, const simd4_t<int,N>& v2) {
# endif
#endif /* __SIMD_H__ */

View File

@ -35,7 +35,7 @@ inline void zeros(simd4x4_t<T,N>& r) {
template<typename T, int N>
inline void unit(simd4x4_t<T,N>& r) {
zeros(r);
for (int i=0; i<N; i++) {
for (int i=0; i<N; ++i) {
r.ptr()[i][i] = T(1);
}
}
@ -48,9 +48,9 @@ inline simd4x4_t<T,4> rotation_matrix(T angle, const simd4_t<T,3>& axis)
simd4x4_t<T,4> m;
simd4x4::unit(m);
for (int i=0; i<3; i++) {
for (int i=0; i<3; ++i) {
simd4_t<T,4> r = axis.ptr()[i]*at;
for (int j=0; j<4; j++) {
for (int j=0; j<4; ++j) {
m.m4x4()[0][j] = r.v4()[j];
}
}
@ -80,8 +80,8 @@ inline void rotate(simd4x4_t<T,N>& mtx, T angle, const simd4_t<T,3>& axis) {
template<typename T, int N>
inline simd4x4_t<T,N> transpose(simd4x4_t<T,N> mtx) {
simd4x4_t<T,N> m;
for (int i=0; i<N; i++) {
for(int j=0; j<N; j++) {
for (int i=0; i<N; ++i) {
for(int j=0; j<N; ++j) {
m.ptr()[j][i] = mtx.ptr()[i][j];
}
}
@ -150,45 +150,22 @@ public:
return *this;
}
inline simd4x4_t<T,N> operator+(simd4x4_t<T,N> m) {
m += *this; return m;
}
inline simd4x4_t<T,N> operator-(simd4x4_t<T,N> m) {
m -= *this; return m;
}
inline simd4x4_t<T,N> operator*(T s) {
simd4x4_t<T,N> r(*this);
r *= s;
return r;
}
inline simd4x4_t<T,N> operator*(simd4x4_t<T,N> m) {
m *= *this; return m;
}
inline simd4x4_t<T,N> operator/(T s) {
simd4x4_t<T,N> r(*this);
r *= (1/T(s));
return r;
}
inline simd4x4_t<T,N>& operator+=(const simd4x4_t<T,N>& m) {
for (int i=0; i<N*N; i++) {
for (int i=0; i<N*N; ++i) {
array[i] += m[i];
}
return *this;
}
inline simd4x4_t<T,N>& operator-=(const simd4x4_t<T,N>& m) {
for (int i=0; i<N*N; i++) {
for (int i=0; i<N*N; ++i) {
array[i] -= m[i];
}
return *this;
}
inline simd4x4_t<T,N>& operator*=(T s) {
for (int i=0; i<N*N; i++) {
for (int i=0; i<N*N; ++i) {
array[i] *= s;
}
return *this;
@ -196,7 +173,7 @@ public:
simd4x4_t<T,N>& operator*=(const simd4x4_t<T,N>& m1) {
simd4x4_t<T,N> m2 = *this;
simd4_t<T,N> row;
for (int j=0; j<N; j++) {
for (int j=0; j<N; ++j) {
for (int r=0; r<N; r++) {
row[r] = m2.ptr()[r][0];
}
@ -204,7 +181,7 @@ public:
for (int r=0; r<N; r++) {
mtx[r][j] = row[r];
}
for (int i=1; i<N; i++) {
for (int i=1; i<N; ++i) {
for (int r=0; r<N; r++) {
row[r] = m2.ptr()[r][i];
}
@ -237,7 +214,7 @@ inline simd4_t<T,N> operator*(const simd4x4_t<T,N>& m, const simd4_t<T,N>& vi)
simd4_t<T,N> mv;
simd4_t<T,N> row(m);
mv = vi.ptr()[0] * row;
for (int j=1; j<N; j++) {
for (int j=1; j<N; ++j) {
simd4_t<T,N> row(m[j*N]);
mv += vi.ptr()[j] * row;
}
@ -271,18 +248,18 @@ private:
public:
simd4x4_t(void) {}
simd4x4_t(const float m[4*4]) {
for (int i=0; i<4; i++) {
for (int i=0; i<4; ++i) {
simd4x4[i] = simd4_t<float,4>((const float*)&m[4*i]).v4();
}
}
explicit simd4x4_t(const __mtx4f_t m) {
for (int i=0; i<4; i++) {
for (int i=0; i<4; ++i) {
simd4x4[i] = simd4_t<float,4>(m[i]).v4();
}
}
simd4x4_t(const simd4x4_t<float,4>& m) {
for (int i=0; i<4; i++) {
for (int i=0; i<4; ++i) {
simd4x4[i] = m.m4x4()[i];
}
}
@ -313,27 +290,27 @@ public:
}
inline simd4x4_t<float,4>& operator=(const __mtx4f_t m) {
for (int i=0; i<4; i++) {
for (int i=0; i<4; ++i) {
simd4x4[i] = simd4_t<float,4>(m[i]).v4();
}
return *this;
}
inline simd4x4_t<float,4>& operator=(const simd4x4_t<float,4>& m) {
for (int i=0; i<4; i++) {
for (int i=0; i<4; ++i) {
simd4x4[i] = m.m4x4()[i];
}
return *this;
}
inline simd4x4_t<float,4>& operator+=(const simd4x4_t<float,4>& m) {
for (int i=0; i<4; i++) {
for (int i=0; i<4; ++i) {
simd4x4[i] += m.m4x4()[i];
}
return *this;
}
inline simd4x4_t<float,4>& operator-=(const simd4x4_t<float,4>& m) {
for (int i=0; i<4; i++) {
for (int i=0; i<4; ++i) {
simd4x4[i] -= m.m4x4()[i];
}
return *this;
@ -341,7 +318,7 @@ public:
inline simd4x4_t<float,4>& operator*=(float f) {
simd4_t<float,4> f4(f);
for (int i=0; i<4; i++) {
for (int i=0; i<4; ++i) {
simd4x4[i] *= f4.v4();
}
return *this;
@ -351,10 +328,10 @@ public:
simd4x4_t<float,4> m1 = *this;
simd4_t<float,4> row, col;
for (int i=0; i<4; i++) {
for (int i=0; i<4; ++i) {
simd4_t<float,4> col(m2.ptr()[i][0]);
row.v4() = m1.m4x4()[0] * col.v4();
for (int j=1; j<4; j++) {
for (int j=1; j<4; ++j) {
simd4_t<float,4> col(m2.ptr()[i][j]);
row.v4() += m1.m4x4()[j] * col.v4();
}
@ -369,7 +346,7 @@ inline simd4_t<float,4> operator*(const simd4x4_t<float,4>& m, const simd4_t<flo
{
simd4_t<float,4> mv(m);
mv *= vi.ptr()[0];
for (int i=1; i<4; i++) {
for (int i=1; i<4; ++i) {
simd4_t<float,4> row(m.m4x4()[i]);
row *= vi.ptr()[i];
mv.v4() += row.v4();
@ -442,7 +419,7 @@ public:
simd4x4_t(void) {}
explicit simd4x4_t(const double m[4*4]) {
const double *p = m;
for (int i=0; i<4; i++) {
for (int i=0; i<4; ++i) {
simd4_t<double,4> vec4(p);
simd4x4[i][0] = vec4.v4()[0]; p += 4;
simd4x4[i][1] = vec4.v4()[1];
@ -450,13 +427,13 @@ public:
}
explicit simd4x4_t(const __mtx4d_t m) {
for (int i=0; i<4; i++) {
for (int i=0; i<4; ++i) {
simd4x4[i][0] = simd4_t<double,4>(m[i]).v4()[0];
simd4x4[i][1] = simd4_t<double,4>(m[i]).v4()[1];
}
}
simd4x4_t(const simd4x4_t<double,4>& m) {
for (int i=0; i<4; i++) {
for (int i=0; i<4; ++i) {
simd4x4[i][0] = m.m4x4()[i][0];
simd4x4[i][1] = m.m4x4()[i][1];
}
@ -489,7 +466,7 @@ public:
inline simd4x4_t<double,4>& operator=(const double m[4*4]) {
const double *p = m;
for (int i=0; i<4; i++) {
for (int i=0; i<4; ++i) {
simd4_t<double,4> vec4(p);
simd4x4[i][0] = vec4.v4()[0]; p += 4;
simd4x4[i][1] = vec4.v4()[1];
@ -498,14 +475,14 @@ public:
}
inline simd4x4_t<double,4>& operator=(const __mtx4d_t m) {
for (int i=0; i<4; i++) {
for (int i=0; i<4; ++i) {
simd4x4[i][0] = simd4_t<double,4>(m[i]).v4()[0];
simd4x4[i][1] = simd4_t<double,4>(m[i]).v4()[1];
}
return *this;
}
inline simd4x4_t<double,4>& operator=(const simd4x4_t<double,4>& m) {
for (int i=0; i<4; i++) {
for (int i=0; i<4; ++i) {
simd4x4[i][0] = m.m4x4()[i][0];
simd4x4[i][1] = m.m4x4()[i][1];
}
@ -513,7 +490,7 @@ public:
}
inline simd4x4_t<double,4>& operator+=(const simd4x4_t<double,4>& m) {
for (int i=0; i<4; i++) {
for (int i=0; i<4; ++i) {
simd4x4[i][0] += m.m4x4()[i][0];
simd4x4[i][1] += m.m4x4()[i][1];
}
@ -521,7 +498,7 @@ public:
}
inline simd4x4_t<double,4>& operator-=(const simd4x4_t<double,4>& m) {
for (int i=0; i<4; i++) {
for (int i=0; i<4; ++i) {
simd4x4[i][0] -= m.m4x4()[i][0];
simd4x4[i][1] -= m.m4x4()[i][1];
}
@ -530,7 +507,7 @@ public:
inline simd4x4_t<double,4>& operator*=(double f) {
simd4_t<double,4> f4(f);
for (int i=0; i<4; i++) {
for (int i=0; i<4; ++i) {
simd4x4[i][0] *= f4.v4()[0];
simd4x4[i][1] *= f4.v4()[0];
}
@ -541,10 +518,10 @@ public:
simd4x4_t<double,4> m1 = *this;
simd4_t<double,4> row, col;
for (int i=0; i<4; i++ ) {
for (int i=0; i<4; ++i ) {
simd4_t<double,4> col = m1.m4x4()[0];
row = col * m2.ptr()[i][0];
for (int j=1; j<4; j++) {
for (int j=1; j<4; ++j) {
col = m1.m4x4()[j];
row += col * m2.ptr()[i][j];
}
@ -656,18 +633,18 @@ private:
public:
simd4x4_t(void) {}
simd4x4_t(const int m[4*4]) {
for (int i=0; i<4; i++) {
for (int i=0; i<4; ++i) {
simd4x4[i] = simd4_t<int,4>((const int*)&m[4*i]).v4();
}
}
explicit simd4x4_t(const __mtx4i_t m) {
for (int i=0; i<4; i++) {
for (int i=0; i<4; ++i) {
simd4x4[i] = simd4_t<int,4>(m[i]).v4();
}
}
simd4x4_t(const simd4x4_t<int,4>& m) {
for (int i=0; i<4; i++) {
for (int i=0; i<4; ++i) {
simd4x4[i] = m.m4x4()[i];
}
}
@ -698,27 +675,27 @@ public:
}
inline simd4x4_t<int,4>& operator=(const __mtx4i_t m) {
for (int i=0; i<4; i++) {
for (int i=0; i<4; ++i) {
simd4x4[i] = simd4_t<int,4>(m[i]).v4();
}
return *this;
}
inline simd4x4_t<int,4>& operator=(const simd4x4_t<int,4>& m) {
for (int i=0; i<4; i++) {
for (int i=0; i<4; ++i) {
simd4x4[i] = m.m4x4()[i];
}
return *this;
}
inline simd4x4_t<int,4>& operator+=(const simd4x4_t<int,4>& m) {
for (int i=0; i<4; i++) {
for (int i=0; i<4; ++i) {
simd4x4[i] += m.m4x4()[i];
}
return *this;
}
inline simd4x4_t<int,4>& operator-=(const simd4x4_t<int,4>& m) {
for (int i=0; i<4; i++) {
for (int i=0; i<4; ++i) {
simd4x4[i] -= m.m4x4()[i];
}
return *this;
@ -726,7 +703,7 @@ public:
inline simd4x4_t<int,4>& operator*=(int f) {
simd4_t<int,4> f4(f);
for (int i=0; i<4; i++) {
for (int i=0; i<4; ++i) {
simd4x4[i] *= f4.v4();
}
return *this;
@ -736,10 +713,10 @@ public:
simd4x4_t<int,4> m1 = *this;
simd4_t<int,4> row, col;
for (int i=0; i<4; i++) {
for (int i=0; i<4; ++i) {
simd4_t<int,4> col(m2.ptr()[i][0]);
row.v4() = m1.m4x4()[0] * col.v4();
for (int j=1; j<4; j++) {
for (int j=1; j<4; ++j) {
simd4_t<int,4> col(m2.ptr()[i][j]);
row.v4() += m1.m4x4()[j] * col.v4();
}
@ -754,7 +731,7 @@ inline simd4_t<int,4> operator*(const simd4x4_t<int,4>& m, const simd4_t<int,4>&
{
simd4_t<int,4> mv(m);
mv *= vi.ptr()[0];
for (int i=1; i<4; i++) {
for (int i=1; i<4; ++i) {
simd4_t<int,4> row(m.m4x4()[i]);
row *= vi.ptr()[i];
mv.v4() += row.v4();
@ -768,10 +745,10 @@ inline simd4x4_t<int,4> operator*(const simd4x4_t<int,4>& m1, const simd4x4_t<in
simd4_t<int,4> row, col;
simd4x4_t<int,4> m;
for (int i=0; i<4; i++) {
for (int i=0; i<4; ++i) {
simd4_t<int,4> col(m2.ptr()[i][0]);
row.v4() = m1.m4x4()[0] * col.v4();
for (int j=1; j<4; j++) {
for (int j=1; j<4; ++j) {
simd4_t<int,4> col(m2.ptr()[i][j]);
row.v4() += m1.m4x4()[j] * col.v4();
}