Add some more accelerated functions (dot, magnitude, matrix transpose), allow simd_t4 with non mathcing elements to be assigned and a bunch of fixes. Make sure the code compiles when SSE is not available.

This commit is contained in:
Erik Hofman 2016-12-07 10:00:02 +01:00
parent 801d8c4af5
commit b74d1a8351
5 changed files with 394 additions and 146 deletions

View File

@ -222,22 +222,22 @@ SGVec2<T> addClipOverflow(SGVec2<T> const& lhs, SGVec2<T> const& rhs)
template<typename T>
inline
T
dot(SGVec2<T> v1, const SGVec2<T>& v2)
{ v1.simd2() *= v2.simd2(); return (v1(0)+v1(1)); }
dot(const SGVec2<T>& v1, const SGVec2<T>& v2)
{ return simd4::dot(v1.simd2(), v2.simd2()); }
/// The euclidean norm of the vector, that is what most people call length
template<typename T>
inline
T
norm(const SGVec2<T>& v)
{ return sqrt(dot(v, v)); }
{ return simd4::magnitude(v.simd2()); }
/// The euclidean norm of the vector, that is what most people call length
template<typename T>
inline
T
length(const SGVec2<T>& v)
{ return sqrt(dot(v, v)); }
{ return simd4::magnitude(v.simd2()); }
/// The 1-norm of the vector, this one is the fastest length function we
/// can implement on modern cpu's

View File

@ -316,22 +316,22 @@ SGVec3<T> addClipOverflow(SGVec3<T> const& lhs, SGVec3<T> const& rhs)
template<typename T>
inline
T
dot(SGVec3<T> v1, const SGVec3<T>& v2)
{ v1.simd3() *= v2.simd3(); return (v1(0)+v1(1)+v1(2)); }
dot(const SGVec3<T>& v1, const SGVec3<T>& v2)
{ return simd4::dot(v1.simd3(), v2.simd3()); }
/// The euclidean norm of the vector, that is what most people call length
template<typename T>
inline
T
norm(const SGVec3<T>& v)
{ return sqrt(dot(v, v)); }
{ return simd4::magnitude(v.simd3()); }
/// The euclidean norm of the vector, that is what most people call length
template<typename T>
inline
T
length(const SGVec3<T>& v)
{ return sqrt(dot(v, v)); }
{ return simd4::magnitude(v.simd3()); }
/// The 1-norm of the vector, this one is the fastest length function we
/// can implement on modern cpu's

View File

@ -269,22 +269,22 @@ SGVec4<T> addClipOverflow(SGVec4<T> const& lhs, SGVec4<T> const& rhs)
template<typename T>
inline
T
dot(SGVec4<T> v1, const SGVec4<T>& v2)
{ v1.simd4() *= v2.simd4(); return (v1(0)+v1(1)+v1(2)+v1(3)); }
dot(const SGVec4<T>& v1, const SGVec4<T>& v2)
{ return simd4::dot(v1.simd4(), v2.simd4()); }
/// The euclidean norm of the vector, that is what most people call length
template<typename T>
inline
T
norm(const SGVec4<T>& v)
{ return sqrt(dot(v, v)); }
{ return simd4::magnitude(v.simd4()); }
/// The euclidean norm of the vector, that is what most people call length
template<typename T>
inline
T
length(const SGVec4<T>& v)
{ return sqrt(dot(v, v)); }
{ return simd4::magnitude(v.simd4()); }
/// The 1-norm of the vector, this one is the fastest length function we
/// can implement on modern cpu's

View File

@ -21,6 +21,59 @@
#include <cstring>
#include <cmath>
template<typename T, int N> class simd4_t;
namespace simd4
{
template<typename T, int N>
inline simd4_t<T,N> abs(simd4_t<T,N> v) {
for (int i=0; i<N; i++) {
v[i] = std::abs(v[i]);
}
return v;
}
template<typename T, int N>
inline T magnitude2(simd4_t<T,N> v) {
T mag2 = 0;
v = v*v;
for (int i=0; i<N; i++) {
mag2 += v[i];
}
return mag2;
}
template<typename T, int N>
inline T magnitude(const simd4_t<T,N>& v) {
return std::sqrt(magnitude2(v));
}
template<typename T, int N>
inline T normalize(simd4_t<T,N>& v) {
T mag = simd4::magnitude(v);
if (mag) {
v /= mag;
} else {
v = T(0);
}
return mag;
}
template<typename T, int N>
inline T dot(simd4_t<T,N> v1, const simd4_t<T,N>& v2) {
T dp = 0;
v1 *= v2;
for (int i=0; i<N; i++) {
dp += v1[i];
}
return dp;
}
} /* namespace simd4 */
template<typename T, int N>
class simd4_t
{
@ -32,18 +85,28 @@ private:
public:
simd4_t(void) {}
template<typename S>
simd4_t(S s) {
vec[0] = vec[1] = vec[2] = vec[3] = s;
explicit simd4_t(T s) {
for (int i=0; i<N; i++) vec[i] = s;
for (int i=N; i<4; i++) _v4[i] = 0;
}
explicit simd4_t(const T v[N]) {
std::memcpy(vec, v, sizeof(T[N]));
}
simd4_t(const simd4_t<T,N>& v) {
std::memcpy(vec, v.vec, sizeof(T[N]));
template<int M>
simd4_t(const simd4_t<T,M>& v) {
if (M<4) std::memset(_v4+M, 0, sizeof(T[4-M]));
std::memcpy(vec, v.ptr(), sizeof(T[M]));
}
~simd4_t(void) {}
inline T (&v4(void))[4] {
return vec;
}
inline const T (&v4(void) const)[4] {
return vec;
}
inline const T (&ptr(void) const)[N] {
return vec;
}
@ -60,22 +123,24 @@ public:
return vec;
}
template<typename S>
inline simd4_t<T,N>& operator=(S s) {
vec[0] = vec[1] = vec[2] = vec[3] = s;
inline simd4_t<T,N>& operator=(T s) {
for (int i=0; i<N; i++) vec[i] = s;
for (int i=N; i<4; i++) _v4[i] = 0;
return *this;
}
inline simd4_t<T,N>& operator=(const T v[N]) {
if (N<4) std::memset(_v4+N, 0, sizeof(T[4-N]));
std::memcpy(vec, v, sizeof(T[N]));
return *this;
}
inline simd4_t<T,N>& operator=(const simd4_t<T,N>& v) {
std::memcpy(vec, v.vec, sizeof(T[N]));
template<int M>
inline simd4_t<T,N>& operator=(const simd4_t<T,M>& v) {
if (M<4) std::memset(_v4+M, 0, sizeof(T[4-M]));
std::memcpy(vec, v.ptr(), sizeof(T[M]));
return *this;
}
template<typename S>
inline simd4_t<T,N> operator+(S s) {
inline simd4_t<T,N> operator+(T s) {
simd4_t<T,N> r(*this);
r += s;
return r;
@ -96,8 +161,7 @@ public:
r -= *this;
return r;
}
template<typename S>
inline simd4_t<T,N> operator-(S s) {
inline simd4_t<T,N> operator-(T s) {
simd4_t<T,N> r(*this);
r -= s;
return r;
@ -107,9 +171,7 @@ public:
r -= v;
return r;
}
template<typename S>
inline simd4_t<T,N> operator*(S s) {
inline simd4_t<T,N> operator*(T s) {
simd4_t<T,N> r(s);
r *= *this;
return r;
@ -123,9 +185,8 @@ public:
v *= *this; return v;
}
template<typename S>
inline simd4_t<T,N> operator/(S s) {
simd4_t<T,N> r(1/T(s));
inline simd4_t<T,N> operator/(T s) {
simd4_t<T,N> r(1/s);
r *= this;
return r;
}
@ -139,8 +200,7 @@ public:
r /= v; return v;
}
template<typename S>
inline simd4_t<T,N>& operator+=(S s) {
inline simd4_t<T,N>& operator+=(T s) {
for (int i=0; i<N; i++) {
vec[i] += s;
}
@ -148,7 +208,7 @@ public:
}
inline simd4_t<T,N>& operator+=(const T v[N]) {
simd4_t<T,N> r(v);
*this += r.simd4;
*this += r.v4();
return *this;
}
inline simd4_t<T,N>& operator+=(const simd4_t<T,N>& v) {
@ -158,8 +218,7 @@ public:
return *this;
}
template<typename S>
inline simd4_t<T,N>& operator-=(S s) {
inline simd4_t<T,N>& operator-=(T s) {
for (int i=0; i<N; i++) {
vec[i] -= s;
}
@ -168,7 +227,7 @@ public:
inline simd4_t<T,N>& operator-=(const T v[N]) {
simd4_t<T,N> r(v);
*this -= r.simd4;
*this -= r.v4();
return *this;
}
inline simd4_t<T,N>& operator-=(const simd4_t<T,N>& v) {
@ -177,9 +236,7 @@ public:
}
return *this;
}
template<typename S>
inline simd4_t<T,N>& operator*=(S s) {
inline simd4_t<T,N>& operator*=(T s) {
for (int i=0; i<N; i++) {
vec[i] *= s;
}
@ -187,7 +244,7 @@ public:
}
inline simd4_t<T,N>& operator*=(const T v[N]) {
simd4_t<T,N> r(v);
*this *= r.simd4;
*this *= r.v4();
return *this;
}
inline simd4_t<T,N>& operator*=(const simd4_t<T,N>& v) {
@ -197,13 +254,12 @@ public:
return *this;
}
template<typename S>
inline simd4_t<T,N>& operator/=(S s) {
return operator*=(1/T(s));
inline simd4_t<T,N>& operator/=(T s) {
return operator*=(1/s);
}
inline simd4_t<T,N>& operator/=(const T v[N]) {
simd4_t<T,N> r(v);
*this /= r.simd4;
*this /= r.v4();
return *this;
}
inline simd4_t<T,N>& operator/=(const simd4_t<T,N>& v) {
@ -244,16 +300,6 @@ inline simd4_t<T,N> operator*(simd4_t<T,N> v, T f) {
return v;
}
namespace simd4
{
template<typename T, int N>
inline simd4_t<T,N> abs(simd4_t<T,N> v) {
for (int i=0; i<N; i++) {
v[i] = std::abs(v[i]);
}
return v;
}
};
# ifdef __MMX__
# include <mmintrin.h>
@ -271,18 +317,23 @@ private:
union {
__m128 simd4;
__vec4f_t vec;
float _v4[4];
};
public:
simd4_t(void) {}
simd4_t(float f) {
simd4 = _mm_set1_ps(f);
for (int i=N; i<4; i++) _v4[i] = 0.0f;
}
explicit simd4_t(const __vec4f_t v) {
simd4 = _mm_loadu_ps(v);
for (int i=N; i<4; i++) _v4[i] = 0.0f;
}
simd4_t(const simd4_t<float,N>& v) {
simd4 = v.simd4;
template<int M>
simd4_t(const simd4_t<float,M>& v) {
simd4 = v.v4();
for (int i=M; i<4; i++) _v4[i] = 0.0f;
}
simd4_t(const __m128& v) {
simd4 = v;
@ -314,54 +365,58 @@ public:
inline simd4_t<float,N>& operator=(float f) {
simd4 = _mm_set1_ps(f);
for (int i=N; i<4; i++) _v4[i] = 0.0f;
return *this;
}
inline simd4_t<float,N>& operator=(const __vec4f_t v) {
simd4 = _mm_loadu_ps(v);
for (int i=N; i<4; i++) _v4[i] = 0.0f;
return *this;
}
inline simd4_t<float,N>& operator=(const simd4_t<float,N>& v) {
simd4 = v.simd4;
template<int M>
inline simd4_t<float,N>& operator=(const simd4_t<float,M>& v) {
simd4 = v.v4();
for (int i=M; i<4; i++) _v4[i] = 0.0f;
return *this;
}
inline simd4_t<float,N>& operator+=(float f) {
simd4_t<float,N> r(f);
simd4 += r.simd4;
simd4 += r.v4();
return *this;
}
inline simd4_t<float,N>& operator+=(const simd4_t<float,N>& v) {
simd4 += v.simd4;
simd4 += v.v4();
return *this;
}
inline simd4_t<float,N>& operator-=(float f) {
simd4_t<float,N> r(f);
simd4 -= r.simd4;
simd4 -= r.v4();
return *this;
}
inline simd4_t<float,N>& operator-=(const simd4_t<float,N>& v) {
simd4 -= v.simd4;
simd4 -= v.v4();
return *this;
}
inline simd4_t<float,N>& operator*=(float f) {
simd4_t<float,N> r(f);
simd4 *= r.simd4;
simd4 *= r.v4();
return *this;
}
inline simd4_t<float,N>& operator*=(const simd4_t<float,N>& v) {
simd4 *= v.simd4;
simd4 *= v.v4();
return *this;
}
inline simd4_t<float,N>& operator/=(float f) {
simd4_t<float,N> r(1.0f/f);
simd4 *= r.simd4;
simd4 *= r.v4();
return *this;
}
inline simd4_t<float,N>& operator/=(const simd4_t<float,N>& v) {
simd4 /= v.simd4;
simd4 /= v.v4();
return *this;
}
};
@ -390,20 +445,25 @@ private:
union {
__m128d simd4[2];
__vec4d_t vec;
double _v4[4];
};
public:
simd4_t(void) {}
simd4_t(double d) {
simd4[0] = simd4[1] = _mm_set1_pd(d);
for (int i=N; i<4; i++) _v4[i] = 0.0;
}
explicit simd4_t(const __vec4d_t v) {
simd4[0] = _mm_loadu_pd(v);
simd4[1] = _mm_loadu_pd(v+2);
for (int i=N; i<4; i++) _v4[i] = 0.0;
}
simd4_t(const simd4_t<double,N>& v) {
simd4[0] = v.simd4[0];
simd4[1] = v.simd4[1];
template<int M>
simd4_t(const simd4_t<double,M>& v) {
simd4[0] = v.v4()[0];
simd4[1] = v.v4()[1];
for (int i=M; i<4; i++) _v4[i] = 0.0;
}
simd4_t(const __m128d v[2]) {
simd4[0] = v[0];
@ -436,16 +496,20 @@ public:
inline simd4_t<double,N>& operator=(double d) {
simd4[0] = simd4[1] = _mm_set1_pd(d);
for (int i=N; i<4; i++) _v4[i] = 0.0;
return *this;
}
inline simd4_t<double,N>& operator=(const __vec4d_t v) {
simd4[0] = _mm_loadu_pd(v);
simd4[1] = _mm_loadu_pd(v+2);
for (int i=N; i<4; i++) _v4[i] = 0.0;
return *this;
}
inline simd4_t<double,N>& operator=(const simd4_t<double,N>& v) {
simd4[0] = v.simd4[0];
simd4[1] = v.simd4[1];
template<int M>
inline simd4_t<double,N>& operator=(const simd4_t<double,M>& v) {
simd4[0] = v.v4()[0];
simd4[1] = v.v4()[1];
for (int i=M; i<4; i++) _v4[i] = 0.0;
return *this;
}
inline simd4_t<double,N>& operator=(const __m128d v[2]) {
@ -456,49 +520,49 @@ public:
inline simd4_t<double,N>& operator+=(double d) {
simd4_t<double,N> r(d);
simd4[0] += r.simd4[0];
simd4[1] += r.simd4[1];
simd4[0] += r.v4()[0];
simd4[1] += r.v4()[1];
return *this;
}
inline simd4_t<double,N>& operator+=(const simd4_t<double,N>& v) {
simd4[0] += v.simd4[0];
simd4[1] += v.simd4[1];
simd4[0] += v.v4()[0];
simd4[1] += v.v4()[1];
return *this;
}
inline simd4_t<double,N>& operator-=(double d) {
simd4_t<double,N> r(d);
simd4[0] -= r.simd4[0];
simd4[1] -= r.simd4[1];
simd4[0] -= r.v4()[0];
simd4[1] -= r.v4()[1];
return *this;
}
inline simd4_t<double,N>& operator-=(const simd4_t<double,N>& v) {
simd4[0] -= v.simd4[0];
simd4[1] -= v.simd4[1];
simd4[0] -= v.v4()[0];
simd4[1] -= v.v4()[1];
return *this;
}
inline simd4_t<double,N>& operator*=(double d) {
simd4_t<double,N> r(d);
simd4[0] *= r.simd4[0];
simd4[1] *= r.simd4[1];
simd4[0] *= r.v4()[0];
simd4[1] *= r.v4()[1];
return *this;
}
inline simd4_t<double,N>& operator*=(const simd4_t<double,N>& v) {
simd4[0] *= v.simd4[0];
simd4[1] *= v.simd4[1];
simd4[0] *= v.v4()[0];
simd4[1] *= v.v4()[1];
return *this;
}
inline simd4_t<double,N>& operator/=(double d) {
simd4_t<double,N> r(1.0/d);
simd4[0] *= r.simd4[0];
simd4[1] *= r.simd4[1];
simd4[0] *= r.v4()[0];
simd4[1] *= r.v4()[1];
return *this;
}
inline simd4_t<double,N>& operator/=(const simd4_t<double,N>& v) {
simd4[0] /= v.simd4[0];
simd4[1] /= v.simd4[1];
simd4[0] /= v.v4()[0];
simd4[1] /= v.v4()[1];
return *this;
}
};
@ -528,18 +592,23 @@ private:
union {
__m128i simd4;
__vec4i_t vec;
int _v4[4];
};
public:
simd4_t(void) {}
simd4_t(int i) {
simd4 = _mm_set1_epi32(i);
for (int i=N; i<4; i++) _v4[i] = 0;
}
explicit simd4_t(const __vec4i_t v) {
simd4 = _mm_loadu_si128((__m128i*)v);
for (int i=N; i<4; i++) _v4[i] = 0;
}
simd4_t(const simd4_t<int,N>& v) {
simd4 = v.simd4;
template<int M>
simd4_t(const simd4_t<int,M>& v) {
simd4 = v.v4();
for (int i=M; i<4; i++) _v4[i] = 0;
}
simd4_t(const __m128i& v) {
simd4 = v;
@ -571,54 +640,58 @@ public:
inline simd4_t<int,N>& operator=(int i) {
simd4 = _mm_set1_epi32(i);
for (int i=N; i<4; i++) _v4[i] = 0;
return *this;
}
inline simd4_t<int,N>& operator=(const __vec4i_t v) {
simd4 = _mm_loadu_si128((__m128i*)v);
for (int i=N; i<4; i++) _v4[i] = 0;
return *this;
}
inline simd4_t<int,N>& operator=(const simd4_t<int,N>& v) {
simd4 = v.simd4;
template<int M>
inline simd4_t<int,N>& operator=(const simd4_t<int,M>& v) {
simd4 = v.v4();
for (int i=M; i<4; i++) _v4[i] = 0;
return *this;
}
inline simd4_t<int,N>& operator+=(int i) {
simd4_t<int,N> r(i);
simd4 += r.simd4;
simd4 += r.v4();
return *this;
}
inline simd4_t<int,N>& operator+=(const simd4_t<int,N>& v) {
simd4 += v.simd4;
simd4 += v.v4();
return *this;
}
inline simd4_t<int,N>& operator-=(int i) {
simd4_t<int,N> r(i);
simd4 -= r.simd4;
simd4 -= r.v4();
return *this;
}
inline simd4_t<int,N>& operator-=(const simd4_t<int,N>& v) {
simd4 -= v.simd4;
simd4 -= v.v4();
return *this;
}
inline simd4_t<int,N>& operator*=(int i) {
simd4_t<int,N> r(i);
simd4 *= r.simd4;
simd4 *= r.v4();
return *this;
}
inline simd4_t<int,N>& operator*=(const simd4_t<int,N>& v) {
simd4 *= v.simd4;
simd4 *= v.v4();
return *this;
}
inline simd4_t<int,N>& operator/=(int i) {
simd4_t<int,N> r(i);
simd4 /= r.simd4;
simd4 /= r.v4();
return *this;
}
inline simd4_t<int,N>& operator/=(const simd4_t<int,N>& v) {
simd4 /= v.simd4;
simd4 /= v.v4();
return *this;
}
};

View File

@ -22,12 +22,80 @@
#include "simd.hxx"
template<typename T, int N> class simd4x4_t;
namespace simd4x4
{
template<typename T, int N>
inline void zeros(simd4x4_t<T,N>& r) {
std::memset(r, 0, sizeof(T[N][N]));
}
template<typename T, int N>
inline void unit(simd4x4_t<T,N>& r) {
zeros(r);
for (int i=0; i<N; i++) {
r.ptr()[i][i] = T(1);
}
}
template<typename T>
inline simd4x4_t<T,4> rotation_matrix(T angle, const simd4_t<T,3>& axis)
{
T s = std::sin(angle), c = std::cos(angle), t = T(1)-c;
simd4_t<T,3> at = axis*t, as = axis*s;
simd4x4_t<T,4> m;
simd4x4::unit(m);
for (int i=0; i<3; i++) {
simd4_t<T,4> r = axis.ptr()[i]*at;
for (int j=0; j<4; j++) {
m.m4x4()[0][j] = r.v4()[j];
}
}
m.ptr()[0][0] += c;
m.ptr()[0][1] += as.ptr()[2];
m.ptr()[0][2] -= as.ptr()[1];
m.ptr()[1][0] -= as.ptr()[2];
m.ptr()[1][1] += c;
m.ptr()[1][2] += as.ptr()[0];
m.ptr()[2][0] += as.ptr()[1];
m.ptr()[2][1] -= as.ptr()[0];
m.ptr()[2][2] += c;
return m;
}
template<typename T, int N>
inline void rotate(simd4x4_t<T,N>& mtx, T angle, const simd4_t<T,3>& axis) {
if (std::abs(angle) > std::numeric_limits<T>::epsilon()) {
mtx *= rotation_matrix(angle, axis);
}
}
template<typename T, int N>
inline simd4x4_t<T,N> transpose(simd4x4_t<T,N> mtx) {
simd4x4_t<T,N> m;
for (int i=0; i<N; i++) {
for(int j=0; j<N; j++) {
m.ptr()[j][i] = mtx.ptr()[i][j];
}
}
return m;
}
} /* namespace simd4x4 */
template<typename T, int N>
class simd4x4_t
{
private:
union {
T m4x4[4][4];
T _m4x4[4][4];
T mtx[N][N];
T array[N*N];
};
@ -37,11 +105,22 @@ public:
simd4x4_t(const T m[N*N]) {
std::memcpy(array, m, sizeof(T[N*N]));
}
simd4x4_t(const T m[N][N]) {
std::memcpy(array, m, sizeof(T[N*N]));
}
simd4x4_t(const simd4x4_t<T,N>& m) {
std::memcpy(array, m, sizeof(T[N*N]));
}
~simd4x4_t(void) {}
inline T (&m4x4(void))[4][4] {
return _m4x4;
}
inline const T (&m4x4(void) const)[4][4] {
return _m4x4;
}
inline const T (&ptr(void) const)[N][N] {
return mtx;
}
@ -62,6 +141,10 @@ public:
std::memcpy(array, m, sizeof(T[N*N]));
return *this;
}
inline simd4x4_t<T,N>& operator=(const T m[N][N]) {
std::memcpy(array, m, sizeof(T[N*N]));
return *this;
}
inline simd4x4_t<T,N>& operator=(const simd4x4_t<T,N>& m) {
std::memcpy(array, m, sizeof(T[N*N]));
return *this;
@ -142,22 +225,6 @@ public:
}
};
namespace simd4x4
{
template<typename T, int N>
void zeros(simd4x4_t<T,N>& r) {
std::memset(r, 0, sizeof(T[N][N]));
}
template<typename T, int N>
void unit(simd4x4_t<T,N>& r) {
zeros(r);
for (int i=0; i<N; i++) {
r.ptr()[i][i] = 1;
}
}
}
template<typename T, int N>
inline simd4x4_t<T,N> operator-(simd4x4_t<T,N> m) {
simd4x4_t<T,N> r;
@ -282,6 +349,22 @@ public:
}
return *this;
}
simd4x4_t<float,4>& operator*=(const simd4x4_t<float,4>& m2) {
simd4x4_t<float,4> m1 = *this;
simd4_t<float,4> row, col;
for (int i=0; i<4; i++) {
simd4_t<float,4> col(m2.ptr()[i][0]);
row.v4() = m1.m4x4()[0] * col.v4();
for (int j=1; j<4; j++) {
simd4_t<float,4> col(m2.ptr()[i][j]);
row.v4() += m1.m4x4()[j] * col.v4();
}
simd4x4[i] = row.v4();
}
return *this;
}
};
template<>
@ -297,24 +380,17 @@ inline simd4_t<float,4> operator*(const simd4x4_t<float,4>& m, const simd4_t<flo
return mv;
}
template<>
inline simd4x4_t<float,4> operator*(const simd4x4_t<float,4>& m1, const simd4x4_t<float,4>& m2)
namespace simd4x4
{
simd4_t<float,4> row, col;
simd4x4_t<float,4> m;
for (int i=0; i<4; i++) {
simd4_t<float,4> col(m2.ptr()[i][0]);
row.v4() = m1.m4x4()[0] * col.v4();
for (int j=1; j<4; j++) {
simd4_t<float,4> col(m2.ptr()[i][j]);
row.v4() += m1.m4x4()[j] * col.v4();
}
m.m4x4()[i] = row.v4();
}
template<>
inline simd4x4_t<float,4> transpose<float>(simd4x4_t<float,4> m) {
_MM_TRANSPOSE4_PS(m.m4x4()[0], m.m4x4()[1], m.m4x4()[2], m.m4x4()[3]);
return m;
}
} /* namespace simd4x */
# endif
@ -431,6 +507,24 @@ public:
}
return *this;
}
simd4x4_t<double,4>& operator*=(const simd4x4_t<double,4>& m2) {
simd4x4_t<double,4> m1 = *this;
simd4_t<double,4> row, col;
for (int i=0; i<4; i++ ) {
simd4_t<double,4> col = m1.m4x4()[0];
row = col * m2.ptr()[i][0];
for (int j=1; j<4; j++) {
col = m1.m4x4()[j];
row += col * m2.ptr()[i][j];
}
simd4x4[i][0] = row.v4()[0];
simd4x4[i][1] = row.v4()[1];
}
return *this;
}
};
@ -448,25 +542,70 @@ inline simd4_t<double,4> operator*(const simd4x4_t<double,4>& m, const simd4_t<d
return mv;
}
template<>
inline simd4x4_t<double,4> operator*(const simd4x4_t<double,4>& m1, const simd4x4_t<double,4>& m2)
namespace simd4x4
{
simd4_t<double,4> row, col;
template<>
inline simd4x4_t<double,4> rotation_matrix<double>(double angle, const simd4_t<double,3>& axis)
{
double s = std::sin(angle), c = std::cos(angle), t = 1.0-c;
simd4_t<double,3> axt, at = axis*t, as = axis*s;
simd4x4_t<double,4> m;
for (int i=0; i<4; i++ ) {
simd4_t<double,4> col = m1.m4x4()[0];
row = col * m2.ptr()[i][0];
for (int j=1; j<4; j++) {
col = m1.m4x4()[j];
row += col * m2.ptr()[i][j];
}
m.m4x4()[i][0] = row.v4()[0];
m.m4x4()[i][1] = row.v4()[1];
}
simd4x4::unit(m);
axt = axis.ptr()[0]*at;
m.m4x4()[0][0] = axt.v4()[0];
m.m4x4()[0][1] = axt.v4()[1];
axt = axis.ptr()[1]*at;
m.ptr()[0][0] += c;
m.ptr()[0][1] += as.ptr()[2];
m.ptr()[0][2] -= as.ptr()[1];
m.m4x4()[1][0] = axt.v4()[0];
m.m4x4()[1][1] = axt.v4()[1];
axt = axis.ptr()[2]*at;
m.ptr()[1][0] -= as.ptr()[2];
m.ptr()[1][1] += c;
m.ptr()[1][2] += as.ptr()[0];
m.m4x4()[2][0] = axt.v4()[0];
m.m4x4()[2][1] = axt.v4()[1];
m.ptr()[2][0] += as.ptr()[1];
m.ptr()[2][1] -= as.ptr()[0];
m.ptr()[2][2] += c;
return m;
}
template<>
inline simd4x4_t<double,4> transpose<double>(simd4x4_t<double,4> m) {
simd4x4_t<double,4> mtx;
__m128d T0 = _mm_unpacklo_pd(m.m4x4()[0][0], m.m4x4()[0][1]);
__m128d T1 = _mm_unpacklo_pd(m.m4x4()[1][0], m.m4x4()[1][1]);
__m128d T2 = _mm_unpackhi_pd(m.m4x4()[0][0], m.m4x4()[0][1]);
__m128d T3 = _mm_unpackhi_pd(m.m4x4()[1][0], m.m4x4()[1][1]);
mtx.m4x4()[0][0] = _mm_unpacklo_pd(T0, T1);
mtx.m4x4()[1][0] = _mm_unpacklo_pd(T2, T3);
mtx.m4x4()[2][0] = _mm_unpackhi_pd(T0, T1);
mtx.m4x4()[3][0] = _mm_unpackhi_pd(T2, T3);
T0 = _mm_unpacklo_pd(m.m4x4()[2][0], m.m4x4()[2][1]);
T1 = _mm_unpacklo_pd(m.m4x4()[3][0], m.m4x4()[3][1]);
T2 = _mm_unpackhi_pd(m.m4x4()[2][0], m.m4x4()[2][1]);
T3 = _mm_unpackhi_pd(m.m4x4()[3][0], m.m4x4()[3][1]);
mtx.m4x4()[0][1] = _mm_unpacklo_pd(T0, T1);
mtx.m4x4()[1][1] = _mm_unpacklo_pd(T2, T3);
mtx.m4x4()[2][1] = _mm_unpackhi_pd(T0, T1);
mtx.m4x4()[3][1] = _mm_unpackhi_pd(T2, T3);
return mtx;
}
} /* namespace simd4x4 */
# endif
@ -563,6 +702,22 @@ public:
}
return *this;
}
simd4x4_t<int,4>& operator*=(const simd4x4_t<int,4>& m2) {
simd4x4_t<int,4> m1 = *this;
simd4_t<int,4> row, col;
for (int i=0; i<4; i++) {
simd4_t<int,4> col(m2.ptr()[i][0]);
row.v4() = m1.m4x4()[0] * col.v4();
for (int j=1; j<4; j++) {
simd4_t<int,4> col(m2.ptr()[i][j]);
row.v4() += m1.m4x4()[j] * col.v4();
}
simd4x4[i] = row.v4();
}
return *this;
}
};
template<>
@ -596,6 +751,26 @@ inline simd4x4_t<int,4> operator*(const simd4x4_t<int,4>& m1, const simd4x4_t<in
return m;
}
namespace simd4x4
{
template<>
inline simd4x4_t<int,4> transpose<int>(simd4x4_t<int,4> m) {
__m128i T0 = _mm_unpacklo_epi32(m.m4x4()[0], m.m4x4()[1]);
__m128i T1 = _mm_unpacklo_epi32(m.m4x4()[2], m.m4x4()[3]);
__m128i T2 = _mm_unpackhi_epi32(m.m4x4()[0], m.m4x4()[1]);
__m128i T3 = _mm_unpackhi_epi32(m.m4x4()[2], m.m4x4()[3]);
m.m4x4()[0] = _mm_unpacklo_epi64(T0, T1);
m.m4x4()[1] = _mm_unpackhi_epi64(T0, T1);
m.m4x4()[2] = _mm_unpacklo_epi64(T2, T3);
m.m4x4()[3] = _mm_unpackhi_epi64(T2, T3);
return m;
}
} /* namespace simd4x */
# endif
#endif /* __SIMD4X4_H__ */