Finish SIMD Vector code, do not change matrix operations yet since it seems to have a problem

This commit is contained in:
Erik Hofman 2016-11-25 13:31:54 +01:00
parent 16d62f93c8
commit e7f80cf5f3
4 changed files with 432 additions and 39 deletions

View File

@ -223,7 +223,7 @@ template<typename T>
inline
T
dot(SGVec2<T> v1, const SGVec2<T>& v2)
{ v1.simd2() *= v2.simd2(); return (v1(0)+v1(1)+v1(2)); }
{ v1.simd2() *= v2.simd2(); return (v1(0)+v1(1)); }
/// The euclidean norm of the vector, that is what most people call length
template<typename T>

View File

@ -20,7 +20,7 @@
#include <iosfwd>
#include <simgear/math/SGVec2.hxx>
#include "SGVec2.hxx"
#include <simgear/math/SGGeodesy.hxx>
#include "simd.hxx"

View File

@ -26,7 +26,7 @@ class simd4_t
{
private:
union {
T v4[4];
T _v4[4];
T vec[N];
};
@ -214,6 +214,36 @@ public:
}
};
template<typename T, int N>
inline simd4_t<T,N> operator+(simd4_t<T,N> v1, const simd4_t<T,N>& v2) {
v1 += v2;
return v1;
}
template<typename T, int N>
inline simd4_t<T,N> operator-(simd4_t<T,N> v1, const simd4_t<T,N>& v2) {
v1 -= v2;
return v1;
}
template<typename T, int N>
inline simd4_t<T,N> operator*(simd4_t<T,N> v1, const simd4_t<T,N>& v2) {
v1 *= v2;
return v1;
}
template<typename T, int N>
inline simd4_t<T,N> operator*(T f, simd4_t<T,N> v) {
v *= f;
return v;
}
template<typename T, int N>
inline simd4_t<T,N> operator*(simd4_t<T,N> v, T f) {
v *= f;
return v;
}
namespace simd4
{
template<typename T, int N>
@ -254,11 +284,9 @@ public:
simd4_t(const simd4_t<float,N>& v) {
simd4 = v.simd4;
}
#if 0
simd4_t(const simd4_t<double,N>& v) {
simd4 = _mm_movelh_ps(_mm_cvtpd_ps(v.v4()[0]), _mm_cvtpd_ps(v.v4()[1]));
simd4_t(const __m128& v) {
simd4 = v;
}
#endif
inline __m128 (&v4(void)) {
return simd4;
@ -350,7 +378,9 @@ inline simd4_t<float,N>abs(simd4_t<float,N> v) {
# endif
# ifdef __SSE2
# ifdef __SSE2__
# include <emmintrin.h>
template<int N>
class simd4_t<double,N>
{
@ -375,6 +405,10 @@ public:
simd4[0] = v.simd4[0];
simd4[1] = v.simd4[1];
}
simd4_t(const __m128d v[2]) {
simd4[0] = v[0];
simd4[1] = v[1];
}
inline __m128d (&v4(void))[2] {
return simd4;
@ -401,7 +435,7 @@ public:
}
inline simd4_t<double,N>& operator=(double d) {
simd4[0] = simd[1] = _mm_set1_pd(d);
simd4[0] = simd4[1] = _mm_set1_pd(d);
return *this;
}
inline simd4_t<double,N>& operator=(const __vec4d_t v) {
@ -414,6 +448,11 @@ public:
simd4[1] = v.simd4[1];
return *this;
}
inline simd4_t<double,N>& operator=(const __m128d v[2]) {
simd4[0] = v[0];
simd4[1] = v[1];
return *this;
}
inline simd4_t<double,N>& operator+=(double d) {
simd4_t<double,N> r(d);
@ -469,8 +508,8 @@ namespace simd4
template<int N>
inline simd4_t<double,N>abs(simd4_t<double,N> v) {
static const __m128d sign_mask = _mm_set1_pd(-0.); // -0. = 1 << 63
v.v4[0] = mm_andnot_ps(sign_mask, v4[0]);
v.v4[1] = _mm_andnot_ps(sign_mask, v4[1]);
v.v4()[0] = _mm_andnot_pd(sign_mask, v.v4()[0]);
v.v4()[1] = _mm_andnot_pd(sign_mask, v.v4()[1]);
return v;
}
};
@ -502,6 +541,17 @@ public:
simd4_t(const simd4_t<int,N>& v) {
simd4 = v.simd4;
}
simd4_t(const __m128i& v) {
simd4 = v;
}
inline __m128i (&v4(void)) {
return simd4;
}
inline const __m128i (&v4(void) const) {
return simd4;
}
inline const int (&ptr(void) const)[N] {
return vec;

View File

@ -20,6 +20,7 @@
#include <cstring>
#include "simd.hxx"
template<typename T, int N>
class simd4x4_t
@ -33,11 +34,11 @@ private:
public:
simd4x4_t(void) {}
explicit simd4x4_t(const T m[N][N]) {
std::memcpy(array, m, sizeof(T[N][N]));
simd4x4_t(const T m[N*N]) {
std::memcpy(array, m, sizeof(T[N*N]));
}
simd4x4_t(const simd4x4_t<T,N>& m) {
std::memcpy(array, m, sizeof(T[N][N]));
std::memcpy(array, m, sizeof(T[N*N]));
}
~simd4x4_t(void) {}
@ -57,12 +58,12 @@ public:
return array;
}
inline simd4x4_t<T,N>& operator=(const T m[N][N]) {
std::memcpy(array, m, sizeof(T[N][N]));
inline simd4x4_t<T,N>& operator=(const T m[N*N]) {
std::memcpy(array, m, sizeof(T[N*N]));
return *this;
}
inline simd4x4_t<T,N>& operator=(const simd4x4_t<T,N>& m) {
std::memcpy(array, m, sizeof(T[N][N]));
std::memcpy(array, m, sizeof(T[N*N]));
return *this;
}
@ -105,15 +106,33 @@ public:
return *this;
}
template<typename S>
inline simd4x4_t<T,N>& operator*=(S s) {
inline simd4x4_t<T,N>& operator*=(T s) {
for (int i=0; i<N*N; i++) {
array[i] *= s;
}
return *this;
}
inline simd4x4_t<T,N>& operator*=(const simd4x4_t<T,N>& m) {
//TODO
simd4x4_t<T,N>& operator*=(const simd4x4_t<T,N>& m1) {
simd4x4_t<T,N> m2 = *this;
simd4_t<T,N> row;
for (int j=0; j<N; j++) {
for (int r=0; r<N; r++) {
row[r] = m2.ptr()[r][0];
}
row *= m1.ptr()[0][j];
for (int r=0; r<N; r++) {
mtx[r][j] = row[r];
}
for (int i=1; i<N; i++) {
for (int r=0; r<N; r++) {
row[r] = m2.ptr()[r][i];
}
row *= m1.ptr()[i][j];
for (int r=0; r<N; r++) {
mtx[r][j] += row[r];
}
}
}
return *this;
}
@ -133,28 +152,43 @@ void zeros(simd4x4_t<T,N>& r) {
template<typename T, int N>
void unit(simd4x4_t<T,N>& r) {
zeros(r);
for (int i=0; i<4; i++) {
for (int i=0; i<N; i++) {
r.ptr()[i][i] = 1;
}
}
}
#if 0
template<typename S, int N>
simd4_t<T,N>
_pt4Matrix4_sse(const simd4x4_t<T,N>& m, const simd4_t<S,N>& vi)
{
simd4_t<T,N> r;
simd4_t<T,N> x(vi[0]);
r = x*m[0];
for (int i=1; i<N; i++ {
x = vi[i];
r += x*m[i];
}
template<typename T, int N>
inline simd4x4_t<T,N> operator-(simd4x4_t<T,N> m) {
simd4x4_t<T,N> r;
simd4x4::zeros(r);
r -= m;
return r;
}
#endif
template<typename T, int N>
inline simd4_t<T,N> operator*(const simd4x4_t<T,N>& m, const simd4_t<T,N>& vi)
{
simd4_t<T,N> mv;
simd4_t<T,N> row(m);
mv = vi.ptr()[0] * row;
for (int j=1; j<N; j++) {
simd4_t<T,N> row(m[j*N]);
mv += vi.ptr()[j] * row;
}
return mv;
}
template<typename T, int N>
inline simd4x4_t<T,N> operator*(const simd4x4_t<T,N>& m1, const simd4x4_t<T,N>& m2)
{
simd4x4_t<T,N> m = m1;
m *= m2;
return m;
}
# ifdef __SSE__
# include <xmmintrin.h>
@ -172,9 +206,15 @@ private:
public:
simd4x4_t(void) {}
simd4x4_t(const float m[4*4]) {
for (int i=0; i<4; i++) {
simd4x4[i] = simd4_t<float,4>((const float*)&m[4*i]).v4();
}
}
explicit simd4x4_t(const __mtx4f_t m) {
for (int i=0; i<4; i++) {
simd4x4[i] = _mm_loadu_ps(m[i]);
simd4x4[i] = simd4_t<float,4>(m[i]).v4();
}
}
simd4x4_t(const simd4x4_t<float,4>& m) {
@ -210,7 +250,7 @@ public:
inline simd4x4_t<float,4>& operator=(const __mtx4f_t m) {
for (int i=0; i<4; i++) {
simd4x4[i] = _mm_loadu_ps(m[i]);
simd4x4[i] = simd4_t<float,4>(m[i]).v4();
}
return *this;
}
@ -236,14 +276,317 @@ public:
}
inline simd4x4_t<float,4>& operator*=(float f) {
__m128 f4 = _mm_set1_ps(f);
simd4_t<float,4> f4(f);
for (int i=0; i<4; i++) {
simd4x4[i] *= f4;
simd4x4[i] *= f4.v4();
}
return *this;
}
};
template<>
inline simd4_t<float,4> operator*(const simd4x4_t<float,4>& m, const simd4_t<float,4>& vi)
{
simd4_t<float,4> mv(m);
mv *= vi.ptr()[0];
for (int i=1; i<4; i++) {
simd4_t<float,4> row(m.m4x4()[i]);
row *= vi.ptr()[i];
mv.v4() += row.v4();
}
return mv;
}
template<>
inline simd4x4_t<float,4> operator*(const simd4x4_t<float,4>& m1, const simd4x4_t<float,4>& m2)
{
simd4_t<float,4> row, col;
simd4x4_t<float,4> m;
for (int i=0; i<4; i++) {
simd4_t<float,4> col(m2.ptr()[i][0]);
row.v4() = m1.m4x4()[0] * col.v4();
for (int j=1; j<4; j++) {
simd4_t<float,4> col(m2.ptr()[i][j]);
row.v4() += m1.m4x4()[j] * col.v4();
}
m.m4x4()[i] = row.v4();
}
return m;
}
# endif
# ifdef __SSE2__
# include <emmintrin.h>
template<>
class simd4x4_t<double,4>
{
private:
typedef double __mtx4d_t[4][4];
union {
__m128d simd4x4[4][2];
__mtx4d_t mtx;
double array[4*4];
};
public:
simd4x4_t(void) {}
explicit simd4x4_t(const double m[4*4]) {
for (int i=0; i<4; i++) {
simd4x4[i][0] = simd4_t<double,4>((const double*)&m[4*i]).v4()[0];
simd4x4[i][1] = simd4_t<double,4>((const double*)&m[4*i+2]).v4()[1];
}
}
explicit simd4x4_t(const __mtx4d_t m) {
for (int i=0; i<4; i++) {
simd4x4[i][0] = simd4_t<double,4>(m[i]).v4()[0];
simd4x4[i][1] = simd4_t<double,4>(m[i]).v4()[1];
}
}
simd4x4_t(const simd4x4_t<double,4>& m) {
for (int i=0; i<4; i++) {
simd4x4[i][0] = m.m4x4()[i][0];
simd4x4[i][1] = m.m4x4()[i][1];
}
}
~simd4x4_t(void) {}
inline __m128d (&m4x4(void))[4][2] {
return simd4x4;
}
inline const __m128d (&m4x4(void) const)[4][2] {
return simd4x4;
}
inline const double (&ptr(void) const)[4][4] {
return mtx;
}
inline double (&ptr(void))[4][4] {
return mtx;
}
inline operator const double*(void) const {
return array;
}
inline operator double*(void) {
return array;
}
inline simd4x4_t<double,4>& operator=(const __mtx4d_t m) {
for (int i=0; i<4; i++) {
simd4x4[i][0] = simd4_t<double,4>(m[i]).v4()[0];
simd4x4[i][1] = simd4_t<double,4>(m[i]).v4()[1];
}
return *this;
}
inline simd4x4_t<double,4>& operator=(const simd4x4_t<double,4>& m) {
for (int i=0; i<4; i++) {
simd4x4[i][0] = m.m4x4()[i][0];
simd4x4[i][1] = m.m4x4()[i][1];
}
return *this;
}
inline simd4x4_t<double,4>& operator+=(const simd4x4_t<double,4>& m) {
for (int i=0; i<4; i++) {
simd4x4[i][0] += m.m4x4()[i][0];
simd4x4[i][1] += m.m4x4()[i][1];
}
return *this;
}
inline simd4x4_t<double,4>& operator-=(const simd4x4_t<double,4>& m) {
for (int i=0; i<4; i++) {
simd4x4[i][0] -= m.m4x4()[i][0];
simd4x4[i][1] -= m.m4x4()[i][1];
}
return *this;
}
inline simd4x4_t<double,4>& operator*=(double f) {
simd4_t<double,4> f4(f);
for (int i=0; i<4; i++) {
simd4x4[i][0] *= f4.v4()[0];
simd4x4[i][1] *= f4.v4()[0];
}
return *this;
}
};
#if 0
template<>
inline simd4_t<double,4> operator*(const simd4x4_t<double,4>& m, const simd4_t<double,4>& vi)
{
simd4_t<double,4> mv(m.m4x4()[0]);
mv.v4()[0] *= vi.ptr()[0];
mv.v4()[1] *= vi.ptr()[2];
for (int i=1; i<4; i+=2) {
simd4_t<double,4> row = m.m4x4()[i];
row.v4()[0] *= vi.ptr()[i];
row.v4()[1] *= vi.ptr()[i+2];
mv += row;
}
return mv;
}
#endif
template<>
inline simd4x4_t<double,4> operator*(const simd4x4_t<double,4>& m1, const simd4x4_t<double,4>& m2)
{
simd4_t<double,4> row, col;
simd4x4_t<double,4> m;
for (int i=0; i<4; i++ ) {
simd4_t<double,4> col = m1.m4x4()[0];
row = col * m2.ptr()[i][0];
for (int j=1; j<4; j++) {
col = m1.m4x4()[j];
row += col * m2.ptr()[i][j];
}
m.m4x4()[i][0] = row.v4()[0];
m.m4x4()[i][1] = row.v4()[1];
}
return m;
}
# endif
# ifdef __SSE2__
# include <xmmintrin.h>
template<>
class simd4x4_t<int,4>
{
private:
typedef int __mtx4i_t[4][4];
union {
__m128i simd4x4[4];
__mtx4i_t mtx;
int array[4*4];
};
public:
simd4x4_t(void) {}
simd4x4_t(const int m[4*4]) {
for (int i=0; i<4; i++) {
simd4x4[i] = simd4_t<int,4>((const int*)&m[4*i]).v4();
}
}
explicit simd4x4_t(const __mtx4i_t m) {
for (int i=0; i<4; i++) {
simd4x4[i] = simd4_t<int,4>(m[i]).v4();
}
}
simd4x4_t(const simd4x4_t<int,4>& m) {
for (int i=0; i<4; i++) {
simd4x4[i] = m.m4x4()[i];
}
}
~simd4x4_t(void) {}
inline __m128i (&m4x4(void))[4] {
return simd4x4;
}
inline const __m128i (&m4x4(void) const)[4] {
return simd4x4;
}
inline const int (&ptr(void) const)[4][4] {
return mtx;
}
inline int (&ptr(void))[4][4] {
return mtx;
}
inline operator const int*(void) const {
return array;
}
inline operator int*(void) {
return array;
}
inline simd4x4_t<int,4>& operator=(const __mtx4i_t m) {
for (int i=0; i<4; i++) {
simd4x4[i] = simd4_t<int,4>(m[i]).v4();
}
return *this;
}
inline simd4x4_t<int,4>& operator=(const simd4x4_t<int,4>& m) {
for (int i=0; i<4; i++) {
simd4x4[i] = m.m4x4()[i];
}
return *this;
}
inline simd4x4_t<int,4>& operator+=(const simd4x4_t<int,4>& m) {
for (int i=0; i<4; i++) {
simd4x4[i] += m.m4x4()[i];
}
return *this;
}
inline simd4x4_t<int,4>& operator-=(const simd4x4_t<int,4>& m) {
for (int i=0; i<4; i++) {
simd4x4[i] -= m.m4x4()[i];
}
return *this;
}
inline simd4x4_t<int,4>& operator*=(int f) {
simd4_t<int,4> f4(f);
for (int i=0; i<4; i++) {
simd4x4[i] *= f4.v4();
}
return *this;
}
};
template<>
inline simd4_t<int,4> operator*(const simd4x4_t<int,4>& m, const simd4_t<int,4>& vi)
{
simd4_t<int,4> mv(m);
mv *= vi.ptr()[0];
for (int i=1; i<4; i++) {
simd4_t<int,4> row(m.m4x4()[i]);
row *= vi.ptr()[i];
mv.v4() += row.v4();
}
return mv;
}
template<>
inline simd4x4_t<int,4> operator*(const simd4x4_t<int,4>& m1, const simd4x4_t<int,4>& m2)
{
simd4_t<int,4> row, col;
simd4x4_t<int,4> m;
for (int i=0; i<4; i++) {
simd4_t<int,4> col(m2.ptr()[i][0]);
row.v4() = m1.m4x4()[0] * col.v4();
for (int j=1; j<4; j++) {
simd4_t<int,4> col(m2.ptr()[i][j]);
row.v4() += m1.m4x4()[j] * col.v4();
}
m.m4x4()[i] = row.v4();
}
return m;
}
# endif
#endif /* __SIMD4X4_H__ */