Skip to content

Commit

Permalink
Merge pull request #3 from etheory/etheory-patch-gccNarrowingFix
Browse files Browse the repository at this point in the history
Etheory patch gcc narrowing fix + MSVC support
  • Loading branch information
romeric committed Apr 27, 2017
2 parents 61cb030 + 4343f20 commit e729add
Show file tree
Hide file tree
Showing 2 changed files with 96 additions and 8 deletions.
52 changes: 48 additions & 4 deletions fastapprox/src/fastonebigheader.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,9 +104,49 @@ typedef __m128i v4si;
#define v4si_to_v4sf _mm_cvtepi32_ps
#define v4sf_to_v4si _mm_cvttps_epi32

#define v4sfl(x) ((const v4sf) { (x), (x), (x), (x) })
#define v2dil(x) ((const v4si) { (x), (x) })
#define v4sil(x) v2dil((((unsigned long long) (x)) << 32) | (x))
#if _MSC_VER && !__INTEL_COMPILER
template <class T>
__forceinline char GetChar(T value, size_t index) { return ((char*)&value)[index]; }

#define AS_4CHARS(a) \
GetChar(int32_t(a), 0), GetChar(int32_t(a), 1), \
GetChar(int32_t(a), 2), GetChar(int32_t(a), 3)

#define _MM_SETR_EPI32(a0, a1, a2, a3) \
{ AS_4CHARS(a0), AS_4CHARS(a1), AS_4CHARS(a2), AS_4CHARS(a3) }

#define v4sfl(x) (const v4sf { (x), (x), (x), (x) })
#define v4sil(x) (const v4si _MM_SETR_EPI32(x, x, x, x))

__forceinline const v4sf operator+(const v4sf& a, const v4sf& b) { return _mm_add_ps(a,b); }
__forceinline const v4sf operator-(const v4sf& a, const v4sf& b) { return _mm_sub_ps(a,b); }
__forceinline const v4sf operator/(const v4sf& a, const v4sf& b) { return _mm_div_ps(a,b); }
__forceinline const v4sf operator*(const v4sf& a, const v4sf& b) { return _mm_mul_ps(a,b); }

__forceinline const v4sf operator+(const v4sf& a) { return a; }
__forceinline const v4sf operator-(const v4sf& a) { return _mm_xor_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))); }

__forceinline const v4sf operator&(const v4sf& a, const v4sf& b) { return _mm_and_ps(a,b); }
__forceinline const v4sf operator|(const v4sf& a, const v4sf& b) { return _mm_or_ps(a,b); }
__forceinline const v4sf operator^(const v4sf& a, const v4sf& b) { return _mm_xor_ps(a,b); }

__forceinline const v4si operator&(const v4si& a, const v4si& b) { return _mm_and_si128(a,b); }
__forceinline const v4si operator|(const v4si& a, const v4si& b) { return _mm_or_si128(a,b); }
__forceinline const v4si operator^(const v4si& a, const v4si& b) { return _mm_xor_si128(a,b); }

__forceinline const v4sf operator+=(v4sf& a, const v4sf& b) { return a = a + b; }
__forceinline const v4sf operator-=(v4sf& a, const v4sf& b) { return a = a - b; }
__forceinline const v4sf operator*=(v4sf& a, const v4sf& b) { return a = a * b; }
__forceinline const v4sf operator/=(v4sf& a, const v4sf& b) { return a = a / b; }

__forceinline const v4si operator|=(v4si& a, const v4si& b) { return a = a | b; }
__forceinline const v4si operator&=(v4si& a, const v4si& b) { return a = a & b; }
__forceinline const v4si operator^=(v4si& a, const v4si& b) { return a = a ^ b; }
#else
#define v4sfl(x) ((const v4sf) { (x), (x), (x), (x) })
#define v2dil(x) ((const v4si) { (x), (x) })
#define v4sil(x) v2dil((((long long) (x)) << 32) | (long long) (x))
#endif

typedef union { v4sf f; float array[4]; } v4sfindexer;
#define v4sf_index(_findx, _findi) \
Expand All @@ -122,13 +162,17 @@ typedef union { v4si i; int array[4]; } v4siindexer;
})

typedef union { v4sf f; v4si i; } v4sfv4sipun;
#define v4sf_fabs(x) \
#if _MSC_VER && !__INTEL_COMPILER
#define v4sf_fabs(x) _mm_and_ps(x, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))
#else
#define v4sf_fabs(x) \
({ \
v4sfv4sipun vx; \
vx.f = x; \
vx.i &= v4sil (0x7FFFFFFF); \
vx.f; \
})
#endif

#ifdef __cplusplus
} // end namespace
Expand Down
52 changes: 48 additions & 4 deletions fastapprox/src/sse.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,49 @@ typedef __m128i v4si;
#define v4si_to_v4sf _mm_cvtepi32_ps
#define v4sf_to_v4si _mm_cvttps_epi32

#define v4sfl(x) ((const v4sf) { (x), (x), (x), (x) })
#define v2dil(x) ((const v4si) { (x), (x) })
#define v4sil(x) v2dil((((unsigned long long) (x)) << 32) | (x))
#if _MSC_VER && !__INTEL_COMPILER
template <class T>
__forceinline char GetChar(T value, size_t index) { return ((char*)&value)[index]; }

#define AS_4CHARS(a) \
GetChar(int32_t(a), 0), GetChar(int32_t(a), 1), \
GetChar(int32_t(a), 2), GetChar(int32_t(a), 3)

#define _MM_SETR_EPI32(a0, a1, a2, a3) \
{ AS_4CHARS(a0), AS_4CHARS(a1), AS_4CHARS(a2), AS_4CHARS(a3) }

#define v4sfl(x) (const v4sf { (x), (x), (x), (x) })
#define v4sil(x) (const v4si _MM_SETR_EPI32(x, x, x, x))

__forceinline const v4sf operator+(const v4sf& a, const v4sf& b) { return _mm_add_ps(a,b); }
__forceinline const v4sf operator-(const v4sf& a, const v4sf& b) { return _mm_sub_ps(a,b); }
__forceinline const v4sf operator/(const v4sf& a, const v4sf& b) { return _mm_div_ps(a,b); }
__forceinline const v4sf operator*(const v4sf& a, const v4sf& b) { return _mm_mul_ps(a,b); }

__forceinline const v4sf operator+(const v4sf& a) { return a; }
__forceinline const v4sf operator-(const v4sf& a) { return _mm_xor_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))); }

__forceinline const v4sf operator&(const v4sf& a, const v4sf& b) { return _mm_and_ps(a,b); }
__forceinline const v4sf operator|(const v4sf& a, const v4sf& b) { return _mm_or_ps(a,b); }
__forceinline const v4sf operator^(const v4sf& a, const v4sf& b) { return _mm_xor_ps(a,b); }

__forceinline const v4si operator&(const v4si& a, const v4si& b) { return _mm_and_si128(a,b); }
__forceinline const v4si operator|(const v4si& a, const v4si& b) { return _mm_or_si128(a,b); }
__forceinline const v4si operator^(const v4si& a, const v4si& b) { return _mm_xor_si128(a,b); }

__forceinline const v4sf operator+=(v4sf& a, const v4sf& b) { return a = a + b; }
__forceinline const v4sf operator-=(v4sf& a, const v4sf& b) { return a = a - b; }
__forceinline const v4sf operator*=(v4sf& a, const v4sf& b) { return a = a * b; }
__forceinline const v4sf operator/=(v4sf& a, const v4sf& b) { return a = a / b; }

__forceinline const v4si operator|=(v4si& a, const v4si& b) { return a = a | b; }
__forceinline const v4si operator&=(v4si& a, const v4si& b) { return a = a & b; }
__forceinline const v4si operator^=(v4si& a, const v4si& b) { return a = a ^ b; }
#else
#define v4sfl(x) ((const v4sf) { (x), (x), (x), (x) })
#define v2dil(x) ((const v4si) { (x), (x) })
#define v4sil(x) v2dil((((long long) (x)) << 32) | (long long) (x))
#endif

typedef union { v4sf f; float array[4]; } v4sfindexer;
#define v4sf_index(_findx, _findi) \
Expand All @@ -73,13 +113,17 @@ typedef union { v4si i; int array[4]; } v4siindexer;
})

typedef union { v4sf f; v4si i; } v4sfv4sipun;
#define v4sf_fabs(x) \
#if _MSC_VER && !__INTEL_COMPILER
#define v4sf_fabs(x) _mm_and_ps(x, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))
#else
#define v4sf_fabs(x) \
({ \
v4sfv4sipun vx; \
vx.f = x; \
vx.i &= v4sil (0x7FFFFFFF); \
vx.f; \
})
#endif

#ifdef __cplusplus
} // end namespace
Expand Down

0 comments on commit e729add

Please sign in to comment.