Compare commits
14 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
cf9ceb6bf9 | ||
|
|
7589ae0de5 | ||
|
|
f46e5b37e9 | ||
|
|
560acd5017 | ||
|
|
2267f278d2 | ||
|
|
0feeef585c | ||
|
|
6211966c55 | ||
|
|
92f591b4bd | ||
|
|
29ceb42b7b | ||
|
|
adaabe5993 | ||
|
|
6c392ee4a1 | ||
|
|
7699eda5ba | ||
|
|
d8b5fd5409 | ||
|
|
b37ffdbe85 |
@@ -933,7 +933,7 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
|
||||
EXPORT_MODULE(builtins_bitcode_generic_16_32bit);
|
||||
}
|
||||
else {
|
||||
EXPORT_MODULE(builtins_bitcode_generic_4_64bit);
|
||||
EXPORT_MODULE(builtins_bitcode_generic_16_64bit);
|
||||
}
|
||||
break;
|
||||
case 32:
|
||||
|
||||
10
ctx.cpp
10
ctx.cpp
@@ -344,6 +344,14 @@ FunctionEmitContext::FunctionEmitContext(Function *func, Symbol *funSym,
|
||||
AssertPos(currentPos, diSubprogramType.Verify());
|
||||
}
|
||||
|
||||
#if defined(LLVM_3_4)
|
||||
Assert(diSubprogramType.isCompositeType());
|
||||
llvm::DICompositeType diSubprogramType_n =
|
||||
static_cast<llvm::DICompositeType>(diSubprogramType);
|
||||
#else
|
||||
llvm::DIType diSubprogramType_n = diSubprogramType;
|
||||
#endif
|
||||
|
||||
std::string mangledName = llvmFunction->getName();
|
||||
if (mangledName == funSym->name)
|
||||
mangledName = "";
|
||||
@@ -356,7 +364,7 @@ FunctionEmitContext::FunctionEmitContext(Function *func, Symbol *funSym,
|
||||
diSubprogram =
|
||||
m->diBuilder->createFunction(diFile /* scope */, funSym->name,
|
||||
mangledName, diFile,
|
||||
firstLine, diSubprogramType,
|
||||
firstLine, diSubprogramType_n,
|
||||
isStatic, true, /* is defn */
|
||||
firstLine,
|
||||
flags,
|
||||
|
||||
@@ -1,4 +1,20 @@
|
||||
=== v1.4.0 === (27 May 2013)
|
||||
=== v1.4.2 === (11 June 2013)
|
||||
|
||||
A minor version update with a few important changes:
|
||||
|
||||
* Stability fix for AVX2 target (Haswell) - problem with gather instructions was
|
||||
released in LLVM 3.4, if you build with LLVM 3.2 or 3.3, it's available in our
|
||||
repository (llvm_patches/r183327-AVX2-GATHER.patch) and needs to be applied
|
||||
manually.
|
||||
|
||||
* Stability fix for widespread issue on Win32 platform (#503).
|
||||
|
||||
* Performance improvements for Xeon Phi related to mask representation.
|
||||
|
||||
Also LLVM 3.3 has been released and now it's the recommended version for building ISPC.
|
||||
Precompiled binaries are also built with LLVM 3.3.
|
||||
|
||||
=== v1.4.1 === (28 May 2013)
|
||||
|
||||
A major new version of ispc has been released with stability and performance
|
||||
improvements on all supported platforms (Windows, Linux and MacOS).
|
||||
@@ -20,7 +36,7 @@ Important bug fixes/changes:
|
||||
|
||||
* FMA instructions are enabled for AVX2 instruction set.
|
||||
|
||||
* Support of RDRAND instruction when availible via library function rdrand (Ivy Bridge).
|
||||
* Support of RDRAND instruction when available via library function rdrand (Ivy Bridge).
|
||||
|
||||
Release also contains numerous bug fixes and minor improvements.
|
||||
|
||||
|
||||
@@ -2,7 +2,18 @@
|
||||
ispc News
|
||||
=========
|
||||
|
||||
ispc 1.4.0 is Released
|
||||
ispc 1.4.2 is Released
|
||||
----------------------
|
||||
|
||||
A minor update of ``ispc`` has been released with stability fix for AVX2
|
||||
(Haswell), fix for Win32 platform and performance improvements on Xeon Phi.
|
||||
As usual, it's available on all supported platforms (Windows, Linux and MacOS).
|
||||
This version supports LLVM 3.1, 3.2, 3.3 and 3.4, but now we are recommending
|
||||
to avoid 3.1, as it's known to contain a number of stability problems and we are
|
||||
planning to deprecate its support soon.
|
||||
The released binaries are built with 3.3.
|
||||
|
||||
ispc 1.4.1 is Released
|
||||
----------------------
|
||||
|
||||
A major new version of ``ispc`` has been released with stability and
|
||||
|
||||
@@ -31,7 +31,7 @@ PROJECT_NAME = "Intel SPMD Program Compiler"
|
||||
# This could be handy for archiving the generated documentation or
|
||||
# if some version control system is used.
|
||||
|
||||
PROJECT_NUMBER = 1.4.0
|
||||
PROJECT_NUMBER = 1.4.2
|
||||
|
||||
# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
|
||||
# base path where the generated documentation will be put.
|
||||
|
||||
@@ -75,43 +75,7 @@ typedef int64_t __vec1_i64;
|
||||
|
||||
struct __vec16_i32;
|
||||
|
||||
typedef struct PRE_ALIGN(2) __vec16_i1 {
|
||||
FORCEINLINE operator __mmask16() const { return m; }
|
||||
FORCEINLINE __vec16_i1() { /* FIXME? __mm512_undef_mask(); */ }
|
||||
FORCEINLINE __vec16_i1(const __mmask16 &in) : m(in) {}
|
||||
FORCEINLINE __vec16_i1(const __vec16_i32 &in);
|
||||
FORCEINLINE __vec16_i1(const __vec16_i1 &o) : m(o.m) {}
|
||||
FORCEINLINE __vec16_i1& operator=(const __vec16_i1 &o) { m = o.m; return *this; }
|
||||
FORCEINLINE __vec16_i1(uint32_t v00, uint32_t v01, uint32_t v02, uint32_t v03,
|
||||
uint32_t v04, uint32_t v05, uint32_t v06, uint32_t v07,
|
||||
uint32_t v08, uint32_t v09, uint32_t v10, uint32_t v11,
|
||||
uint32_t v12, uint32_t v13, uint32_t v14, uint32_t v15) {
|
||||
m = (v00) |
|
||||
((v01) << 1) |
|
||||
((v02) << 2) |
|
||||
((v03) << 3) |
|
||||
((v04) << 4) |
|
||||
((v05) << 5) |
|
||||
((v06) << 6) |
|
||||
((v07) << 7) |
|
||||
((v08) << 8) |
|
||||
((v09) << 9) |
|
||||
((v10) << 10) |
|
||||
((v11) << 11) |
|
||||
((v12) << 12) |
|
||||
((v13) << 13) |
|
||||
((v14) << 14) |
|
||||
((v15) << 15);
|
||||
}
|
||||
|
||||
union {
|
||||
__mmask16 m;
|
||||
struct {
|
||||
__mmask8 m1;
|
||||
__mmask8 m2;
|
||||
} m8;
|
||||
};
|
||||
} POST_ALIGN(2) __vec16_i1;
|
||||
typedef __mmask16 POST_ALIGN(2) __vec16_i1;
|
||||
|
||||
typedef struct PRE_ALIGN(64) __vec16_f {
|
||||
FORCEINLINE operator __m512() const { return v; }
|
||||
@@ -159,10 +123,6 @@ typedef struct PRE_ALIGN(64) __vec16_i32 {
|
||||
__m512i v;
|
||||
} POST_ALIGN(64) __vec16_i32;
|
||||
|
||||
FORCEINLINE __vec16_i1::__vec16_i1(const __vec16_i32 &in) {
|
||||
m = _mm512_test_epi32_mask(in, in);
|
||||
}
|
||||
|
||||
typedef struct PRE_ALIGN(64) __vec16_i64 {
|
||||
FORCEINLINE __vec16_i64() : v_lo(_mm512_undefined_epi32()), v_hi(_mm512_undefined_epi32()) {}
|
||||
FORCEINLINE __vec16_i64(const __vec16_i64 &o) : v_lo(o.v_lo), v_hi(o.v_hi) {}
|
||||
@@ -325,7 +285,7 @@ static FORCEINLINE __vec16_i1 __or(__vec16_i1 a, __vec16_i1 b) {
|
||||
|
||||
static FORCEINLINE __vec16_i1 __select(__vec16_i1 mask, __vec16_i1 a,
|
||||
__vec16_i1 b) {
|
||||
return ((a.m & mask.m) | (b.m & ~mask.m));
|
||||
return ((a & mask) | (b & ~mask));
|
||||
//return __or(__and(a, mask), __andnr(b, mask));
|
||||
}
|
||||
|
||||
@@ -335,7 +295,7 @@ static FORCEINLINE __vec16_i1 __select(bool cond, __vec16_i1 a, __vec16_i1 b) {
|
||||
|
||||
|
||||
static FORCEINLINE bool __extract_element(__vec16_i1 mask, uint32_t index) {
|
||||
return (mask.m & (1 << index)) ? true : false;
|
||||
return (mask & (1 << index)) ? true : false;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -351,13 +311,13 @@ static FORCEINLINE void __insert_element(__vec16_i1 *vec, int index,
|
||||
template <int ALIGN> static FORCEINLINE __vec16_i1 __load(const __vec16_i1 *p) {
|
||||
const uint16_t *ptr = (const uint16_t *)p;
|
||||
__vec16_i1 r;
|
||||
r.m = *ptr;
|
||||
r = *ptr;
|
||||
return r;
|
||||
}
|
||||
|
||||
template <int ALIGN> static FORCEINLINE void __store(__vec16_i1 *p, __vec16_i1 v) {
|
||||
uint16_t *ptr = (uint16_t *)p;
|
||||
*ptr = v.m;
|
||||
*ptr = v;
|
||||
}
|
||||
|
||||
template <class RetVecType> RetVecType __smear_i1(int i);
|
||||
@@ -556,7 +516,7 @@ static FORCEINLINE __vec16_i1 __signed_greater_than_i32_and_mask(__vec16_i32 a,
|
||||
|
||||
static FORCEINLINE __vec16_i32 __select(__vec16_i1 mask,
|
||||
__vec16_i32 a, __vec16_i32 b) {
|
||||
return _mm512_mask_mov_epi32(b.v, mask.m, a.v);
|
||||
return _mm512_mask_mov_epi32(b.v, mask, a.v);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i32 __select(bool cond, __vec16_i32 a, __vec16_i32 b) {
|
||||
@@ -785,8 +745,8 @@ static FORCEINLINE __vec16_i1 __not_equal_i64_and_mask(const __vec16_i64 &a, con
|
||||
static FORCEINLINE __vec16_i64 __select(__vec16_i1 mask,
|
||||
__vec16_i64 a, __vec16_i64 b) {
|
||||
__vec16_i64 ret;
|
||||
ret.v_hi = _mm512_mask_mov_epi32(b.v_hi, mask.m, a.v_hi);
|
||||
ret.v_lo = _mm512_mask_mov_epi32(b.v_lo, mask.m, a.v_lo);
|
||||
ret.v_hi = _mm512_mask_mov_epi32(b.v_hi, mask, a.v_hi);
|
||||
ret.v_lo = _mm512_mask_mov_epi32(b.v_lo, mask, a.v_lo);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -1077,113 +1037,134 @@ static FORCEINLINE __vec16_d __div(__vec16_d a, __vec16_d b) {
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i1 __equal_double(__vec16_d a, __vec16_d b) {
|
||||
__vec16_i1 ret;
|
||||
ret.m8.m1 = _mm512_cmpeq_pd_mask(a.v1, b.v1);
|
||||
ret.m8.m2 = _mm512_cmpeq_pd_mask(a.v2, b.v2);
|
||||
return ret;
|
||||
__vec16_i1 ret1;
|
||||
__vec16_i1 ret2;
|
||||
ret1 = _mm512_cmpeq_pd_mask(a.v1, b.v1);
|
||||
ret2 = _mm512_cmpeq_pd_mask(a.v2, b.v2);
|
||||
return _mm512_kmovlhb(ret1, ret2);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i1 __equal_double_and_mask(__vec16_d a, __vec16_d b,
|
||||
__vec16_i1 m) {
|
||||
__vec16_i1 ret;
|
||||
ret.m8.m1 = _mm512_mask_cmpeq_pd_mask(m.m8.m1, a.v1, b.v1);
|
||||
ret.m8.m2 = _mm512_mask_cmpeq_pd_mask(m.m8.m2, a.v2, b.v2);
|
||||
return ret;
|
||||
__vec16_i1 ret1;
|
||||
__vec16_i1 ret2;
|
||||
ret1 = _mm512_mask_cmpeq_pd_mask(m, a.v1, b.v1);
|
||||
__vec16_i1 tmp_m = m;
|
||||
ret2 = _mm512_mask_cmpeq_pd_mask(_mm512_kswapb(tmp_m,tmp_m), a.v2, b.v2);
|
||||
return _mm512_kmovlhb(ret1, ret2);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i1 __not_equal_double(__vec16_d a, __vec16_d b) {
|
||||
__vec16_i1 ret;
|
||||
ret.m8.m1 = _mm512_cmpneq_pd_mask(a.v1, b.v1);
|
||||
ret.m8.m2 = _mm512_cmpneq_pd_mask(a.v2, b.v2);
|
||||
return ret;
|
||||
__vec16_i1 ret1;
|
||||
__vec16_i1 ret2;
|
||||
ret1 = _mm512_cmpneq_pd_mask(a.v1, b.v1);
|
||||
ret2 = _mm512_cmpneq_pd_mask(a.v2, b.v2);
|
||||
return _mm512_kmovlhb(ret1, ret2);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i1 __not_equal_double_and_mask(__vec16_d a, __vec16_d b,
|
||||
__vec16_i1 m) {
|
||||
__vec16_i1 ret;
|
||||
ret.m8.m1 = _mm512_mask_cmpneq_pd_mask(m.m8.m1, a.v1, b.v1);
|
||||
ret.m8.m2 = _mm512_mask_cmpneq_pd_mask(m.m8.m2, a.v2, b.v2);
|
||||
return ret;
|
||||
__vec16_i1 ret1;
|
||||
__vec16_i1 ret2;
|
||||
__vec16_i1 tmp_m = m;
|
||||
ret1 = _mm512_mask_cmpneq_pd_mask(m, a.v1, b.v1);
|
||||
ret2 = _mm512_mask_cmpneq_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2);
|
||||
return _mm512_kmovlhb(ret1, ret2);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i1 __less_than_double(__vec16_d a, __vec16_d b) {
|
||||
__vec16_i1 ret;
|
||||
ret.m8.m1 = _mm512_cmplt_pd_mask(a.v1, b.v1);
|
||||
ret.m8.m2 = _mm512_cmplt_pd_mask(a.v2, b.v2);
|
||||
return ret;
|
||||
__vec16_i1 ret1;
|
||||
__vec16_i1 ret2;
|
||||
ret1 = _mm512_cmplt_pd_mask(a.v1, b.v1);
|
||||
ret2 = _mm512_cmplt_pd_mask(a.v2, b.v2);
|
||||
return _mm512_kmovlhb(ret1, ret2);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i1 __less_than_double_and_mask(__vec16_d a, __vec16_d b,
|
||||
__vec16_i1 m) {
|
||||
__vec16_i1 ret;
|
||||
ret.m8.m1 = _mm512_mask_cmplt_pd_mask(m.m8.m1, a.v1, b.v1);
|
||||
ret.m8.m2 = _mm512_mask_cmplt_pd_mask(m.m8.m2, a.v2, b.v2);
|
||||
return ret;
|
||||
__vec16_i1 ret1;
|
||||
__vec16_i1 ret2;
|
||||
__vec16_i1 tmp_m = m;
|
||||
ret1 = _mm512_mask_cmplt_pd_mask(m, a.v1, b.v1);
|
||||
ret2 = _mm512_mask_cmplt_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2);
|
||||
return _mm512_kmovlhb(ret1, ret2);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i1 __less_equal_double(__vec16_d a, __vec16_d b) {
|
||||
__vec16_i1 ret;
|
||||
ret.m8.m1 = _mm512_cmple_pd_mask(a.v1, b.v1);
|
||||
ret.m8.m2 = _mm512_cmple_pd_mask(a.v2, b.v2);
|
||||
return ret;
|
||||
__vec16_i1 ret1;
|
||||
__vec16_i1 ret2;
|
||||
ret1 = _mm512_cmple_pd_mask(a.v1, b.v1);
|
||||
ret2 = _mm512_cmple_pd_mask(a.v2, b.v2);
|
||||
return _mm512_kmovlhb(ret1, ret2);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i1 __less_equal_double_and_mask(__vec16_d a, __vec16_d b,
|
||||
__vec16_i1 m) {
|
||||
__vec16_i1 ret;
|
||||
ret.m8.m1 = _mm512_mask_cmple_pd_mask(m.m8.m1, a.v1, b.v1);
|
||||
ret.m8.m2 = _mm512_mask_cmple_pd_mask(m.m8.m2, a.v2, b.v2);
|
||||
return ret;
|
||||
__vec16_i1 ret1;
|
||||
__vec16_i1 ret2;
|
||||
__vec16_i1 tmp_m = m;
|
||||
ret1 = _mm512_mask_cmple_pd_mask(m, a.v1, b.v1);
|
||||
ret2 = _mm512_mask_cmple_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2);
|
||||
return _mm512_kmovlhb(ret1, ret2);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i1 __greater_than_double(__vec16_d a, __vec16_d b) {
|
||||
__vec16_i1 ret;
|
||||
ret.m8.m1 = _mm512_cmpnle_pd_mask(a.v1, b.v1);
|
||||
ret.m8.m2 = _mm512_cmpnle_pd_mask(a.v2, b.v2);
|
||||
return ret;
|
||||
__vec16_i1 ret1;
|
||||
__vec16_i1 ret2;
|
||||
ret1 = _mm512_cmpnle_pd_mask(a.v1, b.v1);
|
||||
ret2 = _mm512_cmpnle_pd_mask(a.v2, b.v2);
|
||||
return _mm512_kmovlhb(ret1, ret2);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i1 __greater_than_double_and_mask(__vec16_d a, __vec16_d b,
|
||||
__vec16_i1 m) {
|
||||
__vec16_i1 ret;
|
||||
ret.m8.m1 = _mm512_mask_cmpnle_pd_mask(m.m8.m1, a.v1, b.v1);
|
||||
ret.m8.m2 = _mm512_mask_cmpnle_pd_mask(m.m8.m2, a.v2, b.v2);
|
||||
return ret;
|
||||
__vec16_i1 ret1;
|
||||
__vec16_i1 ret2;
|
||||
__vec16_i1 tmp_m = m;
|
||||
ret1 = _mm512_mask_cmpnle_pd_mask(m, a.v1, b.v1);
|
||||
ret2 = _mm512_mask_cmpnle_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2);
|
||||
return _mm512_kmovlhb(ret1, ret2);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i1 __greater_equal_double(__vec16_d a, __vec16_d b) {
|
||||
__vec16_i1 ret;
|
||||
ret.m8.m1 = _mm512_cmpnlt_pd_mask(a.v1, b.v1);
|
||||
ret.m8.m2 = _mm512_cmpnlt_pd_mask(a.v2, b.v2);
|
||||
return ret;
|
||||
__vec16_i1 ret1;
|
||||
__vec16_i1 ret2;
|
||||
ret1 = _mm512_cmpnlt_pd_mask(a.v1, b.v1);
|
||||
ret2 = _mm512_cmpnlt_pd_mask(a.v2, b.v2);
|
||||
return _mm512_kmovlhb(ret1, ret2);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i1 __greater_equal_double_and_mask(__vec16_d a, __vec16_d b,
|
||||
__vec16_i1 m) {
|
||||
__vec16_i1 ret;
|
||||
ret.m8.m1 = _mm512_mask_cmpnlt_pd_mask(m.m8.m1, a.v1, b.v1);
|
||||
ret.m8.m2 = _mm512_mask_cmpnlt_pd_mask(m.m8.m2, a.v2, b.v2);
|
||||
return ret;
|
||||
__vec16_i1 ret1;
|
||||
__vec16_i1 ret2;
|
||||
__vec16_i1 tmp_m = m;
|
||||
ret1 = _mm512_mask_cmpnlt_pd_mask(m, a.v1, b.v1);
|
||||
ret2 = _mm512_mask_cmpnlt_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2);
|
||||
return _mm512_kmovlhb(ret1, ret2);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i1 __ordered_double(__vec16_d a, __vec16_d b) {
|
||||
__vec16_i1 ret;
|
||||
ret.m8.m1 = _mm512_cmpord_pd_mask(a.v1, b.v1);
|
||||
ret.m8.m2 = _mm512_cmpord_pd_mask(a.v2, b.v2);
|
||||
return ret;
|
||||
__vec16_i1 ret1;
|
||||
__vec16_i1 ret2;
|
||||
ret1 = _mm512_cmpord_pd_mask(a.v1, b.v1);
|
||||
ret2 = _mm512_cmpord_pd_mask(a.v2, b.v2);
|
||||
return _mm512_kmovlhb(ret1, ret2);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i1 __unordered_double(__vec16_d a, __vec16_d b) {
|
||||
__vec16_i1 ret;
|
||||
ret.m8.m1 = _mm512_cmpunord_pd_mask(a.v1, b.v1);
|
||||
ret.m8.m2 = _mm512_cmpunord_pd_mask(a.v2, b.v2);
|
||||
return ret;
|
||||
__vec16_i1 ret1;
|
||||
__vec16_i1 ret2;
|
||||
ret1 = _mm512_cmpunord_pd_mask(a.v1, b.v1);
|
||||
ret2 = _mm512_cmpunord_pd_mask(a.v2, b.v2);
|
||||
return _mm512_kmovlhb(ret1, ret2);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_d __select(__vec16_i1 mask, __vec16_d a, __vec16_d b) {
|
||||
__vec16_d ret;
|
||||
ret.v1 = _mm512_mask_mov_pd(b.v1, mask.m8.m1, a.v1);
|
||||
ret.v2 = _mm512_mask_mov_pd(b.v2, mask.m8.m2, a.v2);
|
||||
__vec16_i1 tmp_m = mask;
|
||||
ret.v1 = _mm512_mask_mov_pd(b.v1, mask, a.v1);
|
||||
ret.v2 = _mm512_mask_mov_pd(b.v2, _mm512_kswapb(tmp_m, tmp_m), a.v2);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -1283,7 +1264,7 @@ static FORCEINLINE __vec16_i32 __cast_zext(const __vec16_i32 &, const __vec16_i1
|
||||
{
|
||||
__vec16_i32 ret = _mm512_setzero_epi32();
|
||||
__vec16_i32 one = _mm512_set1_epi32(1);
|
||||
return _mm512_mask_mov_epi32(ret, val.m, one);
|
||||
return _mm512_mask_mov_epi32(ret, val, one);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_f __cast_sitofp(__vec16_f, __vec16_i8 val) {
|
||||
@@ -1573,33 +1554,35 @@ static FORCEINLINE float __reduce_max_double(__vec16_d v) {
|
||||
// Currently, when a pseudo_gather is converted into a masked load, it has to be unaligned
|
||||
static FORCEINLINE __vec16_i32 __masked_load_i32(void *p, __vec16_i1 mask) {
|
||||
#ifdef ISPC_FORCE_ALIGNED_MEMORY
|
||||
return _mm512_mask_load_epi32(__vec16_i32(), mask.m, p);
|
||||
return _mm512_mask_load_epi32(__vec16_i32(), mask, p);
|
||||
#else
|
||||
__vec16_i32 tmp;
|
||||
tmp.v = _mm512_mask_extloadunpacklo_epi32(tmp.v, 0xFFFF, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
|
||||
tmp.v = _mm512_mask_extloadunpackhi_epi32(tmp.v, 0xFFFF, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
|
||||
__vec16_i32 ret;
|
||||
return _mm512_mask_mov_epi32(ret.v, mask.m, tmp.v);
|
||||
return _mm512_mask_mov_epi32(ret.v, mask, tmp.v);
|
||||
#endif
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_f __masked_load_float(void *p, __vec16_i1 mask) {
|
||||
#ifdef ISPC_FORCE_ALIGNED_MEMORY
|
||||
return _mm512_mask_load_ps(_mm512_undefined_ps(), mask.m,p);
|
||||
return _mm512_mask_load_ps(_mm512_undefined_ps(), mask,p);
|
||||
#else
|
||||
__vec16_f tmp;
|
||||
tmp.v = _mm512_mask_extloadunpacklo_ps(tmp.v, 0xFFFF, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
|
||||
tmp.v = _mm512_mask_extloadunpackhi_ps(tmp.v, 0xFFFF, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
|
||||
__vec16_f ret;
|
||||
return _mm512_mask_mov_ps(ret.v, mask.m, tmp.v);
|
||||
return _mm512_mask_mov_ps(ret.v, mask, tmp.v);
|
||||
#endif
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_d __masked_load_double(void *p, __vec16_i1 mask) {
|
||||
#ifdef ISPC_FORCE_ALIGNED_MEMORY
|
||||
__vec16_d ret;
|
||||
ret.v1 = _mm512_mask_load_pd(ret.v1, mask.m8.m1, p);
|
||||
ret.v2 = _mm512_mask_load_pd(ret.v2, mask.m8.m2, (uint8_t*)p+64);
|
||||
__vec16_i1 tmp_m = mask;
|
||||
tmp_m = _mm512_kswapb(tmp_m, tmp_m);
|
||||
ret.v1 = _mm512_mask_load_pd(ret.v1, mask, p);
|
||||
ret.v2 = _mm512_mask_load_pd(ret.v2, tmp_m, (uint8_t*)p+64);
|
||||
return ret;
|
||||
#else
|
||||
__vec16_d tmp;
|
||||
@@ -1608,20 +1591,22 @@ static FORCEINLINE __vec16_d __masked_load_double(void *p, __vec16_i1 mask) {
|
||||
tmp.v2 = _mm512_mask_extloadunpacklo_pd(tmp.v2, 0xFF, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
|
||||
tmp.v2 = _mm512_mask_extloadunpackhi_pd(tmp.v2, 0xFF, (uint8_t*)p+128, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
|
||||
__vec16_d ret;
|
||||
ret.v1 = _mm512_mask_mov_pd(ret.v1, mask.m8.m1, tmp.v1);
|
||||
ret.v2 = _mm512_mask_mov_pd(ret.v2, mask.m8.m2, tmp.v2);
|
||||
__vec16_i1 tmp_m = mask;
|
||||
tmp_m = _mm512_kswapb(tmp_m, tmp_m);
|
||||
ret.v1 = _mm512_mask_mov_pd(ret.v1, mask, tmp.v1);
|
||||
ret.v2 = _mm512_mask_mov_pd(ret.v2, tmp_m, tmp.v2);
|
||||
return ret;
|
||||
#endif
|
||||
}
|
||||
|
||||
static FORCEINLINE void __masked_store_i32(void *p, __vec16_i32 val, __vec16_i1 mask) {
|
||||
#ifdef ISPC_FORCE_ALIGNED_MEMORY
|
||||
_mm512_mask_store_epi32(p, mask.m, val.v);
|
||||
_mm512_mask_store_epi32(p, mask, val.v);
|
||||
#else
|
||||
__vec16_i32 tmp;
|
||||
tmp.v = _mm512_extloadunpacklo_epi32(tmp.v, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
|
||||
tmp.v = _mm512_extloadunpackhi_epi32(tmp.v, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
|
||||
tmp.v = _mm512_mask_mov_epi32(tmp.v, mask.m, val.v);
|
||||
tmp.v = _mm512_mask_mov_epi32(tmp.v, mask, val.v);
|
||||
_mm512_extpackstorelo_epi32(p, tmp.v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
|
||||
_mm512_extpackstorehi_epi32((uint8_t*)p+64, tmp.v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
|
||||
#endif
|
||||
@@ -1630,12 +1615,12 @@ static FORCEINLINE void __masked_store_i32(void *p, __vec16_i32 val, __vec16_i1
|
||||
static FORCEINLINE void __masked_store_float(void *p, __vec16_f val,
|
||||
__vec16_i1 mask) {
|
||||
#ifdef ISPC_FORCE_ALIGNED_MEMORY
|
||||
_mm512_mask_store_ps(p, mask.m, val.v);
|
||||
_mm512_mask_store_ps(p, mask, val.v);
|
||||
#else
|
||||
__vec16_f tmp;
|
||||
tmp.v = _mm512_extloadunpacklo_ps(tmp.v, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
|
||||
tmp.v = _mm512_extloadunpackhi_ps(tmp.v, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
|
||||
tmp.v = _mm512_mask_mov_ps(tmp.v, mask.m, val.v);
|
||||
tmp.v = _mm512_mask_mov_ps(tmp.v, mask, val.v);
|
||||
_mm512_extpackstorelo_ps(p, tmp.v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
|
||||
_mm512_extpackstorehi_ps((uint8_t*)p+64, tmp.v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
|
||||
#endif
|
||||
@@ -1644,16 +1629,20 @@ static FORCEINLINE void __masked_store_float(void *p, __vec16_f val,
|
||||
static FORCEINLINE void __masked_store_double(void *p, __vec16_d val,
|
||||
__vec16_i1 mask) {
|
||||
#ifdef ISPC_FORCE_ALIGNED_MEMORY
|
||||
_mm512_mask_store_pd(p, mask.m8.m1, val.v1);
|
||||
_mm512_mask_store_pd((uint8_t*)p+64, mask.m8.m2, val.v2);
|
||||
__vec16_i1 tmp_m = mask;
|
||||
tmp_m = _mm512_kswapb(tmp_m, tmp_m);
|
||||
_mm512_mask_store_pd(p, mask, val.v1);
|
||||
_mm512_mask_store_pd((uint8_t*)p+64, tmp_m, val.v2);
|
||||
#else
|
||||
__vec16_d tmp;
|
||||
__vec16_i1 tmp_m = mask;
|
||||
tmp_m = _mm512_kswapb(tmp_m, tmp_m);
|
||||
tmp.v1 = _mm512_extloadunpacklo_pd(tmp.v1, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
|
||||
tmp.v1 = _mm512_extloadunpackhi_pd(tmp.v1, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
|
||||
tmp.v2 = _mm512_extloadunpacklo_pd(tmp.v2, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
|
||||
tmp.v2 = _mm512_extloadunpackhi_pd(tmp.v2, (uint8_t*)p+128, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
|
||||
tmp.v1 = _mm512_mask_mov_pd(tmp.v1, mask.m8.m1, val.v1);
|
||||
tmp.v2 = _mm512_mask_mov_pd(tmp.v2, mask.m8.m2, val.v2);
|
||||
tmp.v1 = _mm512_mask_mov_pd(tmp.v1, mask, val.v1);
|
||||
tmp.v2 = _mm512_mask_mov_pd(tmp.v2, tmp_m, val.v2);
|
||||
_mm512_extpackstorelo_pd(p, tmp.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
|
||||
_mm512_extpackstorehi_pd((uint8_t*)p+64, tmp.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
|
||||
_mm512_extpackstorelo_pd((uint8_t*)p+64, tmp.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
|
||||
@@ -1870,7 +1859,7 @@ static FORCEINLINE int32_t __packed_store_active(uint32_t *p, __vec16_i32 val,
|
||||
__vec16_i1 mask) {
|
||||
_mm512_mask_extpackstorelo_epi32(p, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
|
||||
_mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
|
||||
return _mm_countbits_32(uint32_t(mask.m));
|
||||
return _mm_countbits_32(uint32_t(mask));
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
8
ispc.cpp
8
ispc.cpp
@@ -441,6 +441,14 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
|
||||
if (g->opt.disableFMA == false)
|
||||
options.AllowFPOpFusion = llvm::FPOpFusion::Fast;
|
||||
#endif // !LLVM_3_1
|
||||
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
if (strcmp("x86", arch) == 0) {
|
||||
// Workaround for issue #503 (LLVM issue 14646).
|
||||
// It's Win32 specific.
|
||||
options.NoFramePointerElim = true;
|
||||
}
|
||||
#endif
|
||||
m_targetMachine =
|
||||
m_target->createTargetMachine(triple, m_cpu, featuresString, options,
|
||||
relocModel);
|
||||
|
||||
2
ispc.h
2
ispc.h
@@ -38,7 +38,7 @@
|
||||
#ifndef ISPC_H
|
||||
#define ISPC_H
|
||||
|
||||
#define ISPC_VERSION "1.4.0"
|
||||
#define ISPC_VERSION "1.4.2"
|
||||
|
||||
#if !defined(LLVM_3_1) && !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4)
|
||||
#error "Only LLVM 3.1, 3.2, 3.3 and the 3.4 development branch are supported"
|
||||
|
||||
54
llvm_patches/r183327-AVX2-GATHER.patch
Executable file
54
llvm_patches/r183327-AVX2-GATHER.patch
Executable file
@@ -0,0 +1,54 @@
|
||||
This patch needs to be applied to LLVM 3.2/3.3 to fix bunch of fails on AVX2 target.
|
||||
LLVM 3.4 contains this fix (r183327).
|
||||
|
||||
Index: lib/Target/X86/X86ISelDAGToDAG.cpp
|
||||
===================================================================
|
||||
--- lib/Target/X86/X86ISelDAGToDAG.cpp (revision 183626)
|
||||
+++ lib/Target/X86/X86ISelDAGToDAG.cpp (working copy)
|
||||
@@ -2013,6 +2013,8 @@
|
||||
case Intrinsic::x86_avx2_gather_d_d_256:
|
||||
case Intrinsic::x86_avx2_gather_q_d:
|
||||
case Intrinsic::x86_avx2_gather_q_d_256: {
|
||||
+ if (!Subtarget->hasAVX2())
|
||||
+ break;
|
||||
unsigned Opc;
|
||||
switch (IntNo) {
|
||||
default: llvm_unreachable("Impossible intrinsic");
|
||||
Index: lib/Target/X86/X86InstrSSE.td
|
||||
===================================================================
|
||||
--- lib/Target/X86/X86InstrSSE.td (revision 183626)
|
||||
+++ lib/Target/X86/X86InstrSSE.td (working copy)
|
||||
@@ -8367,7 +8367,9 @@
|
||||
[]>, VEX_4VOp3, VEX_L;
|
||||
}
|
||||
|
||||
-let mayLoad = 1, Constraints = "$src1 = $dst, $mask = $mask_wb" in {
|
||||
+let mayLoad = 1, Constraints
|
||||
+ = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb"
|
||||
+ in {
|
||||
defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", VR256, vx64mem, vx64mem>, VEX_W;
|
||||
defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", VR256, vx64mem, vy64mem>, VEX_W;
|
||||
defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", VR256, vx32mem, vy32mem>;
|
||||
Index: test/CodeGen/X86/avx2-gather.ll
|
||||
===================================================================
|
||||
--- test/CodeGen/X86/avx2-gather.ll (revision 0)
|
||||
+++ test/CodeGen/X86/avx2-gather.ll (working copy)
|
||||
@@ -0,0 +1,18 @@
|
||||
+; RUN: not llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx
|
||||
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 | FileCheck %s
|
||||
+
|
||||
+declare <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float>, i8*,
|
||||
+ <4 x i32>, <4 x float>, i8) nounwind readonly
|
||||
+
|
||||
+define <4 x float> @test_x86_avx2_gather_d_ps(i8* %a1,
|
||||
+ <4 x i32> %idx, <4 x float> %mask) {
|
||||
+ %res = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> undef,
|
||||
+ i8* %a1, <4 x i32> %idx, <4 x float> %mask, i8 2) ;
|
||||
+ ret <4 x float> %res
|
||||
+}
|
||||
+
|
||||
+; CHECK: test_x86_avx2_gather_d_ps
|
||||
+; CHECK: vgatherdps
|
||||
+; CHECK-NOT: [[DST]]
|
||||
+; CHECK: [[DST:%xmm[0-9]+]]{{$}}
|
||||
+; CHECK: ret
|
||||
4
type.cpp
4
type.cpp
@@ -2879,7 +2879,11 @@ FunctionType::GetDIType(llvm::DIDescriptor scope) const {
|
||||
for (int i = 0; i < GetNumParameters(); ++i) {
|
||||
const Type *t = GetParameterType(i);
|
||||
if (t == NULL)
|
||||
#if defined(LLVM_3_4)
|
||||
return llvm::DICompositeType();
|
||||
#else
|
||||
return llvm::DIType();
|
||||
#endif
|
||||
retArgTypes.push_back(t->GetDIType(scope));
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user