14 Commits

Author SHA1 Message Date
Dmitry Babokin
cf9ceb6bf9 Release 1.4.2, 11 June 2013 2013-06-11 17:18:54 +04:00
Dmitry Babokin
7589ae0de5 Merge pull request #512 from ifilippov/bug_34
Fix to track LLVM 3.4 ToT changes
2013-06-04 07:10:04 -07:00
jbrodman
f46e5b37e9 Merge pull request #511 from dbabokin/win32
Fix for #503 - avoid omitting frame pointer on Win32
2013-06-04 06:43:53 -07:00
Ilia Filippov
560acd5017 changes to support createFunction() with DICompositeType argument in LLVM_3_4 2013-06-04 15:48:39 +04:00
Dmitry Babokin
2267f278d2 Fix for #503 - avoid omitting frame pointer on Win32 2013-06-04 14:51:36 +04:00
jbrodman
0feeef585c Merge pull request #509 from jbrodman/master
Change generic-16's knc.h to use __mmask16 instead of a struct.
2013-05-30 13:21:23 -07:00
james.brodman
6211966c55 Change mask to use __mmask16 instead of a struct. 2013-05-30 16:04:44 -04:00
Dmitry Babokin
92f591b4bd Merge pull request #508 from dbabokin/master
Bumping version to 1.4.1dev
2013-05-28 08:59:13 -07:00
Dmitry Babokin
29ceb42b7b Bumping version to 1.4.1dev 2013-05-28 19:58:27 +04:00
Dmitry Babokin
adaabe5993 Merge pull request #507 from dbabokin/master
Bumping up to 1.4.1 version
2013-05-28 08:49:14 -07:00
Dmitry Babokin
6c392ee4a1 Changes for 1.4.1 release 2013-05-28 19:46:30 +04:00
jbrodman
7699eda5ba Merge pull request #506 from jbrodman/master
Typo Fix
2013-05-28 08:13:03 -07:00
james.brodman
d8b5fd5409 Typo fix. 2013-05-28 11:13:43 -04:00
Dmitry Babokin
b37ffdbe85 Merge pull request #505 from dbabokin/release
Changes for 1.4.0 release
2013-05-27 06:03:22 -07:00
10 changed files with 221 additions and 131 deletions

View File

@@ -933,7 +933,7 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
EXPORT_MODULE(builtins_bitcode_generic_16_32bit);
}
else {
EXPORT_MODULE(builtins_bitcode_generic_4_64bit);
EXPORT_MODULE(builtins_bitcode_generic_16_64bit);
}
break;
case 32:

10
ctx.cpp
View File

@@ -344,6 +344,14 @@ FunctionEmitContext::FunctionEmitContext(Function *func, Symbol *funSym,
AssertPos(currentPos, diSubprogramType.Verify());
}
#if defined(LLVM_3_4)
Assert(diSubprogramType.isCompositeType());
llvm::DICompositeType diSubprogramType_n =
static_cast<llvm::DICompositeType>(diSubprogramType);
#else
llvm::DIType diSubprogramType_n = diSubprogramType;
#endif
std::string mangledName = llvmFunction->getName();
if (mangledName == funSym->name)
mangledName = "";
@@ -356,7 +364,7 @@ FunctionEmitContext::FunctionEmitContext(Function *func, Symbol *funSym,
diSubprogram =
m->diBuilder->createFunction(diFile /* scope */, funSym->name,
mangledName, diFile,
firstLine, diSubprogramType,
firstLine, diSubprogramType_n,
isStatic, true, /* is defn */
firstLine,
flags,

View File

@@ -1,4 +1,20 @@
=== v1.4.0 === (27 May 2013)
=== v1.4.2 === (11 June 2013)
A minor version update with a few important changes:
* Stability fix for AVX2 target (Haswell) - problem with gather instructions was
released in LLVM 3.4, if you build with LLVM 3.2 or 3.3, it's available in our
repository (llvm_patches/r183327-AVX2-GATHER.patch) and needs to be applied
manually.
* Stability fix for widespread issue on Win32 platform (#503).
* Performance improvements for Xeon Phi related to mask representation.
Also LLVM 3.3 has been released and now it's the recommended version for building ISPC.
Precompiled binaries are also built with LLVM 3.3.
=== v1.4.1 === (28 May 2013)
A major new version of ispc has been released with stability and performance
improvements on all supported platforms (Windows, Linux and MacOS).
@@ -20,7 +36,7 @@ Important bug fixes/changes:
* FMA instructions are enabled for AVX2 instruction set.
* Support of RDRAND instruction when availible via library function rdrand (Ivy Bridge).
* Support of RDRAND instruction when available via library function rdrand (Ivy Bridge).
Release also contains numerous bug fixes and minor improvements.

View File

@@ -2,7 +2,18 @@
ispc News
=========
ispc 1.4.0 is Released
ispc 1.4.2 is Released
----------------------
A minor update of ``ispc`` has been released with stability fix for AVX2
(Haswell), fix for Win32 platform and performance improvements on Xeon Phi.
As usual, it's available on all supported platforms (Windows, Linux and MacOS).
This version supports LLVM 3.1, 3.2, 3.3 and 3.4, but now we are recommending
to avoid 3.1, as it's known to contain a number of stability problems and we are
planning to deprecate its support soon.
The released binaries are built with 3.3.
ispc 1.4.1 is Released
----------------------
A major new version of ``ispc`` has been released with stability and

View File

@@ -31,7 +31,7 @@ PROJECT_NAME = "Intel SPMD Program Compiler"
# This could be handy for archiving the generated documentation or
# if some version control system is used.
PROJECT_NUMBER = 1.4.0
PROJECT_NUMBER = 1.4.2
# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
# base path where the generated documentation will be put.

View File

@@ -75,43 +75,7 @@ typedef int64_t __vec1_i64;
struct __vec16_i32;
typedef struct PRE_ALIGN(2) __vec16_i1 {
FORCEINLINE operator __mmask16() const { return m; }
FORCEINLINE __vec16_i1() { /* FIXME? __mm512_undef_mask(); */ }
FORCEINLINE __vec16_i1(const __mmask16 &in) : m(in) {}
FORCEINLINE __vec16_i1(const __vec16_i32 &in);
FORCEINLINE __vec16_i1(const __vec16_i1 &o) : m(o.m) {}
FORCEINLINE __vec16_i1& operator=(const __vec16_i1 &o) { m = o.m; return *this; }
FORCEINLINE __vec16_i1(uint32_t v00, uint32_t v01, uint32_t v02, uint32_t v03,
uint32_t v04, uint32_t v05, uint32_t v06, uint32_t v07,
uint32_t v08, uint32_t v09, uint32_t v10, uint32_t v11,
uint32_t v12, uint32_t v13, uint32_t v14, uint32_t v15) {
m = (v00) |
((v01) << 1) |
((v02) << 2) |
((v03) << 3) |
((v04) << 4) |
((v05) << 5) |
((v06) << 6) |
((v07) << 7) |
((v08) << 8) |
((v09) << 9) |
((v10) << 10) |
((v11) << 11) |
((v12) << 12) |
((v13) << 13) |
((v14) << 14) |
((v15) << 15);
}
union {
__mmask16 m;
struct {
__mmask8 m1;
__mmask8 m2;
} m8;
};
} POST_ALIGN(2) __vec16_i1;
typedef __mmask16 POST_ALIGN(2) __vec16_i1;
typedef struct PRE_ALIGN(64) __vec16_f {
FORCEINLINE operator __m512() const { return v; }
@@ -159,10 +123,6 @@ typedef struct PRE_ALIGN(64) __vec16_i32 {
__m512i v;
} POST_ALIGN(64) __vec16_i32;
FORCEINLINE __vec16_i1::__vec16_i1(const __vec16_i32 &in) {
m = _mm512_test_epi32_mask(in, in);
}
typedef struct PRE_ALIGN(64) __vec16_i64 {
FORCEINLINE __vec16_i64() : v_lo(_mm512_undefined_epi32()), v_hi(_mm512_undefined_epi32()) {}
FORCEINLINE __vec16_i64(const __vec16_i64 &o) : v_lo(o.v_lo), v_hi(o.v_hi) {}
@@ -325,7 +285,7 @@ static FORCEINLINE __vec16_i1 __or(__vec16_i1 a, __vec16_i1 b) {
static FORCEINLINE __vec16_i1 __select(__vec16_i1 mask, __vec16_i1 a,
__vec16_i1 b) {
return ((a.m & mask.m) | (b.m & ~mask.m));
return ((a & mask) | (b & ~mask));
//return __or(__and(a, mask), __andnr(b, mask));
}
@@ -335,7 +295,7 @@ static FORCEINLINE __vec16_i1 __select(bool cond, __vec16_i1 a, __vec16_i1 b) {
static FORCEINLINE bool __extract_element(__vec16_i1 mask, uint32_t index) {
return (mask.m & (1 << index)) ? true : false;
return (mask & (1 << index)) ? true : false;
}
/*
@@ -351,13 +311,13 @@ static FORCEINLINE void __insert_element(__vec16_i1 *vec, int index,
template <int ALIGN> static FORCEINLINE __vec16_i1 __load(const __vec16_i1 *p) {
const uint16_t *ptr = (const uint16_t *)p;
__vec16_i1 r;
r.m = *ptr;
r = *ptr;
return r;
}
template <int ALIGN> static FORCEINLINE void __store(__vec16_i1 *p, __vec16_i1 v) {
uint16_t *ptr = (uint16_t *)p;
*ptr = v.m;
*ptr = v;
}
template <class RetVecType> RetVecType __smear_i1(int i);
@@ -556,7 +516,7 @@ static FORCEINLINE __vec16_i1 __signed_greater_than_i32_and_mask(__vec16_i32 a,
static FORCEINLINE __vec16_i32 __select(__vec16_i1 mask,
__vec16_i32 a, __vec16_i32 b) {
return _mm512_mask_mov_epi32(b.v, mask.m, a.v);
return _mm512_mask_mov_epi32(b.v, mask, a.v);
}
static FORCEINLINE __vec16_i32 __select(bool cond, __vec16_i32 a, __vec16_i32 b) {
@@ -785,8 +745,8 @@ static FORCEINLINE __vec16_i1 __not_equal_i64_and_mask(const __vec16_i64 &a, con
static FORCEINLINE __vec16_i64 __select(__vec16_i1 mask,
__vec16_i64 a, __vec16_i64 b) {
__vec16_i64 ret;
ret.v_hi = _mm512_mask_mov_epi32(b.v_hi, mask.m, a.v_hi);
ret.v_lo = _mm512_mask_mov_epi32(b.v_lo, mask.m, a.v_lo);
ret.v_hi = _mm512_mask_mov_epi32(b.v_hi, mask, a.v_hi);
ret.v_lo = _mm512_mask_mov_epi32(b.v_lo, mask, a.v_lo);
return ret;
}
@@ -1077,113 +1037,134 @@ static FORCEINLINE __vec16_d __div(__vec16_d a, __vec16_d b) {
}
static FORCEINLINE __vec16_i1 __equal_double(__vec16_d a, __vec16_d b) {
__vec16_i1 ret;
ret.m8.m1 = _mm512_cmpeq_pd_mask(a.v1, b.v1);
ret.m8.m2 = _mm512_cmpeq_pd_mask(a.v2, b.v2);
return ret;
__vec16_i1 ret1;
__vec16_i1 ret2;
ret1 = _mm512_cmpeq_pd_mask(a.v1, b.v1);
ret2 = _mm512_cmpeq_pd_mask(a.v2, b.v2);
return _mm512_kmovlhb(ret1, ret2);
}
static FORCEINLINE __vec16_i1 __equal_double_and_mask(__vec16_d a, __vec16_d b,
__vec16_i1 m) {
__vec16_i1 ret;
ret.m8.m1 = _mm512_mask_cmpeq_pd_mask(m.m8.m1, a.v1, b.v1);
ret.m8.m2 = _mm512_mask_cmpeq_pd_mask(m.m8.m2, a.v2, b.v2);
return ret;
__vec16_i1 ret1;
__vec16_i1 ret2;
ret1 = _mm512_mask_cmpeq_pd_mask(m, a.v1, b.v1);
__vec16_i1 tmp_m = m;
ret2 = _mm512_mask_cmpeq_pd_mask(_mm512_kswapb(tmp_m,tmp_m), a.v2, b.v2);
return _mm512_kmovlhb(ret1, ret2);
}
static FORCEINLINE __vec16_i1 __not_equal_double(__vec16_d a, __vec16_d b) {
__vec16_i1 ret;
ret.m8.m1 = _mm512_cmpneq_pd_mask(a.v1, b.v1);
ret.m8.m2 = _mm512_cmpneq_pd_mask(a.v2, b.v2);
return ret;
__vec16_i1 ret1;
__vec16_i1 ret2;
ret1 = _mm512_cmpneq_pd_mask(a.v1, b.v1);
ret2 = _mm512_cmpneq_pd_mask(a.v2, b.v2);
return _mm512_kmovlhb(ret1, ret2);
}
static FORCEINLINE __vec16_i1 __not_equal_double_and_mask(__vec16_d a, __vec16_d b,
__vec16_i1 m) {
__vec16_i1 ret;
ret.m8.m1 = _mm512_mask_cmpneq_pd_mask(m.m8.m1, a.v1, b.v1);
ret.m8.m2 = _mm512_mask_cmpneq_pd_mask(m.m8.m2, a.v2, b.v2);
return ret;
__vec16_i1 ret1;
__vec16_i1 ret2;
__vec16_i1 tmp_m = m;
ret1 = _mm512_mask_cmpneq_pd_mask(m, a.v1, b.v1);
ret2 = _mm512_mask_cmpneq_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2);
return _mm512_kmovlhb(ret1, ret2);
}
static FORCEINLINE __vec16_i1 __less_than_double(__vec16_d a, __vec16_d b) {
__vec16_i1 ret;
ret.m8.m1 = _mm512_cmplt_pd_mask(a.v1, b.v1);
ret.m8.m2 = _mm512_cmplt_pd_mask(a.v2, b.v2);
return ret;
__vec16_i1 ret1;
__vec16_i1 ret2;
ret1 = _mm512_cmplt_pd_mask(a.v1, b.v1);
ret2 = _mm512_cmplt_pd_mask(a.v2, b.v2);
return _mm512_kmovlhb(ret1, ret2);
}
static FORCEINLINE __vec16_i1 __less_than_double_and_mask(__vec16_d a, __vec16_d b,
__vec16_i1 m) {
__vec16_i1 ret;
ret.m8.m1 = _mm512_mask_cmplt_pd_mask(m.m8.m1, a.v1, b.v1);
ret.m8.m2 = _mm512_mask_cmplt_pd_mask(m.m8.m2, a.v2, b.v2);
return ret;
__vec16_i1 ret1;
__vec16_i1 ret2;
__vec16_i1 tmp_m = m;
ret1 = _mm512_mask_cmplt_pd_mask(m, a.v1, b.v1);
ret2 = _mm512_mask_cmplt_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2);
return _mm512_kmovlhb(ret1, ret2);
}
static FORCEINLINE __vec16_i1 __less_equal_double(__vec16_d a, __vec16_d b) {
__vec16_i1 ret;
ret.m8.m1 = _mm512_cmple_pd_mask(a.v1, b.v1);
ret.m8.m2 = _mm512_cmple_pd_mask(a.v2, b.v2);
return ret;
__vec16_i1 ret1;
__vec16_i1 ret2;
ret1 = _mm512_cmple_pd_mask(a.v1, b.v1);
ret2 = _mm512_cmple_pd_mask(a.v2, b.v2);
return _mm512_kmovlhb(ret1, ret2);
}
static FORCEINLINE __vec16_i1 __less_equal_double_and_mask(__vec16_d a, __vec16_d b,
__vec16_i1 m) {
__vec16_i1 ret;
ret.m8.m1 = _mm512_mask_cmple_pd_mask(m.m8.m1, a.v1, b.v1);
ret.m8.m2 = _mm512_mask_cmple_pd_mask(m.m8.m2, a.v2, b.v2);
return ret;
__vec16_i1 ret1;
__vec16_i1 ret2;
__vec16_i1 tmp_m = m;
ret1 = _mm512_mask_cmple_pd_mask(m, a.v1, b.v1);
ret2 = _mm512_mask_cmple_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2);
return _mm512_kmovlhb(ret1, ret2);
}
static FORCEINLINE __vec16_i1 __greater_than_double(__vec16_d a, __vec16_d b) {
__vec16_i1 ret;
ret.m8.m1 = _mm512_cmpnle_pd_mask(a.v1, b.v1);
ret.m8.m2 = _mm512_cmpnle_pd_mask(a.v2, b.v2);
return ret;
__vec16_i1 ret1;
__vec16_i1 ret2;
ret1 = _mm512_cmpnle_pd_mask(a.v1, b.v1);
ret2 = _mm512_cmpnle_pd_mask(a.v2, b.v2);
return _mm512_kmovlhb(ret1, ret2);
}
static FORCEINLINE __vec16_i1 __greater_than_double_and_mask(__vec16_d a, __vec16_d b,
__vec16_i1 m) {
__vec16_i1 ret;
ret.m8.m1 = _mm512_mask_cmpnle_pd_mask(m.m8.m1, a.v1, b.v1);
ret.m8.m2 = _mm512_mask_cmpnle_pd_mask(m.m8.m2, a.v2, b.v2);
return ret;
__vec16_i1 ret1;
__vec16_i1 ret2;
__vec16_i1 tmp_m = m;
ret1 = _mm512_mask_cmpnle_pd_mask(m, a.v1, b.v1);
ret2 = _mm512_mask_cmpnle_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2);
return _mm512_kmovlhb(ret1, ret2);
}
static FORCEINLINE __vec16_i1 __greater_equal_double(__vec16_d a, __vec16_d b) {
__vec16_i1 ret;
ret.m8.m1 = _mm512_cmpnlt_pd_mask(a.v1, b.v1);
ret.m8.m2 = _mm512_cmpnlt_pd_mask(a.v2, b.v2);
return ret;
__vec16_i1 ret1;
__vec16_i1 ret2;
ret1 = _mm512_cmpnlt_pd_mask(a.v1, b.v1);
ret2 = _mm512_cmpnlt_pd_mask(a.v2, b.v2);
return _mm512_kmovlhb(ret1, ret2);
}
static FORCEINLINE __vec16_i1 __greater_equal_double_and_mask(__vec16_d a, __vec16_d b,
__vec16_i1 m) {
__vec16_i1 ret;
ret.m8.m1 = _mm512_mask_cmpnlt_pd_mask(m.m8.m1, a.v1, b.v1);
ret.m8.m2 = _mm512_mask_cmpnlt_pd_mask(m.m8.m2, a.v2, b.v2);
return ret;
__vec16_i1 ret1;
__vec16_i1 ret2;
__vec16_i1 tmp_m = m;
ret1 = _mm512_mask_cmpnlt_pd_mask(m, a.v1, b.v1);
ret2 = _mm512_mask_cmpnlt_pd_mask(_mm512_kswapb(tmp_m, tmp_m), a.v2, b.v2);
return _mm512_kmovlhb(ret1, ret2);
}
static FORCEINLINE __vec16_i1 __ordered_double(__vec16_d a, __vec16_d b) {
__vec16_i1 ret;
ret.m8.m1 = _mm512_cmpord_pd_mask(a.v1, b.v1);
ret.m8.m2 = _mm512_cmpord_pd_mask(a.v2, b.v2);
return ret;
__vec16_i1 ret1;
__vec16_i1 ret2;
ret1 = _mm512_cmpord_pd_mask(a.v1, b.v1);
ret2 = _mm512_cmpord_pd_mask(a.v2, b.v2);
return _mm512_kmovlhb(ret1, ret2);
}
static FORCEINLINE __vec16_i1 __unordered_double(__vec16_d a, __vec16_d b) {
__vec16_i1 ret;
ret.m8.m1 = _mm512_cmpunord_pd_mask(a.v1, b.v1);
ret.m8.m2 = _mm512_cmpunord_pd_mask(a.v2, b.v2);
return ret;
__vec16_i1 ret1;
__vec16_i1 ret2;
ret1 = _mm512_cmpunord_pd_mask(a.v1, b.v1);
ret2 = _mm512_cmpunord_pd_mask(a.v2, b.v2);
return _mm512_kmovlhb(ret1, ret2);
}
static FORCEINLINE __vec16_d __select(__vec16_i1 mask, __vec16_d a, __vec16_d b) {
__vec16_d ret;
ret.v1 = _mm512_mask_mov_pd(b.v1, mask.m8.m1, a.v1);
ret.v2 = _mm512_mask_mov_pd(b.v2, mask.m8.m2, a.v2);
__vec16_i1 tmp_m = mask;
ret.v1 = _mm512_mask_mov_pd(b.v1, mask, a.v1);
ret.v2 = _mm512_mask_mov_pd(b.v2, _mm512_kswapb(tmp_m, tmp_m), a.v2);
return ret;
}
@@ -1283,7 +1264,7 @@ static FORCEINLINE __vec16_i32 __cast_zext(const __vec16_i32 &, const __vec16_i1
{
__vec16_i32 ret = _mm512_setzero_epi32();
__vec16_i32 one = _mm512_set1_epi32(1);
return _mm512_mask_mov_epi32(ret, val.m, one);
return _mm512_mask_mov_epi32(ret, val, one);
}
static FORCEINLINE __vec16_f __cast_sitofp(__vec16_f, __vec16_i8 val) {
@@ -1573,33 +1554,35 @@ static FORCEINLINE float __reduce_max_double(__vec16_d v) {
// Currently, when a pseudo_gather is converted into a masked load, it has to be unaligned
static FORCEINLINE __vec16_i32 __masked_load_i32(void *p, __vec16_i1 mask) {
#ifdef ISPC_FORCE_ALIGNED_MEMORY
return _mm512_mask_load_epi32(__vec16_i32(), mask.m, p);
return _mm512_mask_load_epi32(__vec16_i32(), mask, p);
#else
__vec16_i32 tmp;
tmp.v = _mm512_mask_extloadunpacklo_epi32(tmp.v, 0xFFFF, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
tmp.v = _mm512_mask_extloadunpackhi_epi32(tmp.v, 0xFFFF, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
__vec16_i32 ret;
return _mm512_mask_mov_epi32(ret.v, mask.m, tmp.v);
return _mm512_mask_mov_epi32(ret.v, mask, tmp.v);
#endif
}
static FORCEINLINE __vec16_f __masked_load_float(void *p, __vec16_i1 mask) {
#ifdef ISPC_FORCE_ALIGNED_MEMORY
return _mm512_mask_load_ps(_mm512_undefined_ps(), mask.m,p);
return _mm512_mask_load_ps(_mm512_undefined_ps(), mask,p);
#else
__vec16_f tmp;
tmp.v = _mm512_mask_extloadunpacklo_ps(tmp.v, 0xFFFF, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
tmp.v = _mm512_mask_extloadunpackhi_ps(tmp.v, 0xFFFF, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
__vec16_f ret;
return _mm512_mask_mov_ps(ret.v, mask.m, tmp.v);
return _mm512_mask_mov_ps(ret.v, mask, tmp.v);
#endif
}
static FORCEINLINE __vec16_d __masked_load_double(void *p, __vec16_i1 mask) {
#ifdef ISPC_FORCE_ALIGNED_MEMORY
__vec16_d ret;
ret.v1 = _mm512_mask_load_pd(ret.v1, mask.m8.m1, p);
ret.v2 = _mm512_mask_load_pd(ret.v2, mask.m8.m2, (uint8_t*)p+64);
__vec16_i1 tmp_m = mask;
tmp_m = _mm512_kswapb(tmp_m, tmp_m);
ret.v1 = _mm512_mask_load_pd(ret.v1, mask, p);
ret.v2 = _mm512_mask_load_pd(ret.v2, tmp_m, (uint8_t*)p+64);
return ret;
#else
__vec16_d tmp;
@@ -1608,20 +1591,22 @@ static FORCEINLINE __vec16_d __masked_load_double(void *p, __vec16_i1 mask) {
tmp.v2 = _mm512_mask_extloadunpacklo_pd(tmp.v2, 0xFF, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
tmp.v2 = _mm512_mask_extloadunpackhi_pd(tmp.v2, 0xFF, (uint8_t*)p+128, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
__vec16_d ret;
ret.v1 = _mm512_mask_mov_pd(ret.v1, mask.m8.m1, tmp.v1);
ret.v2 = _mm512_mask_mov_pd(ret.v2, mask.m8.m2, tmp.v2);
__vec16_i1 tmp_m = mask;
tmp_m = _mm512_kswapb(tmp_m, tmp_m);
ret.v1 = _mm512_mask_mov_pd(ret.v1, mask, tmp.v1);
ret.v2 = _mm512_mask_mov_pd(ret.v2, tmp_m, tmp.v2);
return ret;
#endif
}
static FORCEINLINE void __masked_store_i32(void *p, __vec16_i32 val, __vec16_i1 mask) {
#ifdef ISPC_FORCE_ALIGNED_MEMORY
_mm512_mask_store_epi32(p, mask.m, val.v);
_mm512_mask_store_epi32(p, mask, val.v);
#else
__vec16_i32 tmp;
tmp.v = _mm512_extloadunpacklo_epi32(tmp.v, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
tmp.v = _mm512_extloadunpackhi_epi32(tmp.v, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE);
tmp.v = _mm512_mask_mov_epi32(tmp.v, mask.m, val.v);
tmp.v = _mm512_mask_mov_epi32(tmp.v, mask, val.v);
_mm512_extpackstorelo_epi32(p, tmp.v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
_mm512_extpackstorehi_epi32((uint8_t*)p+64, tmp.v, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
#endif
@@ -1630,12 +1615,12 @@ static FORCEINLINE void __masked_store_i32(void *p, __vec16_i32 val, __vec16_i1
static FORCEINLINE void __masked_store_float(void *p, __vec16_f val,
__vec16_i1 mask) {
#ifdef ISPC_FORCE_ALIGNED_MEMORY
_mm512_mask_store_ps(p, mask.m, val.v);
_mm512_mask_store_ps(p, mask, val.v);
#else
__vec16_f tmp;
tmp.v = _mm512_extloadunpacklo_ps(tmp.v, p, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
tmp.v = _mm512_extloadunpackhi_ps(tmp.v, (uint8_t*)p+64, _MM_UPCONV_PS_NONE, _MM_HINT_NONE);
tmp.v = _mm512_mask_mov_ps(tmp.v, mask.m, val.v);
tmp.v = _mm512_mask_mov_ps(tmp.v, mask, val.v);
_mm512_extpackstorelo_ps(p, tmp.v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
_mm512_extpackstorehi_ps((uint8_t*)p+64, tmp.v, _MM_DOWNCONV_PS_NONE, _MM_HINT_NONE);
#endif
@@ -1644,16 +1629,20 @@ static FORCEINLINE void __masked_store_float(void *p, __vec16_f val,
static FORCEINLINE void __masked_store_double(void *p, __vec16_d val,
__vec16_i1 mask) {
#ifdef ISPC_FORCE_ALIGNED_MEMORY
_mm512_mask_store_pd(p, mask.m8.m1, val.v1);
_mm512_mask_store_pd((uint8_t*)p+64, mask.m8.m2, val.v2);
__vec16_i1 tmp_m = mask;
tmp_m = _mm512_kswapb(tmp_m, tmp_m);
_mm512_mask_store_pd(p, mask, val.v1);
_mm512_mask_store_pd((uint8_t*)p+64, tmp_m, val.v2);
#else
__vec16_d tmp;
__vec16_i1 tmp_m = mask;
tmp_m = _mm512_kswapb(tmp_m, tmp_m);
tmp.v1 = _mm512_extloadunpacklo_pd(tmp.v1, p, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
tmp.v1 = _mm512_extloadunpackhi_pd(tmp.v1, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
tmp.v2 = _mm512_extloadunpacklo_pd(tmp.v2, (uint8_t*)p+64, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
tmp.v2 = _mm512_extloadunpackhi_pd(tmp.v2, (uint8_t*)p+128, _MM_UPCONV_PD_NONE, _MM_HINT_NONE);
tmp.v1 = _mm512_mask_mov_pd(tmp.v1, mask.m8.m1, val.v1);
tmp.v2 = _mm512_mask_mov_pd(tmp.v2, mask.m8.m2, val.v2);
tmp.v1 = _mm512_mask_mov_pd(tmp.v1, mask, val.v1);
tmp.v2 = _mm512_mask_mov_pd(tmp.v2, tmp_m, val.v2);
_mm512_extpackstorelo_pd(p, tmp.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
_mm512_extpackstorehi_pd((uint8_t*)p+64, tmp.v1, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
_mm512_extpackstorelo_pd((uint8_t*)p+64, tmp.v2, _MM_DOWNCONV_PD_NONE, _MM_HINT_NONE);
@@ -1870,7 +1859,7 @@ static FORCEINLINE int32_t __packed_store_active(uint32_t *p, __vec16_i32 val,
__vec16_i1 mask) {
_mm512_mask_extpackstorelo_epi32(p, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
_mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE);
return _mm_countbits_32(uint32_t(mask.m));
return _mm_countbits_32(uint32_t(mask));
}
///////////////////////////////////////////////////////////////////////////

View File

@@ -441,6 +441,14 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
if (g->opt.disableFMA == false)
options.AllowFPOpFusion = llvm::FPOpFusion::Fast;
#endif // !LLVM_3_1
#ifdef ISPC_IS_WINDOWS
if (strcmp("x86", arch) == 0) {
// Workaround for issue #503 (LLVM issue 14646).
// It's Win32 specific.
options.NoFramePointerElim = true;
}
#endif
m_targetMachine =
m_target->createTargetMachine(triple, m_cpu, featuresString, options,
relocModel);

2
ispc.h
View File

@@ -38,7 +38,7 @@
#ifndef ISPC_H
#define ISPC_H
#define ISPC_VERSION "1.4.0"
#define ISPC_VERSION "1.4.2"
#if !defined(LLVM_3_1) && !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4)
#error "Only LLVM 3.1, 3.2, 3.3 and the 3.4 development branch are supported"

View File

@@ -0,0 +1,54 @@
This patch needs to be applied to LLVM 3.2/3.3 to fix bunch of fails on AVX2 target.
LLVM 3.4 contains this fix (r183327).
Index: lib/Target/X86/X86ISelDAGToDAG.cpp
===================================================================
--- lib/Target/X86/X86ISelDAGToDAG.cpp (revision 183626)
+++ lib/Target/X86/X86ISelDAGToDAG.cpp (working copy)
@@ -2013,6 +2013,8 @@
case Intrinsic::x86_avx2_gather_d_d_256:
case Intrinsic::x86_avx2_gather_q_d:
case Intrinsic::x86_avx2_gather_q_d_256: {
+ if (!Subtarget->hasAVX2())
+ break;
unsigned Opc;
switch (IntNo) {
default: llvm_unreachable("Impossible intrinsic");
Index: lib/Target/X86/X86InstrSSE.td
===================================================================
--- lib/Target/X86/X86InstrSSE.td (revision 183626)
+++ lib/Target/X86/X86InstrSSE.td (working copy)
@@ -8367,7 +8367,9 @@
[]>, VEX_4VOp3, VEX_L;
}
-let mayLoad = 1, Constraints = "$src1 = $dst, $mask = $mask_wb" in {
+let mayLoad = 1, Constraints
+ = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb"
+ in {
defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", VR256, vx64mem, vx64mem>, VEX_W;
defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", VR256, vx64mem, vy64mem>, VEX_W;
defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", VR256, vx32mem, vy32mem>;
Index: test/CodeGen/X86/avx2-gather.ll
===================================================================
--- test/CodeGen/X86/avx2-gather.ll (revision 0)
+++ test/CodeGen/X86/avx2-gather.ll (working copy)
@@ -0,0 +1,18 @@
+; RUN: not llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 | FileCheck %s
+
+declare <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float>, i8*,
+ <4 x i32>, <4 x float>, i8) nounwind readonly
+
+define <4 x float> @test_x86_avx2_gather_d_ps(i8* %a1,
+ <4 x i32> %idx, <4 x float> %mask) {
+ %res = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> undef,
+ i8* %a1, <4 x i32> %idx, <4 x float> %mask, i8 2) ;
+ ret <4 x float> %res
+}
+
+; CHECK: test_x86_avx2_gather_d_ps
+; CHECK: vgatherdps
+; CHECK-NOT: [[DST]]
+; CHECK: [[DST:%xmm[0-9]+]]{{$}}
+; CHECK: ret

View File

@@ -2879,7 +2879,11 @@ FunctionType::GetDIType(llvm::DIDescriptor scope) const {
for (int i = 0; i < GetNumParameters(); ++i) {
const Type *t = GetParameterType(i);
if (t == NULL)
#if defined(LLVM_3_4)
return llvm::DICompositeType();
#else
return llvm::DIType();
#endif
retArgTypes.push_back(t->GetDIType(scope));
}