avx1-i32x4 implementation as sse4-i32x4 with avx target-feature flag

This commit is contained in:
Dmitry Babokin
2013-11-10 23:48:49 +04:00
parent fbab9874f6
commit ffc9a33933
3 changed files with 58 additions and 13 deletions

View File

@@ -942,11 +942,22 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
case Target::AVX: { case Target::AVX: {
switch (g->target->getVectorWidth()) { switch (g->target->getVectorWidth()) {
case 4: case 4:
if (runtime32) { if (g->target->getDataTypeWidth() == 32) {
EXPORT_MODULE(builtins_bitcode_avx1_i64x4_32bit); if (runtime32) {
} EXPORT_MODULE(builtins_bitcode_sse4_32bit);
else { }
EXPORT_MODULE(builtins_bitcode_avx1_i64x4_64bit); else {
EXPORT_MODULE(builtins_bitcode_sse4_64bit);
}
} else if (g->target->getDataTypeWidth() == 64) {
if (runtime32) {
EXPORT_MODULE(builtins_bitcode_avx1_i64x4_32bit);
}
else {
EXPORT_MODULE(builtins_bitcode_avx1_i64x4_64bit);
}
} else {
FATAL("logic error in DefineStdlib");
} }
break; break;
case 8: case 8:

View File

@@ -169,7 +169,7 @@ static const char *supportedCPUs[] = {
, "core-avx-i", "core-avx2" , "core-avx-i", "core-avx2"
#endif // LLVM 3.2+ #endif // LLVM 3.2+
#if !defined(LLVM_3_1) && !defined(LLVM_3_2) && !defined(LLVM_3_3) #if !defined(LLVM_3_1) && !defined(LLVM_3_2) && !defined(LLVM_3_3)
, "slm" , "slm"
#endif // LLVM 3.4+ #endif // LLVM 3.4+
}; };
@@ -191,6 +191,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
m_tf_attributes(NULL), m_tf_attributes(NULL),
#endif #endif
m_nativeVectorWidth(-1), m_nativeVectorWidth(-1),
m_dataTypeWidth(-1),
m_vectorWidth(-1), m_vectorWidth(-1),
m_generatePIC(pic), m_generatePIC(pic),
m_maskingIsFree(false), m_maskingIsFree(false),
@@ -308,6 +309,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
!strcasecmp(isa, "sse2-i32x4")) { !strcasecmp(isa, "sse2-i32x4")) {
this->m_isa = Target::SSE2; this->m_isa = Target::SSE2;
this->m_nativeVectorWidth = 4; this->m_nativeVectorWidth = 4;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 4; this->m_vectorWidth = 4;
this->m_attributes = "+sse,+sse2,-sse3,-sse4a,-ssse3,-popcnt" this->m_attributes = "+sse,+sse2,-sse3,-sse4a,-ssse3,-popcnt"
#if defined(LLVM_3_4) #if defined(LLVM_3_4)
@@ -323,6 +325,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
!strcasecmp(isa, "sse2-i32x8")) { !strcasecmp(isa, "sse2-i32x8")) {
this->m_isa = Target::SSE2; this->m_isa = Target::SSE2;
this->m_nativeVectorWidth = 4; this->m_nativeVectorWidth = 4;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 8; this->m_vectorWidth = 8;
this->m_attributes = "+sse,+sse2,-sse3,-sse4a,-ssse3,-popcnt" this->m_attributes = "+sse,+sse2,-sse3,-sse4a,-ssse3,-popcnt"
#if defined(LLVM_3_4) #if defined(LLVM_3_4)
@@ -338,11 +341,12 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
!strcasecmp(isa, "sse4-i32x4")) { !strcasecmp(isa, "sse4-i32x4")) {
this->m_isa = Target::SSE4; this->m_isa = Target::SSE4;
this->m_nativeVectorWidth = 4; this->m_nativeVectorWidth = 4;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 4; this->m_vectorWidth = 4;
// TODO: why not sse42 and popcnt? // TODO: why not sse42 and popcnt?
this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov" this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov"
#if defined(LLVM_3_4) #if defined(LLVM_3_4)
",+sse4.1,-sse4.2" ",+sse4.1,-sse4.2"
#else #else
",+sse41,-sse42" ",+sse41,-sse42"
#endif #endif
@@ -355,10 +359,11 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
!strcasecmp(isa, "sse4-i32x8")) { !strcasecmp(isa, "sse4-i32x8")) {
this->m_isa = Target::SSE4; this->m_isa = Target::SSE4;
this->m_nativeVectorWidth = 4; this->m_nativeVectorWidth = 4;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 8; this->m_vectorWidth = 8;
this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov" this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov"
#if defined(LLVM_3_4) #if defined(LLVM_3_4)
",+sse4.1,-sse4.2" ",+sse4.1,-sse4.2"
#else #else
",+sse41,-sse42" ",+sse41,-sse42"
#endif #endif
@@ -369,10 +374,11 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
else if (!strcasecmp(isa, "sse4-i8x16")) { else if (!strcasecmp(isa, "sse4-i8x16")) {
this->m_isa = Target::SSE4; this->m_isa = Target::SSE4;
this->m_nativeVectorWidth = 16; this->m_nativeVectorWidth = 16;
this->m_dataTypeWidth = 8;
this->m_vectorWidth = 16; this->m_vectorWidth = 16;
this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov" this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov"
#if defined(LLVM_3_4) #if defined(LLVM_3_4)
",+sse4.1,-sse4.2" ",+sse4.1,-sse4.2"
#else #else
",+sse41,-sse42" ",+sse41,-sse42"
#endif #endif
@@ -383,10 +389,11 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
else if (!strcasecmp(isa, "sse4-i16x8")) { else if (!strcasecmp(isa, "sse4-i16x8")) {
this->m_isa = Target::SSE4; this->m_isa = Target::SSE4;
this->m_nativeVectorWidth = 8; this->m_nativeVectorWidth = 8;
this->m_dataTypeWidth = 16;
this->m_vectorWidth = 8; this->m_vectorWidth = 8;
this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov" this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov"
#if defined(LLVM_3_4) #if defined(LLVM_3_4)
",+sse4.1,-sse4.2" ",+sse4.1,-sse4.2"
#else #else
",+sse41,-sse42" ",+sse41,-sse42"
#endif #endif
@@ -457,11 +464,21 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
this->m_maskingIsFree = false; this->m_maskingIsFree = false;
this->m_maskBitCount = 32; this->m_maskBitCount = 32;
} }
else if (!strcasecmp(isa, "avx1-i32x4")) {
this->m_isa = Target::AVX;
this->m_nativeVectorWidth = 8;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 4;
this->m_attributes = "+avx,+popcnt,+cmov";
this->m_maskingIsFree = false;
this->m_maskBitCount = 32;
}
else if (!strcasecmp(isa, "avx") || else if (!strcasecmp(isa, "avx") ||
!strcasecmp(isa, "avx1") || !strcasecmp(isa, "avx1") ||
!strcasecmp(isa, "avx1-i32x8")) { !strcasecmp(isa, "avx1-i32x8")) {
this->m_isa = Target::AVX; this->m_isa = Target::AVX;
this->m_nativeVectorWidth = 8; this->m_nativeVectorWidth = 8;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 8; this->m_vectorWidth = 8;
this->m_attributes = "+avx,+popcnt,+cmov"; this->m_attributes = "+avx,+popcnt,+cmov";
this->m_maskingIsFree = false; this->m_maskingIsFree = false;
@@ -471,6 +488,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
!strcasecmp(isa, "avx1-i64x4")) { !strcasecmp(isa, "avx1-i64x4")) {
this->m_isa = Target::AVX; this->m_isa = Target::AVX;
this->m_nativeVectorWidth = 8; /* native vector width in terms of floats */ this->m_nativeVectorWidth = 8; /* native vector width in terms of floats */
this->m_dataTypeWidth = 64;
this->m_vectorWidth = 4; this->m_vectorWidth = 4;
this->m_attributes = "+avx,+popcnt,+cmov"; this->m_attributes = "+avx,+popcnt,+cmov";
this->m_maskingIsFree = false; this->m_maskingIsFree = false;
@@ -481,6 +499,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
!strcasecmp(isa, "avx1-i32x16")) { !strcasecmp(isa, "avx1-i32x16")) {
this->m_isa = Target::AVX; this->m_isa = Target::AVX;
this->m_nativeVectorWidth = 8; this->m_nativeVectorWidth = 8;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 16; this->m_vectorWidth = 16;
this->m_attributes = "+avx,+popcnt,+cmov"; this->m_attributes = "+avx,+popcnt,+cmov";
this->m_maskingIsFree = false; this->m_maskingIsFree = false;
@@ -490,6 +509,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
!strcasecmp(isa, "avx1.1-i32x8")) { !strcasecmp(isa, "avx1.1-i32x8")) {
this->m_isa = Target::AVX11; this->m_isa = Target::AVX11;
this->m_nativeVectorWidth = 8; this->m_nativeVectorWidth = 8;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 8; this->m_vectorWidth = 8;
this->m_attributes = "+avx,+popcnt,+cmov,+f16c" this->m_attributes = "+avx,+popcnt,+cmov,+f16c"
#if defined(LLVM_3_4) #if defined(LLVM_3_4)
@@ -510,6 +530,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
!strcasecmp(isa, "avx1.1-i32x16")) { !strcasecmp(isa, "avx1.1-i32x16")) {
this->m_isa = Target::AVX11; this->m_isa = Target::AVX11;
this->m_nativeVectorWidth = 8; this->m_nativeVectorWidth = 8;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 16; this->m_vectorWidth = 16;
this->m_attributes = "+avx,+popcnt,+cmov,+f16c" this->m_attributes = "+avx,+popcnt,+cmov,+f16c"
#if defined(LLVM_3_4) #if defined(LLVM_3_4)
@@ -517,7 +538,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
#else #else
",+rdrand" ",+rdrand"
#endif #endif
; ;
this->m_maskingIsFree = false; this->m_maskingIsFree = false;
this->m_maskBitCount = 32; this->m_maskBitCount = 32;
this->m_hasHalf = true; this->m_hasHalf = true;
@@ -529,6 +550,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
else if (!strcasecmp(isa, "avx1.1-i64x4")) { else if (!strcasecmp(isa, "avx1.1-i64x4")) {
this->m_isa = Target::AVX11; this->m_isa = Target::AVX11;
this->m_nativeVectorWidth = 8; /* native vector width in terms of floats */ this->m_nativeVectorWidth = 8; /* native vector width in terms of floats */
this->m_dataTypeWidth = 64;
this->m_vectorWidth = 4; this->m_vectorWidth = 4;
this->m_attributes = "+avx,+popcnt,+cmov,+f16c" this->m_attributes = "+avx,+popcnt,+cmov,+f16c"
#if defined(LLVM_3_4) #if defined(LLVM_3_4)
@@ -536,7 +558,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
#else #else
",+rdrand" ",+rdrand"
#endif #endif
; ;
this->m_maskingIsFree = false; this->m_maskingIsFree = false;
this->m_maskBitCount = 64; this->m_maskBitCount = 64;
this->m_hasHalf = true; this->m_hasHalf = true;
@@ -549,6 +571,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
!strcasecmp(isa, "avx2-i32x8")) { !strcasecmp(isa, "avx2-i32x8")) {
this->m_isa = Target::AVX2; this->m_isa = Target::AVX2;
this->m_nativeVectorWidth = 8; this->m_nativeVectorWidth = 8;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 8; this->m_vectorWidth = 8;
this->m_attributes = "+avx2,+popcnt,+cmov,+f16c" this->m_attributes = "+avx2,+popcnt,+cmov,+f16c"
#if defined(LLVM_3_4) #if defined(LLVM_3_4)
@@ -573,6 +596,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
!strcasecmp(isa, "avx2-i32x16")) { !strcasecmp(isa, "avx2-i32x16")) {
this->m_isa = Target::AVX2; this->m_isa = Target::AVX2;
this->m_nativeVectorWidth = 16; this->m_nativeVectorWidth = 16;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 16; this->m_vectorWidth = 16;
this->m_attributes = "+avx2,+popcnt,+cmov,+f16c" this->m_attributes = "+avx2,+popcnt,+cmov,+f16c"
#if defined(LLVM_3_4) #if defined(LLVM_3_4)
@@ -596,6 +620,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
else if (!strcasecmp(isa, "avx2-i64x4")) { else if (!strcasecmp(isa, "avx2-i64x4")) {
this->m_isa = Target::AVX2; this->m_isa = Target::AVX2;
this->m_nativeVectorWidth = 8; /* native vector width in terms of floats */ this->m_nativeVectorWidth = 8; /* native vector width in terms of floats */
this->m_dataTypeWidth = 64;
this->m_vectorWidth = 4; this->m_vectorWidth = 4;
this->m_attributes = "+avx2,+popcnt,+cmov,+f16c" this->m_attributes = "+avx2,+popcnt,+cmov,+f16c"
#if defined(LLVM_3_4) #if defined(LLVM_3_4)
@@ -620,6 +645,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
else if (!strcasecmp(isa, "neon-i8x16")) { else if (!strcasecmp(isa, "neon-i8x16")) {
this->m_isa = Target::NEON8; this->m_isa = Target::NEON8;
this->m_nativeVectorWidth = 16; this->m_nativeVectorWidth = 16;
this->m_dataTypeWidth = 8;
this->m_vectorWidth = 16; this->m_vectorWidth = 16;
this->m_attributes = "+neon,+fp16"; this->m_attributes = "+neon,+fp16";
this->m_hasHalf = true; // ?? this->m_hasHalf = true; // ??
@@ -629,6 +655,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
else if (!strcasecmp(isa, "neon-i16x8")) { else if (!strcasecmp(isa, "neon-i16x8")) {
this->m_isa = Target::NEON16; this->m_isa = Target::NEON16;
this->m_nativeVectorWidth = 8; this->m_nativeVectorWidth = 8;
this->m_dataTypeWidth = 16;
this->m_vectorWidth = 8; this->m_vectorWidth = 8;
this->m_attributes = "+neon,+fp16"; this->m_attributes = "+neon,+fp16";
this->m_hasHalf = true; // ?? this->m_hasHalf = true; // ??
@@ -639,6 +666,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
!strcasecmp(isa, "neon-i32x4")) { !strcasecmp(isa, "neon-i32x4")) {
this->m_isa = Target::NEON32; this->m_isa = Target::NEON32;
this->m_nativeVectorWidth = 4; this->m_nativeVectorWidth = 4;
this->m_dataTypeWidth = 32;
this->m_vectorWidth = 4; this->m_vectorWidth = 4;
this->m_attributes = "+neon,+fp16"; this->m_attributes = "+neon,+fp16";
this->m_hasHalf = true; // ?? this->m_hasHalf = true; // ??

8
ispc.h
View File

@@ -253,6 +253,8 @@ public:
int getNativeVectorWidth() const {return m_nativeVectorWidth;} int getNativeVectorWidth() const {return m_nativeVectorWidth;}
int getDataTypeWidth() const {return m_dataTypeWidth;}
int getVectorWidth() const {return m_vectorWidth;} int getVectorWidth() const {return m_vectorWidth;}
bool getGeneratePIC() const {return m_generatePIC;} bool getGeneratePIC() const {return m_generatePIC;}
@@ -319,10 +321,14 @@ private:
#endif #endif
/** Native vector width of the vector instruction set. Note that this /** Native vector width of the vector instruction set. Note that this
value is directly derived from the ISA Being used (e.g. it's 4 for value is directly derived from the ISA being used (e.g. it's 4 for
SSE, 8 for AVX, etc.) */ SSE, 8 for AVX, etc.) */
int m_nativeVectorWidth; int m_nativeVectorWidth;
/** Data type with in bits. Typically it's 32, but could be 8, 16 or 64.
For generic it's -1, which means undefined. */
int m_dataTypeWidth;
/** Actual vector width currently being compiled to. This may be an /** Actual vector width currently being compiled to. This may be an
integer multiple of the native vector width, for example if we're integer multiple of the native vector width, for example if we're
"doubling up" and compiling 8-wide on a 4-wide SSE system. */ "doubling up" and compiling 8-wide on a 4-wide SSE system. */