Merge branch 'master' of https://github.com/ispc/ispc
This commit is contained in:
28
ctx.cpp
28
ctx.cpp
@@ -316,7 +316,11 @@ FunctionEmitContext::FunctionEmitContext(Function *func, Symbol *funSym,
|
|||||||
llvm::BasicBlock *offBB =
|
llvm::BasicBlock *offBB =
|
||||||
llvm::BasicBlock::Create(*g->ctx, "entry",
|
llvm::BasicBlock::Create(*g->ctx, "entry",
|
||||||
(llvm::Function *)offFunc, 0);
|
(llvm::Function *)offFunc, 0);
|
||||||
|
llvm::StoreInst *inst =
|
||||||
new llvm::StoreInst(LLVMMaskAllOff, globalAllOnMaskPtr, offBB);
|
new llvm::StoreInst(LLVMMaskAllOff, globalAllOnMaskPtr, offBB);
|
||||||
|
if (g->opt.forceAlignedMemory) {
|
||||||
|
inst->setAlignment(g->target->getNativeVectorAlignment());
|
||||||
|
}
|
||||||
llvm::ReturnInst::Create(*g->ctx, offBB);
|
llvm::ReturnInst::Create(*g->ctx, offBB);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2437,7 +2441,13 @@ FunctionEmitContext::LoadInst(llvm::Value *ptr, const char *name) {
|
|||||||
if (name == NULL)
|
if (name == NULL)
|
||||||
name = LLVMGetName(ptr, "_load");
|
name = LLVMGetName(ptr, "_load");
|
||||||
|
|
||||||
llvm::Instruction *inst = new llvm::LoadInst(ptr, name, bblock);
|
llvm::LoadInst *inst = new llvm::LoadInst(ptr, name, bblock);
|
||||||
|
|
||||||
|
if (g->opt.forceAlignedMemory &&
|
||||||
|
llvm::dyn_cast<llvm::VectorType>(pt->getElementType())) {
|
||||||
|
inst->setAlignment(g->target->getNativeVectorAlignment());
|
||||||
|
}
|
||||||
|
|
||||||
AddDebugPos(inst);
|
AddDebugPos(inst);
|
||||||
return inst;
|
return inst;
|
||||||
}
|
}
|
||||||
@@ -2719,7 +2729,7 @@ FunctionEmitContext::AllocaInst(llvm::Type *llvmType,
|
|||||||
inst = new llvm::AllocaInst(llvmType, name ? name : "", bblock);
|
inst = new llvm::AllocaInst(llvmType, name ? name : "", bblock);
|
||||||
|
|
||||||
// If no alignment was specified but we have an array of a uniform
|
// If no alignment was specified but we have an array of a uniform
|
||||||
// type, then align it to 4 * the native vector width; it's not
|
// type, then align it to the native vector alignment; it's not
|
||||||
// unlikely that this array will be loaded into varying variables with
|
// unlikely that this array will be loaded into varying variables with
|
||||||
// what will be aligned accesses if the uniform -> varying load is done
|
// what will be aligned accesses if the uniform -> varying load is done
|
||||||
// in regular chunks.
|
// in regular chunks.
|
||||||
@@ -2727,7 +2737,7 @@ FunctionEmitContext::AllocaInst(llvm::Type *llvmType,
|
|||||||
llvm::dyn_cast<llvm::ArrayType>(llvmType);
|
llvm::dyn_cast<llvm::ArrayType>(llvmType);
|
||||||
if (align == 0 && arrayType != NULL &&
|
if (align == 0 && arrayType != NULL &&
|
||||||
!llvm::isa<llvm::VectorType>(arrayType->getElementType()))
|
!llvm::isa<llvm::VectorType>(arrayType->getElementType()))
|
||||||
align = 4 * g->target->getNativeVectorWidth();
|
align = g->target->getNativeVectorAlignment();
|
||||||
|
|
||||||
if (align != 0)
|
if (align != 0)
|
||||||
inst->setAlignment(align);
|
inst->setAlignment(align);
|
||||||
@@ -2986,7 +2996,17 @@ FunctionEmitContext::StoreInst(llvm::Value *value, llvm::Value *ptr) {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
llvm::Instruction *inst = new llvm::StoreInst(value, ptr, bblock);
|
llvm::PointerType *pt =
|
||||||
|
llvm::dyn_cast<llvm::PointerType>(ptr->getType());
|
||||||
|
AssertPos(currentPos, pt != NULL);
|
||||||
|
|
||||||
|
llvm::StoreInst *inst = new llvm::StoreInst(value, ptr, bblock);
|
||||||
|
|
||||||
|
if (g->opt.forceAlignedMemory &&
|
||||||
|
llvm::dyn_cast<llvm::VectorType>(pt->getElementType())) {
|
||||||
|
inst->setAlignment(g->target->getNativeVectorAlignment());
|
||||||
|
}
|
||||||
|
|
||||||
AddDebugPos(inst);
|
AddDebugPos(inst);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -74,8 +74,8 @@ static void usage() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char *argv[]) {
|
int main(int argc, char *argv[]) {
|
||||||
unsigned int width = 1536;
|
unsigned int width = 1536 * 8;
|
||||||
unsigned int height = 1024;
|
unsigned int height = 1024 * 8;
|
||||||
float x0 = -2;
|
float x0 = -2;
|
||||||
float x1 = 1;
|
float x1 = 1;
|
||||||
float y0 = -1;
|
float y0 = -1;
|
||||||
|
|||||||
@@ -66,8 +66,8 @@ writePPM(float *buf, int width, int height, const char *fn) {
|
|||||||
|
|
||||||
|
|
||||||
int main() {
|
int main() {
|
||||||
unsigned int width = 768;
|
unsigned int width = 768 * 4;
|
||||||
unsigned int height = 768;
|
unsigned int height = 768 * 4;
|
||||||
float x0 = -10;
|
float x0 = -10;
|
||||||
float x1 = 10;
|
float x1 = 10;
|
||||||
float y0 = -10;
|
float y0 = -10;
|
||||||
|
|||||||
@@ -67,7 +67,7 @@ void InitData(int Nx, int Ny, int Nz, float *A[2], float *vsq) {
|
|||||||
|
|
||||||
|
|
||||||
int main() {
|
int main() {
|
||||||
int Nx = 256, Ny = 256, Nz = 256;
|
int Nx = 256 * 2, Ny = 256 * 2, Nz = 256 * 2;
|
||||||
int width = 4;
|
int width = 4;
|
||||||
float *Aserial[2], *Aispc[2];
|
float *Aserial[2], *Aispc[2];
|
||||||
Aserial[0] = new float [Nx * Ny * Nz];
|
Aserial[0] = new float [Nx * Ny * Nz];
|
||||||
|
|||||||
26
ispc.cpp
26
ispc.cpp
@@ -191,6 +191,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
|
|||||||
m_tf_attributes(NULL),
|
m_tf_attributes(NULL),
|
||||||
#endif
|
#endif
|
||||||
m_nativeVectorWidth(-1),
|
m_nativeVectorWidth(-1),
|
||||||
|
m_nativeVectorAlignment(-1),
|
||||||
m_dataTypeWidth(-1),
|
m_dataTypeWidth(-1),
|
||||||
m_vectorWidth(-1),
|
m_vectorWidth(-1),
|
||||||
m_generatePIC(pic),
|
m_generatePIC(pic),
|
||||||
@@ -309,6 +310,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
|
|||||||
!strcasecmp(isa, "sse2-i32x4")) {
|
!strcasecmp(isa, "sse2-i32x4")) {
|
||||||
this->m_isa = Target::SSE2;
|
this->m_isa = Target::SSE2;
|
||||||
this->m_nativeVectorWidth = 4;
|
this->m_nativeVectorWidth = 4;
|
||||||
|
this->m_nativeVectorAlignment = 16;
|
||||||
this->m_dataTypeWidth = 32;
|
this->m_dataTypeWidth = 32;
|
||||||
this->m_vectorWidth = 4;
|
this->m_vectorWidth = 4;
|
||||||
this->m_attributes = "+sse,+sse2,-sse3,-sse4a,-ssse3,-popcnt"
|
this->m_attributes = "+sse,+sse2,-sse3,-sse4a,-ssse3,-popcnt"
|
||||||
@@ -325,6 +327,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
|
|||||||
!strcasecmp(isa, "sse2-i32x8")) {
|
!strcasecmp(isa, "sse2-i32x8")) {
|
||||||
this->m_isa = Target::SSE2;
|
this->m_isa = Target::SSE2;
|
||||||
this->m_nativeVectorWidth = 4;
|
this->m_nativeVectorWidth = 4;
|
||||||
|
this->m_nativeVectorAlignment = 16;
|
||||||
this->m_dataTypeWidth = 32;
|
this->m_dataTypeWidth = 32;
|
||||||
this->m_vectorWidth = 8;
|
this->m_vectorWidth = 8;
|
||||||
this->m_attributes = "+sse,+sse2,-sse3,-sse4a,-ssse3,-popcnt"
|
this->m_attributes = "+sse,+sse2,-sse3,-sse4a,-ssse3,-popcnt"
|
||||||
@@ -341,6 +344,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
|
|||||||
!strcasecmp(isa, "sse4-i32x4")) {
|
!strcasecmp(isa, "sse4-i32x4")) {
|
||||||
this->m_isa = Target::SSE4;
|
this->m_isa = Target::SSE4;
|
||||||
this->m_nativeVectorWidth = 4;
|
this->m_nativeVectorWidth = 4;
|
||||||
|
this->m_nativeVectorAlignment = 16;
|
||||||
this->m_dataTypeWidth = 32;
|
this->m_dataTypeWidth = 32;
|
||||||
this->m_vectorWidth = 4;
|
this->m_vectorWidth = 4;
|
||||||
// TODO: why not sse42 and popcnt?
|
// TODO: why not sse42 and popcnt?
|
||||||
@@ -359,6 +363,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
|
|||||||
!strcasecmp(isa, "sse4-i32x8")) {
|
!strcasecmp(isa, "sse4-i32x8")) {
|
||||||
this->m_isa = Target::SSE4;
|
this->m_isa = Target::SSE4;
|
||||||
this->m_nativeVectorWidth = 4;
|
this->m_nativeVectorWidth = 4;
|
||||||
|
this->m_nativeVectorAlignment = 16;
|
||||||
this->m_dataTypeWidth = 32;
|
this->m_dataTypeWidth = 32;
|
||||||
this->m_vectorWidth = 8;
|
this->m_vectorWidth = 8;
|
||||||
this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov"
|
this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov"
|
||||||
@@ -374,6 +379,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
|
|||||||
else if (!strcasecmp(isa, "sse4-i8x16")) {
|
else if (!strcasecmp(isa, "sse4-i8x16")) {
|
||||||
this->m_isa = Target::SSE4;
|
this->m_isa = Target::SSE4;
|
||||||
this->m_nativeVectorWidth = 16;
|
this->m_nativeVectorWidth = 16;
|
||||||
|
this->m_nativeVectorAlignment = 16;
|
||||||
this->m_dataTypeWidth = 8;
|
this->m_dataTypeWidth = 8;
|
||||||
this->m_vectorWidth = 16;
|
this->m_vectorWidth = 16;
|
||||||
this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov"
|
this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov"
|
||||||
@@ -389,6 +395,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
|
|||||||
else if (!strcasecmp(isa, "sse4-i16x8")) {
|
else if (!strcasecmp(isa, "sse4-i16x8")) {
|
||||||
this->m_isa = Target::SSE4;
|
this->m_isa = Target::SSE4;
|
||||||
this->m_nativeVectorWidth = 8;
|
this->m_nativeVectorWidth = 8;
|
||||||
|
this->m_nativeVectorAlignment = 16;
|
||||||
this->m_dataTypeWidth = 16;
|
this->m_dataTypeWidth = 16;
|
||||||
this->m_vectorWidth = 8;
|
this->m_vectorWidth = 8;
|
||||||
this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov"
|
this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov"
|
||||||
@@ -405,6 +412,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
|
|||||||
!strcasecmp(isa, "generic-x4")) {
|
!strcasecmp(isa, "generic-x4")) {
|
||||||
this->m_isa = Target::GENERIC;
|
this->m_isa = Target::GENERIC;
|
||||||
this->m_nativeVectorWidth = 4;
|
this->m_nativeVectorWidth = 4;
|
||||||
|
this->m_nativeVectorAlignment = 16;
|
||||||
this->m_vectorWidth = 4;
|
this->m_vectorWidth = 4;
|
||||||
this->m_maskingIsFree = true;
|
this->m_maskingIsFree = true;
|
||||||
this->m_maskBitCount = 1;
|
this->m_maskBitCount = 1;
|
||||||
@@ -416,6 +424,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
|
|||||||
!strcasecmp(isa, "generic-x8")) {
|
!strcasecmp(isa, "generic-x8")) {
|
||||||
this->m_isa = Target::GENERIC;
|
this->m_isa = Target::GENERIC;
|
||||||
this->m_nativeVectorWidth = 8;
|
this->m_nativeVectorWidth = 8;
|
||||||
|
this->m_nativeVectorAlignment = 32;
|
||||||
this->m_vectorWidth = 8;
|
this->m_vectorWidth = 8;
|
||||||
this->m_maskingIsFree = true;
|
this->m_maskingIsFree = true;
|
||||||
this->m_maskBitCount = 1;
|
this->m_maskBitCount = 1;
|
||||||
@@ -427,6 +436,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
|
|||||||
!strcasecmp(isa, "generic-x16")) {
|
!strcasecmp(isa, "generic-x16")) {
|
||||||
this->m_isa = Target::GENERIC;
|
this->m_isa = Target::GENERIC;
|
||||||
this->m_nativeVectorWidth = 16;
|
this->m_nativeVectorWidth = 16;
|
||||||
|
this->m_nativeVectorAlignment = 64;
|
||||||
this->m_vectorWidth = 16;
|
this->m_vectorWidth = 16;
|
||||||
this->m_maskingIsFree = true;
|
this->m_maskingIsFree = true;
|
||||||
this->m_maskBitCount = 1;
|
this->m_maskBitCount = 1;
|
||||||
@@ -438,6 +448,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
|
|||||||
!strcasecmp(isa, "generic-x32")) {
|
!strcasecmp(isa, "generic-x32")) {
|
||||||
this->m_isa = Target::GENERIC;
|
this->m_isa = Target::GENERIC;
|
||||||
this->m_nativeVectorWidth = 32;
|
this->m_nativeVectorWidth = 32;
|
||||||
|
this->m_nativeVectorAlignment = 64;
|
||||||
this->m_vectorWidth = 32;
|
this->m_vectorWidth = 32;
|
||||||
this->m_maskingIsFree = true;
|
this->m_maskingIsFree = true;
|
||||||
this->m_maskBitCount = 1;
|
this->m_maskBitCount = 1;
|
||||||
@@ -449,6 +460,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
|
|||||||
!strcasecmp(isa, "generic-x64")) {
|
!strcasecmp(isa, "generic-x64")) {
|
||||||
this->m_isa = Target::GENERIC;
|
this->m_isa = Target::GENERIC;
|
||||||
this->m_nativeVectorWidth = 64;
|
this->m_nativeVectorWidth = 64;
|
||||||
|
this->m_nativeVectorAlignment = 64;
|
||||||
this->m_vectorWidth = 64;
|
this->m_vectorWidth = 64;
|
||||||
this->m_maskingIsFree = true;
|
this->m_maskingIsFree = true;
|
||||||
this->m_maskBitCount = 1;
|
this->m_maskBitCount = 1;
|
||||||
@@ -460,6 +472,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
|
|||||||
!strcasecmp(isa, "generic-x1")) {
|
!strcasecmp(isa, "generic-x1")) {
|
||||||
this->m_isa = Target::GENERIC;
|
this->m_isa = Target::GENERIC;
|
||||||
this->m_nativeVectorWidth = 1;
|
this->m_nativeVectorWidth = 1;
|
||||||
|
this->m_nativeVectorAlignment = 16;
|
||||||
this->m_vectorWidth = 1;
|
this->m_vectorWidth = 1;
|
||||||
this->m_maskingIsFree = false;
|
this->m_maskingIsFree = false;
|
||||||
this->m_maskBitCount = 32;
|
this->m_maskBitCount = 32;
|
||||||
@@ -467,6 +480,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
|
|||||||
else if (!strcasecmp(isa, "avx1-i32x4")) {
|
else if (!strcasecmp(isa, "avx1-i32x4")) {
|
||||||
this->m_isa = Target::AVX;
|
this->m_isa = Target::AVX;
|
||||||
this->m_nativeVectorWidth = 8;
|
this->m_nativeVectorWidth = 8;
|
||||||
|
this->m_nativeVectorAlignment = 32;
|
||||||
this->m_dataTypeWidth = 32;
|
this->m_dataTypeWidth = 32;
|
||||||
this->m_vectorWidth = 4;
|
this->m_vectorWidth = 4;
|
||||||
this->m_attributes = "+avx,+popcnt,+cmov";
|
this->m_attributes = "+avx,+popcnt,+cmov";
|
||||||
@@ -478,6 +492,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
|
|||||||
!strcasecmp(isa, "avx1-i32x8")) {
|
!strcasecmp(isa, "avx1-i32x8")) {
|
||||||
this->m_isa = Target::AVX;
|
this->m_isa = Target::AVX;
|
||||||
this->m_nativeVectorWidth = 8;
|
this->m_nativeVectorWidth = 8;
|
||||||
|
this->m_nativeVectorAlignment = 32;
|
||||||
this->m_dataTypeWidth = 32;
|
this->m_dataTypeWidth = 32;
|
||||||
this->m_vectorWidth = 8;
|
this->m_vectorWidth = 8;
|
||||||
this->m_attributes = "+avx,+popcnt,+cmov";
|
this->m_attributes = "+avx,+popcnt,+cmov";
|
||||||
@@ -488,6 +503,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
|
|||||||
!strcasecmp(isa, "avx1-i64x4")) {
|
!strcasecmp(isa, "avx1-i64x4")) {
|
||||||
this->m_isa = Target::AVX;
|
this->m_isa = Target::AVX;
|
||||||
this->m_nativeVectorWidth = 8; /* native vector width in terms of floats */
|
this->m_nativeVectorWidth = 8; /* native vector width in terms of floats */
|
||||||
|
this->m_nativeVectorAlignment = 32;
|
||||||
this->m_dataTypeWidth = 64;
|
this->m_dataTypeWidth = 64;
|
||||||
this->m_vectorWidth = 4;
|
this->m_vectorWidth = 4;
|
||||||
this->m_attributes = "+avx,+popcnt,+cmov";
|
this->m_attributes = "+avx,+popcnt,+cmov";
|
||||||
@@ -499,6 +515,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
|
|||||||
!strcasecmp(isa, "avx1-i32x16")) {
|
!strcasecmp(isa, "avx1-i32x16")) {
|
||||||
this->m_isa = Target::AVX;
|
this->m_isa = Target::AVX;
|
||||||
this->m_nativeVectorWidth = 8;
|
this->m_nativeVectorWidth = 8;
|
||||||
|
this->m_nativeVectorAlignment = 32;
|
||||||
this->m_dataTypeWidth = 32;
|
this->m_dataTypeWidth = 32;
|
||||||
this->m_vectorWidth = 16;
|
this->m_vectorWidth = 16;
|
||||||
this->m_attributes = "+avx,+popcnt,+cmov";
|
this->m_attributes = "+avx,+popcnt,+cmov";
|
||||||
@@ -509,6 +526,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
|
|||||||
!strcasecmp(isa, "avx1.1-i32x8")) {
|
!strcasecmp(isa, "avx1.1-i32x8")) {
|
||||||
this->m_isa = Target::AVX11;
|
this->m_isa = Target::AVX11;
|
||||||
this->m_nativeVectorWidth = 8;
|
this->m_nativeVectorWidth = 8;
|
||||||
|
this->m_nativeVectorAlignment = 32;
|
||||||
this->m_dataTypeWidth = 32;
|
this->m_dataTypeWidth = 32;
|
||||||
this->m_vectorWidth = 8;
|
this->m_vectorWidth = 8;
|
||||||
this->m_attributes = "+avx,+popcnt,+cmov,+f16c"
|
this->m_attributes = "+avx,+popcnt,+cmov,+f16c"
|
||||||
@@ -530,6 +548,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
|
|||||||
!strcasecmp(isa, "avx1.1-i32x16")) {
|
!strcasecmp(isa, "avx1.1-i32x16")) {
|
||||||
this->m_isa = Target::AVX11;
|
this->m_isa = Target::AVX11;
|
||||||
this->m_nativeVectorWidth = 8;
|
this->m_nativeVectorWidth = 8;
|
||||||
|
this->m_nativeVectorAlignment = 32;
|
||||||
this->m_dataTypeWidth = 32;
|
this->m_dataTypeWidth = 32;
|
||||||
this->m_vectorWidth = 16;
|
this->m_vectorWidth = 16;
|
||||||
this->m_attributes = "+avx,+popcnt,+cmov,+f16c"
|
this->m_attributes = "+avx,+popcnt,+cmov,+f16c"
|
||||||
@@ -550,6 +569,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
|
|||||||
else if (!strcasecmp(isa, "avx1.1-i64x4")) {
|
else if (!strcasecmp(isa, "avx1.1-i64x4")) {
|
||||||
this->m_isa = Target::AVX11;
|
this->m_isa = Target::AVX11;
|
||||||
this->m_nativeVectorWidth = 8; /* native vector width in terms of floats */
|
this->m_nativeVectorWidth = 8; /* native vector width in terms of floats */
|
||||||
|
this->m_nativeVectorAlignment = 32;
|
||||||
this->m_dataTypeWidth = 64;
|
this->m_dataTypeWidth = 64;
|
||||||
this->m_vectorWidth = 4;
|
this->m_vectorWidth = 4;
|
||||||
this->m_attributes = "+avx,+popcnt,+cmov,+f16c"
|
this->m_attributes = "+avx,+popcnt,+cmov,+f16c"
|
||||||
@@ -571,6 +591,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
|
|||||||
!strcasecmp(isa, "avx2-i32x8")) {
|
!strcasecmp(isa, "avx2-i32x8")) {
|
||||||
this->m_isa = Target::AVX2;
|
this->m_isa = Target::AVX2;
|
||||||
this->m_nativeVectorWidth = 8;
|
this->m_nativeVectorWidth = 8;
|
||||||
|
this->m_nativeVectorAlignment = 32;
|
||||||
this->m_dataTypeWidth = 32;
|
this->m_dataTypeWidth = 32;
|
||||||
this->m_vectorWidth = 8;
|
this->m_vectorWidth = 8;
|
||||||
this->m_attributes = "+avx2,+popcnt,+cmov,+f16c"
|
this->m_attributes = "+avx2,+popcnt,+cmov,+f16c"
|
||||||
@@ -596,6 +617,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
|
|||||||
!strcasecmp(isa, "avx2-i32x16")) {
|
!strcasecmp(isa, "avx2-i32x16")) {
|
||||||
this->m_isa = Target::AVX2;
|
this->m_isa = Target::AVX2;
|
||||||
this->m_nativeVectorWidth = 16;
|
this->m_nativeVectorWidth = 16;
|
||||||
|
this->m_nativeVectorAlignment = 32;
|
||||||
this->m_dataTypeWidth = 32;
|
this->m_dataTypeWidth = 32;
|
||||||
this->m_vectorWidth = 16;
|
this->m_vectorWidth = 16;
|
||||||
this->m_attributes = "+avx2,+popcnt,+cmov,+f16c"
|
this->m_attributes = "+avx2,+popcnt,+cmov,+f16c"
|
||||||
@@ -620,6 +642,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
|
|||||||
else if (!strcasecmp(isa, "avx2-i64x4")) {
|
else if (!strcasecmp(isa, "avx2-i64x4")) {
|
||||||
this->m_isa = Target::AVX2;
|
this->m_isa = Target::AVX2;
|
||||||
this->m_nativeVectorWidth = 8; /* native vector width in terms of floats */
|
this->m_nativeVectorWidth = 8; /* native vector width in terms of floats */
|
||||||
|
this->m_nativeVectorAlignment = 32;
|
||||||
this->m_dataTypeWidth = 64;
|
this->m_dataTypeWidth = 64;
|
||||||
this->m_vectorWidth = 4;
|
this->m_vectorWidth = 4;
|
||||||
this->m_attributes = "+avx2,+popcnt,+cmov,+f16c"
|
this->m_attributes = "+avx2,+popcnt,+cmov,+f16c"
|
||||||
@@ -645,6 +668,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
|
|||||||
else if (!strcasecmp(isa, "neon-i8x16")) {
|
else if (!strcasecmp(isa, "neon-i8x16")) {
|
||||||
this->m_isa = Target::NEON8;
|
this->m_isa = Target::NEON8;
|
||||||
this->m_nativeVectorWidth = 16;
|
this->m_nativeVectorWidth = 16;
|
||||||
|
this->m_nativeVectorAlignment = 16;
|
||||||
this->m_dataTypeWidth = 8;
|
this->m_dataTypeWidth = 8;
|
||||||
this->m_vectorWidth = 16;
|
this->m_vectorWidth = 16;
|
||||||
this->m_attributes = "+neon,+fp16";
|
this->m_attributes = "+neon,+fp16";
|
||||||
@@ -655,6 +679,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
|
|||||||
else if (!strcasecmp(isa, "neon-i16x8")) {
|
else if (!strcasecmp(isa, "neon-i16x8")) {
|
||||||
this->m_isa = Target::NEON16;
|
this->m_isa = Target::NEON16;
|
||||||
this->m_nativeVectorWidth = 8;
|
this->m_nativeVectorWidth = 8;
|
||||||
|
this->m_nativeVectorAlignment = 16;
|
||||||
this->m_dataTypeWidth = 16;
|
this->m_dataTypeWidth = 16;
|
||||||
this->m_vectorWidth = 8;
|
this->m_vectorWidth = 8;
|
||||||
this->m_attributes = "+neon,+fp16";
|
this->m_attributes = "+neon,+fp16";
|
||||||
@@ -666,6 +691,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
|
|||||||
!strcasecmp(isa, "neon-i32x4")) {
|
!strcasecmp(isa, "neon-i32x4")) {
|
||||||
this->m_isa = Target::NEON32;
|
this->m_isa = Target::NEON32;
|
||||||
this->m_nativeVectorWidth = 4;
|
this->m_nativeVectorWidth = 4;
|
||||||
|
this->m_nativeVectorAlignment = 16;
|
||||||
this->m_dataTypeWidth = 32;
|
this->m_dataTypeWidth = 32;
|
||||||
this->m_vectorWidth = 4;
|
this->m_vectorWidth = 4;
|
||||||
this->m_attributes = "+neon,+fp16";
|
this->m_attributes = "+neon,+fp16";
|
||||||
|
|||||||
9
ispc.h
9
ispc.h
@@ -260,6 +260,8 @@ public:
|
|||||||
|
|
||||||
int getNativeVectorWidth() const {return m_nativeVectorWidth;}
|
int getNativeVectorWidth() const {return m_nativeVectorWidth;}
|
||||||
|
|
||||||
|
int getNativeVectorAlignment() const {return m_nativeVectorAlignment;}
|
||||||
|
|
||||||
int getDataTypeWidth() const {return m_dataTypeWidth;}
|
int getDataTypeWidth() const {return m_dataTypeWidth;}
|
||||||
|
|
||||||
int getVectorWidth() const {return m_vectorWidth;}
|
int getVectorWidth() const {return m_vectorWidth;}
|
||||||
@@ -332,6 +334,13 @@ private:
|
|||||||
SSE, 8 for AVX, etc.) */
|
SSE, 8 for AVX, etc.) */
|
||||||
int m_nativeVectorWidth;
|
int m_nativeVectorWidth;
|
||||||
|
|
||||||
|
/** Native vector alignment in bytes. Theoretically this may be derived
|
||||||
|
from the vector size, but it's better to manage directly the alignement.
|
||||||
|
It allows easier experimenting and better fine tuning for particular
|
||||||
|
platform. This information is primatily used when
|
||||||
|
--opt=force-aligned-memory is used. */
|
||||||
|
int m_nativeVectorAlignment;
|
||||||
|
|
||||||
/** Data type with in bits. Typically it's 32, but could be 8, 16 or 64.
|
/** Data type with in bits. Typically it's 32, but could be 8, 16 or 64.
|
||||||
For generic it's -1, which means undefined. */
|
For generic it's -1, which means undefined. */
|
||||||
int m_dataTypeWidth;
|
int m_dataTypeWidth;
|
||||||
|
|||||||
115
llvm_patches/3_3_r193261_bug17631_196261_win_vzeroupper.patch
Normal file
115
llvm_patches/3_3_r193261_bug17631_196261_win_vzeroupper.patch
Normal file
@@ -0,0 +1,115 @@
|
|||||||
|
From b9b016cda57d8afc26a150de7ee329b54a994c85 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Michael Liao <michael.hliao@gmail.com>
|
||||||
|
Date: Mon, 21 Oct 2013 17:47:58 -0700
|
||||||
|
Subject: [PATCH] Fix PR17631
|
||||||
|
|
||||||
|
- Skip instructions added in prolog. For specific targets, prolog may
|
||||||
|
insert helper function calls (e.g. _chkstk will be called when
|
||||||
|
there're more than 4K bytes allocated on stack). However, these
|
||||||
|
helpers don't use/def YMM/XMM registers.
|
||||||
|
It also include second fix for the problem: r196261+r196391.
|
||||||
|
|
||||||
|
diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp
|
||||||
|
index 477f75a..0d37a7d 100644
|
||||||
|
--- lib/Target/X86/X86VZeroUpper.cpp
|
||||||
|
+++ lib/Target/X86/X86VZeroUpper.cpp
|
||||||
|
@@ -121,7 +121,7 @@
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool clobbersAllYmmRegs(const MachineOperand &MO) {
|
||||||
|
- for (unsigned reg = X86::YMM0; reg < X86::YMM15; ++reg) {
|
||||||
|
+ for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) {
|
||||||
|
if (!MO.clobbersPhysReg(reg))
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
@@ -143,6 +143,21 @@
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
+/// clobbersAnyYmmReg() - Check if any YMM register will be clobbered by this
|
||||||
|
+/// instruction.
|
||||||
|
+static bool clobbersAnyYmmReg(MachineInstr *MI) {
|
||||||
|
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
|
||||||
|
+ const MachineOperand &MO = MI->getOperand(i);
|
||||||
|
+ if (!MO.isRegMask())
|
||||||
|
+ continue;
|
||||||
|
+ for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) {
|
||||||
|
+ if (MO.clobbersPhysReg(reg))
|
||||||
|
+ return true;
|
||||||
|
+ }
|
||||||
|
+ }
|
||||||
|
+ return false;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
/// runOnMachineFunction - Loop over all of the basic blocks, inserting
|
||||||
|
/// vzero upper instructions before function calls.
|
||||||
|
bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
|
||||||
|
@@ -226,8 +241,9 @@
|
||||||
|
bool BBHasCall = false;
|
||||||
|
|
||||||
|
for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) {
|
||||||
|
+ DebugLoc dl = I->getDebugLoc();
|
||||||
|
MachineInstr *MI = I;
|
||||||
|
- DebugLoc dl = I->getDebugLoc();
|
||||||
|
+
|
||||||
|
bool isControlFlow = MI->isCall() || MI->isReturn();
|
||||||
|
|
||||||
|
// Shortcut: don't need to check regular instructions in dirty state.
|
||||||
|
@@ -246,6 +262,14 @@
|
||||||
|
if (!isControlFlow)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
+ // If the call won't clobber any YMM register, skip it as well. It usually
|
||||||
|
+ // happens on helper function calls (such as '_chkstk', '_ftol2') where
|
||||||
|
+ // standard calling convention is not used (RegMask is not used to mark
|
||||||
|
+ // register clobbered and register usage (def/imp-def/use) is well-dfined
|
||||||
|
+ // and explicitly specified.
|
||||||
|
+ if (MI->isCall() && !clobbersAnyYmmReg(MI))
|
||||||
|
+ continue;
|
||||||
|
+
|
||||||
|
BBHasCall = true;
|
||||||
|
|
||||||
|
// The VZEROUPPER instruction resets the upper 128 bits of all Intel AVX
|
||||||
|
diff --git a/test/CodeGen/X86/pr17631.ll b/test/CodeGen/X86/pr17631.ll
|
||||||
|
new file mode 100644
|
||||||
|
index 0000000..a572ff2
|
||||||
|
--- /dev/null
|
||||||
|
+++ test/CodeGen/X86/pr17631.ll
|
||||||
|
@@ -0,0 +1,34 @@
|
||||||
|
+; RUN: llc < %s -mcpu=core-avx-i -mtriple=i386-pc-win32 | FileCheck %s
|
||||||
|
+
|
||||||
|
+%struct_type = type { [64 x <8 x float>], <8 x float> }
|
||||||
|
+
|
||||||
|
+; Function Attrs: nounwind readnone
|
||||||
|
+declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>)
|
||||||
|
+
|
||||||
|
+; Function Attrs: nounwind
|
||||||
|
+define i32 @equal(<8 x i32> %A) {
|
||||||
|
+allocas:
|
||||||
|
+ %first_alloc = alloca [64 x <8 x i32>]
|
||||||
|
+ %second_alloc = alloca %struct_type
|
||||||
|
+
|
||||||
|
+ %A1 = bitcast <8 x i32> %A to <8 x float>
|
||||||
|
+ %A2 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %A1)
|
||||||
|
+ ret i32 %A2
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+; CHECK: equal
|
||||||
|
+; CHECK-NOT: vzeroupper
|
||||||
|
+; CHECK: _chkstk
|
||||||
|
+; CHECK: ret
|
||||||
|
+
|
||||||
|
+define <8 x float> @foo(<8 x float> %y, i64* %p, double %x) {
|
||||||
|
+ %i = fptoui double %x to i64
|
||||||
|
+ store i64 %i, i64* %p
|
||||||
|
+ %ret = fadd <8 x float> %y, %y
|
||||||
|
+ ret <8 x float> %ret
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+; CHECK: foo
|
||||||
|
+; CHECK-NOT: vzeroupper
|
||||||
|
+; CHECK: _ftol2
|
||||||
|
+; CHECK: ret
|
||||||
|
--
|
||||||
|
1.8.1.2
|
||||||
|
|
||||||
@@ -1,69 +0,0 @@
|
|||||||
From b9b016cda57d8afc26a150de7ee329b54a994c85 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Michael Liao <michael.hliao@gmail.com>
|
|
||||||
Date: Mon, 21 Oct 2013 17:47:58 -0700
|
|
||||||
Subject: [PATCH] Fix PR17631
|
|
||||||
|
|
||||||
- Skip instructions added in prolog. For specific targets, prolog may
|
|
||||||
insert helper function calls (e.g. _chkstk will be called when
|
|
||||||
there're more than 4K bytes allocated on stack). However, these
|
|
||||||
helpers don't use/def YMM/XMM registers.
|
|
||||||
---
|
|
||||||
lib/Target/X86/X86VZeroUpper.cpp | 11 ++++++++++-
|
|
||||||
test/CodeGen/X86/pr17631.ll | 22 ++++++++++++++++++++++
|
|
||||||
2 files changed, 32 insertions(+), 1 deletion(-)
|
|
||||||
create mode 100644 test/CodeGen/X86/pr17631.ll
|
|
||||||
|
|
||||||
diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp
|
|
||||||
index 477f75a..0d37a7d 100644
|
|
||||||
--- lib/Target/X86/X86VZeroUpper.cpp
|
|
||||||
+++ lib/Target/X86/X86VZeroUpper.cpp
|
|
||||||
@@ -231,8 +231,17 @@ bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF,
|
|
||||||
bool BBHasCall = false;
|
|
||||||
|
|
||||||
for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) {
|
|
||||||
- MachineInstr *MI = I;
|
|
||||||
DebugLoc dl = I->getDebugLoc();
|
|
||||||
+ MachineInstr *MI = I;
|
|
||||||
+
|
|
||||||
+ // Don't need to check instructions added in prolog.
|
|
||||||
+ // In prolog, special function calls may be added for specific targets
|
|
||||||
+ // (e.g. on Windows, a prolog helper '_chkstk' is called when the local
|
|
||||||
+ // variables exceed 4K bytes on stack.) These helpers won't use/def YMM/XMM
|
|
||||||
+ // registers.
|
|
||||||
+ if (MI->getFlag(MachineInstr::FrameSetup))
|
|
||||||
+ continue;
|
|
||||||
+
|
|
||||||
bool isControlFlow = MI->isCall() || MI->isReturn();
|
|
||||||
|
|
||||||
// Shortcut: don't need to check regular instructions in dirty state.
|
|
||||||
diff --git a/test/CodeGen/X86/pr17631.ll b/test/CodeGen/X86/pr17631.ll
|
|
||||||
new file mode 100644
|
|
||||||
index 0000000..a572ff2
|
|
||||||
--- /dev/null
|
|
||||||
+++ test/CodeGen/X86/pr17631.ll
|
|
||||||
@@ -0,0 +1,22 @@
|
|
||||||
+; RUN: llc < %s -mcpu=core-avx-i -mtriple=i386-pc-win32 | FileCheck %s
|
|
||||||
+
|
|
||||||
+%struct_type = type { [64 x <8 x float>], <8 x float> }
|
|
||||||
+
|
|
||||||
+; Function Attrs: nounwind readnone
|
|
||||||
+declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>)
|
|
||||||
+
|
|
||||||
+; Function Attrs: nounwind
|
|
||||||
+define i32 @equal(<8 x i32> %A) {
|
|
||||||
+allocas:
|
|
||||||
+ %first_alloc = alloca [64 x <8 x i32>]
|
|
||||||
+ %second_alloc = alloca %struct_type
|
|
||||||
+
|
|
||||||
+ %A1 = bitcast <8 x i32> %A to <8 x float>
|
|
||||||
+ %A2 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %A1)
|
|
||||||
+ ret i32 %A2
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+; CHECK: equal
|
|
||||||
+; CHECK-NOT: vzeroupper
|
|
||||||
+; CHECK: _chkstk
|
|
||||||
+; CHECK: ret
|
|
||||||
--
|
|
||||||
1.8.1.2
|
|
||||||
|
|
||||||
16
opt.cpp
16
opt.cpp
@@ -904,7 +904,7 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
|
|||||||
lCopyMetadata(castPtr, callInst);
|
lCopyMetadata(castPtr, callInst);
|
||||||
int align;
|
int align;
|
||||||
if (g->opt.forceAlignedMemory)
|
if (g->opt.forceAlignedMemory)
|
||||||
align = 0;
|
align = g->target->getNativeVectorAlignment();
|
||||||
else
|
else
|
||||||
align = callInst->getCalledFunction() == avxMaskedLoad32 ? 4 : 8;
|
align = callInst->getCalledFunction() == avxMaskedLoad32 ? 4 : 8;
|
||||||
name = LLVMGetName(callInst->getArgOperand(0), "_load");
|
name = LLVMGetName(callInst->getArgOperand(0), "_load");
|
||||||
@@ -946,7 +946,7 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
|
|||||||
new llvm::StoreInst(rvalue, castPtr, (llvm::Instruction *)NULL);
|
new llvm::StoreInst(rvalue, castPtr, (llvm::Instruction *)NULL);
|
||||||
int align;
|
int align;
|
||||||
if (g->opt.forceAlignedMemory)
|
if (g->opt.forceAlignedMemory)
|
||||||
align = 0;
|
align = g->target->getNativeVectorAlignment();
|
||||||
else
|
else
|
||||||
align = callInst->getCalledFunction() == avxMaskedStore32 ? 4 : 8;
|
align = callInst->getCalledFunction() == avxMaskedStore32 ? 4 : 8;
|
||||||
storeInst->setAlignment(align);
|
storeInst->setAlignment(align);
|
||||||
@@ -2758,7 +2758,8 @@ lImproveMaskedStore(llvm::CallInst *callInst) {
|
|||||||
lCopyMetadata(lvalue, callInst);
|
lCopyMetadata(lvalue, callInst);
|
||||||
llvm::Instruction *store =
|
llvm::Instruction *store =
|
||||||
new llvm::StoreInst(rvalue, lvalue, false /* not volatile */,
|
new llvm::StoreInst(rvalue, lvalue, false /* not volatile */,
|
||||||
g->opt.forceAlignedMemory ? 0 : info->align);
|
g->opt.forceAlignedMemory ?
|
||||||
|
g->target->getNativeVectorAlignment() : info->align);
|
||||||
lCopyMetadata(store, callInst);
|
lCopyMetadata(store, callInst);
|
||||||
llvm::ReplaceInstWithInst(callInst, store);
|
llvm::ReplaceInstWithInst(callInst, store);
|
||||||
return true;
|
return true;
|
||||||
@@ -2821,7 +2822,8 @@ lImproveMaskedLoad(llvm::CallInst *callInst,
|
|||||||
callInst);
|
callInst);
|
||||||
llvm::Instruction *load =
|
llvm::Instruction *load =
|
||||||
new llvm::LoadInst(ptr, callInst->getName(), false /* not volatile */,
|
new llvm::LoadInst(ptr, callInst->getName(), false /* not volatile */,
|
||||||
g->opt.forceAlignedMemory ? 0 : info->align,
|
g->opt.forceAlignedMemory ?
|
||||||
|
g->target->getNativeVectorAlignment() : info->align,
|
||||||
(llvm::Instruction *)NULL);
|
(llvm::Instruction *)NULL);
|
||||||
lCopyMetadata(load, callInst);
|
lCopyMetadata(load, callInst);
|
||||||
llvm::ReplaceInstWithInst(callInst, load);
|
llvm::ReplaceInstWithInst(callInst, load);
|
||||||
@@ -3226,6 +3228,9 @@ lEmitLoads(llvm::Value *basePtr, std::vector<CoalescedLoadOp> &loadOps,
|
|||||||
}
|
}
|
||||||
case 4: {
|
case 4: {
|
||||||
// 4-wide vector load
|
// 4-wide vector load
|
||||||
|
if (g->opt.forceAlignedMemory) {
|
||||||
|
align = g->target->getNativeVectorAlignment();
|
||||||
|
}
|
||||||
llvm::VectorType *vt =
|
llvm::VectorType *vt =
|
||||||
llvm::VectorType::get(LLVMTypes::Int32Type, 4);
|
llvm::VectorType::get(LLVMTypes::Int32Type, 4);
|
||||||
loadOps[i].load = lGEPAndLoad(basePtr, start, align,
|
loadOps[i].load = lGEPAndLoad(basePtr, start, align,
|
||||||
@@ -3234,6 +3239,9 @@ lEmitLoads(llvm::Value *basePtr, std::vector<CoalescedLoadOp> &loadOps,
|
|||||||
}
|
}
|
||||||
case 8: {
|
case 8: {
|
||||||
// 8-wide vector load
|
// 8-wide vector load
|
||||||
|
if (g->opt.forceAlignedMemory) {
|
||||||
|
align = g->target->getNativeVectorAlignment();
|
||||||
|
}
|
||||||
llvm::VectorType *vt =
|
llvm::VectorType *vt =
|
||||||
llvm::VectorType::get(LLVMTypes::Int32Type, 8);
|
llvm::VectorType::get(LLVMTypes::Int32Type, 8);
|
||||||
loadOps[i].load = lGEPAndLoad(basePtr, start, align,
|
loadOps[i].load = lGEPAndLoad(basePtr, start, align,
|
||||||
|
|||||||
4
perf.ini
4
perf.ini
@@ -10,7 +10,7 @@
|
|||||||
%****************************************************************************************************
|
%****************************************************************************************************
|
||||||
AOBench
|
AOBench
|
||||||
aobench
|
aobench
|
||||||
10 512 512
|
3 2048 2048
|
||||||
#***
|
#***
|
||||||
Deferred Shading
|
Deferred Shading
|
||||||
deferred
|
deferred
|
||||||
@@ -41,7 +41,7 @@ options
|
|||||||
#***
|
#***
|
||||||
Ray Tracer
|
Ray Tracer
|
||||||
rt
|
rt
|
||||||
sponza
|
sponza --scale=6.0
|
||||||
#***
|
#***
|
||||||
3D Stencil
|
3D Stencil
|
||||||
stencil
|
stencil
|
||||||
|
|||||||
Reference in New Issue
Block a user