diff --git a/Makefile b/Makefile index b83714c9..08e487f9 100644 --- a/Makefile +++ b/Makefile @@ -72,7 +72,7 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \ HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \ opt.h stmt.h sym.h type.h util.h TARGETS=avx1 avx1-x2 avx2 avx2-x2 sse2 sse2-x2 sse4 sse4-x2 generic-4 generic-8 \ - generic-16 + generic-16 generic-1 BUILTINS_SRC=$(addprefix builtins/target-, $(addsuffix .ll, $(TARGETS))) \ builtins/dispatch.ll BUILTINS_OBJS=$(addprefix builtins-, $(notdir $(BUILTINS_SRC:.ll=.o))) \ diff --git a/ast.cpp b/ast.cpp index 746bc0ec..bfbc71f6 100644 --- a/ast.cpp +++ b/ast.cpp @@ -98,6 +98,7 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc, StmtList *sl; PrintStmt *ps; AssertStmt *as; + DeleteStmt *dels; if ((es = dynamic_cast(node)) != NULL) es->expr = (Expr *)WalkAST(es->expr, preFunc, postFunc, data); @@ -160,6 +161,8 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc, ps->values = (Expr *)WalkAST(ps->values, preFunc, postFunc, data); else if ((as = dynamic_cast(node)) != NULL) as->expr = (Expr *)WalkAST(as->expr, preFunc, postFunc, data); + else if ((dels = dynamic_cast(node)) != NULL) + dels->expr = (Expr *)WalkAST(dels->expr, preFunc, postFunc, data); else FATAL("Unhandled statement type in WalkAST()"); } @@ -180,6 +183,7 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc, DereferenceExpr *dre; SizeOfExpr *soe; AddressOfExpr *aoe; + NewExpr *newe; if ((ue = dynamic_cast(node)) != NULL) ue->expr = (Expr *)WalkAST(ue->expr, preFunc, postFunc, data); @@ -223,6 +227,12 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc, soe->expr = (Expr *)WalkAST(soe->expr, preFunc, postFunc, data); else if ((aoe = dynamic_cast(node)) != NULL) aoe->expr = (Expr *)WalkAST(aoe->expr, preFunc, postFunc, data); + else if ((newe = dynamic_cast(node)) != NULL) { + newe->countExpr = (Expr *)WalkAST(newe->countExpr, preFunc, + postFunc, data); + newe->initExpr = (Expr *)WalkAST(newe->initExpr, preFunc, + postFunc, data); + } else if (dynamic_cast(node) != NULL || dynamic_cast(node) != NULL || dynamic_cast(node) != NULL || @@ -305,3 +315,116 @@ EstimateCost(ASTNode *root) { return cost; } + +/** Given an AST node, check to see if it's safe if we happen to run the + code for that node with the execution mask all off. + */ +static bool +lCheckAllOffSafety(ASTNode *node, void *data) { + bool *okPtr = (bool *)data; + + if (dynamic_cast(node) != NULL) { + // FIXME: If we could somehow determine that the function being + // called was safe (and all of the args Exprs were safe, then it'd + // be nice to be able to return true here. (Consider a call to + // e.g. floatbits() in the stdlib.) Unfortunately for now we just + // have to be conservative. + *okPtr = false; + return false; + } + + if (dynamic_cast(node) != NULL) { + // While it's fine to run the assert for varying tests, it's not + // desirable to check an assert on a uniform variable if all of the + // lanes are off. + *okPtr = false; + return false; + } + + if (dynamic_cast(node) != NULL || + dynamic_cast(node) != NULL) { + // We definitely don't want to run the uniform variants of these if + // the mask is all off. It's also worth skipping the overhead of + // executing the varying versions of them in the all-off mask case. + *okPtr = false; + return false; + } + + if (g->target.allOffMaskIsSafe == true) + // Don't worry about memory accesses if we have a target that can + // safely run them with the mask all off + return true; + + IndexExpr *ie; + if ((ie = dynamic_cast(node)) != NULL && ie->baseExpr != NULL) { + const Type *type = ie->baseExpr->GetType(); + if (type == NULL) + return true; + if (dynamic_cast(type) != NULL) + type = type->GetReferenceTarget(); + + ConstExpr *ce = dynamic_cast(ie->index); + if (ce == NULL) { + // indexing with a variable... -> not safe + *okPtr = false; + return false; + } + + const PointerType *pointerType = + dynamic_cast(type); + if (pointerType != NULL) { + // pointer[index] -> can't be sure -> not safe + *okPtr = false; + return false; + } + + const SequentialType *seqType = + dynamic_cast(type); + Assert(seqType != NULL); + int nElements = seqType->GetElementCount(); + if (nElements == 0) { + // Unsized array, so we can't be sure -> not safe + *okPtr = false; + return false; + } + + int32_t indices[ISPC_MAX_NVEC]; + int count = ce->AsInt32(indices); + for (int i = 0; i < count; ++i) { + if (indices[i] < 0 || indices[i] >= nElements) { + // Index is out of bounds -> not safe + *okPtr = false; + return false; + } + } + + // All indices are in-bounds + return true; + } + + MemberExpr *me; + if ((me = dynamic_cast(node)) != NULL && + me->dereferenceExpr) { + *okPtr = false; + return false; + } + + DereferenceExpr *de; + if ((de = dynamic_cast(node)) != NULL) { + const Type *exprType = de->expr->GetType(); + if (dynamic_cast(exprType) != NULL) { + *okPtr = false; + return false; + } + } + + return true; +} + + +bool +SafeToRunWithMaskAllOff(ASTNode *root) { + bool safe = true; + WalkAST(root, lCheckAllOffSafety, NULL, &safe); + return safe; +} diff --git a/ast.h b/ast.h index 0c3d4b64..0f73677b 100644 --- a/ast.h +++ b/ast.h @@ -144,4 +144,8 @@ extern Stmt *TypeCheck(Stmt *); the given root. */ extern int EstimateCost(ASTNode *root); +/** Returns true if it would be safe to run the given code with an "all + off" mask. */ +extern bool SafeToRunWithMaskAllOff(ASTNode *root); + #endif // ISPC_AST_H diff --git a/builtins.cpp b/builtins.cpp index 76ebdfa7..dd910c9a 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -391,6 +391,8 @@ lSetInternalFunctions(llvm::Module *module) { "__count_trailing_zeros_i64", "__count_leading_zeros_i32", "__count_leading_zeros_i64", + "__delete_uniform", + "__delete_varying", "__do_assert_uniform", "__do_assert_varying", "__do_print", @@ -449,6 +451,9 @@ lSetInternalFunctions(llvm::Module *module) { "__min_varying_uint32", "__min_varying_uint64", "__movmsk", + "__new_uniform", + "__new_varying32", + "__new_varying64", "__num_cores", "__packed_load_active", "__packed_store_active", @@ -794,6 +799,13 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod builtins_bitcode_generic_16_length, module, symbolTable); break; + case 1: + extern unsigned char builtins_bitcode_generic_1[]; + extern int builtins_bitcode_generic_1_length; + AddBitcodeToModule(builtins_bitcode_generic_1, + builtins_bitcode_generic_1_length, + module, symbolTable); + break; default: FATAL("logic error in DefineStdlib"); } @@ -829,7 +841,7 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod // If the user wants the standard library to be included, parse the // serialized version of the stdlib.ispc file to get its // definitions added. - if (g->target.isa == Target::GENERIC) { + if (g->target.isa == Target::GENERIC&&g->target.vectorWidth!=1) { // 1 wide uses x86 stdlib extern char stdlib_generic_code[]; yy_scan_string(stdlib_generic_code); yyparse(); diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll new file mode 100755 index 00000000..ad911e64 --- /dev/null +++ b/builtins/target-generic-1.ll @@ -0,0 +1,935 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Define the standard library builtins for the NOVEC target +define(`MASK',`i32') +define(`WIDTH',`1') +include(`util.m4') +; Define some basics for a 1-wide target +stdlib_core() +packed_load_and_store() +scans() +int64minmax() +aossoa() + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; masked store + +gen_masked_store(1, i8, 8) +gen_masked_store(1, i16, 16) +gen_masked_store(1, i32, 32) +gen_masked_store(1, i64, 64) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; unaligned loads/loads+broadcasts + +load_and_broadcast(1, i8, 8) +load_and_broadcast(1, i16, 16) +load_and_broadcast(1, i32, 32) +load_and_broadcast(1, i64, 64) + +masked_load(1, i8, 8, 1) +masked_load(1, i16, 16, 2) +masked_load(1, i32, 32, 4) +masked_load(1, i64, 64, 8) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; gather/scatter + +; define these with the macros from stdlib.m4 + +gen_gather(1, i8) +gen_gather(1, i16) +gen_gather(1, i32) +gen_gather(1, i64) + +gen_scatter(1, i8) +gen_scatter(1, i16) +gen_scatter(1, i32) +gen_scatter(1, i64) + + +define <1 x i8> @__vselect_i8(<1 x i8>, <1 x i8> , + <1 x i32> %mask) nounwind readnone alwaysinline { +; %mv = trunc <1 x i32> %mask to <1 x i8> +; %notmask = xor <1 x i8> %mv, +; %cleared_old = and <1 x i8> %0, %notmask +; %masked_new = and <1 x i8> %1, %mv +; %new = or <1 x i8> %cleared_old, %masked_new +; ret <1 x i8> %new + + ; not doing this the easy way because of problems with LLVM's scalarizer +; %cmp = icmp eq <1 x i32> %mask, +; %sel = select <1 x i1> %cmp, <1 x i8> %0, <1 x i8> %1 + %m = extractelement <1 x i32> %mask, i32 0 + %cmp = icmp eq i32 %m, 0 + %d0 = extractelement <1 x i8> %0, i32 0 + %d1 = extractelement <1 x i8> %1, i32 0 + %sel = select i1 %cmp, i8 %d0, i8 %d1 + %r = insertelement <1 x i8> undef, i8 %sel, i32 0 + ret <1 x i8> %r +} + +define <1 x i16> @__vselect_i16(<1 x i16>, <1 x i16> , + <1 x i32> %mask) nounwind readnone alwaysinline { +; %mv = trunc <1 x i32> %mask to <1 x i16> +; %notmask = xor <1 x i16> %mv, +; %cleared_old = and <1 x i16> %0, %notmask +; %masked_new = and <1 x i16> %1, %mv +; %new = or <1 x i16> %cleared_old, %masked_new +; ret <1 x i16> %new +; %cmp = icmp eq <1 x i32> %mask, +; %sel = select <1 x i1> %cmp, <1 x i16> %0, <1 x i16> %1 + %m = extractelement <1 x i32> %mask, i32 0 + %cmp = icmp eq i32 %m, 0 + %d0 = extractelement <1 x i16> %0, i32 0 + %d1 = extractelement <1 x i16> %1, i32 0 + %sel = select i1 %cmp, i16 %d0, i16 %d1 + %r = insertelement <1 x i16> undef, i16 %sel, i32 0 + ret <1 x i16> %r + +; ret <1 x i16> %sel +} + + +define <1 x i32> @__vselect_i32(<1 x i32>, <1 x i32> , + <1 x i32> %mask) nounwind readnone alwaysinline { +; %notmask = xor <1 x i32> %mask, +; %cleared_old = and <1 x i32> %0, %notmask +; %masked_new = and <1 x i32> %1, %mask +; %new = or <1 x i32> %cleared_old, %masked_new +; ret <1 x i32> %new +; %cmp = icmp eq <1 x i32> %mask, +; %sel = select <1 x i1> %cmp, <1 x i32> %0, <1 x i32> %1 +; ret <1 x i32> %sel + %m = extractelement <1 x i32> %mask, i32 0 + %cmp = icmp eq i32 %m, 0 + %d0 = extractelement <1 x i32> %0, i32 0 + %d1 = extractelement <1 x i32> %1, i32 0 + %sel = select i1 %cmp, i32 %d0, i32 %d1 + %r = insertelement <1 x i32> undef, i32 %sel, i32 0 + ret <1 x i32> %r + +} +define <1 x i64> @__vselect_i64(<1 x i64>, <1 x i64> , + <1 x i32> %mask) nounwind readnone alwaysinline { +; %newmask = zext <1 x i32> %mask to <1 x i64> +; %notmask = xor <1 x i64> %newmask, +; %cleared_old = and <1 x i64> %0, %notmask +; %masked_new = and <1 x i64> %1, %newmask +; %new = or <1 x i64> %cleared_old, %masked_new +; ret <1 x i64> %new +; %cmp = icmp eq <1 x i32> %mask, +; %sel = select <1 x i1> %cmp, <1 x i64> %0, <1 x i64> %1 +; ret <1 x i64> %sel + %m = extractelement <1 x i32> %mask, i32 0 + %cmp = icmp eq i32 %m, 0 + %d0 = extractelement <1 x i64> %0, i32 0 + %d1 = extractelement <1 x i64> %1, i32 0 + %sel = select i1 %cmp, i64 %d0, i64 %d1 + %r = insertelement <1 x i64> undef, i64 %sel, i32 0 + ret <1 x i64> %r + +} + +define <1 x float> @__vselect_float(<1 x float>, <1 x float>, + <1 x i32> %mask) nounwind readnone alwaysinline { +; %v0 = bitcast <1 x float> %0 to <1 x i32> +; %v1 = bitcast <1 x float> %1 to <1 x i32> +; %r = call <1 x i32> @__vselect_i32(<1 x i32> %v0, <1 x i32> %v1, <1 x i32> %mask) +; %rf = bitcast <1 x i32> %r to <1 x float> +; ret <1 x float> %rf +; %cmp = icmp eq <1 x i32> %mask, +; %sel = select <1 x i1> %cmp, <1 x float> %0, <1 x float> %1 +; ret <1 x float> %sel + %m = extractelement <1 x i32> %mask, i32 0 + %cmp = icmp eq i32 %m, 0 + %d0 = extractelement <1 x float> %0, i32 0 + %d1 = extractelement <1 x float> %1, i32 0 + %sel = select i1 %cmp, float %d0, float %d1 + %r = insertelement <1 x float> undef, float %sel, i32 0 + ret <1 x float> %r + +} + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; masked store + +define void @__masked_store_blend_8(<1 x i8>* nocapture, <1 x i8>, + <1 x i32> %mask) nounwind alwaysinline { + %val = load <1 x i8> * %0, align 4 + %newval = call <1 x i8> @__vselect_i8(<1 x i8> %val, <1 x i8> %1, <1 x i32> %mask) + store <1 x i8> %newval, <1 x i8> * %0, align 4 + ret void +} +define void @__masked_store_blend_16(<1 x i16>* nocapture, <1 x i16>, + <1 x i32> %mask) nounwind alwaysinline { + %val = load <1 x i16> * %0, align 4 + %newval = call <1 x i16> @__vselect_i16(<1 x i16> %val, <1 x i16> %1, <1 x i32> %mask) + store <1 x i16> %newval, <1 x i16> * %0, align 4 + ret void +} + + +define void @__masked_store_blend_32(<1 x i32>* nocapture, <1 x i32>, + <1 x i32> %mask) nounwind alwaysinline { + %val = load <1 x i32> * %0, align 4 + %newval = call <1 x i32> @__vselect_i32(<1 x i32> %val, <1 x i32> %1, <1 x i32> %mask) + store <1 x i32> %newval, <1 x i32> * %0, align 4 + ret void +} + +define void @__masked_store_blend_64(<1 x i64>* nocapture, <1 x i64>, + <1 x i32> %mask) nounwind alwaysinline { + %val = load <1 x i64> * %0, align 4 + %newval = call <1 x i64> @__vselect_i64(<1 x i64> %val, <1 x i64> %1, <1 x i32> %mask) + store <1 x i64> %newval, <1 x i64> * %0, align 4 + ret void +} + +define i32 @__movmsk(<1 x i32>) nounwind readnone alwaysinline { + %item = extractelement <1 x i32> %0, i32 0 + %v = lshr i32 %item, 31 + ret i32 %v +} + + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rounding +;; +;; There are not any rounding instructions in SSE2, so we have to emulate +;; the functionality with multiple instructions... + +; The code for __round_* is the result of compiling the following source +; code. +; +; export float Round(float x) { +; unsigned int sign = signbits(x); +; unsigned int ix = intbits(x); +; ix ^= sign; +; x = floatbits(ix); +; x += 0x1.0p23f; +; x -= 0x1.0p23f; +; ix = intbits(x); +; ix ^= sign; +; x = floatbits(ix); +; return x; +;} + +define <1 x float> @__round_varying_float(<1 x float>) nounwind readonly alwaysinline { + %float_to_int_bitcast.i.i.i.i = bitcast <1 x float> %0 to <1 x i32> + %bitop.i.i = and <1 x i32> %float_to_int_bitcast.i.i.i.i, + %bitop.i = xor <1 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i + %int_to_float_bitcast.i.i40.i = bitcast <1 x i32> %bitop.i to <1 x float> + %binop.i = fadd <1 x float> %int_to_float_bitcast.i.i40.i, + %binop21.i = fadd <1 x float> %binop.i, + %float_to_int_bitcast.i.i.i = bitcast <1 x float> %binop21.i to <1 x i32> + %bitop31.i = xor <1 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i + %int_to_float_bitcast.i.i.i = bitcast <1 x i32> %bitop31.i to <1 x float> + ret <1 x float> %int_to_float_bitcast.i.i.i +} + +;; Similarly, for implementations of the __floor* functions below, we have the +;; bitcode from compiling the following source code... + +;export float Floor(float x) { +; float y = Round(x); +; unsigned int cmp = y > x ? 0xffffffff : 0; +; float delta = -1.f; +; unsigned int idelta = intbits(delta); +; idelta &= cmp; +; delta = floatbits(idelta); +; return y + delta; +;} + +define <1 x float> @__floor_varying_float(<1 x float>) nounwind readonly alwaysinline { + %calltmp.i = tail call <1 x float> @__round_varying_float(<1 x float> %0) nounwind + %bincmp.i = fcmp ogt <1 x float> %calltmp.i, %0 + %val_to_boolvec32.i = sext <1 x i1> %bincmp.i to <1 x i32> + %bitop.i = and <1 x i32> %val_to_boolvec32.i, + %int_to_float_bitcast.i.i.i = bitcast <1 x i32> %bitop.i to <1 x float> + %binop.i = fadd <1 x float> %calltmp.i, %int_to_float_bitcast.i.i.i + ret <1 x float> %binop.i +} + +;; And here is the code we compiled to get the __ceil* functions below +; +;export uniform float Ceil(uniform float x) { +; uniform float y = Round(x); +; uniform int yltx = y < x ? 0xffffffff : 0; +; uniform float delta = 1.f; +; uniform int idelta = intbits(delta); +; idelta &= yltx; +; delta = floatbits(idelta); +; return y + delta; +;} + +define <1 x float> @__ceil_varying_float(<1 x float>) nounwind readonly alwaysinline { + %calltmp.i = tail call <1 x float> @__round_varying_float(<1 x float> %0) nounwind + %bincmp.i = fcmp olt <1 x float> %calltmp.i, %0 + %val_to_boolvec32.i = sext <1 x i1> %bincmp.i to <1 x i32> + %bitop.i = and <1 x i32> %val_to_boolvec32.i, + %int_to_float_bitcast.i.i.i = bitcast <1 x i32> %bitop.i to <1 x float> + %binop.i = fadd <1 x float> %calltmp.i, %int_to_float_bitcast.i.i.i + ret <1 x float> %binop.i +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rounding doubles + +; expecting math lib to provide this +declare double @ceil (double) nounwind readnone +declare double @floor (double) nounwind readnone +declare double @round (double) nounwind readnone +;declare float @llvm.sqrt.f32(float %Val) +declare double @llvm.sqrt.f64(double %Val) +declare float @llvm.sin.f32(float %Val) +declare float @llvm.cos.f32(float %Val) +declare float @llvm.sqrt.f32(float %Val) +declare float @llvm.exp.f32(float %Val) +declare float @llvm.log.f32(float %Val) +declare float @llvm.pow.f32(float %f, float %e) + + + + +;; stuff that could be in builtins ... + +define(`unary1to1', ` + %v_0 = extractelement <1 x $1> %0, i32 0 + %r_0 = call $1 $2($1 %v_0) + %ret_0 = insertelement <1 x $1> undef, $1 %r_0, i32 0 + ret <1 x $1> %ret_0 +') + + + +;; dummy 1 wide vector ops +define void +@__aos_to_soa4_float1(<1 x float> %v0, <1 x float> %v1, <1 x float> %v2, + <1 x float> %v3, <1 x float> * noalias %out0, + <1 x float> * noalias %out1, <1 x float> * noalias %out2, + <1 x float> * noalias %out3) nounwind alwaysinline { + + store <1 x float> %v0, <1 x float > * %out0 + store <1 x float> %v1, <1 x float > * %out1 + store <1 x float> %v2, <1 x float > * %out2 + store <1 x float> %v3, <1 x float > * %out3 + + ret void +} + +define void +@__soa_to_aos4_float1(<1 x float> %v0, <1 x float> %v1, <1 x float> %v2, + <1 x float> %v3, <1 x float> * noalias %out0, + <1 x float> * noalias %out1, <1 x float> * noalias %out2, + <1 x float> * noalias %out3) nounwind alwaysinline { + call void @__aos_to_soa4_float1(<1 x float> %v0, <1 x float> %v1, + <1 x float> %v2, <1 x float> %v3, <1 x float> * %out0, + <1 x float> * %out1, <1 x float> * %out2, <1 x float> * %out3) + ret void +} + +define void +@__aos_to_soa3_float1(<1 x float> %v0, <1 x float> %v1, + <1 x float> %v2, <1 x float> * %out0, <1 x float> * %out1, + <1 x float> * %out2) { + store <1 x float> %v0, <1 x float > * %out0 + store <1 x float> %v1, <1 x float > * %out1 + store <1 x float> %v2, <1 x float > * %out2 + + ret void +} + +define void +@__soa_to_aos3_float1(<1 x float> %v0, <1 x float> %v1, + <1 x float> %v2, <1 x float> * %out0, <1 x float> * %out1, + <1 x float> * %out2) { + call void @__aos_to_soa3_float1(<1 x float> %v0, <1 x float> %v1, + <1 x float> %v2, <1 x float> * %out0, <1 x float> * %out1, + <1 x float> * %out2) + ret void +} + + +;; end builtins + + +define <1 x double> @__round_varying_double(<1 x double>) nounwind readonly alwaysinline { + unary1to1(double, @round) +} + +define <1 x double> @__floor_varying_double(<1 x double>) nounwind readonly alwaysinline { + unary1to1(double, @floor) +} + + +define <1 x double> @__ceil_varying_double(<1 x double>) nounwind readonly alwaysinline { + unary1to1(double, @ceil) +} + +; To do vector integer min and max, we do the vector compare and then sign +; extend the i1 vector result to an i32 mask. The __vselect does the +; rest... + +define <1 x i32> @__min_varying_int32(<1 x i32>, <1 x i32>) nounwind readonly alwaysinline { + %c = icmp slt <1 x i32> %0, %1 + %mask = sext <1 x i1> %c to <1 x i32> + %v = call <1 x i32> @__vselect_i32(<1 x i32> %1, <1 x i32> %0, <1 x i32> %mask) + ret <1 x i32> %v +} + +define i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline { + %c = icmp slt i32 %0, %1 + %r = select i1 %c, i32 %0, i32 %1 + ret i32 %r +} + +define <1 x i32> @__max_varying_int32(<1 x i32>, <1 x i32>) nounwind readonly alwaysinline { + %c = icmp sgt <1 x i32> %0, %1 + %mask = sext <1 x i1> %c to <1 x i32> + %v = call <1 x i32> @__vselect_i32(<1 x i32> %1, <1 x i32> %0, <1 x i32> %mask) + ret <1 x i32> %v +} + +define i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline { + %c = icmp sgt i32 %0, %1 + %r = select i1 %c, i32 %0, i32 %1 + ret i32 %r +} + +; The functions for unsigned ints are similar, just with unsigned +; comparison functions... + +define <1 x i32> @__min_varying_uint32(<1 x i32>, <1 x i32>) nounwind readonly alwaysinline { + %c = icmp ult <1 x i32> %0, %1 + %mask = sext <1 x i1> %c to <1 x i32> + %v = call <1 x i32> @__vselect_i32(<1 x i32> %1, <1 x i32> %0, <1 x i32> %mask) + ret <1 x i32> %v +} + +define i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline { + %c = icmp ult i32 %0, %1 + %r = select i1 %c, i32 %0, i32 %1 + ret i32 %r +} + +define <1 x i32> @__max_varying_uint32(<1 x i32>, <1 x i32>) nounwind readonly alwaysinline { + %c = icmp ugt <1 x i32> %0, %1 + %mask = sext <1 x i1> %c to <1 x i32> + %v = call <1 x i32> @__vselect_i32(<1 x i32> %1, <1 x i32> %0, <1 x i32> %mask) + ret <1 x i32> %v +} + +define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline { + %c = icmp ugt i32 %0, %1 + %r = select i1 %c, i32 %0, i32 %1 + ret i32 %r +} + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; horizontal ops / reductions + +declare i32 @llvm.ctpop.i32(i32) nounwind readnone + +define i32 @__popcnt_int32(i32) nounwind readonly alwaysinline { + %call = call i32 @llvm.ctpop.i32(i32 %0) + ret i32 %call +} + +declare i64 @llvm.ctpop.i64(i64) nounwind readnone + +define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline { + %call = call i64 @llvm.ctpop.i64(i64 %0) + ret i64 %call +} + + +define float @__reduce_add_float(<1 x float> %v) nounwind readonly alwaysinline { + %r = extractelement <1 x float> %v, i32 0 + ret float %r +} + +define float @__reduce_min_float(<1 x float>) nounwind readnone { + %r = extractelement <1 x float> %0, i32 0 + ret float %r +} + +define float @__reduce_max_float(<1 x float>) nounwind readnone { + %r = extractelement <1 x float> %0, i32 0 + ret float %r +} + +define i32 @__reduce_add_int32(<1 x i32> %v) nounwind readnone { + %r = extractelement <1 x i32> %v, i32 0 + ret i32 %r +} + +define i32 @__reduce_min_int32(<1 x i32>) nounwind readnone { + %r = extractelement <1 x i32> %0, i32 0 + ret i32 %r +} + +define i32 @__reduce_max_int32(<1 x i32>) nounwind readnone { + %r = extractelement <1 x i32> %0, i32 0 + ret i32 %r +} + +define i32 @__reduce_add_uint32(<1 x i32> %v) nounwind readnone { + %r = call i32 @__reduce_add_int32(<1 x i32> %v) + ret i32 %r +} + +define i32 @__reduce_min_uint32(<1 x i32>) nounwind readnone { + %r = extractelement <1 x i32> %0, i32 0 + ret i32 %r +} + +define i32 @__reduce_max_uint32(<1 x i32>) nounwind readnone { + %r = extractelement <1 x i32> %0, i32 0 + ret i32 %r + } + + +define double @__reduce_add_double(<1 x double>) nounwind readnone { + %m = extractelement <1 x double> %0, i32 0 + ret double %m +} + +define double @__reduce_min_double(<1 x double>) nounwind readnone { + %m = extractelement <1 x double> %0, i32 0 + ret double %m +} + +define double @__reduce_max_double(<1 x double>) nounwind readnone { + %m = extractelement <1 x double> %0, i32 0 + ret double %m +} + +define i64 @__reduce_add_int64(<1 x i64>) nounwind readnone { + %m = extractelement <1 x i64> %0, i32 0 + ret i64 %m +} + +define i64 @__reduce_min_int64(<1 x i64>) nounwind readnone { + %m = extractelement <1 x i64> %0, i32 0 + ret i64 %m +} + +define i64 @__reduce_max_int64(<1 x i64>) nounwind readnone { + %m = extractelement <1 x i64> %0, i32 0 + ret i64 %m +} + +define i64 @__reduce_min_uint64(<1 x i64>) nounwind readnone { + %m = extractelement <1 x i64> %0, i32 0 + ret i64 %m +} + +define i64 @__reduce_max_uint64(<1 x i64>) nounwind readnone { + %m = extractelement <1 x i64> %0, i32 0 + ret i64 %m +} + +define i1 @__reduce_equal_int32(<1 x i32> %vv, i32 * %samevalue, + <1 x i32> %mask) nounwind alwaysinline { + %v=extractelement <1 x i32> %vv, i32 0 + store i32 %v, i32 * %samevalue + ret i1 true + +} + +define i1 @__reduce_equal_float(<1 x float> %vv, float * %samevalue, + <1 x i32> %mask) nounwind alwaysinline { + %v=extractelement <1 x float> %vv, i32 0 + store float %v, float * %samevalue + ret i1 true + +} + +define i1 @__reduce_equal_int64(<1 x i64> %vv, i64 * %samevalue, + <1 x i32> %mask) nounwind alwaysinline { + %v=extractelement <1 x i64> %vv, i32 0 + store i64 %v, i64 * %samevalue + ret i1 true + +} + +define i1 @__reduce_equal_double(<1 x double> %vv, double * %samevalue, + <1 x i32> %mask) nounwind alwaysinline { + %v=extractelement <1 x double> %vv, i32 0 + store double %v, double * %samevalue + ret i1 true + +} + +; extracting/reinserting elements because I want to be able to remove vectors later on + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rcp + +define <1 x float> @__rcp_varying_float(<1 x float>) nounwind readonly alwaysinline { + ;%call = call <1 x float> @llvm.x86.sse.rcp.ps(<1 x float> %0) + ; do one N-R iteration to improve precision + ; float iv = __rcp_v(v); + ; return iv * (2. - v * iv); + ;%v_iv = fmul <1 x float> %0, %call + ;%two_minus = fsub <1 x float> , %v_iv + ;%iv_mul = fmul <1 x float> %call, %two_minus + ;ret <1 x float> %iv_mul + %d = extractelement <1 x float> %0, i32 0 + %r = fdiv float 1.,%d + %rv = insertelement <1 x float> undef, float %r, i32 0 + ret <1 x float> %rv +} + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; sqrt + +define <1 x float> @__sqrt_varying_float(<1 x float>) nounwind readonly alwaysinline { + ;%call = call <1 x float> @llvm.x86.sse.sqrt.ps(<1 x float> %0) + ;ret <1 x float> %call + %d = extractelement <1 x float> %0, i32 0 + %r = call float @llvm.sqrt.f32(float %d) + %rv = insertelement <1 x float> undef, float %r, i32 0 + ret <1 x float> %rv +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; rsqrt + +define <1 x float> @__rsqrt_varying_float(<1 x float> %v) nounwind readonly alwaysinline { + ; float is = __rsqrt_v(v); + ;%is = call <1 x float> @llvm.x86.sse.rsqrt.ps(<1 x float> %v) + ; Newton-Raphson iteration to improve precision + ; return 0.5 * is * (3. - (v * is) * is); + ;%v_is = fmul <1 x float> %v, %is + ;%v_is_is = fmul <1 x float> %v_is, %is + ;%three_sub = fsub <1 x float> , %v_is_is + ;%is_mul = fmul <1 x float> %is, %three_sub + ;%half_scale = fmul <1 x float> , %is_mul + ;ret <1 x float> %half_scale + %s = call <1 x float> @__sqrt_varying_float(<1 x float> %v) + %r = call <1 x float> @__rcp_varying_float(<1 x float> %s) + ret <1 x float> %r + +} + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; svml stuff + +define <1 x float> @__svml_sin(<1 x float>) nounwind readnone alwaysinline { + ;%ret = call <1 x float> @__svml_sinf4(<1 x float> %0) + ;ret <1 x float> %ret + ;%r = extractelement <1 x float> %0, i32 0 + ;%s = call float @llvm.sin.f32(float %r) + ;%rv = insertelement <1 x float> undef, float %r, i32 0 + ;ret <1 x float> %rv + unary1to1(float,@llvm.sin.f32) + +} + +define <1 x float> @__svml_cos(<1 x float>) nounwind readnone alwaysinline { + ;%ret = call <1 x float> @__svml_cosf4(<1 x float> %0) + ;ret <1 x float> %ret + ;%r = extractelement <1 x float> %0, i32 0 + ;%s = call float @llvm.cos.f32(float %r) + ;%rv = insertelement <1 x float> undef, float %r, i32 0 + ;ret <1 x float> %rv + unary1to1(float, @llvm.cos.f32) + +} + +define void @__svml_sincos(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline { +; %s = call <1 x float> @__svml_sincosf4(<1 x float> * %2, <1 x float> %0) +; store <1 x float> %s, <1 x float> * %1 +; ret void + %sin = call <1 x float> @__svml_sin (<1 x float> %0) + %cos = call <1 x float> @__svml_cos (<1 x float> %0) + store <1 x float> %sin, <1 x float> * %1 + store <1 x float> %cos, <1 x float> * %2 + ret void +} + +define <1 x float> @__svml_tan(<1 x float>) nounwind readnone alwaysinline { + ;%ret = call <1 x float> @__svml_tanf4(<1 x float> %0) + ;ret <1 x float> %ret + ;%r = extractelement <1 x float> %0, i32 0 + ;%s = call float @llvm_tan_f32(float %r) + ;%rv = insertelement <1 x float> undef, float %r, i32 0 + ;ret <1 x float> %rv + ;unasry1to1(float, @llvm.tan.f32) + ; UNSUPPORTED! + ret <1 x float > %0 +} + +define <1 x float> @__svml_atan(<1 x float>) nounwind readnone alwaysinline { +; %ret = call <1 x float> @__svml_atanf4(<1 x float> %0) +; ret <1 x float> %ret + ;%r = extractelement <1 x float> %0, i32 0 + ;%s = call float @llvm_atan_f32(float %r) + ;%rv = insertelement <1 x float> undef, float %r, i32 0 + ;ret <1 x float> %rv + ;unsary1to1(float,@llvm.atan.f32) + ;UNSUPPORTED! + ret <1 x float > %0 + +} + +define <1 x float> @__svml_atan2(<1 x float>, <1 x float>) nounwind readnone alwaysinline { + ;%ret = call <1 x float> @__svml_atan2f4(<1 x float> %0, <1 x float> %1) + ;ret <1 x float> %ret + ;%y = extractelement <1 x float> %0, i32 0 + ;%x = extractelement <1 x float> %1, i32 0 + ;%q = fdiv float %y, %x + ;%a = call float @llvm.atan.f32 (float %q) + ;%rv = insertelement <1 x float> undef, float %a, i32 0 + ;ret <1 x float> %rv + ; UNSUPPORTED! + ret <1 x float > %0 +} + +define <1 x float> @__svml_exp(<1 x float>) nounwind readnone alwaysinline { + ;%ret = call <1 x float> @__svml_expf4(<1 x float> %0) + ;ret <1 x float> %ret + unary1to1(float, @llvm.exp.f32) +} + +define <1 x float> @__svml_log(<1 x float>) nounwind readnone alwaysinline { + ;%ret = call <1 x float> @__svml_logf4(<1 x float> %0) + ;ret <1 x float> %ret + unary1to1(float, @llvm.log.f32) +} + +define <1 x float> @__svml_pow(<1 x float>, <1 x float>) nounwind readnone alwaysinline { + ;%ret = call <1 x float> @__svml_powf4(<1 x float> %0, <1 x float> %1) + ;ret <1 x float> %ret + %r = extractelement <1 x float> %0, i32 0 + %e = extractelement <1 x float> %1, i32 0 + %s = call float @llvm.pow.f32(float %r,float %e) + %rv = insertelement <1 x float> undef, float %s, i32 0 + ret <1 x float> %rv + +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; float min/max + +define <1 x float> @__max_varying_float(<1 x float>, <1 x float>) nounwind readonly alwaysinline { +; %call = call <1 x float> @llvm.x86.sse.max.ps(<1 x float> %0, <1 x float> %1) +; ret <1 x float> %call + %a = extractelement <1 x float> %0, i32 0 + %b = extractelement <1 x float> %1, i32 0 + %d = fcmp ogt float %a, %b + %r = select i1 %d, float %a, float %b + %rv = insertelement <1 x float> undef, float %r, i32 0 + ret <1 x float> %rv +} + +define <1 x float> @__min_varying_float(<1 x float>, <1 x float>) nounwind readonly alwaysinline { +; %call = call <1 x float> @llvm.x86.sse.min.ps(<1 x float> %0, <1 x float> %1) +; ret <1 x float> %call + %a = extractelement <1 x float> %0, i32 0 + %b = extractelement <1 x float> %1, i32 0 + %d = fcmp olt float %a, %b + %r = select i1 %d, float %a, float %b + %rv = insertelement <1 x float> undef, float %r, i32 0 + ret <1 x float> %rv + +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; double precision sqrt + +;declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone + +define <1 x double> @__sqrt_varying_double(<1 x double>) nounwind alwaysinline { + ;unarya2to4(ret, double, @llvm.x86.sse2.sqrt.pd, %0) + ;ret <1 x double> %ret + unary1to1(double, @llvm.sqrt.f64) +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; double precision min/max + +;declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone +;declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone + +define <1 x double> @__min_varying_double(<1 x double>, <1 x double>) nounwind readnone { + ;binarsy2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1) + ;ret <1 x double> %ret + %a = extractelement <1 x double> %0, i32 0 + %b = extractelement <1 x double> %1, i32 0 + %d = fcmp olt double %a, %b + %r = select i1 %d, double %a, double %b + %rv = insertelement <1 x double> undef, double %r, i32 0 + ret <1 x double> %rv + +} + +define <1 x double> @__max_varying_double(<1 x double>, <1 x double>) nounwind readnone { + ;binary2sto4(ret, double, @llvm.x86.sse2.max.pd, %0, %1) + ;ret <1 x double> %ret + %a = extractelement <1 x double> %0, i32 0 + %b = extractelement <1 x double> %1, i32 0 + %d = fcmp ogt double %a, %b + %r = select i1 %d, double %a, double %b + %rv = insertelement <1 x double> undef, double %r, i32 0 + ret <1 x double> %rv + +} + + +define float @__rcp_uniform_float(float) nounwind readonly alwaysinline { +; uniform float iv = extract(__rcp_u(v), 0); +; return iv * (2. - v * iv); + %r = fdiv float 1.,%0 + ret float %r +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rounding floats + +define float @__round_uniform_float(float) nounwind readonly alwaysinline { + ; roundss, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8 + ; the roundss intrinsic is a total mess--docs say: + ; + ; __m128 _mm_round_ss (__m128 a, __m128 b, const int c) + ; + ; b is a 128-bit parameter. The lowest 32 bits are the result of the rounding function + ; on b0. The higher order 96 bits are copied directly from input parameter a. The + ; return value is described by the following equations: + ; + ; r0 = RND(b0) + ; r1 = a1 + ; r2 = a2 + ; r3 = a3 + ; + ; It doesn't matter what we pass as a, since we only need the r0 value + ; here. So we pass the same register for both. + %v = insertelement<1 x float> undef, float %0, i32 0 + %rv = call <1 x float> @__round_varying_float(<1 x float> %v) + %r=extractelement <1 x float> %rv, i32 0 + ret float %r + +} + +define float @__floor_uniform_float(float) nounwind readonly alwaysinline { + %v = insertelement<1 x float> undef, float %0, i32 0 + %rv = call <1 x float> @__floor_varying_float(<1 x float> %v) + %r=extractelement <1 x float> %rv, i32 0 + ret float %r + +} + +define float @__ceil_uniform_float(float) nounwind readonly alwaysinline { + %v = insertelement<1 x float> undef, float %0, i32 0 + %rv = call <1 x float> @__ceil_varying_float(<1 x float> %v) + %r=extractelement <1 x float> %rv, i32 0 + ret float %r +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rounding doubles + + +define double @__round_uniform_double(double) nounwind readonly alwaysinline { + %rs=call double @round(double %0) + ret double %rs +} + +define double @__floor_uniform_double(double) nounwind readonly alwaysinline { + %rs = call double @floor(double %0) + ret double %rs +} + +define double @__ceil_uniform_double(double) nounwind readonly alwaysinline { + %rs = call double @ceil(double %0) + ret double %rs +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; sqrt + + +define float @__sqrt_uniform_float(float) nounwind readonly alwaysinline { + %ret = call float @llvm.sqrt.f32(float %0) + ret float %ret +} + +define double @__sqrt_uniform_double(double) nounwind readonly alwaysinline { + %ret = call double @llvm.sqrt.f64(double %0) + ret double %ret +} + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rsqrt + + +define float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline { + %s = call float @__sqrt_uniform_float(float %0) + %r = call float @__rcp_uniform_float(float %s) + ret float %r +} + + + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; fastmath + + +define void @__fastmath() nounwind alwaysinline { + ; no-op + ret void +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; float min/max + + +define float @__max_uniform_float(float, float) nounwind readonly alwaysinline { + %d = fcmp ogt float %0, %1 + %r = select i1 %d, float %0, float %1 + ret float %r + +} + +define float @__min_uniform_float(float, float) nounwind readonly alwaysinline { + %d = fcmp olt float %0, %1 + %r = select i1 %d, float %0, float %1 + ret float %r + +} +define double @__max_uniform_double(double, double) nounwind readonly alwaysinline { + %d = fcmp ogt double %0, %1 + %r = select i1 %d, double %0, double %1 + ret double %r + +} + +define double @__min_uniform_double(double, double) nounwind readonly alwaysinline { + %d = fcmp olt double %0, %1 + %r = select i1 %d, double %0, double %1 + ret double %r + +} + +define_shuffles() + +ctlztz() + +define_prefetches() + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; half conversion routines + +declare float @__half_to_float_uniform(i16 %v) nounwind readnone +declare @__half_to_float_varying( %v) nounwind readnone +declare i16 @__float_to_half_uniform(float %v) nounwind readnone +declare @__float_to_half_varying( %v) nounwind readnone + diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll index 6e280ba6..e4c70aa4 100644 --- a/builtins/target-generic-common.ll +++ b/builtins/target-generic-common.ll @@ -98,6 +98,14 @@ declare void @__aos_to_soa4_float(float * noalias %p, * noalias * noalias %out2, * noalias %out3) nounwind +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; half conversion routines + +declare float @__half_to_float_uniform(i16 %v) nounwind readnone +declare @__half_to_float_varying( %v) nounwind readnone +declare i16 @__float_to_half_uniform(float %v) nounwind readnone +declare @__float_to_half_varying( %v) nounwind readnone + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; math diff --git a/builtins/util.m4 b/builtins/util.m4 index 36882491..7c022e94 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -1805,10 +1805,69 @@ ok: ret void } +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; new/delete + +declare i8 * @malloc(i64) +declare void @free(i8 *) + +define i8 * @__new_uniform(i64 %size) { + %a = call i8 * @malloc(i64 %size) + ret i8 * %a +} + +define @__new_varying32( %size, %mask) { + %ret = alloca + store zeroinitializer, * %ret + %ret64 = bitcast * %ret to i64 * + + per_lane(WIDTH, %mask, ` + %sz_LANE_ID = extractelement %size, i32 LANE + %sz64_LANE_ID = zext i32 %sz_LANE_ID to i64 + %ptr_LANE_ID = call i8 * @malloc(i64 %sz64_LANE_ID) + %ptr_int_LANE_ID = ptrtoint i8 * %ptr_LANE_ID to i64 + %store_LANE_ID = getelementptr i64 * %ret64, i32 LANE + store i64 %ptr_int_LANE_ID, i64 * %store_LANE_ID') + + %r = load * %ret + ret %r +} + +define @__new_varying64( %size, %mask) { + %ret = alloca + store zeroinitializer, * %ret + %ret64 = bitcast * %ret to i64 * + + per_lane(WIDTH, %mask, ` + %sz_LANE_ID = extractelement %size, i32 LANE + %ptr_LANE_ID = call i8 * @malloc(i64 %sz_LANE_ID) + %ptr_int_LANE_ID = ptrtoint i8 * %ptr_LANE_ID to i64 + %store_LANE_ID = getelementptr i64 * %ret64, i32 LANE + store i64 %ptr_int_LANE_ID, i64 * %store_LANE_ID') + + %r = load * %ret + ret %r +} + +define void @__delete_uniform(i8 * %ptr) { + call void @free(i8 * %ptr) + ret void +} + +define void @__delete_varying( %ptr, %mask) { + per_lane(WIDTH, %mask, ` + %iptr_LANE_ID = extractelement %ptr, i32 LANE + %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to i8 * + call void @free(i8 * %ptr_LANE_ID) + ') + ret void +} + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; read hw clock -define i64 @__clock() nounwind uwtable ssp { +define i64 @__clock() nounwind { entry: tail call void asm sideeffect "xorl %eax,%eax \0A cpuid", "~{rax},~{rbx},~{rcx},~{rdx},~{dirflag},~{fpsr},~{flags}"() nounwind %0 = tail call { i32, i32 } asm sideeffect "rdtsc", "={ax},={dx},~{dirflag},~{fpsr},~{flags}"() nounwind @@ -2187,9 +2246,9 @@ return: define(`gen_masked_store', ` define void @__masked_store_$3(<$1 x $2>* nocapture, <$1 x $2>, <$1 x i32>) nounwind alwaysinline { per_lane($1, <$1 x i32> %2, ` - %ptr_ID = getelementptr <$1 x $2> * %0, i32 0, i32 LANE - %storeval_ID = extractelement <$1 x $2> %1, i32 LANE - store $2 %storeval_ID, $2 * %ptr_ID') + %ptr_LANE_ID = getelementptr <$1 x $2> * %0, i32 0, i32 LANE + %storeval_LANE_ID = extractelement <$1 x $2> %1, i32 LANE + store $2 %storeval_LANE_ID, $2 * %ptr_LANE_ID') ret void } ') @@ -2644,7 +2703,7 @@ pl_known_mask: pl_all_on: ;; the mask is all on--just expand the code for each lane sequentially forloop(i, 0, eval($1-1), - `patsubst(`$3', `ID\|LANE', i)') + `patsubst(`$3', `LANE', i)') br label %pl_done pl_unknown_mask: @@ -2806,11 +2865,11 @@ define <$1 x $2> @__gather32_$2(<$1 x i32> %ptrs, <$1 x i32> %vecmask) nounwind readonly alwaysinline { %ret_ptr = alloca <$1 x $2> per_lane($1, <$1 x i32> %vecmask, ` - %iptr_ID = extractelement <$1 x i32> %ptrs, i32 LANE - %ptr_ID = inttoptr i32 %iptr_ID to $2 * - %val_ID = load $2 * %ptr_ID - %store_ptr_ID = getelementptr <$1 x $2> * %ret_ptr, i32 0, i32 LANE - store $2 %val_ID, $2 * %store_ptr_ID + %iptr_LANE_ID = extractelement <$1 x i32> %ptrs, i32 LANE + %ptr_LANE_ID = inttoptr i32 %iptr_LANE_ID to $2 * + %val_LANE_ID = load $2 * %ptr_LANE_ID + %store_ptr_LANE_ID = getelementptr <$1 x $2> * %ret_ptr, i32 0, i32 LANE + store $2 %val_LANE_ID, $2 * %store_ptr_LANE_ID ') %ret = load <$1 x $2> * %ret_ptr @@ -2822,11 +2881,11 @@ define <$1 x $2> @__gather64_$2(<$1 x i64> %ptrs, <$1 x i32> %vecmask) nounwind readonly alwaysinline { %ret_ptr = alloca <$1 x $2> per_lane($1, <$1 x i32> %vecmask, ` - %iptr_ID = extractelement <$1 x i64> %ptrs, i32 LANE - %ptr_ID = inttoptr i64 %iptr_ID to $2 * - %val_ID = load $2 * %ptr_ID - %store_ptr_ID = getelementptr <$1 x $2> * %ret_ptr, i32 0, i32 LANE - store $2 %val_ID, $2 * %store_ptr_ID + %iptr_LANE_ID = extractelement <$1 x i64> %ptrs, i32 LANE + %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to $2 * + %val_LANE_ID = load $2 * %ptr_LANE_ID + %store_ptr_LANE_ID = getelementptr <$1 x $2> * %ret_ptr, i32 0, i32 LANE + store $2 %val_LANE_ID, $2 * %store_ptr_LANE_ID ') %ret = load <$1 x $2> * %ret_ptr @@ -2910,10 +2969,10 @@ define void @__scatter_base_offsets64_$2(i8* %base, <$1 x i64> %offsets, i32 %of define void @__scatter32_$2(<$1 x i32> %ptrs, <$1 x $2> %values, <$1 x i32> %mask) nounwind alwaysinline { per_lane($1, <$1 x i32> %mask, ` - %iptr_ID = extractelement <$1 x i32> %ptrs, i32 LANE - %ptr_ID = inttoptr i32 %iptr_ID to $2 * - %val_ID = extractelement <$1 x $2> %values, i32 LANE - store $2 %val_ID, $2 * %ptr_ID + %iptr_LANE_ID = extractelement <$1 x i32> %ptrs, i32 LANE + %ptr_LANE_ID = inttoptr i32 %iptr_LANE_ID to $2 * + %val_LANE_ID = extractelement <$1 x $2> %values, i32 LANE + store $2 %val_LANE_ID, $2 * %ptr_LANE_ID ') ret void } @@ -2922,10 +2981,10 @@ define void @__scatter32_$2(<$1 x i32> %ptrs, <$1 x $2> %values, define void @__scatter64_$2(<$1 x i64> %ptrs, <$1 x $2> %values, <$1 x i32> %mask) nounwind alwaysinline { per_lane($1, <$1 x i32> %mask, ` - %iptr_ID = extractelement <$1 x i64> %ptrs, i32 LANE - %ptr_ID = inttoptr i64 %iptr_ID to $2 * - %val_ID = extractelement <$1 x $2> %values, i32 LANE - store $2 %val_ID, $2 * %ptr_ID + %iptr_LANE_ID = extractelement <$1 x i64> %ptrs, i32 LANE + %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to $2 * + %val_LANE_ID = extractelement <$1 x $2> %values, i32 LANE + store $2 %val_LANE_ID, $2 * %ptr_LANE_ID ') ret void } diff --git a/cbackend.cpp b/cbackend.cpp index b800d4ac..314b53d6 100644 --- a/cbackend.cpp +++ b/cbackend.cpp @@ -2114,7 +2114,8 @@ bool CWriter::doInitialization(Module &M) { I->getName() == "memset" || I->getName() == "memset_pattern16" || I->getName() == "puts" || I->getName() == "printf" || I->getName() == "putchar" || - I->getName() == "fflush") + I->getName() == "fflush" || I->getName() == "malloc" || + I->getName() == "free") continue; // Don't redeclare ispc's own intrinsics @@ -3437,6 +3438,9 @@ void CWriter::visitCallInst(CallInst &I) { Callee = RF; } + if (Callee->getName() == "malloc") + Out << "(uint8_t *)"; + if (NeedsCast) { // Ok, just cast the pointer type. Out << "(("; diff --git a/ctx.cpp b/ctx.cpp index 0a7dd6d0..41178a5b 100644 --- a/ctx.cpp +++ b/ctx.cpp @@ -642,12 +642,12 @@ FunctionEmitContext::inSwitchStatement() const { void FunctionEmitContext::Break(bool doCoherenceCheck) { - Assert(controlFlowInfo.size() > 0); if (breakTarget == NULL) { Error(currentPos, "\"break\" statement is illegal outside of " "for/while/do loops and \"switch\" statements."); return; } + Assert(controlFlowInfo.size() > 0); if (bblock == NULL) return; @@ -721,6 +721,7 @@ FunctionEmitContext::Continue(bool doCoherenceCheck) { "for/while/do/foreach loops."); return; } + Assert(controlFlowInfo.size() > 0); if (ifsInCFAllUniform(CFInfo::Loop) || GetInternalMask() == LLVMMaskAllOn) { // Similarly to 'break' statements, we can immediately jump to the @@ -1279,7 +1280,11 @@ FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) { llvm::Value * FunctionEmitContext::GetStringPtr(const std::string &str) { +#ifdef LLVM_3_1svn + llvm::Constant *lstr = llvm::ConstantDataArray::getString(*g->ctx, str); +#else llvm::Constant *lstr = llvm::ConstantArray::get(*g->ctx, str); +#endif llvm::GlobalValue::LinkageTypes linkage = llvm::GlobalValue::InternalLinkage; llvm::Value *lstrPtr = new llvm::GlobalVariable(*m->module, lstr->getType(), true /*isConst*/, @@ -1329,7 +1334,11 @@ FunctionEmitContext::I1VecToBoolVec(llvm::Value *b) { static llvm::Value * lGetStringAsValue(llvm::BasicBlock *bblock, const char *s) { +#ifdef LLVM_3_1svn + llvm::Constant *sConstant = llvm::ConstantDataArray::getString(*g->ctx, s); +#else llvm::Constant *sConstant = llvm::ConstantArray::get(*g->ctx, s); +#endif llvm::Value *sPtr = new llvm::GlobalVariable(*m->module, sConstant->getType(), true /* const */, llvm::GlobalValue::InternalLinkage, @@ -2923,7 +2932,7 @@ FunctionEmitContext::SyncInst() { /** When we gathering from or scattering to a varying atomic type, we need - to add an appropraite offset to the final address for each lane right + to add an appropriate offset to the final address for each lane right before we use it. Given a varying pointer we're about to use and its type, this function determines whether these offsets are needed and returns an updated pointer that incorporates these offsets if needed. diff --git a/decl.cpp b/decl.cpp index 5ec58462..f5c0eb88 100644 --- a/decl.cpp +++ b/decl.cpp @@ -113,6 +113,12 @@ DeclSpecs::DeclSpecs(const Type *t, StorageClass sc, int tq) { const Type * DeclSpecs::GetBaseType(SourcePos pos) const { const Type *bt = baseType; + + if (bt == NULL) { + Warning(pos, "No type specified in declaration. Assuming int32."); + bt = AtomicType::UnboundInt32; + } + if (vectorSize > 0) { const AtomicType *atomicType = dynamic_cast(bt); if (atomicType == NULL) { @@ -171,6 +177,11 @@ Declarator::Declarator(DeclaratorKind dk, SourcePos p) void Declarator::InitFromDeclSpecs(DeclSpecs *ds) { const Type *t = GetType(ds); + if (t == NULL) { + Assert(m->errorCount > 0); + return; + } + Symbol *sym = GetSymbol(); if (sym != NULL) { sym->type = t; @@ -248,8 +259,10 @@ Declarator::GetFunctionInfo(DeclSpecs *ds, std::vector *funArgs) { // already have been added to the symbol table by AddGlobal() by the // time we get here.) Symbol *funSym = m->symbolTable->LookupFunction(declSym->name.c_str(), type); - if (funSym != NULL) + if (funSym == NULL) // May be NULL due to error earlier in compilation + Assert(m->errorCount > 0); + else funSym->pos = pos; // Walk down to the declarator for the function. (We have to get past @@ -262,11 +275,18 @@ Declarator::GetFunctionInfo(DeclSpecs *ds, std::vector *funArgs) { for (unsigned int i = 0; i < d->functionParams.size(); ++i) { Symbol *sym = d->GetSymbolForFunctionParameter(i); - sym->type = sym->type->ResolveUnboundVariability(Type::Varying); + if (sym->type == NULL) { + Assert(m->errorCount > 0); + continue; + } + else + sym->type = sym->type->ResolveUnboundVariability(Type::Varying); + funArgs->push_back(sym); } - funSym->type = funSym->type->ResolveUnboundVariability(Type::Varying); + if (funSym != NULL) + funSym->type = funSym->type->ResolveUnboundVariability(Type::Varying); return funSym; } @@ -331,6 +351,16 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const { break; case DK_ARRAY: + if (type == AtomicType::Void) { + Error(pos, "Arrays of \"void\" type are illegal."); + return NULL; + } + if (dynamic_cast(type)) { + Error(pos, "Arrays of references (type \"%s\") are illegal.", + type->GetString().c_str()); + return NULL; + } + type = new ArrayType(type, arraySize); if (child) return child->GetType(type, ds); @@ -357,6 +387,11 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const { "function parameter declaration for parameter \"%s\".", lGetStorageClassName(d->declSpecs->storageClass), sym->name.c_str()); + if (sym->type == AtomicType::Void) { + Error(sym->pos, "Parameter with type \"void\" illegal in function " + "parameter list."); + sym->type = NULL; + } const ArrayType *at = dynamic_cast(sym->type); if (at != NULL) { @@ -368,8 +403,12 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const { // report this differently than it was originally declared // in the function, but it's not clear that this is a // significant problem.) - sym->type = PointerType::GetUniform(at->GetElementType()); + if (at->GetElementType() == NULL) { + Assert(m->errorCount > 0); + return NULL; + } + sym->type = PointerType::GetUniform(at->GetElementType()); // Make sure there are no unsized arrays (other than the // first dimension) in function parameter lists. at = dynamic_cast(at->GetElementType()); @@ -413,6 +452,10 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const { Error(pos, "No return type provided in function declaration."); return NULL; } + if (dynamic_cast(returnType) != NULL) { + Error(pos, "Illegal to return function type from function."); + return NULL; + } bool isExported = ds && (ds->storageClass == SC_EXPORT); bool isExternC = ds && (ds->storageClass == SC_EXTERN_C); @@ -434,6 +477,11 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const { return NULL; } + if (child == NULL) { + Assert(m->errorCount > 0); + return NULL; + } + const Type *functionType = new FunctionType(returnType, args, argNames, argDefaults, argPos, isTask, isExported, isExternC); @@ -536,14 +584,23 @@ Declaration::GetVariableDeclarations() const { for (unsigned int i = 0; i < declarators.size(); ++i) { Declarator *decl = declarators[i]; - if (decl == NULL) + if (decl == NULL) { // Ignore earlier errors + Assert(m->errorCount > 0); continue; + } Symbol *sym = decl->GetSymbol(); + if (sym == NULL || sym->type == NULL) { + // Ignore errors + Assert(m->errorCount > 0); + continue; + } sym->type = sym->type->ResolveUnboundVariability(Type::Varying); - if (dynamic_cast(sym->type) == NULL) { + if (sym->type == AtomicType::Void) + Error(sym->pos, "\"void\" type variable illegal in declaration."); + else if (dynamic_cast(sym->type) == NULL) { m->symbolTable->AddVariable(sym); vars.push_back(VariableDeclaration(sym, decl->initExpr)); } @@ -558,11 +615,18 @@ Declaration::DeclareFunctions() { for (unsigned int i = 0; i < declarators.size(); ++i) { Declarator *decl = declarators[i]; - if (decl == NULL) + if (decl == NULL) { // Ignore earlier errors + Assert(m->errorCount > 0); continue; + } Symbol *sym = decl->GetSymbol(); + if (sym == NULL || sym->type == NULL) { + // Ignore errors + Assert(m->errorCount > 0); + continue; + } sym->type = sym->type->ResolveUnboundVariability(Type::Varying); if (dynamic_cast(sym->type) == NULL) @@ -610,6 +674,9 @@ GetStructTypesNamesPositions(const std::vector &sd, Symbol *sym = d->GetSymbol(); + if (sym->type == AtomicType::Void) + Error(d->pos, "\"void\" type illegal for struct member."); + const ArrayType *arrayType = dynamic_cast(sym->type); if (arrayType != NULL && arrayType->GetElementCount() == 0) { diff --git a/docs/ReleaseNotes.txt b/docs/ReleaseNotes.txt index 14b42c25..5e67e901 100644 --- a/docs/ReleaseNotes.txt +++ b/docs/ReleaseNotes.txt @@ -1,3 +1,42 @@ +=== v1.1.4 === (4 February 2012) + +There are two major bugfixes for Windows in this release. First, a number +of failures in AVX code generation on Windows have been fixed; AVX on +Windows now has no known issues. Second, a longstanding bug in parsing 64-bit +integer constants on Windows has been fixed. + +This release features a new experimental scalar target, contributed by Gabe +Weisz . This target ("--target=generic-1") compiles +gangs of single program instances (i.e. programCount == 1); it can be +useful for debugging ispc programs. + +The compiler now supports dynamic memory allocation in ispc programs (with +"new" and "delete" operators based on C++). See +http://ispc.github.com/ispc.html#dynamic-memory-allocation in the +documentation for more information. + +ispc now performs "short circuit" evaluation of the || and && logical +operators and the ? : selection operator. (This represents the correction +of a major incompatibility with C.) Code like "(index < arraySize && +array[index] == 1)" thus now executes as in C, where "array[index]" won't +be evaluated unless "index" is less than "arraySize". + +The standard library now provides "local" atomic operations, which are +atomic across the gang of program instances (but not across other gangs or +other hardware threads. See the updated documentation on atomics for more +information: +http://ispc.github.com/ispc.html#atomic-operations-and-memory-fences. + +The standard library now offers a clock() function, which returns a uniform +int64 value that counts processor cycles; it can be used for +fine-resolution timing measurements. + +Finally (of limited interest now): ispc now supports the forthcoming AVX2 +instruction set, due with Haswell-generation CPUs. All tests and examples +compile and execute correctly with AVX2. (Thanks specifically to Craig +Topper and Nadav Rotem for work on AVX2 support in LLVM, which made this +possible.) + === v1.1.3 === (20 January 2012) With this release, the language now supports "switch" statements, with the diff --git a/docs/ispc.rst b/docs/ispc.rst index c0dcd6df..aa15158d 100644 --- a/docs/ispc.rst +++ b/docs/ispc.rst @@ -96,6 +96,9 @@ Contents: + `Declarations and Initializers`_ + `Expressions`_ + + * `Dynamic Memory Allocation`_ + + `Control Flow`_ * `Conditional Statements: "if"`_ @@ -1148,6 +1151,7 @@ in C: * Structs and arrays * Support for recursive function calls * Support for separate compilation of source files +* "Short-circuit" evaluation of ``||``, ``&&`` and ``? :`` operators * The preprocessor ``ispc`` adds a number of features from C++ and C99 to this base: @@ -1162,6 +1166,7 @@ in C: * The ``inline`` qualifier to indicate that a function should be inlined * Function overloading by parameter type * Hexadecimal floating-point constants +* Dynamic memory allocation with ``new`` and ``delete``. ``ispc`` also adds a number of new features that aren't in C89, C99, or C++: @@ -1180,7 +1185,6 @@ C++: There are a number of features of C89 that are not supported in ``ispc`` but are likely to be supported in future releases: -* Short circuiting of logical operations * There are no types named ``char``, ``short``, or ``long`` (or ``long double``). However, there are built-in ``int8``, ``int16``, and ``int64`` types @@ -1965,19 +1969,137 @@ operator also work as expected. (*fp).a = 0; fp->b = 1; +As in C and C++, evaluation of the ``||`` and ``&&`` logical operators as +well as the selection operator ``? :`` is "short-circuited"; the right hand +side won't be evaluated if the value from the left-hand side determines the +logical operator's value. For example, in the following code, +``array[index]`` won't be evaluated for values of ``index`` that are +greater than or equal to ``NUM_ITEMS``. + +:: + + if (index < NUM_ITEMS && array[index] > 0) { + // ... + } + + +Dynamic Memory Allocation +------------------------- + +``ispc`` programs can dynamically allocate (and free) memory, using syntax +based on C++'s ``new`` and ``delete`` operators: + +:: + + int count = ...; + int *ptr = new uniform int[count]; + // use ptr... + delete[] ptr; + +In the above code, each program instance allocates its own ``count`-sized +array of ``uniform int`` values, uses that memory, and then deallocates +that memory. Uses of ``new`` and ``delete`` in ``ispc`` programs are +serviced by corresponding calls the system C library's ``malloc()`` and +``free()`` functions. + +After a pointer has been deleted, it is illegal to access the memory it +points to. However, note that deletion happens on a per-program-instance +basis. In other words, consider the following code: + +:: + + int *ptr = new uniform int[count]; + // use ptr + if (count > 1000) + delete[] ptr; + // ... + +Here, the program instances where ``count`` is greater than 1000 have +deleted the dynamically allocated memory pointed to by ``ptr``, but the +other program instances have not. As such, it's illegal for the former set +of program instances to access ``*ptr``, but it's perfectly fine for the +latter set to continue to use the memory ``ptr`` points to. Note that it +is illegal to delete a pointer value returned by ``new`` more than one +time. + +Sometimes, it's useful to be able to do a single allocation for the entire +gang of program instances. A ``new`` statement can be qualified with +``uniform`` to indicate a single memory allocation: + +:: + + float * uniform ptr = uniform new float[10]; + +While a regular call to ``new`` returns a ``varying`` pointer (i.e. a +distinct pointer to separately-allocated memory for each program instance), +a ``uniform new`` performs a single allocation and returns a ``uniform`` +pointer. + +When using ``uniform new``, it's important to be aware of a subtlety; if +the returned pointer is stored in a varying pointer variable (as may be +appropriate and useful for the particular program being written), then the +varying pointer may inadvertently be passed to a subsequent ``delete`` +statement, which is an error: effectively + +:: + + float *ptr = uniform new float[10]; + // use ptr... + delete ptr; // ERROR: varying pointer is deleted + +In this case, ``ptr`` will be deleted multiple times, once for each +executing program instance, which is an error (unless it happens that only +a single program instance is active in the above code.) + +When using ``new`` statements, it's important to make an appropriate choice +of ``uniform`` or ``varying`` (as always, the default), for both the +``new`` operator itself as well as the type of data being allocated, based +on the program's needs. Consider the following four memory allocations: + +:: + + uniform float * uniform p1 = uniform new uniform float[10]; + float * uniform p2 = uniform new float[10]; + uniform float * p3 = new uniform float[10]; + float * p4 = new float[10]; + +Assuming that a ``float`` is 4 bytes in memory and if the gang size is 8 +program instances, then the first allocation represents a single allocation +of 40 bytes, the second is a single allocation of 8*4*10 = 320 bytes, the +third is 8 allocations of 40 bytes, and the last performs 8 allocations of +80 bytes each. + +Note in particular that varying allocations of varying data types are rarely +desirable in practice. In that case, each program instance is performing a +separate allocation of ``varying float`` memory. In this case, it's likely +that the program instances will only access a single element of each +``varying float``, which is wasteful. + +Although ``ispc`` doesn't support constructors or destructors like C++, it +is possible to provide initializer values with ``new`` statements: + +:: + + struct Point { float x, y, z; }; + Point *pptr = new Point(10, 20, 30); + +Here for example, the "x" element of the returned ``Point`` is initialized +to have the value 10 and so forth. In general, the rules for how +initializer values provided in ``new`` statements are used to initialize +complex data types follow the same rules as initializers for variables +described in `Declarations and Initializers`_. Control Flow ------------ ``ispc`` supports most of C's control flow constructs, including ``if``, -``for``, ``while``, ``do``. It also supports variants of C's control flow +``switch``, ``for``, ``while``, ``do``. It has limited support for +``goto``, detailed below. It also supports variants of C's control flow constructs that provide hints about the expected runtime coherence of the control flow at that statement. It also provides parallel looping constructs, ``foreach`` and ``foreach_tiled``, all of which will be detailed in this section. -``ispc`` does not currently support ``switch`` statements or ``goto``. - Conditional Statements: "if" ---------------------------- @@ -3267,24 +3389,53 @@ Systems Programming Support Atomic Operations and Memory Fences ----------------------------------- -The usual range of atomic memory operations are provided in ``ispc``, -including variants to handle both uniform and varying types. As a first -example, consider on variant of the 32-bit integer atomic add routine: +The standard range of atomic memory operations are provided by the standard +library``ispc``, including variants to handle both uniform and varying +types as well as "local" and "global" atomics. + +Local atomics provide atomic behavior across the program instances in a +gang, but not across multiple gangs or memory operations in different +hardware threads. To see why they are needed, consider a histogram +calculation where each program instance in the gang computes which bucket a +value lies in and then increments a corresponding counter. If the code is +written like this: :: - int32 atomic_add_global(uniform int32 * uniform ptr, int32 delta) + uniform int count[N_BUCKETS] = ...; + float value = ...; + int bucket = clamp(value / N_BUCKETS, 0, N_BUCKETS); + ++count[bucket]; // ERROR: undefined behavior if collisions -The semantics are the expected ones for an atomic add function: the pointer -points to a single location in memory (the same one for all program -instances), and for each executing program instance, the value stored in -the location that ``ptr`` points to has that program instance's value -"delta" added to it atomically, and the old value at that location is -returned from the function. (Thus, if multiple processors simultaneously -issue atomic adds to the same memory location, the adds will be serialized -by the hardware so that the correct result is computed in the end. -Furthermore, the atomic adds are serialized across the running program -instances.) +then the program's behavior is undefined: whenever multiple program +instances have values that map to the same value of ``bucket``, then the +effect of the increment is undefined. (See the discussion in the `Data +Races Within a Gang`_ section; in the case here, there isn't a sequence +point between one program instance updating ``count[bucket]`` and the other +program instance reading its value.) + +The ``atomic_add_local()`` function can be used in this case; as a local +atomic it is atomic across the gang of program instances, such that the +expected result is computed. + +:: + + ... + int bucket = clamp(value / N_BUCKETS, 0, N_BUCKETS); + atomic_add_local(&count[bucket], 1); + +It uses this variant of the 32-bit integer atomic add routine: + +:: + + int32 atomic_add_local(uniform int32 * uniform ptr, int32 delta) + +The semantics of this routine are typical for an atomic add function: the +pointer here points to a single location in memory (the same one for all +program instances), and for each executing program instance, the value +stored in the location that ``ptr`` points to has that program instance's +value "delta" added to it atomically, and the old value at that location is +returned from the function. One thing to note is that that the type of the value being added to a ``uniform`` integer, while the increment amount and the return value are @@ -3295,45 +3446,76 @@ atomics for the running program instances may be issued in arbitrary order; it's not guaranteed that they will be issued in ``programIndex`` order, for example. -Here are the declarations of the ``int32`` variants of these functions. -There are also ``int64`` equivalents as well as variants that take -``unsigned`` ``int32`` and ``int64`` values. (The ``atomic_swap_global()`` -function can be used with ``float`` and ``double`` types as well.) +Global atomics are more powerful than local atomics; they are atomic across +both the program instances in the gang as well as atomic across different +gangs and different hardware threads. For example, for the global variant +of the atomic used above, :: - int32 atomic_add_global(uniform int32 * uniform ptr, int32 value) - int32 atomic_subtract_global(uniform int32 * uniform ptr, int32 value) - int32 atomic_min_global(uniform int32 * uniform ptr, int32 value) - int32 atomic_max_global(uniform int32 * uniform ptr, int32 value) - int32 atomic_and_global(uniform int32 * uniform ptr, int32 value) - int32 atomic_or_global(uniform int32 * uniform ptr, int32 value) - int32 atomic_xor_global(uniform int32 * uniform ptr, int32 value) - int32 atomic_swap_global(uniform int32 * uniform ptr, int32 value) + int32 atomic_add_global(uniform int32 * uniform ptr, int32 delta) -There are also variants of these functions that take ``uniform`` values for -the operand and return a ``uniform`` result. These correspond to a single +if multiple processors simultaneously issue atomic adds to the same memory +location, the adds will be serialized by the hardware so that the correct +result is computed in the end. + +Here are the declarations of the ``int32`` variants of these functions. +There are also ``int64`` equivalents as well as variants that take +``unsigned`` ``int32`` and ``int64`` values. + +:: + + int32 atomic_add_{local,global}(uniform int32 * uniform ptr, int32 value) + int32 atomic_subtract_{local,global}(uniform int32 * uniform ptr, int32 value) + int32 atomic_min_{local,global}(uniform int32 * uniform ptr, int32 value) + int32 atomic_max_{local,global}(uniform int32 * uniform ptr, int32 value) + int32 atomic_and_{local,global}(uniform int32 * uniform ptr, int32 value) + int32 atomic_or_{local,global}(uniform int32 * uniform ptr, int32 value) + int32 atomic_xor_{local,global}(uniform int32 * uniform ptr, int32 value) + int32 atomic_swap_{local,global}(uniform int32 * uniform ptr, int32 value) + +Support for ``float`` and ``double`` types is also available. For local +atomics, all but the logical operations are available. (There are +corresponding ``double`` variants of these, not listed here.) + +:: + + float atomic_add_local(uniform float * uniform ptr, float value) + float atomic_subtract_local(uniform float * uniform ptr, float value) + float atomic_min_local(uniform float * uniform ptr, float value) + float atomic_max_local(uniform float * uniform ptr, float value) + float atomic_swap_local(uniform float * uniform ptr, float value) + +For global atomics, only atomic swap is available for these types: + +:: + + float atomic_swap_global(uniform float * uniform ptr, float value) + double atomic_swap_global(uniform double * uniform ptr, double value) + +There are also variants of the atomic that take ``uniform`` values for the +operand and return a ``uniform`` result. These correspond to a single atomic operation being performed for the entire gang of program instances, rather than one per program instance. :: - uniform int32 atomic_add_global(uniform int32 * uniform ptr, - uniform int32 value) - uniform int32 atomic_subtract_global(uniform int32 * uniform ptr, - uniform int32 value) - uniform int32 atomic_min_global(uniform int32 * uniform ptr, - uniform int32 value) - uniform int32 atomic_max_global(uniform int32 * uniform ptr, - uniform int32 value) - uniform int32 atomic_and_global(uniform int32 * uniform ptr, - uniform int32 value) - uniform int32 atomic_or_global(uniform int32 * uniform ptr, - uniform int32 value) - uniform int32 atomic_xor_global(uniform int32 * uniform ptr, - uniform int32 value) - uniform int32 atomic_swap_global(uniform int32 * uniform ptr, - uniform int32 newval) + uniform int32 atomic_add_{local,global}(uniform int32 * uniform ptr, + uniform int32 value) + uniform int32 atomic_subtract_{local,global}(uniform int32 * uniform ptr, + uniform int32 value) + uniform int32 atomic_min_{local,global}(uniform int32 * uniform ptr, + uniform int32 value) + uniform int32 atomic_max_{local,global}(uniform int32 * uniform ptr, + uniform int32 value) + uniform int32 atomic_and_{local,global}(uniform int32 * uniform ptr, + uniform int32 value) + uniform int32 atomic_or_{local,global}(uniform int32 * uniform ptr, + uniform int32 value) + uniform int32 atomic_xor_{local,global}(uniform int32 * uniform ptr, + uniform int32 value) + uniform int32 atomic_swap_{local,global}(uniform int32 * uniform ptr, + uniform int32 newval) Be careful that you use the atomic function that you mean to; consider the following code: @@ -3357,8 +3539,7 @@ will cause the desired atomic add function to be called. :: extern uniform int32 counter; - int32 one = 1; - int32 myCounter = atomic_add_global(&counter, one); + int32 myCounter = atomic_add_global(&counter, (varying int32)1); There is a third variant of each of these atomic functions that takes a ``varying`` pointer; this allows each program instance to issue an atomic @@ -3368,30 +3549,27 @@ the same location in memory!) :: - int32 atomic_add_global(uniform int32 * varying ptr, int32 value) - int32 atomic_subtract_global(uniform int32 * varying ptr, int32 value) - int32 atomic_min_global(uniform int32 * varying ptr, int32 value) - int32 atomic_max_global(uniform int32 * varying ptr, int32 value) - int32 atomic_and_global(uniform int32 * varying ptr, int32 value) - int32 atomic_or_global(uniform int32 * varying ptr, int32 value) - int32 atomic_xor_global(uniform int32 * varying ptr, int32 value) - int32 atomic_swap_global(uniform int32 * varying ptr, int32 value) + int32 atomic_add_{local,global}(uniform int32 * varying ptr, int32 value) + int32 atomic_subtract_{local,global}(uniform int32 * varying ptr, int32 value) + int32 atomic_min_{local,global}(uniform int32 * varying ptr, int32 value) + int32 atomic_max_{local,global}(uniform int32 * varying ptr, int32 value) + int32 atomic_and_{local,global}(uniform int32 * varying ptr, int32 value) + int32 atomic_or_{local,global}(uniform int32 * varying ptr, int32 value) + int32 atomic_xor_{local,global}(uniform int32 * varying ptr, int32 value) + int32 atomic_swap_{local,global}(uniform int32 * varying ptr, int32 value) -There are also atomic swap and "compare and exchange" functions. -Compare and exchange atomically compares the value in "val" to -"compare"--if they match, it assigns "newval" to "val". In either case, -the old value of "val" is returned. (As with the other atomic operations, -there are also ``unsigned`` and 64-bit variants of this function. -Furthermore, there are ``float`` and ``double`` variants as well.) +There are also atomic "compare and exchange" functions. Compare and +exchange atomically compares the value in "val" to "compare"--if they +match, it assigns "newval" to "val". In either case, the old value of +"val" is returned. (As with the other atomic operations, there are also +``unsigned`` and 64-bit variants of this function. Furthermore, there are +``float`` and ``double`` variants as well.) :: - int32 atomic_swap_global(uniform int32 * uniform ptr, int32 newvalue) - uniform int32 atomic_swap_global(uniform int32 * uniform ptr, - uniform int32 newvalue) - int32 atomic_compare_exchange_global(uniform int32 * uniform ptr, - int32 compare, int32 newval) - uniform int32 atomic_compare_exchange_global(uniform int32 * uniform ptr, + int32 atomic_compare_exchange_{local,global}(uniform int32 * uniform ptr, + int32 compare, int32 newval) + uniform int32 atomic_compare_exchange_{local,global}(uniform int32 * uniform ptr, uniform int32 compare, uniform int32 newval) ``ispc`` also has a standard library routine that inserts a memory barrier diff --git a/doxygen.cfg b/doxygen.cfg index 75d925df..30c097de 100644 --- a/doxygen.cfg +++ b/doxygen.cfg @@ -31,7 +31,7 @@ PROJECT_NAME = "Intel SPMD Program Compiler" # This could be handy for archiving the generated documentation or # if some version control system is used. -PROJECT_NUMBER = 1.1.3 +PROJECT_NUMBER = 1.1.4 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) # base path where the generated documentation will be put. diff --git a/examples/aobench/ao.ispc b/examples/aobench/ao.ispc index ffd85d29..61c2dc7d 100644 --- a/examples/aobench/ao.ispc +++ b/examples/aobench/ao.ispc @@ -212,104 +212,44 @@ static void ao_scanlines(uniform int y0, uniform int y1, uniform int w, RNGState rngstate; seed_rng(&rngstate, y0); + float invSamples = 1.f / nsubsamples; - // Compute the mapping between the 'programCount'-wide program - // instances running in parallel and samples in the image. - // - // For now, we'll always take four samples per pixel, so start by - // initializing du and dv with offsets into subpixel samples. We'll - // take care of further updating du and dv for the case where we're - // doing more than 4 program instances in parallel shortly. - uniform float uSteps[4] = { 0, 1, 0, 1 }; - uniform float vSteps[4] = { 0, 0, 1, 1 }; - float du = uSteps[programIndex % 4] / nsubsamples; - float dv = vSteps[programIndex % 4] / nsubsamples; + foreach_tiled(y = y0 ... y1, x = 0 ... w, + u = 0 ... nsubsamples, v = 0 ... nsubsamples) { + float du = (float)u * invSamples, dv = (float)v * invSamples; - // Now handle the case where we are able to do more than one pixel's - // worth of work at once. nx records the number of pixels in the x - // direction we do per iteration and ny the number in y. - uniform int nx = 1, ny = 1; + // Figure out x,y pixel in NDC + float px = (x + du - (w / 2.0f)) / (w / 2.0f); + float py = -(y + dv - (h / 2.0f)) / (h / 2.0f); + float ret = 0.f; + Ray ray; + Isect isect; - // FIXME: We actually need ny to be 1 regardless of the decomposition, - // since the task decomposition is one scanline high. + ray.org = 0.f; - if (programCount == 8) { - // Do two pixels at once in the x direction - nx = 2; - if (programIndex >= 4) - // And shift the offsets for the second pixel's worth of work - ++du; - } - else if (programCount == 16) { - nx = 4; - ny = 1; - if (programIndex >= 4 && programIndex < 8) - ++du; - if (programIndex >= 8 && programIndex < 12) - du += 2; - if (programIndex >= 12) - du += 3; - } + // Poor man's perspective projection + ray.dir.x = px; + ray.dir.y = py; + ray.dir.z = -1.0; + vnormalize(ray.dir); - // Now loop over all of the pixels, stepping in x and y as calculated - // above. (Assumes that ny divides y and nx divides x...) - for (uniform int y = y0; y < y1; y += ny) { - for (uniform int x = 0; x < w; x += nx) { - // Figure out x,y pixel in NDC - float px = (x + du - (w / 2.0f)) / (w / 2.0f); - float py = -(y + dv - (h / 2.0f)) / (h / 2.0f); - float ret = 0.f; - Ray ray; - Isect isect; + isect.t = 1.0e+17; + isect.hit = 0; - ray.org = 0.f; + for (uniform int snum = 0; snum < 3; ++snum) + ray_sphere_intersect(isect, ray, spheres[snum]); + ray_plane_intersect(isect, ray, plane); - // Poor man's perspective projection - ray.dir.x = px; - ray.dir.y = py; - ray.dir.z = -1.0; - vnormalize(ray.dir); + // Note use of 'coherent' if statement; the set of rays we + // trace will often all hit or all miss the scene + cif (isect.hit) { + ret = ambient_occlusion(isect, plane, spheres, rngstate); + ret *= invSamples * invSamples; - isect.t = 1.0e+17; - isect.hit = 0; - - for (uniform int snum = 0; snum < 3; ++snum) - ray_sphere_intersect(isect, ray, spheres[snum]); - ray_plane_intersect(isect, ray, plane); - - // Note use of 'coherent' if statement; the set of rays we - // trace will often all hit or all miss the scene - cif (isect.hit) - ret = ambient_occlusion(isect, plane, spheres, rngstate); - - // This is a little grungy; we have results for - // programCount-worth of values. Because we're doing 2x2 - // subsamples, we need to peel them off in groups of four, - // average the four values for each pixel, and update the - // output image. - // - // Store the varying value to a uniform array of the same size. - // See the discussion about communication among program - // instances in the ispc user's manual for more discussion on - // this idiom. - uniform float retArray[programCount]; - retArray[programIndex] = ret; - - // offset to the first pixel in the image - uniform int offset = 3 * (y * w + x); - for (uniform int p = 0; p < programCount; p += 4, offset += 3) { - // Get the four sample values for this pixel - uniform float sumret = retArray[p] + retArray[p+1] + retArray[p+2] + - retArray[p+3]; - - // Normalize by number of samples taken - sumret /= nsubsamples * nsubsamples; - - // Store result in the image - image[offset+0] = sumret; - image[offset+1] = sumret; - image[offset+2] = sumret; - } + int offset = 3 * (y * w + x); + atomic_add_local(&image[offset], ret); + atomic_add_local(&image[offset+1], ret); + atomic_add_local(&image[offset+2], ret); } } } diff --git a/examples/common.mk b/examples/common.mk index 5c5377c0..a79e3b93 100644 --- a/examples/common.mk +++ b/examples/common.mk @@ -14,7 +14,7 @@ CPP_OBJS=$(addprefix objs/, $(CPP_SRC:.cpp=.o) $(TASK_OBJ)) default: $(EXAMPLE) -all: $(EXAMPLE) $(EXAMPLE)-sse4 $(EXAMPLE)-generic16 +all: $(EXAMPLE) $(EXAMPLE)-sse4 $(EXAMPLE)-generic16 $(EXAMPLE)-scalar .PHONY: dirs clean @@ -57,3 +57,9 @@ objs/$(ISPC_SRC:.ispc=)_generic16.o: objs/$(ISPC_SRC:.ispc=)_generic16.cpp $(EXAMPLE)-generic16: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_generic16.o $(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS) + +objs/$(ISPC_SRC:.ispc=)_scalar.o: $(ISPC_SRC) + $(ISPC) $< -o $@ --target=generic-1 + +$(EXAMPLE)-scalar: $(CPP_OBJS) objs/$(ISPC_SRC:.ispc=)_scalar.o + $(CXX) $(CXXFLAGS) -o $@ $^ $(LIBS) diff --git a/examples/deferred/kernels.ispc b/examples/deferred/kernels.ispc index 8117e8a9..ae0542b2 100644 --- a/examples/deferred/kernels.ispc +++ b/examples/deferred/kernels.ispc @@ -158,38 +158,22 @@ IntersectLightsWithTileMinMax( uniform float gBufferScale_x = 0.5f * (float)gBufferWidth; uniform float gBufferScale_y = 0.5f * (float)gBufferHeight; - // Parallize across frustum planes. - // We really only have four side planes here, but write the code to - // handle programCount > 4 robustly - uniform float frustumPlanes_xy[programCount]; - uniform float frustumPlanes_z[programCount]; + uniform float frustumPlanes_xy[4] = { + -(cameraProj_11 * gBufferScale_x), + (cameraProj_11 * gBufferScale_x), + (cameraProj_22 * gBufferScale_y), + -(cameraProj_22 * gBufferScale_y) }; + uniform float frustumPlanes_z[4] = { + tileEndX - gBufferScale_x, + -tileStartX + gBufferScale_x, + tileEndY - gBufferScale_y, + -tileStartY + gBufferScale_y }; - // TODO: If programIndex < 4 here? Don't care about masking off the - // rest but if interleaving ("x2" modes) the other lanes should ideally - // not be emitted... - { - // This one is totally constant over the whole screen... worth pulling it up at all? - float frustumPlanes_xy_v; - frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 0, -(cameraProj_11 * gBufferScale_x)); - frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 1, (cameraProj_11 * gBufferScale_x)); - frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 2, (cameraProj_22 * gBufferScale_y)); - frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 3, -(cameraProj_22 * gBufferScale_y)); - - float frustumPlanes_z_v; - frustumPlanes_z_v = insert(frustumPlanes_z_v, 0, tileEndX - gBufferScale_x); - frustumPlanes_z_v = insert(frustumPlanes_z_v, 1, -tileStartX + gBufferScale_x); - frustumPlanes_z_v = insert(frustumPlanes_z_v, 2, tileEndY - gBufferScale_y); - frustumPlanes_z_v = insert(frustumPlanes_z_v, 3, -tileStartY + gBufferScale_y); - - // Normalize - float norm = rsqrt(frustumPlanes_xy_v * frustumPlanes_xy_v + - frustumPlanes_z_v * frustumPlanes_z_v); - frustumPlanes_xy_v *= norm; - frustumPlanes_z_v *= norm; - - // Save out for uniform use later - frustumPlanes_xy[programIndex] = frustumPlanes_xy_v; - frustumPlanes_z[programIndex] = frustumPlanes_z_v; + for (uniform int i = 0; i < 4; ++i) { + uniform float norm = rsqrt(frustumPlanes_xy[i] * frustumPlanes_xy[i] + + frustumPlanes_z[i] * frustumPlanes_z[i]); + frustumPlanes_xy[i] *= norm; + frustumPlanes_z[i] *= norm; } uniform int32 tileNumLights = 0; @@ -601,30 +585,20 @@ SplitTileMinMax( uniform float gBufferScale_x = 0.5f * (float)gBufferWidth; uniform float gBufferScale_y = 0.5f * (float)gBufferHeight; - // Parallize across frustum planes - // Only have 2 frustum split planes here so may not be worth it, but - // we'll do it for now for consistency - uniform float frustumPlanes_xy[programCount]; - uniform float frustumPlanes_z[programCount]; - - // This one is totally constant over the whole screen... worth pulling it up at all? - float frustumPlanes_xy_v; - frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 0, -(cameraProj_11 * gBufferScale_x)); - frustumPlanes_xy_v = insert(frustumPlanes_xy_v, 1, (cameraProj_22 * gBufferScale_y)); - - float frustumPlanes_z_v; - frustumPlanes_z_v = insert(frustumPlanes_z_v, 0, tileMidX - gBufferScale_x); - frustumPlanes_z_v = insert(frustumPlanes_z_v, 1, tileMidY - gBufferScale_y); + uniform float frustumPlanes_xy[2] = { -(cameraProj_11 * gBufferScale_x), + (cameraProj_22 * gBufferScale_y) }; + uniform float frustumPlanes_z[2] = { tileMidX - gBufferScale_x, + tileMidY - gBufferScale_y }; // Normalize - float norm = rsqrt(frustumPlanes_xy_v * frustumPlanes_xy_v + - frustumPlanes_z_v * frustumPlanes_z_v); - frustumPlanes_xy_v *= norm; - frustumPlanes_z_v *= norm; - - // Save out for uniform use later - frustumPlanes_xy[programIndex] = frustumPlanes_xy_v; - frustumPlanes_z[programIndex] = frustumPlanes_z_v; + uniform float norm[2] = { rsqrt(frustumPlanes_xy[0] * frustumPlanes_xy[0] + + frustumPlanes_z[0] * frustumPlanes_z[0]), + rsqrt(frustumPlanes_xy[1] * frustumPlanes_xy[1] + + frustumPlanes_z[1] * frustumPlanes_z[1]) }; + frustumPlanes_xy[0] *= norm[0]; + frustumPlanes_xy[1] *= norm[1]; + frustumPlanes_z[0] *= norm[0]; + frustumPlanes_z[1] *= norm[1]; // Initialize uniform int32 subtileLightOffset[4]; diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h index 7418f5d6..861db2a4 100644 --- a/examples/intrinsics/generic-16.h +++ b/examples/intrinsics/generic-16.h @@ -1106,7 +1106,7 @@ GATHER_GENERAL(__vec16_i64, int64_t, __vec16_i64, __gather64_i64) // scatter -#define SCATTER_BASE_VARYINGOFFSET(VTYPE, STYPE, OTYPE, FUNC) \ +#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \ uint32_t scale, OTYPE constOffset, \ VTYPE val, __vec16_i1 mask) { \ diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h index 7a3af6ad..c6299893 100644 --- a/examples/intrinsics/sse4.h +++ b/examples/intrinsics/sse4.h @@ -941,10 +941,8 @@ static FORCEINLINE __vec4_i1 __not_equal(__vec4_i32 a, __vec4_i32 b) { } static FORCEINLINE __vec4_i1 __unsigned_less_equal(__vec4_i32 a, __vec4_i32 b) { - a.v = _mm_xor_si128(a.v, _mm_set1_epi32(0x80000000)); - b.v = _mm_xor_si128(b.v, _mm_set1_epi32(0x80000000)); - return _mm_or_si128(_mm_cmplt_epi32(a.v, b.v), - _mm_cmpeq_epi32(a.v, b.v)); + // a<=b == (min(a,b) == a) + return _mm_cmpeq_epi32(_mm_min_epu32(a.v, b.v), a.v); } static FORCEINLINE __vec4_i1 __signed_less_equal(__vec4_i32 a, __vec4_i32 b) { @@ -953,10 +951,8 @@ static FORCEINLINE __vec4_i1 __signed_less_equal(__vec4_i32 a, __vec4_i32 b) { } static FORCEINLINE __vec4_i1 __unsigned_greater_equal(__vec4_i32 a, __vec4_i32 b) { - a.v = _mm_xor_si128(a.v, _mm_set1_epi32(0x80000000)); - b.v = _mm_xor_si128(b.v, _mm_set1_epi32(0x80000000)); - return _mm_or_si128(_mm_cmpgt_epi32(a.v, b.v), - _mm_cmpeq_epi32(a.v, b.v)); + // a>=b == (max(a,b) == a) + return _mm_cmpeq_epi32(_mm_max_epu32(a.v, b.v), a.v); } static FORCEINLINE __vec4_i1 __signed_greater_equal(__vec4_i32 a, __vec4_i32 b) { diff --git a/examples/tasksys.cpp b/examples/tasksys.cpp index 92dc50f0..4ce5d354 100644 --- a/examples/tasksys.cpp +++ b/examples/tasksys.cpp @@ -273,7 +273,7 @@ lAtomicCompareAndSwapPointer(void **v, void *newValue, void *oldValue) { #else void *result; #if (ISPC_POINTER_BYTES == 4) - __asm__ __volatile__("lock\ncmpxchgd %2,%1" + __asm__ __volatile__("lock\ncmpxchgl %2,%1" : "=a"(result), "=m"(*v) : "q"(newValue), "0"(oldValue) : "memory"); diff --git a/expr.cpp b/expr.cpp index c424f3c2..ec242fea 100644 --- a/expr.cpp +++ b/expr.cpp @@ -211,6 +211,21 @@ lDoTypeConv(const Type *fromType, const Type *toType, Expr **expr, return false; } + if (dynamic_cast(fromType)) { + if (!failureOk) + Error(pos, "Can't convert function type \"%s\" to \"%s\" for %s.", + fromType->GetString().c_str(), + toType->GetString().c_str(), errorMsgBase); + return false; + } + if (dynamic_cast(toType)) { + if (!failureOk) + Error(pos, "Can't convert from type \"%s\" to function type \"%s\" " + "for %s.", fromType->GetString().c_str(), + toType->GetString().c_str(), errorMsgBase); + return false; + } + const ArrayType *toArrayType = dynamic_cast(toType); const ArrayType *fromArrayType = dynamic_cast(fromType); const VectorType *toVectorType = dynamic_cast(toType); @@ -504,6 +519,153 @@ TypeConvertExpr(Expr *expr, const Type *toType, const char *errorMsgBase) { } +bool +PossiblyResolveFunctionOverloads(Expr *expr, const Type *type) { + FunctionSymbolExpr *fse = NULL; + const FunctionType *funcType = NULL; + if (dynamic_cast(type) != NULL && + (funcType = dynamic_cast(type->GetBaseType())) && + (fse = dynamic_cast(expr)) != NULL) { + // We're initializing a function pointer with a function symbol, + // which in turn may represent an overloaded function. So we need + // to try to resolve the overload based on the type of the symbol + // we're initializing here. + std::vector paramTypes; + for (int i = 0; i < funcType->GetNumParameters(); ++i) + paramTypes.push_back(funcType->GetParameterType(i)); + + if (fse->ResolveOverloads(expr->pos, paramTypes) == false) + return false; + } + return true; +} + + + +/** Utility routine that emits code to initialize a symbol given an + initializer expression. + + @param lvalue Memory location of storage for the symbol's data + @param symName Name of symbol (used in error messages) + @param symType Type of variable being initialized + @param initExpr Expression for the initializer + @param ctx FunctionEmitContext to use for generating instructions + @param pos Source file position of the variable being initialized +*/ +void +InitSymbol(llvm::Value *lvalue, const Type *symType, Expr *initExpr, + FunctionEmitContext *ctx, SourcePos pos) { + if (initExpr == NULL) + // leave it uninitialized + return; + + // If the initializer is a straight up expression that isn't an + // ExprList, then we'll see if we can type convert it to the type of + // the variable. + if (dynamic_cast(initExpr) == NULL) { + if (PossiblyResolveFunctionOverloads(initExpr, symType) == false) + return; + initExpr = TypeConvertExpr(initExpr, symType, "initializer"); + + if (initExpr != NULL) { + llvm::Value *initializerValue = initExpr->GetValue(ctx); + if (initializerValue != NULL) + // Bingo; store the value in the variable's storage + ctx->StoreInst(initializerValue, lvalue); + return; + } + } + + // Atomic types and enums can't be initialized with { ... } initializer + // expressions, so print an error and return if that's what we've got + // here.. + if (dynamic_cast(symType) != NULL || + dynamic_cast(symType) != NULL || + dynamic_cast(symType) != NULL) { + ExprList *elist = dynamic_cast(initExpr); + if (elist != NULL) { + if (elist->exprs.size() == 1) + InitSymbol(lvalue, symType, elist->exprs[0], ctx, pos); + else + Error(initExpr->pos, "Expression list initializers can't be used " + "with type \"%s\".", symType->GetString().c_str()); + } + return; + } + + const ReferenceType *rt = dynamic_cast(symType); + if (rt) { + if (!Type::Equal(initExpr->GetType(), rt)) { + Error(initExpr->pos, "Initializer for reference type \"%s\" must have same " + "reference type itself. \"%s\" is incompatible.", + rt->GetString().c_str(), initExpr->GetType()->GetString().c_str()); + return; + } + + llvm::Value *initializerValue = initExpr->GetValue(ctx); + if (initializerValue) + ctx->StoreInst(initializerValue, lvalue); + return; + } + + // There are two cases for initializing structs, arrays and vectors; + // either a single initializer may be provided (float foo[3] = 0;), in + // which case all of the elements are initialized to the given value, + // or an initializer list may be provided (float foo[3] = { 1,2,3 }), + // in which case the elements are initialized with the corresponding + // values. + const CollectionType *collectionType = + dynamic_cast(symType); + if (collectionType != NULL) { + std::string name; + if (dynamic_cast(symType) != NULL) + name = "struct"; + else if (dynamic_cast(symType) != NULL) + name = "array"; + else if (dynamic_cast(symType) != NULL) + name = "vector"; + else + FATAL("Unexpected CollectionType in InitSymbol()"); + + ExprList *exprList = dynamic_cast(initExpr); + if (exprList != NULL) { + // The { ... } case; make sure we have the same number of + // expressions in the ExprList as we have struct members + int nInits = exprList->exprs.size(); + if (nInits != collectionType->GetElementCount()) { + Error(initExpr->pos, "Initializer for %s type \"%s\" requires " + "%d values; %d provided.", name.c_str(), + symType->GetString().c_str(), + collectionType->GetElementCount(), nInits); + return; + } + + // Initialize each element with the corresponding value from + // the ExprList + for (int i = 0; i < nInits; ++i) { + llvm::Value *ep; + if (dynamic_cast(symType) != NULL) + ep = ctx->AddElementOffset(lvalue, i, NULL, "element"); + else + ep = ctx->GetElementPtrInst(lvalue, LLVMInt32(0), LLVMInt32(i), + PointerType::GetUniform(collectionType->GetElementType(i)), + "gep"); + + InitSymbol(ep, collectionType->GetElementType(i), + exprList->exprs[i], ctx, pos); + } + } + else + Error(initExpr->pos, "Can't assign type \"%s\" to \"%s\".", + initExpr->GetType()->GetString().c_str(), + collectionType->GetString().c_str()); + return; + } + + FATAL("Unexpected Type in InitSymbol()"); +} + + /////////////////////////////////////////////////////////////////////////// /** Given an atomic or vector type, this returns a boolean type with the @@ -1258,13 +1420,275 @@ BinaryExpr::BinaryExpr(Op o, Expr *a, Expr *b, SourcePos p) } +/** Emit code for a && or || logical operator. In particular, the code + here handles "short-circuit" evaluation, where the second expression + isn't evaluated if the value of the first one determines the value of + the result. +*/ +llvm::Value * +lEmitLogicalOp(BinaryExpr::Op op, Expr *arg0, Expr *arg1, + FunctionEmitContext *ctx, SourcePos pos) { + + const Type *type0 = arg0->GetType(), *type1 = arg1->GetType(); + if (type0 == NULL || type1 == NULL) { + Assert(m->errorCount > 0); + return NULL; + } + + // There is overhead (branches, etc.), to short-circuiting, so if the + // right side of the expression is a) relatively simple, and b) can be + // safely executed with an all-off execution mask, then we just + // evaluate both sides and then the logical operator in that case. + // FIXME: not sure what we should do about vector types here... + bool shortCircuit = (EstimateCost(arg1) > PREDICATE_SAFE_IF_STATEMENT_COST || + SafeToRunWithMaskAllOff(arg1) == false || + dynamic_cast(type0) != NULL || + dynamic_cast(type1) != NULL); + if (shortCircuit == false) { + // If one of the operands is uniform but the other is varying, + // promote the uniform one to varying + if (type0->IsUniformType() && type1->IsVaryingType()) { + arg0 = TypeConvertExpr(arg0, AtomicType::VaryingBool, lOpString(op)); + Assert(arg0 != NULL); + } + if (type1->IsUniformType() && type0->IsVaryingType()) { + arg1 = TypeConvertExpr(arg1, AtomicType::VaryingBool, lOpString(op)); + Assert(arg1 != NULL); + } + + llvm::Value *value0 = arg0->GetValue(ctx); + llvm::Value *value1 = arg1->GetValue(ctx); + if (value0 == NULL || value1 == NULL) { + Assert(m->errorCount > 0); + return NULL; + } + + if (op == BinaryExpr::LogicalAnd) + return ctx->BinaryOperator(llvm::Instruction::And, value0, value1, + "logical_and"); + else { + Assert(op == BinaryExpr::LogicalOr); + return ctx->BinaryOperator(llvm::Instruction::Or, value0, value1, + "logical_or"); + } + } + + // Allocate temporary storage for the return value + const Type *retType = Type::MoreGeneralType(type0, type1, pos, lOpString(op)); + LLVM_TYPE_CONST llvm::Type *llvmRetType = retType->LLVMType(g->ctx); + llvm::Value *retPtr = ctx->AllocaInst(llvmRetType, "logical_op_mem"); + + llvm::BasicBlock *bbSkipEvalValue1 = ctx->CreateBasicBlock("skip_eval_1"); + llvm::BasicBlock *bbEvalValue1 = ctx->CreateBasicBlock("eval_1"); + llvm::BasicBlock *bbLogicalDone = ctx->CreateBasicBlock("logical_op_done"); + + // Evaluate the first operand + llvm::Value *value0 = arg0->GetValue(ctx); + if (value0 == NULL) { + Assert(m->errorCount > 0); + return NULL; + } + + if (type0->IsUniformType()) { + // Check to see if the value of the first operand is true or false + llvm::Value *value0True = + ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, + value0, LLVMTrue); + + if (op == BinaryExpr::LogicalOr) { + // For ||, if value0 is true, then we skip evaluating value1 + // entirely. + ctx->BranchInst(bbSkipEvalValue1, bbEvalValue1, value0True); + + // If value0 is true, the complete result is true (either + // uniform or varying) + ctx->SetCurrentBasicBlock(bbSkipEvalValue1); + llvm::Value *trueValue = retType->IsUniformType() ? LLVMTrue : + LLVMMaskAllOn; + ctx->StoreInst(trueValue, retPtr); + ctx->BranchInst(bbLogicalDone); + } + else { + Assert(op == BinaryExpr::LogicalAnd); + + // Conversely, for &&, if value0 is false, we skip evaluating + // value1. + ctx->BranchInst(bbEvalValue1, bbSkipEvalValue1, value0True); + + // In this case, the complete result is false (again, either a + // uniform or varying false). + ctx->SetCurrentBasicBlock(bbSkipEvalValue1); + llvm::Value *falseValue = retType->IsUniformType() ? LLVMFalse : + LLVMMaskAllOff; + ctx->StoreInst(falseValue, retPtr); + ctx->BranchInst(bbLogicalDone); + } + + // Both || and && are in the same situation if the first operand's + // value didn't resolve the final result: they need to evaluate the + // value of the second operand, which in turn gives the value for + // the full expression. + ctx->SetCurrentBasicBlock(bbEvalValue1); + if (type1->IsUniformType() && retType->IsVaryingType()) { + arg1 = TypeConvertExpr(arg1, AtomicType::VaryingBool, "logical op"); + Assert(arg1 != NULL); + } + + llvm::Value *value1 = arg1->GetValue(ctx); + if (value1 == NULL) { + Assert(m->errorCount > 0); + return NULL; + } + ctx->StoreInst(value1, retPtr); + ctx->BranchInst(bbLogicalDone); + + // In all cases, we end up at the bbLogicalDone basic block; + // loading the value stored in retPtr in turn gives the overall + // result. + ctx->SetCurrentBasicBlock(bbLogicalDone); + return ctx->LoadInst(retPtr); + } + else { + // Otherwise, the first operand is varying... Save the current + // value of the mask so that we can restore it at the end. + llvm::Value *oldMask = ctx->GetInternalMask(); + llvm::Value *oldFullMask = ctx->GetFullMask(); + + // Convert the second operand to be varying as well, so that we can + // perform logical vector ops with its value. + if (type1->IsUniformType()) { + arg1 = TypeConvertExpr(arg1, AtomicType::VaryingBool, "logical op"); + Assert(arg1 != NULL); + type1 = arg1->GetType(); + } + + if (op == BinaryExpr::LogicalOr) { + // See if value0 is true for all currently executing + // lanes--i.e. if (value0 & mask) == mask. If so, we don't + // need to evaluate the second operand of the expression. + llvm::Value *value0AndMask = + ctx->BinaryOperator(llvm::Instruction::And, value0, + oldFullMask, "op&mask"); + llvm::Value *equalsMask = + ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, + value0AndMask, oldFullMask, "value0&mask==mask"); + equalsMask = ctx->I1VecToBoolVec(equalsMask); + llvm::Value *allMatch = ctx->All(equalsMask); + ctx->BranchInst(bbSkipEvalValue1, bbEvalValue1, allMatch); + + // value0 is true for all running lanes, so it can be used for + // the final result + ctx->SetCurrentBasicBlock(bbSkipEvalValue1); + ctx->StoreInst(value0, retPtr); + ctx->BranchInst(bbLogicalDone); + + // Otherwise, we need to valuate arg1. However, first we need + // to set the execution mask to be (oldMask & ~a); in other + // words, only execute the instances where value0 is false. + // For the instances where value0 was true, we need to inhibit + // execution. + ctx->SetCurrentBasicBlock(bbEvalValue1); + llvm::Value *not0 = ctx->NotOperator(value0); + ctx->SetInternalMaskAnd(oldMask, not0); + + llvm::Value *value1 = arg1->GetValue(ctx); + if (value1 == NULL) { + Assert(m->errorCount > 0); + return NULL; + } + + // We need to compute the result carefully, since vector + // elements that were computed when the corresponding lane was + // disabled have undefined values: + // result = (value0 & old_mask) | (value1 & current_mask) + llvm::Value *value1AndMask = + ctx->BinaryOperator(llvm::Instruction::And, value1, + ctx->GetInternalMask(), "op&mask"); + llvm::Value *result = + ctx->BinaryOperator(llvm::Instruction::Or, value0AndMask, + value1AndMask, "or_result"); + ctx->StoreInst(result, retPtr); + ctx->BranchInst(bbLogicalDone); + } + else { + Assert(op == BinaryExpr::LogicalAnd); + + // If value0 is false for all currently running lanes, the + // overall result must be false: this corresponds to checking + // if (mask & ~value0) == mask. + llvm::Value *notValue0 = ctx->NotOperator(value0, "not_value0"); + llvm::Value *notValue0AndMask = + ctx->BinaryOperator(llvm::Instruction::And, notValue0, + oldFullMask, "not_value0&mask"); + llvm::Value *equalsMask = + ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, + notValue0AndMask, oldFullMask, "not_value0&mask==mask"); + equalsMask = ctx->I1VecToBoolVec(equalsMask); + llvm::Value *allMatch = ctx->All(equalsMask); + ctx->BranchInst(bbSkipEvalValue1, bbEvalValue1, allMatch); + + // value0 was false for all running lanes, so use its value as + // the overall result. + ctx->SetCurrentBasicBlock(bbSkipEvalValue1); + ctx->StoreInst(value0, retPtr); + ctx->BranchInst(bbLogicalDone); + + // Otherwise we need to evaluate value1, but again with the + // mask set to only be on for the lanes where value0 was true. + // For the lanes where value0 was false, execution needs to be + // disabled: mask = (mask & value0). + ctx->SetCurrentBasicBlock(bbEvalValue1); + ctx->SetInternalMaskAnd(oldMask, value0); + + llvm::Value *value1 = arg1->GetValue(ctx); + if (value1 == NULL) { + Assert(m->errorCount > 0); + return NULL; + } + + // And as in the || case, we compute the overall result by + // masking off the valid lanes before we AND them together: + // result = (value0 & old_mask) & (value1 & current_mask) + llvm::Value *value0AndMask = + ctx->BinaryOperator(llvm::Instruction::And, value0, + oldFullMask, "op&mask"); + llvm::Value *value1AndMask = + ctx->BinaryOperator(llvm::Instruction::And, value1, + ctx->GetInternalMask(), "value1&mask"); + llvm::Value *result = + ctx->BinaryOperator(llvm::Instruction::And, value0AndMask, + value1AndMask, "or_result"); + ctx->StoreInst(result, retPtr); + ctx->BranchInst(bbLogicalDone); + } + + // And finally we always end up in bbLogicalDone, where we restore + // the old mask and return the computed result + ctx->SetCurrentBasicBlock(bbLogicalDone); + ctx->SetInternalMask(oldMask); + return ctx->LoadInst(retPtr); + } +} + + llvm::Value * BinaryExpr::GetValue(FunctionEmitContext *ctx) const { - if (!arg0 || !arg1) + if (!arg0 || !arg1) { + Assert(m->errorCount > 0); return NULL; + } + + // Handle these specially, since we want to short-circuit their evaluation... + if (op == LogicalAnd || op == LogicalOr) + return lEmitLogicalOp(op, arg0, arg1, ctx, pos); llvm::Value *value0 = arg0->GetValue(ctx); llvm::Value *value1 = arg1->GetValue(ctx); + if (value0 == NULL || value1 == NULL) { + Assert(m->errorCount > 0); + return NULL; + } + ctx->SetDebugPos(pos); switch (op) { @@ -1294,12 +1718,6 @@ BinaryExpr::GetValue(FunctionEmitContext *ctx) const { return lEmitBinaryBitOp(op, value0, value1, arg0->GetType()->IsUnsignedType(), ctx); } - case LogicalAnd: - return ctx->BinaryOperator(llvm::Instruction::And, value0, value1, - "logical_and"); - case LogicalOr: - return ctx->BinaryOperator(llvm::Instruction::Or, value0, value1, - "logical_or"); case Comma: return value1; default: @@ -1447,7 +1865,8 @@ lConstFoldBinLogicalOp(BinaryExpr::Op op, const T *v0, const T *v1, ConstExpr *c /** Constant fold binary arithmetic ops. */ template static ConstExpr * -lConstFoldBinArithOp(BinaryExpr::Op op, const T *v0, const T *v1, ConstExpr *carg0) { +lConstFoldBinArithOp(BinaryExpr::Op op, const T *v0, const T *v1, ConstExpr *carg0, + SourcePos pos) { T result[ISPC_MAX_NVEC]; int count = carg0->Count(); @@ -1455,7 +1874,16 @@ lConstFoldBinArithOp(BinaryExpr::Op op, const T *v0, const T *v1, ConstExpr *car FOLD_OP(BinaryExpr::Add, +); FOLD_OP(BinaryExpr::Sub, -); FOLD_OP(BinaryExpr::Mul, *); - FOLD_OP(BinaryExpr::Div, /); + case BinaryExpr::Div: + for (int i = 0; i < count; ++i) { + if (v1[i] == 0) { + Error(pos, "Division by zero encountered in expression."); + result[i] = 0; + } + else + result[i] = (v0[i] / v1[i]); + } + break; default: return NULL; } @@ -1571,7 +1999,7 @@ BinaryExpr::Optimize() { constArg0->AsFloat(v0); constArg1->AsFloat(v1); ConstExpr *ret; - if ((ret = lConstFoldBinArithOp(op, v0, v1, constArg0)) != NULL) + if ((ret = lConstFoldBinArithOp(op, v0, v1, constArg0, pos)) != NULL) return ret; else if ((ret = lConstFoldBinLogicalOp(op, v0, v1, constArg0)) != NULL) return ret; @@ -1583,7 +2011,7 @@ BinaryExpr::Optimize() { constArg0->AsDouble(v0); constArg1->AsDouble(v1); ConstExpr *ret; - if ((ret = lConstFoldBinArithOp(op, v0, v1, constArg0)) != NULL) + if ((ret = lConstFoldBinArithOp(op, v0, v1, constArg0, pos)) != NULL) return ret; else if ((ret = lConstFoldBinLogicalOp(op, v0, v1, constArg0)) != NULL) return ret; @@ -1595,7 +2023,7 @@ BinaryExpr::Optimize() { constArg0->AsInt32(v0); constArg1->AsInt32(v1); ConstExpr *ret; - if ((ret = lConstFoldBinArithOp(op, v0, v1, constArg0)) != NULL) + if ((ret = lConstFoldBinArithOp(op, v0, v1, constArg0, pos)) != NULL) return ret; else if ((ret = lConstFoldBinIntOp(op, v0, v1, constArg0)) != NULL) return ret; @@ -1610,7 +2038,7 @@ BinaryExpr::Optimize() { constArg0->AsUInt32(v0); constArg1->AsUInt32(v1); ConstExpr *ret; - if ((ret = lConstFoldBinArithOp(op, v0, v1, constArg0)) != NULL) + if ((ret = lConstFoldBinArithOp(op, v0, v1, constArg0, pos)) != NULL) return ret; else if ((ret = lConstFoldBinIntOp(op, v0, v1, constArg0)) != NULL) return ret; @@ -1796,7 +2224,8 @@ BinaryExpr::TypeCheck() { return NULL; } - const Type *promotedType = Type::MoreGeneralType(type0, type1, arg0->pos, + const Type *promotedType = Type::MoreGeneralType(type0, type1, + Union(arg0->pos, arg1->pos), lOpString(op)); if (promotedType == NULL) return NULL; @@ -1859,12 +2288,15 @@ BinaryExpr::TypeCheck() { } case LogicalAnd: case LogicalOr: { - // We need to type convert to a boolean type of the more general - // shape of the two types - bool isUniform = (type0->IsUniformType() && type1->IsUniformType()); - const AtomicType *boolType = isUniform ? AtomicType::UniformBool : - AtomicType::VaryingBool; - const Type *destType = NULL; + // For now, we just type convert to boolean types, of the same + // variability as the original types. (When generating code, it's + // useful to have preserved the uniform/varying distinction.) + const AtomicType *boolType0 = type0->IsUniformType() ? + AtomicType::UniformBool : AtomicType::VaryingBool; + const AtomicType *boolType1 = type1->IsUniformType() ? + AtomicType::UniformBool : AtomicType::VaryingBool; + + const Type *destType0 = NULL, *destType1 = NULL; const VectorType *vtype0 = dynamic_cast(type0); const VectorType *vtype1 = dynamic_cast(type1); if (vtype0 && vtype1) { @@ -1874,17 +2306,24 @@ BinaryExpr::TypeCheck() { "different sizes (%d vs. %d).", lOpString(op), sz0, sz1); return NULL; } - destType = new VectorType(boolType, sz0); + destType0 = new VectorType(boolType0, sz0); + destType1 = new VectorType(boolType1, sz1); + } + else if (vtype0 != NULL) { + destType0 = new VectorType(boolType0, vtype0->GetElementCount()); + destType1 = new VectorType(boolType1, vtype0->GetElementCount()); + } + else if (vtype1 != NULL) { + destType0 = new VectorType(boolType0, vtype1->GetElementCount()); + destType1 = new VectorType(boolType1, vtype1->GetElementCount()); + } + else { + destType0 = boolType0; + destType1 = boolType1; } - else if (vtype0) - destType = new VectorType(boolType, vtype0->GetElementCount()); - else if (vtype1) - destType = new VectorType(boolType, vtype1->GetElementCount()); - else - destType = boolType; - arg0 = TypeConvertExpr(arg0, destType, lOpString(op)); - arg1 = TypeConvertExpr(arg1, destType, lOpString(op)); + arg0 = TypeConvertExpr(arg0, destType0, lOpString(op)); + arg1 = TypeConvertExpr(arg1, destType1, lOpString(op)); if (arg0 == NULL || arg1 == NULL) return NULL; return this; @@ -2160,6 +2599,11 @@ AssignExpr::TypeCheck() { } const Type *lhsType = lvalue->GetType(); + if (lhsType == NULL) { + Assert(m->errorCount > 0); + return NULL; + } + if (lhsType->IsConstType()) { Error(lvalue->pos, "Can't assign to type \"%s\" on left-hand side of " "expression.", lhsType->GetString().c_str()); @@ -2199,6 +2643,14 @@ AssignExpr::TypeCheck() { if (rvalue == NULL) return NULL; + if (lhsType->IsFloatType() == true && + (op == ShlAssign || op == ShrAssign || op == AndAssign || + op == XorAssign || op == OrAssign)) { + Error(pos, "Illegal to use %s operator with floating-point " + "operands.", lOpString(op)); + return NULL; + } + // Make sure we're not assigning to a struct that has a constant member const StructType *st = dynamic_cast(lhsType); if (st != NULL && lCheckForConstStructMember(pos, st, st)) @@ -2262,6 +2714,34 @@ lEmitVaryingSelect(FunctionEmitContext *ctx, llvm::Value *test, } +static void +lEmitSelectExprCode(FunctionEmitContext *ctx, llvm::Value *testVal, + llvm::Value *oldMask, llvm::Value *fullMask, + Expr *expr, llvm::Value *exprPtr) { + llvm::BasicBlock *bbEval = ctx->CreateBasicBlock("select_eval_expr"); + llvm::BasicBlock *bbDone = ctx->CreateBasicBlock("select_done"); + + // Check to see if the test was true for any of the currently executing + // program instances. + llvm::Value *testAndFullMask = + ctx->BinaryOperator(llvm::Instruction::And, testVal, fullMask, + "test&mask"); + llvm::Value *anyOn = ctx->Any(testAndFullMask); + ctx->BranchInst(bbEval, bbDone, anyOn); + + ctx->SetCurrentBasicBlock(bbEval); + llvm::Value *testAndMask = + ctx->BinaryOperator(llvm::Instruction::And, testVal, oldMask, + "test&mask"); + ctx->SetInternalMask(testAndMask); + llvm::Value *exprVal = expr->GetValue(ctx); + ctx->StoreInst(exprVal, exprPtr); + ctx->BranchInst(bbDone); + + ctx->SetCurrentBasicBlock(bbDone); +} + + llvm::Value * SelectExpr::GetValue(FunctionEmitContext *ctx) const { if (!expr1 || !expr2 || !test) @@ -2309,18 +2789,58 @@ SelectExpr::GetValue(FunctionEmitContext *ctx) const { return ret; } else if (dynamic_cast(testType) == NULL) { - // if the test is a varying bool type, then evaluate both of the - // value expressions with the mask set appropriately and then do an - // element-wise select to get the result + // the test is a varying bool type llvm::Value *testVal = test->GetValue(ctx); Assert(testVal->getType() == LLVMTypes::MaskType); llvm::Value *oldMask = ctx->GetInternalMask(); - ctx->SetInternalMaskAnd(oldMask, testVal); - llvm::Value *expr1Val = expr1->GetValue(ctx); - ctx->SetInternalMaskAndNot(oldMask, testVal); - llvm::Value *expr2Val = expr2->GetValue(ctx); - ctx->SetInternalMask(oldMask); + llvm::Value *fullMask = ctx->GetFullMask(); + // We don't want to incur the overhead for short-circuit evaluation + // for expressions that are both computationally simple and safe to + // run with an "all off" mask. + bool shortCircuit1 = + (::EstimateCost(expr1) > PREDICATE_SAFE_IF_STATEMENT_COST || + SafeToRunWithMaskAllOff(expr1) == false); + bool shortCircuit2 = + (::EstimateCost(expr2) > PREDICATE_SAFE_IF_STATEMENT_COST || + SafeToRunWithMaskAllOff(expr2) == false); + + Debug(expr1->pos, "%sshort circuiting evaluation for select expr", + shortCircuit1 ? "" : "Not "); + Debug(expr2->pos, "%sshort circuiting evaluation for select expr", + shortCircuit2 ? "" : "Not "); + + // Temporary storage to store the values computed for each + // expression, if any. (These stay as uninitialized memory if we + // short circuit around the corresponding expression.) + LLVM_TYPE_CONST llvm::Type *exprType = + expr1->GetType()->LLVMType(g->ctx); + llvm::Value *expr1Ptr = ctx->AllocaInst(exprType); + llvm::Value *expr2Ptr = ctx->AllocaInst(exprType); + + if (shortCircuit1) + lEmitSelectExprCode(ctx, testVal, oldMask, fullMask, expr1, + expr1Ptr); + else { + ctx->SetInternalMaskAnd(oldMask, testVal); + llvm::Value *expr1Val = expr1->GetValue(ctx); + ctx->StoreInst(expr1Val, expr1Ptr); + } + + if (shortCircuit2) { + llvm::Value *notTest = ctx->NotOperator(testVal); + lEmitSelectExprCode(ctx, notTest, oldMask, fullMask, expr2, + expr2Ptr); + } + else { + ctx->SetInternalMaskAndNot(oldMask, testVal); + llvm::Value *expr2Val = expr2->GetValue(ctx); + ctx->StoreInst(expr2Val, expr2Ptr); + } + + ctx->SetInternalMask(oldMask); + llvm::Value *expr1Val = ctx->LoadInst(expr1Ptr); + llvm::Value *expr2Val = ctx->LoadInst(expr2Ptr); return lEmitVaryingSelect(ctx, testVal, expr1Val, expr2Val, type); } else { @@ -2388,7 +2908,27 @@ Expr * SelectExpr::Optimize() { if (test == NULL || expr1 == NULL || expr2 == NULL) return NULL; - return this; + + ConstExpr *constTest = dynamic_cast(test); + if (constTest == NULL) + return this; + + // The test is a constant; see if we can resolve to one of the + // expressions.. + bool bv[ISPC_MAX_NVEC]; + int count = constTest->AsBool(bv); + if (count == 1) + // Uniform test value; return the corresponding expression + return (bv[0] == true) ? expr1 : expr2; + else { + // Varying test: see if all of the values are the same; if so, then + // return the corresponding expression + bool first = bv[0]; + for (int i = 0; i < count; ++i) + if (bv[i] != first) + return this; + return (bv[0] == true) ? expr1 : expr2; + } } @@ -2678,12 +3218,12 @@ FunctionCallExpr::TypeCheck() { const Type *fptrType = func->GetType(); if (fptrType == NULL) return NULL; - - Assert(dynamic_cast(fptrType) != NULL); - const FunctionType *funcType = - dynamic_cast(fptrType->GetBaseType()); - if (funcType == NULL) { - Error(pos, "Must provide function name or function pointer for " + + // Make sure we do in fact have a function to call + const FunctionType *funcType; + if (dynamic_cast(fptrType) == NULL || + (funcType = dynamic_cast(fptrType->GetBaseType())) == NULL) { + Error(func->pos, "Must provide function name or function pointer for " "function call expression."); return NULL; } @@ -3065,8 +3605,10 @@ IndexExpr::GetLValue(FunctionEmitContext *ctx) const { if (baseValue == NULL || indexValue == NULL) return NULL; ctx->SetDebugPos(pos); - return ctx->GetElementPtrInst(baseValue, indexValue, - baseExprType, "ptr_offset"); + llvm::Value *ptr = ctx->GetElementPtrInst(baseValue, indexValue, + baseExprType, "ptr_offset"); + ptr = lAddVaryingOffsetsIfNeeded(ctx, ptr, GetLValueType()); + return ptr; } // Otherwise it's an array or vector @@ -4042,6 +4584,53 @@ ConstExpr::ConstExpr(ConstExpr *old, double *v) } +ConstExpr::ConstExpr(ConstExpr *old, SourcePos p) + : Expr(p) { + type = old->type; + + AtomicType::BasicType basicType = getBasicType(); + + switch (basicType) { + case AtomicType::TYPE_BOOL: + memcpy(boolVal, old->boolVal, Count() * sizeof(bool)); + break; + case AtomicType::TYPE_INT8: + memcpy(int8Val, old->int8Val, Count() * sizeof(int8_t)); + break; + case AtomicType::TYPE_UINT8: + memcpy(uint8Val, old->uint8Val, Count() * sizeof(uint8_t)); + break; + case AtomicType::TYPE_INT16: + memcpy(int16Val, old->int16Val, Count() * sizeof(int16_t)); + break; + case AtomicType::TYPE_UINT16: + memcpy(uint16Val, old->uint16Val, Count() * sizeof(uint16_t)); + break; + case AtomicType::TYPE_INT32: + memcpy(int32Val, old->int32Val, Count() * sizeof(int32_t)); + break; + case AtomicType::TYPE_UINT32: + memcpy(uint32Val, old->uint32Val, Count() * sizeof(uint32_t)); + break; + case AtomicType::TYPE_FLOAT: + memcpy(floatVal, old->floatVal, Count() * sizeof(float)); + break; + case AtomicType::TYPE_DOUBLE: + memcpy(doubleVal, old->doubleVal, Count() * sizeof(double)); + break; + case AtomicType::TYPE_INT64: + memcpy(int64Val, old->int64Val, Count() * sizeof(int64_t)); + break; + case AtomicType::TYPE_UINT64: + memcpy(uint64Val, old->uint64Val, Count() * sizeof(uint64_t)); + break; + default: + FATAL("unimplemented const type"); + } + +} + + AtomicType::BasicType ConstExpr::getBasicType() const { const AtomicType *at = dynamic_cast(type); @@ -5565,28 +6154,15 @@ llvm::Constant * TypeCastExpr::GetConstant(const Type *constType) const { // We don't need to worry about most the basic cases where the type // cast can resolve to a constant here, since the - // TypeCastExpr::Optimize() method ends up doing the type conversion - // and returning a ConstExpr, which in turn will have its GetConstant() - // method called. Thus, the only case we do need to worry about here - // is converting a uniform function pointer to a varying function - // pointer of the same type. + // TypeCastExpr::Optimize() method generally ends up doing the type + // conversion and returning a ConstExpr, which in turn will have its + // GetConstant() method called. However, because ConstExpr currently + // can't represent pointer values, we have to handle two cases here: + // 1. Null pointers (NULL, 0) valued initializers, and + // 2. Converting a uniform function pointer to a varying function + // pointer of the same type. Assert(Type::Equal(constType, type)); - const FunctionType *ft = NULL; - if (dynamic_cast(type) == NULL || - (ft = dynamic_cast(type->GetBaseType())) == NULL) - return NULL; - - llvm::Constant *ec = expr->GetConstant(expr->GetType()); - if (ec == NULL) - return NULL; - - ec = llvm::ConstantExpr::getPtrToInt(ec, LLVMTypes::PointerIntType); - - Assert(type->IsVaryingType()); - std::vector smear; - for (int i = 0; i < g->target.vectorWidth; ++i) - smear.push_back(ec); - return llvm::ConstantVector::get(smear); + return expr->GetConstant(constType); } @@ -5744,8 +6320,18 @@ DereferenceExpr::GetType() const { Expr * DereferenceExpr::TypeCheck() { - if (expr == NULL) + if (expr == NULL) { + Assert(m->errorCount > 0); return NULL; + } + + if (dynamic_cast(expr->GetType()) == NULL && + dynamic_cast(expr->GetType()) == NULL) { + Error(pos, "Illegal to dereference non-pointer or reference " + "type \"%s\".", expr->GetType()->GetString().c_str()); + return NULL; + } + return this; } @@ -5985,7 +6571,7 @@ SymbolExpr::Optimize() { return NULL; else if (symbol->constValue != NULL) { Assert(GetType()->IsConstType()); - return symbol->constValue; + return new ConstExpr(symbol->constValue, pos); } else return this; @@ -6081,13 +6667,30 @@ FunctionSymbolExpr::Print() const { llvm::Constant * FunctionSymbolExpr::GetConstant(const Type *type) const { - Assert(type->IsUniformType()); - Assert(GetType()->IsUniformType()); - - if (Type::EqualIgnoringConst(type, GetType()) == false) + if (matchingFunc == NULL || matchingFunc->function == NULL) return NULL; - return matchingFunc ? matchingFunc->function : NULL; + const FunctionType *ft; + if (dynamic_cast(type) == NULL || + (ft = dynamic_cast(type->GetBaseType())) == NULL) + return NULL; + + LLVM_TYPE_CONST llvm::Type *llvmUnifType = + type->GetAsUniformType()->LLVMType(g->ctx); + if (llvmUnifType != matchingFunc->function->getType()) + return NULL; + + if (type->IsUniformType()) + return matchingFunc->function; + else { + llvm::Constant *intPtr = + llvm::ConstantExpr::getPtrToInt(matchingFunc->function, + LLVMTypes::PointerIntType); + std::vector smear; + for (int i = 0; i < g->target.vectorWidth; ++i) + smear.push_back(intPtr); + return llvm::ConstantVector::get(smear); + } } @@ -6513,6 +7116,22 @@ NullPointerExpr::Optimize() { } +llvm::Constant * +NullPointerExpr::GetConstant(const Type *type) const { + const PointerType *pt = dynamic_cast(type); + if (pt == NULL) + return NULL; + + LLVM_TYPE_CONST llvm::Type *llvmType = type->LLVMType(g->ctx); + if (llvmType == NULL) { + Assert(m->errorCount > 0); + return NULL; + } + + return llvm::Constant::getNullValue(llvmType); +} + + void NullPointerExpr::Print() const { printf("NULL"); @@ -6525,3 +7144,211 @@ NullPointerExpr::EstimateCost() const { return 0; } + +/////////////////////////////////////////////////////////////////////////// +// NewExpr + +NewExpr::NewExpr(int typeQual, const Type *t, Expr *init, Expr *count, + SourcePos tqPos, SourcePos p) + : Expr(p) { + allocType = t; + if (allocType != NULL && allocType->HasUnboundVariability()) + allocType = allocType->ResolveUnboundVariability(Type::Varying); + + initExpr = init; + countExpr = count; + + /* (The below cases actually should be impossible, since the parser + doesn't allow more than a single type qualifier before a "new".) */ + if ((typeQual & ~(TYPEQUAL_UNIFORM | TYPEQUAL_VARYING)) != 0) { + Error(tqPos, "Illegal type qualifiers in \"new\" expression (only " + "\"uniform\" and \"varying\" are allowed."); + isVarying = false; + } + else if ((typeQual & TYPEQUAL_UNIFORM) != 0 && + (typeQual & TYPEQUAL_VARYING) != 0) { + Error(tqPos, "Illegal to provide both \"uniform\" and \"varying\" " + "qualifiers to \"new\" expression."); + isVarying = false; + } + else + // If no type qualifier is given before the 'new', treat it as a + // varying new. + isVarying = (typeQual == 0) || (typeQual & TYPEQUAL_VARYING); +} + + +llvm::Value * +NewExpr::GetValue(FunctionEmitContext *ctx) const { + bool do32Bit = (g->target.is32Bit || g->opt.force32BitAddressing); + + // Determine how many elements we need to allocate. Note that this + // will be a varying value if this is a varying new. + llvm::Value *countValue; + if (countExpr != NULL) { + countValue = countExpr->GetValue(ctx); + if (countValue == NULL) { + Assert(m->errorCount > 0); + return NULL; + } + } + else { + if (isVarying) { + if (do32Bit) countValue = LLVMInt32Vector(1); + else countValue = LLVMInt64Vector(1); + } + else { + if (do32Bit) countValue = LLVMInt32(1); + else countValue = LLVMInt64(1); + } + } + + // Compute the total amount of memory to allocate, allocSize, as the + // product of the number of elements to allocate and the size of a + // single element. + llvm::Value *eltSize = g->target.SizeOf(allocType->LLVMType(g->ctx), + ctx->GetCurrentBasicBlock()); + if (isVarying) + eltSize = ctx->SmearUniform(eltSize, "smear_size"); + llvm::Value *allocSize = ctx->BinaryOperator(llvm::Instruction::Mul, countValue, + eltSize, "alloc_size"); + + // Determine which allocation builtin function to call: uniform or + // varying, and taking 32-bit or 64-bit allocation counts. + llvm::Function *func; + if (isVarying) { + if (do32Bit) + func = m->module->getFunction("__new_varying32"); + else + func = m->module->getFunction("__new_varying64"); + } + else { + if (allocSize->getType() != LLVMTypes::Int64Type) + allocSize = ctx->SExtInst(allocSize, LLVMTypes::Int64Type, + "alloc_size64"); + func = m->module->getFunction("__new_uniform"); + } + Assert(func != NULL); + + // Make the call for the the actual allocation. + llvm::Value *ptrValue = ctx->CallInst(func, NULL, allocSize, "new"); + + // Now handle initializers and returning the right type for the result. + const Type *retType = GetType(); + if (retType == NULL) + return NULL; + if (isVarying) { + if (g->target.is32Bit) + // Convert i64 vector values to i32 if we are compiling to a + // 32-bit target. + ptrValue = ctx->TruncInst(ptrValue, LLVMTypes::VoidPointerVectorType, + "ptr_to_32bit"); + + if (initExpr != NULL) { + // If we have an initializer expression, emit code that checks + // to see if each lane is active and if so, runs the code to do + // the initialization. Note that we're we're taking advantage + // of the fact that the __new_varying*() functions are + // implemented to return NULL for program instances that aren't + // executing; more generally, we should be using the current + // execution mask for this... + for (int i = 0; i < g->target.vectorWidth; ++i) { + llvm::BasicBlock *bbInit = ctx->CreateBasicBlock("init_ptr"); + llvm::BasicBlock *bbSkip = ctx->CreateBasicBlock("skip_init"); + llvm::Value *p = ctx->ExtractInst(ptrValue, i); + llvm::Value *nullValue = g->target.is32Bit ? LLVMInt32(0) : + LLVMInt64(0); + // Is the pointer for the current lane non-zero? + llvm::Value *nonNull = ctx->CmpInst(llvm::Instruction::ICmp, + llvm::CmpInst::ICMP_NE, + p, nullValue, "non_null"); + ctx->BranchInst(bbInit, bbSkip, nonNull); + + // Initialize the memory pointed to by the pointer for the + // current lane. + ctx->SetCurrentBasicBlock(bbInit); + LLVM_TYPE_CONST llvm::Type *ptrType = + retType->GetAsUniformType()->LLVMType(g->ctx); + llvm::Value *ptr = ctx->IntToPtrInst(p, ptrType); + InitSymbol(ptr, allocType, initExpr, ctx, pos); + ctx->BranchInst(bbSkip); + + ctx->SetCurrentBasicBlock(bbSkip); + } + } + + return ptrValue; + } + else { + // For uniform news, we just need to cast the void * to be a + // pointer of the return type and to run the code for initializers, + // if present. + LLVM_TYPE_CONST llvm::Type *ptrType = retType->LLVMType(g->ctx); + ptrValue = ctx->BitCastInst(ptrValue, ptrType, "cast_new_ptr"); + + if (initExpr != NULL) + InitSymbol(ptrValue, allocType, initExpr, ctx, pos); + + return ptrValue; + } +} + + +const Type * +NewExpr::GetType() const { + if (allocType == NULL) + return NULL; + + return isVarying ? PointerType::GetVarying(allocType) : + PointerType::GetUniform(allocType); +} + + +Expr * +NewExpr::TypeCheck() { + // Here we only need to make sure that if we have an expression giving + // a number of elements to allocate that it can be converted to an + // integer of the appropriate variability. + if (countExpr == NULL) + return this; + + const Type *countType; + if ((countType = countExpr->GetType()) == NULL) + return NULL; + + if (isVarying == false && countType->IsVaryingType()) { + Error(pos, "Illegal to provide \"varying\" allocation count with " + "\"uniform new\" expression."); + return NULL; + } + + // Figure out the type that the allocation count should be + const Type *t = (g->target.is32Bit || g->opt.force32BitAddressing) ? + AtomicType::UniformUInt32 : AtomicType::UniformUInt64; + if (isVarying) + t = t->GetAsVaryingType(); + + countExpr = TypeConvertExpr(countExpr, t, "item count"); + if (countExpr == NULL) + return NULL; + + return this; +} + + +Expr * +NewExpr::Optimize() { + return this; +} + + +void +NewExpr::Print() const { + printf("new (%s)", allocType ? allocType->GetString().c_str() : "NULL"); +} + + +int +NewExpr::EstimateCost() const { + return COST_NEW; +} diff --git a/expr.h b/expr.h index 48388475..70224a7f 100644 --- a/expr.h +++ b/expr.h @@ -388,6 +388,10 @@ public: with values given by the "vales" parameter. */ ConstExpr(ConstExpr *old, double *values); + /** Create ConstExpr with the same type and values as the given one, + but at the given position. */ + ConstExpr(ConstExpr *old, SourcePos pos); + llvm::Value *GetValue(FunctionEmitContext *ctx) const; const Type *GetType() const; void Print() const; @@ -680,11 +684,44 @@ public: const Type *GetType() const; Expr *TypeCheck(); Expr *Optimize(); + llvm::Constant *GetConstant(const Type *type) const; void Print() const; int EstimateCost() const; }; +/** An expression representing a "new" expression, used for dynamically + allocating memory. +*/ +class NewExpr : public Expr { +public: + NewExpr(int typeQual, const Type *type, Expr *initializer, Expr *count, + SourcePos tqPos, SourcePos p); + + llvm::Value *GetValue(FunctionEmitContext *ctx) const; + const Type *GetType() const; + Expr *TypeCheck(); + Expr *Optimize(); + void Print() const; + int EstimateCost() const; + + /** Type of object to allocate storage for. */ + const Type *allocType; + /** Expression giving the number of elements to allocate, when the + "new Foo[expr]" form is used. This may be NULL, in which case a + single element of the given type will be allocated. */ + Expr *countExpr; + /** Optional initializer expression used to initialize the allocated + memory. */ + Expr *initExpr; + /** Indicates whether this is a "varying new" or "uniform new" + (i.e. whether a separate allocation is performed per program + instance, or whether a single allocation is performed for the + entire gang of program instances.) */ + bool isVarying; +}; + + /** This function indicates whether it's legal to convert from fromType to toType. If the optional errorMsgBase and source position parameters are provided, then an error message is issued if the type conversion @@ -703,4 +740,20 @@ bool CanConvertTypes(const Type *fromType, const Type *toType, */ Expr *TypeConvertExpr(Expr *expr, const Type *toType, const char *errorMsgBase); +/** Utility routine that emits code to initialize a symbol given an + initializer expression. + + @param lvalue Memory location of storage for the symbol's data + @param symName Name of symbol (used in error messages) + @param symType Type of variable being initialized + @param initExpr Expression for the initializer + @param ctx FunctionEmitContext to use for generating instructions + @param pos Source file position of the variable being initialized +*/ +void +InitSymbol(llvm::Value *lvalue, const Type *symType, Expr *initExpr, + FunctionEmitContext *ctx, SourcePos pos); + +bool PossiblyResolveFunctionOverloads(Expr *expr, const Type *type); + #endif // ISPC_EXPR_H diff --git a/ispc.cpp b/ispc.cpp index 7fbc5bc6..6729da92 100644 --- a/ispc.cpp +++ b/ispc.cpp @@ -185,6 +185,14 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa, t->allOffMaskIsSafe = true; t->maskBitCount = 1; } + else if (!strcasecmp(isa, "generic-1")) { + t->isa = Target::GENERIC; + t->nativeVectorWidth = 1; + t->vectorWidth = 1; + t->maskingIsFree = false; + t->allOffMaskIsSafe = false; + t->maskBitCount = 32; + } #if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn) else if (!strcasecmp(isa, "avx")) { t->isa = Target::AVX; @@ -270,7 +278,7 @@ Target::SupportedTargetISAs() { #ifdef LLVM_3_1svn ", avx2, avx2-x2" #endif // LLVM_3_1svn - ", generic-4, generic-8, generic-16"; + ", generic-4, generic-8, generic-16, generic-1"; } @@ -502,12 +510,15 @@ Globals::Globals() { debugPrint = false; disableWarnings = false; warningsAsErrors = false; + quiet = false; disableLineWrap = false; emitPerfWarnings = true; emitInstrumentation = false; generateDebuggingSymbols = false; + enableFuzzTest = false; + fuzzTestSeed = -1; mangleFunctionsWithTarget = false; - + ctx = new llvm::LLVMContext; #ifdef ISPC_IS_WINDOWS diff --git a/ispc.h b/ispc.h index 009470e2..59c9140f 100644 --- a/ispc.h +++ b/ispc.h @@ -388,6 +388,9 @@ struct Globals { possible performance pitfalls. */ bool emitPerfWarnings; + /** Indicates whether all printed output should be surpressed. */ + bool quiet; + /** Indicates whether calls should be emitted in the program to an externally-defined program instrumentation function. (See the "Instrumenting your ispc programs" section in the user's @@ -402,6 +405,14 @@ struct Globals { vector width to them. */ bool mangleFunctionsWithTarget; + /** If enabled, the lexer will randomly replace some tokens returned + with other tokens, in order to test error condition handling in the + compiler. */ + bool enableFuzzTest; + + /** Seed for random number generator used for fuzz testing. */ + int fuzzTestSeed; + /** Global LLVMContext object */ llvm::LLVMContext *ctx; @@ -412,12 +423,17 @@ struct Globals { /** Arguments to pass along to the C pre-processor, if it is run on the program before compilation. */ std::vector cppArgs; + + /** Additional user-provided directories to search when processing + #include directives in the preprocessor. */ + std::vector includePath; }; enum { COST_ASSIGN = 1, COST_COHERENT_BREAK_CONTINE = 4, COST_COMPLEX_ARITH_OP = 4, + COST_DELETE = 32, COST_DEREF = 4, COST_FUNCALL = 4, COST_FUNPTR_UNIFORM = 12, @@ -425,6 +441,7 @@ enum { COST_GATHER = 8, COST_GOTO = 4, COST_LOAD = 2, + COST_NEW = 32, COST_REGULAR_BREAK_CONTINUE = 2, COST_RETURN = 4, COST_SELECT = 4, diff --git a/ispc.vcxproj b/ispc.vcxproj index 38457518..6971ce9a 100755 --- a/ispc.vcxproj +++ b/ispc.vcxproj @@ -25,6 +25,7 @@ + @@ -211,6 +212,19 @@ Building gen-bitcode-avx2-x2.cpp + + + Document + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-1.ll | python bitcode2cpp.py builtins\target-generic-1.ll > gen-bitcode-generic-1.cpp + gen-bitcode-generic-1.cpp + builtins\util.m4;builtins\target-generic-common.ll + m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-1.ll | python bitcode2cpp.py builtins\target-generic-1.ll > gen-bitcode-generic-1.cpp + gen-bitcode-generic-1.cpp + builtins\util.m4;builtins\target-generic-common.ll + Building gen-bitcode-generic-1.cpp + Building gen-bitcode-generic-1.cpp + + Document diff --git a/lex.ll b/lex.ll index 1b6d382b..678e9179 100644 --- a/lex.ll +++ b/lex.ll @@ -50,20 +50,275 @@ static void lStringConst(YYSTYPE *, SourcePos *); static double lParseHexFloat(const char *ptr); #define YY_USER_ACTION \ - yylloc->first_line = yylloc->last_line; \ - yylloc->first_column = yylloc->last_column; \ - yylloc->last_column += yyleng; + yylloc.first_line = yylloc.last_line; \ + yylloc.first_column = yylloc.last_column; \ + yylloc.last_column += yyleng; #ifdef ISPC_IS_WINDOWS inline int isatty(int) { return 0; } #endif // ISPC_IS_WINDOWS +static int allTokens[] = { + TOKEN_ASSERT, TOKEN_BOOL, TOKEN_BREAK, TOKEN_CASE, TOKEN_CBREAK, + TOKEN_CCONTINUE, TOKEN_CDO, TOKEN_CFOR, TOKEN_CIF, TOKEN_CWHILE, + TOKEN_CONST, TOKEN_CONTINUE, TOKEN_CRETURN, TOKEN_DEFAULT, TOKEN_DO, + TOKEN_DELETE, TOKEN_DOUBLE, TOKEN_ELSE, TOKEN_ENUM, + TOKEN_EXPORT, TOKEN_EXTERN, TOKEN_FALSE, TOKEN_FLOAT, TOKEN_FOR, + TOKEN_FOREACH, TOKEN_FOREACH_TILED, TOKEN_GOTO, TOKEN_IF, TOKEN_INLINE, + TOKEN_INT, TOKEN_INT8, TOKEN_INT16, TOKEN_INT, TOKEN_INT64, TOKEN_LAUNCH, + TOKEN_NEW, TOKEN_NULL, TOKEN_PRINT, TOKEN_RETURN, TOKEN_SOA, TOKEN_SIGNED, + TOKEN_SIZEOF, TOKEN_STATIC, TOKEN_STRUCT, TOKEN_SWITCH, TOKEN_SYNC, + TOKEN_TASK, TOKEN_TRUE, TOKEN_TYPEDEF, TOKEN_UNIFORM, TOKEN_UNSIGNED, + TOKEN_VARYING, TOKEN_VOID, TOKEN_WHILE, TOKEN_STRING_C_LITERAL, + TOKEN_DOTDOTDOT, + TOKEN_FLOAT_CONSTANT, + TOKEN_INT32_CONSTANT, TOKEN_UINT32_CONSTANT, + TOKEN_INT64_CONSTANT, TOKEN_UINT64_CONSTANT, + TOKEN_INC_OP, TOKEN_DEC_OP, TOKEN_LEFT_OP, TOKEN_RIGHT_OP, TOKEN_LE_OP, + TOKEN_GE_OP, TOKEN_EQ_OP, TOKEN_NE_OP, TOKEN_AND_OP, TOKEN_OR_OP, + TOKEN_MUL_ASSIGN, TOKEN_DIV_ASSIGN, TOKEN_MOD_ASSIGN, TOKEN_ADD_ASSIGN, + TOKEN_SUB_ASSIGN, TOKEN_LEFT_ASSIGN, TOKEN_RIGHT_ASSIGN, TOKEN_AND_ASSIGN, + TOKEN_XOR_ASSIGN, TOKEN_OR_ASSIGN, TOKEN_PTR_OP, + ';', '{', '}', ',', ':', '=', '(', ')', '[', ']', '.', '&', '!', '~', '-', + '+', '*', '/', '%', '<', '>', '^', '|', '?', +}; + +std::map tokenToName; +std::map tokenNameRemap; + +void ParserInit() { + tokenToName[TOKEN_ASSERT] = "assert"; + tokenToName[TOKEN_BOOL] = "bool"; + tokenToName[TOKEN_BREAK] = "break"; + tokenToName[TOKEN_CASE] = "case"; + tokenToName[TOKEN_CBREAK] = "cbreak"; + tokenToName[TOKEN_CCONTINUE] = "ccontinue"; + tokenToName[TOKEN_CDO] = "cdo"; + tokenToName[TOKEN_CFOR] = "cfor"; + tokenToName[TOKEN_CIF] = "cif"; + tokenToName[TOKEN_CWHILE] = "cwhile"; + tokenToName[TOKEN_CONST] = "const"; + tokenToName[TOKEN_CONTINUE] = "continue"; + tokenToName[TOKEN_CRETURN] = "creturn"; + tokenToName[TOKEN_DEFAULT] = "default"; + tokenToName[TOKEN_DO] = "do"; + tokenToName[TOKEN_DELETE] = "delete"; + tokenToName[TOKEN_DOUBLE] = "double"; + tokenToName[TOKEN_ELSE] = "else"; + tokenToName[TOKEN_ENUM] = "enum"; + tokenToName[TOKEN_EXPORT] = "export"; + tokenToName[TOKEN_EXTERN] = "extern"; + tokenToName[TOKEN_FALSE] = "false"; + tokenToName[TOKEN_FLOAT] = "float"; + tokenToName[TOKEN_FOR] = "for"; + tokenToName[TOKEN_FOREACH] = "foreach"; + tokenToName[TOKEN_FOREACH_TILED] = "foreach_tiled"; + tokenToName[TOKEN_GOTO] = "goto"; + tokenToName[TOKEN_IF] = "if"; + tokenToName[TOKEN_INLINE] = "inline"; + tokenToName[TOKEN_INT] = "int"; + tokenToName[TOKEN_INT8] = "int8"; + tokenToName[TOKEN_INT16] = "int16"; + tokenToName[TOKEN_INT] = "int"; + tokenToName[TOKEN_INT64] = "int64"; + tokenToName[TOKEN_LAUNCH] = "launch"; + tokenToName[TOKEN_NEW] = "new"; + tokenToName[TOKEN_NULL] = "NULL"; + tokenToName[TOKEN_PRINT] = "print"; + tokenToName[TOKEN_RETURN] = "return"; + tokenToName[TOKEN_SOA] = "soa"; + tokenToName[TOKEN_SIGNED] = "signed"; + tokenToName[TOKEN_SIZEOF] = "sizeof"; + tokenToName[TOKEN_STATIC] = "static"; + tokenToName[TOKEN_STRUCT] = "struct"; + tokenToName[TOKEN_SWITCH] = "switch"; + tokenToName[TOKEN_SYNC] = "sync"; + tokenToName[TOKEN_TASK] = "task"; + tokenToName[TOKEN_TRUE] = "true"; + tokenToName[TOKEN_TYPEDEF] = "typedef"; + tokenToName[TOKEN_UNIFORM] = "uniform"; + tokenToName[TOKEN_UNSIGNED] = "unsigned"; + tokenToName[TOKEN_VARYING] = "varying"; + tokenToName[TOKEN_VOID] = "void"; + tokenToName[TOKEN_WHILE] = "while"; + tokenToName[TOKEN_STRING_C_LITERAL] = "\"C\""; + tokenToName[TOKEN_DOTDOTDOT] = "..."; + tokenToName[TOKEN_FLOAT_CONSTANT] = "TOKEN_FLOAT_CONSTANT"; + tokenToName[TOKEN_INT32_CONSTANT] = "TOKEN_INT32_CONSTANT"; + tokenToName[TOKEN_UINT32_CONSTANT] = "TOKEN_UINT32_CONSTANT"; + tokenToName[TOKEN_INT64_CONSTANT] = "TOKEN_INT64_CONSTANT"; + tokenToName[TOKEN_UINT64_CONSTANT] = "TOKEN_UINT64_CONSTANT"; + tokenToName[TOKEN_INC_OP] = "++"; + tokenToName[TOKEN_DEC_OP] = "--"; + tokenToName[TOKEN_LEFT_OP] = "<<"; + tokenToName[TOKEN_RIGHT_OP] = ">>"; + tokenToName[TOKEN_LE_OP] = "<="; + tokenToName[TOKEN_GE_OP] = ">="; + tokenToName[TOKEN_EQ_OP] = "=="; + tokenToName[TOKEN_NE_OP] = "!="; + tokenToName[TOKEN_AND_OP] = "&&"; + tokenToName[TOKEN_OR_OP] = "||"; + tokenToName[TOKEN_MUL_ASSIGN] = "*="; + tokenToName[TOKEN_DIV_ASSIGN] = "/="; + tokenToName[TOKEN_MOD_ASSIGN] = "%="; + tokenToName[TOKEN_ADD_ASSIGN] = "+="; + tokenToName[TOKEN_SUB_ASSIGN] = "-="; + tokenToName[TOKEN_LEFT_ASSIGN] = "<<="; + tokenToName[TOKEN_RIGHT_ASSIGN] = ">>="; + tokenToName[TOKEN_AND_ASSIGN] = "&="; + tokenToName[TOKEN_XOR_ASSIGN] = "^="; + tokenToName[TOKEN_OR_ASSIGN] = "|="; + tokenToName[TOKEN_PTR_OP] = "->"; + tokenToName[';'] = ";"; + tokenToName['{'] = "{"; + tokenToName['}'] = "}"; + tokenToName[','] = ","; + tokenToName[':'] = ":"; + tokenToName['='] = "="; + tokenToName['('] = "("; + tokenToName[')'] = ")"; + tokenToName['['] = "["; + tokenToName[']'] = "]"; + tokenToName['.'] = "."; + tokenToName['&'] = "&"; + tokenToName['!'] = "!"; + tokenToName['~'] = "~"; + tokenToName['-'] = "-"; + tokenToName['+'] = "+"; + tokenToName['*'] = "*"; + tokenToName['/'] = "/"; + tokenToName['%'] = "%"; + tokenToName['<'] = "<"; + tokenToName['>'] = ">"; + tokenToName['^'] = "^"; + tokenToName['|'] = "|"; + tokenToName['?'] = "?"; + tokenToName[';'] = ";"; + + tokenNameRemap["TOKEN_ASSERT"] = "\'assert\'"; + tokenNameRemap["TOKEN_BOOL"] = "\'bool\'"; + tokenNameRemap["TOKEN_BREAK"] = "\'break\'"; + tokenNameRemap["TOKEN_CASE"] = "\'case\'"; + tokenNameRemap["TOKEN_CBREAK"] = "\'cbreak\'"; + tokenNameRemap["TOKEN_CCONTINUE"] = "\'ccontinue\'"; + tokenNameRemap["TOKEN_CDO"] = "\'cdo\'"; + tokenNameRemap["TOKEN_CFOR"] = "\'cfor\'"; + tokenNameRemap["TOKEN_CIF"] = "\'cif\'"; + tokenNameRemap["TOKEN_CWHILE"] = "\'cwhile\'"; + tokenNameRemap["TOKEN_CONST"] = "\'const\'"; + tokenNameRemap["TOKEN_CONTINUE"] = "\'continue\'"; + tokenNameRemap["TOKEN_CRETURN"] = "\'creturn\'"; + tokenNameRemap["TOKEN_DEFAULT"] = "\'default\'"; + tokenNameRemap["TOKEN_DO"] = "\'do\'"; + tokenNameRemap["TOKEN_DELETE"] = "\'delete\'"; + tokenNameRemap["TOKEN_DOUBLE"] = "\'double\'"; + tokenNameRemap["TOKEN_ELSE"] = "\'else\'"; + tokenNameRemap["TOKEN_ENUM"] = "\'enum\'"; + tokenNameRemap["TOKEN_EXPORT"] = "\'export\'"; + tokenNameRemap["TOKEN_EXTERN"] = "\'extern\'"; + tokenNameRemap["TOKEN_FALSE"] = "\'false\'"; + tokenNameRemap["TOKEN_FLOAT"] = "\'float\'"; + tokenNameRemap["TOKEN_FOR"] = "\'for\'"; + tokenNameRemap["TOKEN_FOREACH"] = "\'foreach\'"; + tokenNameRemap["TOKEN_FOREACH_TILED"] = "\'foreach_tiled\'"; + tokenNameRemap["TOKEN_GOTO"] = "\'goto\'"; + tokenNameRemap["TOKEN_IDENTIFIER"] = "identifier"; + tokenNameRemap["TOKEN_IF"] = "\'if\'"; + tokenNameRemap["TOKEN_INLINE"] = "\'inline\'"; + tokenNameRemap["TOKEN_INT"] = "\'int\'"; + tokenNameRemap["TOKEN_INT8"] = "\'int8\'"; + tokenNameRemap["TOKEN_INT16"] = "\'int16\'"; + tokenNameRemap["TOKEN_INT"] = "\'int\'"; + tokenNameRemap["TOKEN_INT64"] = "\'int64\'"; + tokenNameRemap["TOKEN_LAUNCH"] = "\'launch\'"; + tokenNameRemap["TOKEN_NEW"] = "\'new\'"; + tokenNameRemap["TOKEN_NULL"] = "\'NULL\'"; + tokenNameRemap["TOKEN_PRINT"] = "\'print\'"; + tokenNameRemap["TOKEN_RETURN"] = "\'return\'"; + tokenNameRemap["TOKEN_SOA"] = "\'soa\'"; + tokenNameRemap["TOKEN_SIGNED"] = "\'signed\'"; + tokenNameRemap["TOKEN_SIZEOF"] = "\'sizeof\'"; + tokenNameRemap["TOKEN_STATIC"] = "\'static\'"; + tokenNameRemap["TOKEN_STRUCT"] = "\'struct\'"; + tokenNameRemap["TOKEN_SWITCH"] = "\'switch\'"; + tokenNameRemap["TOKEN_SYNC"] = "\'sync\'"; + tokenNameRemap["TOKEN_TASK"] = "\'task\'"; + tokenNameRemap["TOKEN_TRUE"] = "\'true\'"; + tokenNameRemap["TOKEN_TYPEDEF"] = "\'typedef\'"; + tokenNameRemap["TOKEN_UNIFORM"] = "\'uniform\'"; + tokenNameRemap["TOKEN_UNSIGNED"] = "\'unsigned\'"; + tokenNameRemap["TOKEN_VARYING"] = "\'varying\'"; + tokenNameRemap["TOKEN_VOID"] = "\'void\'"; + tokenNameRemap["TOKEN_WHILE"] = "\'while\'"; + tokenNameRemap["TOKEN_STRING_C_LITERAL"] = "\"C\""; + tokenNameRemap["TOKEN_DOTDOTDOT"] = "\'...\'"; + tokenNameRemap["TOKEN_FLOAT_CONSTANT"] = "float constant"; + tokenNameRemap["TOKEN_INT32_CONSTANT"] = "int32 constant"; + tokenNameRemap["TOKEN_UINT32_CONSTANT"] = "unsigned int32 constant"; + tokenNameRemap["TOKEN_INT64_CONSTANT"] = "int64 constant"; + tokenNameRemap["TOKEN_UINT64_CONSTANT"] = "unsigned int64 constant"; + tokenNameRemap["TOKEN_INC_OP"] = "\'++\'"; + tokenNameRemap["TOKEN_DEC_OP"] = "\'--\'"; + tokenNameRemap["TOKEN_LEFT_OP"] = "\'<<\'"; + tokenNameRemap["TOKEN_RIGHT_OP"] = "\'>>\'"; + tokenNameRemap["TOKEN_LE_OP"] = "\'<=\'"; + tokenNameRemap["TOKEN_GE_OP"] = "\'>=\'"; + tokenNameRemap["TOKEN_EQ_OP"] = "\'==\'"; + tokenNameRemap["TOKEN_NE_OP"] = "\'!=\'"; + tokenNameRemap["TOKEN_AND_OP"] = "\'&&\'"; + tokenNameRemap["TOKEN_OR_OP"] = "\'||\'"; + tokenNameRemap["TOKEN_MUL_ASSIGN"] = "\'*=\'"; + tokenNameRemap["TOKEN_DIV_ASSIGN"] = "\'/=\'"; + tokenNameRemap["TOKEN_MOD_ASSIGN"] = "\'%=\'"; + tokenNameRemap["TOKEN_ADD_ASSIGN"] = "\'+=\'"; + tokenNameRemap["TOKEN_SUB_ASSIGN"] = "\'-=\'"; + tokenNameRemap["TOKEN_LEFT_ASSIGN"] = "\'<<=\'"; + tokenNameRemap["TOKEN_RIGHT_ASSIGN"] = "\'>>=\'"; + tokenNameRemap["TOKEN_AND_ASSIGN"] = "\'&=\'"; + tokenNameRemap["TOKEN_XOR_ASSIGN"] = "\'^=\'"; + tokenNameRemap["TOKEN_OR_ASSIGN"] = "\'|=\'"; + tokenNameRemap["TOKEN_PTR_OP"] = "\'->\'"; + tokenNameRemap["$end"] = "end of file"; +} + + +inline int ispcRand() { +#ifdef ISPC_IS_WINDOWS + return rand(); +#else + return lrand48(); +#endif +} + +#define RT \ + if (g->enableFuzzTest) { \ + int r = ispcRand() % 40; \ + if (r == 0) { \ + Warning(yylloc, "Fuzz test dropping token"); \ + } \ + else if (r == 1) { \ + Assert (tokenToName.size() > 0); \ + int nt = sizeof(allTokens) / sizeof(allTokens[0]); \ + int tn = ispcRand() % nt; \ + yylval.stringVal = new std::string(yytext); /* just in case */\ + Warning(yylloc, "Fuzz test replaced token with \"%s\"", tokenToName[allTokens[tn]].c_str()); \ + return allTokens[tn]; \ + } \ + else if (r == 2) { \ + Symbol *sym = m->symbolTable->RandomSymbol(); \ + if (sym != NULL) { \ + yylval.stringVal = new std::string(sym->name); \ + Warning(yylloc, "Fuzz test replaced with identifier \"%s\".", sym->name.c_str()); \ + return TOKEN_IDENTIFIER; \ + } \ + } \ + /* TOKEN_TYPE_NAME */ \ + } else /* swallow semicolon */ + %} %option nounput %option noyywrap -%option bison-bridge -%option bison-locations %option nounistd WHITESPACE [ \t\r]+ @@ -75,73 +330,77 @@ IDENT [a-zA-Z_][a-zA-Z_0-9]* ZO_SWIZZLE ([01]+[w-z]+)+|([01]+[rgba]+)+|([01]+[uv]+)+ %% -"/*" { lCComment(yylloc); } -"//" { lCppComment(yylloc); } +"/*" { lCComment(&yylloc); } +"//" { lCppComment(&yylloc); } -__assert { return TOKEN_ASSERT; } -bool { return TOKEN_BOOL; } -break { return TOKEN_BREAK; } -case { return TOKEN_CASE; } -cbreak { return TOKEN_CBREAK; } -ccontinue { return TOKEN_CCONTINUE; } -cdo { return TOKEN_CDO; } -cfor { return TOKEN_CFOR; } -cif { return TOKEN_CIF; } -cwhile { return TOKEN_CWHILE; } -const { return TOKEN_CONST; } -continue { return TOKEN_CONTINUE; } -creturn { return TOKEN_CRETURN; } -default { return TOKEN_DEFAULT; } -do { return TOKEN_DO; } -double { return TOKEN_DOUBLE; } -else { return TOKEN_ELSE; } -enum { return TOKEN_ENUM; } -export { return TOKEN_EXPORT; } -extern { return TOKEN_EXTERN; } -false { return TOKEN_FALSE; } -float { return TOKEN_FLOAT; } -for { return TOKEN_FOR; } -foreach { return TOKEN_FOREACH; } -foreach_tiled { return TOKEN_FOREACH_TILED; } -goto { return TOKEN_GOTO; } -if { return TOKEN_IF; } -inline { return TOKEN_INLINE; } -int { return TOKEN_INT; } -int8 { return TOKEN_INT8; } -int16 { return TOKEN_INT16; } -int32 { return TOKEN_INT; } -int64 { return TOKEN_INT64; } -launch { return TOKEN_LAUNCH; } -NULL { return TOKEN_NULL; } -print { return TOKEN_PRINT; } -reference { Error(*yylloc, "\"reference\" qualifier is no longer supported; " - "please use C++-style '&' syntax for references " - "instead."); } -return { return TOKEN_RETURN; } -soa { return TOKEN_SOA; } -signed { return TOKEN_SIGNED; } -sizeof { return TOKEN_SIZEOF; } -static { return TOKEN_STATIC; } -struct { return TOKEN_STRUCT; } -switch { return TOKEN_SWITCH; } -sync { return TOKEN_SYNC; } -task { return TOKEN_TASK; } -true { return TOKEN_TRUE; } -typedef { return TOKEN_TYPEDEF; } -uniform { return TOKEN_UNIFORM; } -unsigned { return TOKEN_UNSIGNED; } -varying { return TOKEN_VARYING; } -void { return TOKEN_VOID; } -while { return TOKEN_WHILE; } -\"C\" { return TOKEN_STRING_C_LITERAL; } -\.\.\. { return TOKEN_DOTDOTDOT; } +__assert { RT; return TOKEN_ASSERT; } +bool { RT; return TOKEN_BOOL; } +break { RT; return TOKEN_BREAK; } +case { RT; return TOKEN_CASE; } +cbreak { RT; return TOKEN_CBREAK; } +ccontinue { RT; return TOKEN_CCONTINUE; } +cdo { RT; return TOKEN_CDO; } +cfor { RT; return TOKEN_CFOR; } +cif { RT; return TOKEN_CIF; } +cwhile { RT; return TOKEN_CWHILE; } +const { RT; return TOKEN_CONST; } +continue { RT; return TOKEN_CONTINUE; } +creturn { RT; return TOKEN_CRETURN; } +default { RT; return TOKEN_DEFAULT; } +do { RT; return TOKEN_DO; } +delete { RT; return TOKEN_DELETE; } +delete\[\] { RT; return TOKEN_DELETE; } +double { RT; return TOKEN_DOUBLE; } +else { RT; return TOKEN_ELSE; } +enum { RT; return TOKEN_ENUM; } +export { RT; return TOKEN_EXPORT; } +extern { RT; return TOKEN_EXTERN; } +false { RT; return TOKEN_FALSE; } +float { RT; return TOKEN_FLOAT; } +for { RT; return TOKEN_FOR; } +foreach { RT; return TOKEN_FOREACH; } +foreach_tiled { RT; return TOKEN_FOREACH_TILED; } +goto { RT; return TOKEN_GOTO; } +if { RT; return TOKEN_IF; } +inline { RT; return TOKEN_INLINE; } +int { RT; return TOKEN_INT; } +int8 { RT; return TOKEN_INT8; } +int16 { RT; return TOKEN_INT16; } +int32 { RT; return TOKEN_INT; } +int64 { RT; return TOKEN_INT64; } +launch { RT; return TOKEN_LAUNCH; } +new { RT; return TOKEN_NEW; } +NULL { RT; return TOKEN_NULL; } +print { RT; return TOKEN_PRINT; } +reference { Error(yylloc, "\"reference\" qualifier is no longer supported; " + "please use C++-style '&' syntax for references " + "instead."); } +return { RT; return TOKEN_RETURN; } +soa { RT; return TOKEN_SOA; } +signed { RT; return TOKEN_SIGNED; } +sizeof { RT; return TOKEN_SIZEOF; } +static { RT; return TOKEN_STATIC; } +struct { RT; return TOKEN_STRUCT; } +switch { RT; return TOKEN_SWITCH; } +sync { RT; return TOKEN_SYNC; } +task { RT; return TOKEN_TASK; } +true { RT; return TOKEN_TRUE; } +typedef { RT; return TOKEN_TYPEDEF; } +uniform { RT; return TOKEN_UNIFORM; } +unsigned { RT; return TOKEN_UNSIGNED; } +varying { RT; return TOKEN_VARYING; } +void { RT; return TOKEN_VOID; } +while { RT; return TOKEN_WHILE; } +\"C\" { RT; return TOKEN_STRING_C_LITERAL; } +\.\.\. { RT; return TOKEN_DOTDOTDOT; } -L?\"(\\.|[^\\"])*\" { lStringConst(yylval, yylloc); return TOKEN_STRING_LITERAL; } +L?\"(\\.|[^\\"])*\" { lStringConst(&yylval, &yylloc); return TOKEN_STRING_LITERAL; } {IDENT} { + RT; /* We have an identifier--is it a type name or an identifier? The symbol table will straighten us out... */ - yylval->stringVal = new std::string(yytext); + yylval.stringVal = new std::string(yytext); if (m->symbolTable->LookupType(yytext) != NULL) return TOKEN_TYPE_NAME; else @@ -149,18 +408,19 @@ L?\"(\\.|[^\\"])*\" { lStringConst(yylval, yylloc); return TOKEN_STRING_LITERAL; } {INT_NUMBER}+(u|U|l|L)*? { + RT; int ls = 0, us = 0; char *endPtr = NULL; if (yytext[0] == '0' && yytext[1] == 'b') - yylval->intVal = lParseBinary(yytext+2, *yylloc, &endPtr); + yylval.intVal = lParseBinary(yytext+2, yylloc, &endPtr); else { #if defined(ISPC_IS_WINDOWS) && !defined(__MINGW32__) - yylval->intVal = _strtoi64(yytext, &endPtr, 0); + yylval.intVal = _strtoui64(yytext, &endPtr, 0); #else // FIXME: should use strtouq and then issue an error if we can't // fit into 64 bits... - yylval->intVal = strtoull(yytext, &endPtr, 0); + yylval.intVal = strtoull(yytext, &endPtr, 0); #endif } @@ -178,11 +438,11 @@ L?\"(\\.|[^\\"])*\" { lStringConst(yylval, yylloc); return TOKEN_STRING_LITERAL; us++; } if (kilo) - yylval->intVal *= 1024; + yylval.intVal *= 1024; if (mega) - yylval->intVal *= 1024*1024; + yylval.intVal *= 1024*1024; if (giga) - yylval->intVal *= 1024*1024*1024; + yylval.intVal *= 1024*1024*1024; if (ls >= 2) return us ? TOKEN_UINT64_CONSTANT : TOKEN_INT64_CONSTANT; @@ -190,7 +450,7 @@ L?\"(\\.|[^\\"])*\" { lStringConst(yylval, yylloc); return TOKEN_STRING_LITERAL; return us ? TOKEN_UINT32_CONSTANT : TOKEN_INT32_CONSTANT; // See if we can fit this into a 32-bit integer... - if ((yylval->intVal & 0xffffffff) == yylval->intVal) + if ((yylval.intVal & 0xffffffff) == yylval.intVal) return us ? TOKEN_UINT32_CONSTANT : TOKEN_INT32_CONSTANT; else return us ? TOKEN_UINT64_CONSTANT : TOKEN_INT64_CONSTANT; @@ -198,74 +458,76 @@ L?\"(\\.|[^\\"])*\" { lStringConst(yylval, yylloc); return TOKEN_STRING_LITERAL; {FLOAT_NUMBER} { - yylval->floatVal = (float)atof(yytext); + RT; + yylval.floatVal = (float)atof(yytext); return TOKEN_FLOAT_CONSTANT; } {HEX_FLOAT_NUMBER} { - yylval->floatVal = (float)lParseHexFloat(yytext); + RT; + yylval.floatVal = (float)lParseHexFloat(yytext); return TOKEN_FLOAT_CONSTANT; } -"++" { return TOKEN_INC_OP; } -"--" { return TOKEN_DEC_OP; } -"<<" { return TOKEN_LEFT_OP; } -">>" { return TOKEN_RIGHT_OP; } -"<=" { return TOKEN_LE_OP; } -">=" { return TOKEN_GE_OP; } -"==" { return TOKEN_EQ_OP; } -"!=" { return TOKEN_NE_OP; } -"&&" { return TOKEN_AND_OP; } -"||" { return TOKEN_OR_OP; } -"*=" { return TOKEN_MUL_ASSIGN; } -"/=" { return TOKEN_DIV_ASSIGN; } -"%=" { return TOKEN_MOD_ASSIGN; } -"+=" { return TOKEN_ADD_ASSIGN; } -"-=" { return TOKEN_SUB_ASSIGN; } -"<<=" { return TOKEN_LEFT_ASSIGN; } -">>=" { return TOKEN_RIGHT_ASSIGN; } -"&=" { return TOKEN_AND_ASSIGN; } -"^=" { return TOKEN_XOR_ASSIGN; } -"|=" { return TOKEN_OR_ASSIGN; } -"->" { return TOKEN_PTR_OP; } -";" { return ';'; } -("{"|"<%") { return '{'; } -("}"|"%>") { return '}'; } -"," { return ','; } -":" { return ':'; } -"=" { return '='; } -"(" { return '('; } -")" { return ')'; } -("["|"<:") { return '['; } -("]"|":>") { return ']'; } -"." { return '.'; } -"&" { return '&'; } -"!" { return '!'; } -"~" { return '~'; } -"-" { return '-'; } -"+" { return '+'; } -"*" { return '*'; } -"/" { return '/'; } -"%" { return '%'; } -"<" { return '<'; } -">" { return '>'; } -"^" { return '^'; } -"|" { return '|'; } -"?" { return '?'; } +"++" { RT; return TOKEN_INC_OP; } +"--" { RT; return TOKEN_DEC_OP; } +"<<" { RT; return TOKEN_LEFT_OP; } +">>" { RT; return TOKEN_RIGHT_OP; } +"<=" { RT; return TOKEN_LE_OP; } +">=" { RT; return TOKEN_GE_OP; } +"==" { RT; return TOKEN_EQ_OP; } +"!=" { RT; return TOKEN_NE_OP; } +"&&" { RT; return TOKEN_AND_OP; } +"||" { RT; return TOKEN_OR_OP; } +"*=" { RT; return TOKEN_MUL_ASSIGN; } +"/=" { RT; return TOKEN_DIV_ASSIGN; } +"%=" { RT; return TOKEN_MOD_ASSIGN; } +"+=" { RT; return TOKEN_ADD_ASSIGN; } +"-=" { RT; return TOKEN_SUB_ASSIGN; } +"<<=" { RT; return TOKEN_LEFT_ASSIGN; } +">>=" { RT; return TOKEN_RIGHT_ASSIGN; } +"&=" { RT; return TOKEN_AND_ASSIGN; } +"^=" { RT; return TOKEN_XOR_ASSIGN; } +"|=" { RT; return TOKEN_OR_ASSIGN; } +"->" { RT; return TOKEN_PTR_OP; } +";" { RT; return ';'; } +("{"|"<%") { RT; return '{'; } +("}"|"%>") { RT; return '}'; } +"," { RT; return ','; } +":" { RT; return ':'; } +"=" { RT; return '='; } +"(" { RT; return '('; } +")" { RT; return ')'; } +("["|"<:") { RT; return '['; } +("]"|":>") { RT; return ']'; } +"." { RT; return '.'; } +"&" { RT; return '&'; } +"!" { RT; return '!'; } +"~" { RT; return '~'; } +"-" { RT; return '-'; } +"+" { RT; return '+'; } +"*" { RT; return '*'; } +"/" { RT; return '/'; } +"%" { RT; return '%'; } +"<" { RT; return '<'; } +">" { RT; return '>'; } +"^" { RT; return '^'; } +"|" { RT; return '|'; } +"?" { RT; return '?'; } {WHITESPACE} { } \n { - yylloc->last_line++; - yylloc->last_column = 1; + yylloc.last_line++; + yylloc.last_column = 1; } #(line)?[ ][0-9]+[ ]\"(\\.|[^\\"])*\"[^\n]* { - lHandleCppHash(yylloc); + lHandleCppHash(&yylloc); } . { - Error(*yylloc, "Illegal character: %c (0x%x)", yytext[0], int(yytext[0])); + Error(yylloc, "Illegal character: %c (0x%x)", yytext[0], int(yytext[0])); YY_USER_ACTION } @@ -306,8 +568,10 @@ lParseBinary(const char *ptr, SourcePos pos, char **endPtr) { static void lCComment(SourcePos *pos) { char c, prev = 0; - + while ((c = yyinput()) != 0) { + ++pos->last_column; + if (c == '\n') { pos->last_line++; pos->last_column = 1; diff --git a/llvmutil.cpp b/llvmutil.cpp index 808babbc..e5c4785e 100644 --- a/llvmutil.cpp +++ b/llvmutil.cpp @@ -597,6 +597,9 @@ LLVMFlattenInsertChain(llvm::InsertElementInst *ie, int vectorWidth, bool LLVMVectorValuesAllEqual(llvm::Value *v, int vectorLength, std::vector &seenPhis) { + if (vectorLength == 1) + return true; + if (llvm::isa(v)) return true; @@ -604,6 +607,12 @@ LLVMVectorValuesAllEqual(llvm::Value *v, int vectorLength, if (cv != NULL) return (cv->getSplatValue() != NULL); +#ifdef LLVM_3_1svn + llvm::ConstantDataVector *cdv = llvm::dyn_cast(v); + if (cdv != NULL) + return (cdv->getSplatValue() != NULL); +#endif + llvm::BinaryOperator *bop = llvm::dyn_cast(v); if (bop != NULL) return (LLVMVectorValuesAllEqual(bop->getOperand(0), vectorLength, @@ -669,6 +678,10 @@ LLVMVectorValuesAllEqual(llvm::Value *v, int vectorLength, return true; } + if (llvm::isa(v)) + // ? + return false; + Assert(!llvm::isa(v)); if (llvm::isa(v) || llvm::isa(v) || diff --git a/main.cpp b/main.cpp index 7874aa81..7b8c66d5 100644 --- a/main.cpp +++ b/main.cpp @@ -41,6 +41,9 @@ #include "type.h" #include #include +#ifdef ISPC_IS_WINDOWS + #include +#endif // ISPC_IS_WINDOWS #include #include #if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn) @@ -90,7 +93,6 @@ usage(int ret) { printf(" [--cpu=]\t\t\tSelect target CPU type\n"); printf(" ={%s}\n", Target::SupportedTargetCPUs()); printf(" [-D]\t\t\t\t#define given value when running preprocessor\n"); - printf(" [--debug]\t\t\t\tPrint information useful for debugging ispc\n"); printf(" [--emit-asm]\t\t\tGenerate assembly language file as output\n"); #ifndef LLVM_2_9 printf(" [--emit-c++]\t\t\tEmit a C++ source file as output\n"); @@ -99,7 +101,9 @@ usage(int ret) { printf(" [--emit-obj]\t\t\tGenerate object file file as output (default)\n"); printf(" [-g]\t\t\t\tGenerate debugging information\n"); printf(" [--help]\t\t\t\tPrint help\n"); + printf(" [--help-dev]\t\t\tPrint help for developer options\n"); printf(" [-h /--header-outfile=]\tOutput filename for header\n"); + printf(" [-I ]\t\t\t\tAdd to #include file search path\n"); printf(" [--instrument]\t\t\tEmit instrumentation to gather performance data\n"); printf(" [--math-lib=