Pass log/exp/pow transcendentals through to targets that support them.
Currently, this is the generic targets.
This commit is contained in:
@@ -886,10 +886,12 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
|
||||
symbolTable);
|
||||
lDefineConstantInt("__math_lib_system", (int)Globals::Math_System, module,
|
||||
symbolTable);
|
||||
lDefineConstantIntFunc("__fast_masked_vload", (int)g->opt.fastMaskedVload, module,
|
||||
symbolTable);
|
||||
lDefineConstantIntFunc("__fast_masked_vload", (int)g->opt.fastMaskedVload,
|
||||
module, symbolTable);
|
||||
|
||||
lDefineConstantInt("__have_native_half", (g->target.isa == Target::AVX2),
|
||||
lDefineConstantInt("__have_native_half", g->target.hasHalf, module,
|
||||
symbolTable);
|
||||
lDefineConstantInt("__have_native_transcendentals", g->target.hasTranscendentals,
|
||||
module, symbolTable);
|
||||
|
||||
if (includeStdlibISPC) {
|
||||
|
||||
@@ -1654,6 +1654,13 @@ declare void @__pseudo_scatter_base_offsets64_32(i8 * nocapture, <WIDTH x i64>,
|
||||
declare void @__pseudo_scatter_base_offsets64_64(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
|
||||
<WIDTH x i64>, <WIDTH x MASK>) nounwind
|
||||
|
||||
declare float @__log_uniform_float(float) nounwind readnone
|
||||
declare <WIDTH x float> @__log_varying_float(<WIDTH x float>) nounwind readnone
|
||||
declare float @__exp_uniform_float(float) nounwind readnone
|
||||
declare <WIDTH x float> @__exp_varying_float(<WIDTH x float>) nounwind readnone
|
||||
declare float @__pow_uniform_float(float, float) nounwind readnone
|
||||
declare <WIDTH x float> @__pow_varying_float(<WIDTH x float>, <WIDTH x float>) nounwind readnone
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; vector ops
|
||||
|
||||
|
||||
@@ -586,6 +586,121 @@ ROTATE(__vec16_f, float, float)
|
||||
SHUFFLES(__vec16_f, float, float)
|
||||
LOAD_STORE(__vec16_f, float)
|
||||
|
||||
static FORCEINLINE float __exp_uniform_float(float v) {
|
||||
return expf(v);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_f __exp_varying_float(__vec16_f v) {
|
||||
__vec16_f ret;
|
||||
for (int i = 0; i < 16; ++i)
|
||||
ret.v[i] = expf(v.v[i]);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static FORCEINLINE float __log_uniform_float(float v) {
|
||||
return logf(v);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_f __log_varying_float(__vec16_f v) {
|
||||
__vec16_f ret;
|
||||
for (int i = 0; i < 16; ++i)
|
||||
ret.v[i] = logf(v.v[i]);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static FORCEINLINE float __pow_uniform_float(float a, float b) {
|
||||
return powf(a, b);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_f __pow_varying_float(__vec16_f a, __vec16_f b) {
|
||||
__vec16_f ret;
|
||||
for (int i = 0; i < 16; ++i)
|
||||
ret.v[i] = powf(a.v[i], b.v[i]);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static FORCEINLINE int __intbits(float v) {
|
||||
union {
|
||||
float f;
|
||||
int i;
|
||||
} u;
|
||||
u.f = v;
|
||||
return u.i;
|
||||
}
|
||||
|
||||
static FORCEINLINE float __floatbits(int v) {
|
||||
union {
|
||||
float f;
|
||||
int i;
|
||||
} u;
|
||||
u.i = v;
|
||||
return u.f;
|
||||
}
|
||||
|
||||
static FORCEINLINE float __half_to_float_uniform(int16_t h) {
|
||||
static const uint32_t shifted_exp = 0x7c00 << 13; // exponent mask after shift
|
||||
|
||||
int32_t o = ((int32_t)(h & 0x7fff)) << 13; // exponent/mantissa bits
|
||||
uint32_t exp = shifted_exp & o; // just the exponent
|
||||
o += (127 - 15) << 23; // exponent adjust
|
||||
|
||||
// handle exponent special cases
|
||||
if (exp == shifted_exp) // Inf/NaN?
|
||||
o += (128 - 16) << 23; // extra exp adjust
|
||||
else if (exp == 0) { // Zero/Denormal?
|
||||
o += 1 << 23; // extra exp adjust
|
||||
o = __intbits(__floatbits(o) - __floatbits(113 << 23)); // renormalize
|
||||
}
|
||||
|
||||
o |= ((int32_t)(h & 0x8000)) << 16; // sign bit
|
||||
return __floatbits(o);
|
||||
}
|
||||
|
||||
|
||||
static FORCEINLINE __vec16_f __half_to_float_varying(__vec16_i16 v) {
|
||||
__vec16_f ret;
|
||||
for (int i = 0; i < 16; ++i)
|
||||
ret.v[i] = __half_to_float_uniform(v.v[i]);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
static FORCEINLINE int16_t __float_to_half_uniform(float f) {
|
||||
uint32_t sign_mask = 0x80000000u;
|
||||
int32_t o;
|
||||
|
||||
int32_t fint = __intbits(f);
|
||||
int32_t sign = fint & sign_mask;
|
||||
fint ^= sign;
|
||||
|
||||
int32_t f32infty = 255 << 23;
|
||||
o = (fint > f32infty) ? 0x7e00 : 0x7c00;
|
||||
|
||||
// (De)normalized number or zero
|
||||
// update fint unconditionally to save the blending; we don't need it
|
||||
// anymore for the Inf/NaN case anyway.
|
||||
const uint32_t round_mask = ~0xfffu;
|
||||
const int32_t magic = 15 << 23;
|
||||
const int32_t f16infty = 31 << 23;
|
||||
|
||||
int32_t fint2 = __intbits(__floatbits(fint & round_mask) * __floatbits(magic)) - round_mask;
|
||||
fint2 = (fint2 > f16infty) ? f16infty : fint2; // Clamp to signed infinity if overflowed
|
||||
|
||||
if (fint < f32infty)
|
||||
o = fint2 >> 13; // Take the bits!
|
||||
|
||||
return (o | (sign >> 16));
|
||||
}
|
||||
|
||||
|
||||
static FORCEINLINE __vec16_i16 __float_to_half_varying(__vec16_f v) {
|
||||
__vec16_i16 ret;
|
||||
for (int i = 0; i < 16; ++i)
|
||||
ret.v[i] = __float_to_half_uniform(v.v[i]);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// double
|
||||
|
||||
|
||||
9
ispc.cpp
9
ispc.cpp
@@ -197,6 +197,9 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
|
||||
t->arch = arch;
|
||||
}
|
||||
|
||||
// This is the case for most of them
|
||||
t->hasHalf = t->hasTranscendentals = false;
|
||||
|
||||
if (!strcasecmp(isa, "sse2")) {
|
||||
t->isa = Target::SSE2;
|
||||
t->nativeVectorWidth = 4;
|
||||
@@ -256,6 +259,8 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
|
||||
t->maskingIsFree = true;
|
||||
t->allOffMaskIsSafe = true;
|
||||
t->maskBitCount = 1;
|
||||
t->hasHalf = true;
|
||||
t->hasTranscendentals = true;
|
||||
}
|
||||
else if (!strcasecmp(isa, "generic-32")) {
|
||||
t->isa = Target::GENERIC;
|
||||
@@ -264,6 +269,8 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
|
||||
t->maskingIsFree = true;
|
||||
t->allOffMaskIsSafe = true;
|
||||
t->maskBitCount = 1;
|
||||
t->hasHalf = true;
|
||||
t->hasTranscendentals = true;
|
||||
}
|
||||
else if (!strcasecmp(isa, "generic-1")) {
|
||||
t->isa = Target::GENERIC;
|
||||
@@ -300,6 +307,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
|
||||
t->maskingIsFree = false;
|
||||
t->allOffMaskIsSafe = false;
|
||||
t->maskBitCount = 32;
|
||||
t->hasHalf = true;
|
||||
}
|
||||
else if (!strcasecmp(isa, "avx2-x2")) {
|
||||
t->isa = Target::AVX2;
|
||||
@@ -309,6 +317,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
|
||||
t->maskingIsFree = false;
|
||||
t->allOffMaskIsSafe = false;
|
||||
t->maskBitCount = 32;
|
||||
t->hasHalf = true;
|
||||
}
|
||||
#endif // !LLVM_3_0
|
||||
else {
|
||||
|
||||
8
ispc.h
8
ispc.h
@@ -249,6 +249,14 @@ struct Target {
|
||||
is 32 on SSE/AVX, since that matches the HW better, but it's 1 for
|
||||
the generic target. */
|
||||
int maskBitCount;
|
||||
|
||||
/** Indicates whether the target has native support for float/half
|
||||
conversions. */
|
||||
bool hasHalf;
|
||||
|
||||
/** Indicates whether the target has support for transcendentals (beyond
|
||||
sqrt, which we assume that all of them handle). */
|
||||
bool hasTranscendentals;
|
||||
};
|
||||
|
||||
|
||||
|
||||
@@ -1333,6 +1333,11 @@ Module::execPreprocessor(const char* infilename, llvm::raw_string_ostream* ostre
|
||||
else
|
||||
opts.addMacroDef("ISPC_POINTER_SIZE=64");
|
||||
|
||||
if (g->target.hasHalf)
|
||||
opts.addMacroDef("ISPC_TARGET_HAS_HALF");
|
||||
if (g->target.hasTranscendentals)
|
||||
opts.addMacroDef("ISPC_TARGET_HAS_TRANSCENDENTALS");
|
||||
|
||||
opts.addMacroDef("ISPC_MAJOR_VERSION=1");
|
||||
opts.addMacroDef("ISPC_MINOR_VERSION=2");
|
||||
|
||||
|
||||
28
stdlib.ispc
28
stdlib.ispc
@@ -2915,7 +2915,10 @@ static inline uniform float atan2(uniform float y, uniform float x) {
|
||||
|
||||
__declspec(safe)
|
||||
static inline float exp(float x_full) {
|
||||
if (__math_lib == __math_lib_svml) {
|
||||
if (__have_native_transcendentals) {
|
||||
return __exp_varying_float(x_full);
|
||||
}
|
||||
else if (__math_lib == __math_lib_svml) {
|
||||
return __svml_exp(x_full);
|
||||
}
|
||||
else if (__math_lib == __math_lib_system) {
|
||||
@@ -2994,7 +2997,10 @@ static inline float exp(float x_full) {
|
||||
|
||||
__declspec(safe)
|
||||
static inline uniform float exp(uniform float x_full) {
|
||||
if (__math_lib == __math_lib_system ||
|
||||
if (__have_native_transcendentals) {
|
||||
return __exp_uniform_float(x_full);
|
||||
}
|
||||
else if (__math_lib == __math_lib_system ||
|
||||
__math_lib == __math_lib_svml) {
|
||||
return __stdlib_expf(x_full);
|
||||
}
|
||||
@@ -3116,7 +3122,10 @@ static inline void __range_reduce_log(uniform float input, uniform float * unifo
|
||||
|
||||
__declspec(safe)
|
||||
static inline float log(float x_full) {
|
||||
if (__math_lib == __math_lib_svml) {
|
||||
if (__have_native_transcendentals) {
|
||||
return __log_varying_float(x_full);
|
||||
}
|
||||
else if (__math_lib == __math_lib_svml) {
|
||||
return __svml_log(x_full);
|
||||
}
|
||||
else if (__math_lib == __math_lib_system) {
|
||||
@@ -3204,7 +3213,10 @@ static inline float log(float x_full) {
|
||||
|
||||
__declspec(safe)
|
||||
static inline uniform float log(uniform float x_full) {
|
||||
if (__math_lib == __math_lib_system ||
|
||||
if (__have_native_transcendentals) {
|
||||
return __log_uniform_float(x_full);
|
||||
}
|
||||
else if (__math_lib == __math_lib_system ||
|
||||
__math_lib == __math_lib_svml) {
|
||||
return __stdlib_logf(x_full);
|
||||
}
|
||||
@@ -3285,7 +3297,10 @@ static inline uniform float log(uniform float x_full) {
|
||||
|
||||
__declspec(safe)
|
||||
static inline float pow(float a, float b) {
|
||||
if (__math_lib == __math_lib_svml) {
|
||||
if (__have_native_transcendentals) {
|
||||
return __pow_varying_float(a, b);
|
||||
}
|
||||
else if (__math_lib == __math_lib_svml) {
|
||||
return __svml_pow(a, b);
|
||||
}
|
||||
else if (__math_lib == __math_lib_system) {
|
||||
@@ -3304,6 +3319,9 @@ static inline float pow(float a, float b) {
|
||||
|
||||
__declspec(safe)
|
||||
static inline uniform float pow(uniform float a, uniform float b) {
|
||||
if (__have_native_transcendentals) {
|
||||
return __pow_uniform_float(a, b);
|
||||
}
|
||||
if (__math_lib == __math_lib_system ||
|
||||
__math_lib == __math_lib_svml) {
|
||||
return __stdlib_powf(a, b);
|
||||
|
||||
Reference in New Issue
Block a user