The packed_{load,store}_active now functions take a pointer to a
location at which to start loading/storing, rather than an array
base and a uniform index.
Variants of the prefetch functions that take varying pointers
are now available.
There are now variants of the various atomic functions that take
varying pointers (issue #112).
3039 lines
105 KiB
C++
3039 lines
105 KiB
C++
// -*- mode: c++ -*-
|
|
/*
|
|
Copyright (c) 2010-2011, Intel Corporation
|
|
All rights reserved.
|
|
|
|
Redistribution and use in source and binary forms, with or without
|
|
modification, are permitted provided that the following conditions are
|
|
met:
|
|
|
|
* Redistributions of source code must retain the above copyright
|
|
notice, this list of conditions and the following disclaimer.
|
|
|
|
* Redistributions in binary form must reproduce the above copyright
|
|
notice, this list of conditions and the following disclaimer in the
|
|
documentation and/or other materials provided with the distribution.
|
|
|
|
* Neither the name of Intel Corporation nor the names of its
|
|
contributors may be used to endorse or promote products derived from
|
|
this software without specific prior written permission.
|
|
|
|
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
|
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
|
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
|
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
|
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
|
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
|
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
/** @file stdlib.ispc
|
|
|
|
@brief Portion of the ispc standard library implementation that's in
|
|
ispc code
|
|
*/
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
// Low level primitives
|
|
|
|
static inline float floatbits(unsigned int a) {
|
|
return __floatbits_varying_int32(a);
|
|
}
|
|
|
|
static inline uniform float floatbits(uniform unsigned int a) {
|
|
return __floatbits_uniform_int32(a);
|
|
}
|
|
|
|
static inline float floatbits(int a) {
|
|
return __floatbits_varying_int32(a);
|
|
}
|
|
|
|
static inline uniform float floatbits(uniform int a) {
|
|
return __floatbits_uniform_int32(a);
|
|
}
|
|
|
|
static inline double doublebits(unsigned int64 a) {
|
|
return __doublebits_varying_int64(a);
|
|
}
|
|
|
|
static inline uniform double doublebits(uniform unsigned int64 a) {
|
|
return __doublebits_uniform_int64(a);
|
|
}
|
|
|
|
static inline unsigned int intbits(float a) {
|
|
return __intbits_varying_float(a);
|
|
}
|
|
|
|
static inline uniform unsigned int intbits(uniform float a) {
|
|
return __intbits_uniform_float(a);
|
|
}
|
|
|
|
static inline unsigned int64 intbits(double d) {
|
|
return __intbits_varying_double(d);
|
|
}
|
|
|
|
static inline uniform unsigned int64 intbits(uniform double d) {
|
|
return __intbits_uniform_double(d);
|
|
}
|
|
|
|
static inline float broadcast(float v, uniform int i) {
|
|
return __broadcast_float(v, i);
|
|
}
|
|
|
|
static inline int8 broadcast(int8 v, uniform int i) {
|
|
return __broadcast_int8(v, i);
|
|
}
|
|
|
|
static inline int16 broadcast(int16 v, uniform int i) {
|
|
return __broadcast_int16(v, i);
|
|
}
|
|
|
|
static inline int32 broadcast(int32 v, uniform int i) {
|
|
return __broadcast_int32(v, i);
|
|
}
|
|
|
|
static inline double broadcast(double v, uniform int i) {
|
|
return __broadcast_double(v, i);
|
|
}
|
|
|
|
static inline int64 broadcast(int64 v, uniform int i) {
|
|
return __broadcast_int64(v, i);
|
|
}
|
|
|
|
static inline float rotate(float v, uniform int i) {
|
|
return __rotate_float(v, i);
|
|
}
|
|
|
|
static inline int8 rotate(int8 v, uniform int i) {
|
|
return __rotate_int8(v, i);
|
|
}
|
|
|
|
static inline int16 rotate(int16 v, uniform int i) {
|
|
return __rotate_int16(v, i);
|
|
}
|
|
|
|
static inline int32 rotate(int32 v, uniform int i) {
|
|
return __rotate_int32(v, i);
|
|
}
|
|
|
|
static inline double rotate(double v, uniform int i) {
|
|
return __rotate_double(v, i);
|
|
}
|
|
|
|
static inline int64 rotate(int64 v, uniform int i) {
|
|
return __rotate_int64(v, i);
|
|
}
|
|
|
|
static inline float shuffle(float v, int i) {
|
|
return __shuffle_float(v, i);
|
|
}
|
|
|
|
static inline int8 shuffle(int8 v, int i) {
|
|
return __shuffle_int8(v, i);
|
|
}
|
|
|
|
static inline int16 shuffle(int16 v, int i) {
|
|
return __shuffle_int16(v, i);
|
|
}
|
|
|
|
static inline int32 shuffle(int32 v, int i) {
|
|
return __shuffle_int32(v, i);
|
|
}
|
|
|
|
static inline double shuffle(double v, int i) {
|
|
return __shuffle_double(v, i);
|
|
}
|
|
|
|
static inline int64 shuffle(int64 v, int i) {
|
|
return __shuffle_int64(v, i);
|
|
}
|
|
|
|
static inline float shuffle(float v0, float v1, int i) {
|
|
return __shuffle2_float(v0, v1, i);
|
|
}
|
|
|
|
static inline int8 shuffle(int8 v0, int8 v1, int i) {
|
|
return __shuffle2_int8(v0, v1, i);
|
|
}
|
|
|
|
static inline int16 shuffle(int16 v0, int16 v1, int i) {
|
|
return __shuffle2_int16(v0, v1, i);
|
|
}
|
|
|
|
static inline int32 shuffle(int32 v0, int32 v1, int i) {
|
|
return __shuffle2_int32(v0, v1, i);
|
|
}
|
|
|
|
static inline double shuffle(double v0, double v1, int i) {
|
|
return __shuffle2_double(v0, v1, i);
|
|
}
|
|
|
|
static inline int64 shuffle(int64 v0, int64 v1, int i) {
|
|
return __shuffle2_int64(v0, v1, i);
|
|
}
|
|
|
|
// x[i]
|
|
static inline uniform float extract(float x, uniform int i) {
|
|
return floatbits(__extract_int32((int)intbits(x), i));
|
|
}
|
|
|
|
static inline uniform int8 extract(int8 x, uniform int i) {
|
|
return __extract_int8(x, i);
|
|
}
|
|
|
|
static inline uniform unsigned int8 extract(unsigned int8 x, uniform int i) {
|
|
return __extract_int8(x, (unsigned int)i);
|
|
}
|
|
|
|
static inline uniform int16 extract(int16 x, uniform int i) {
|
|
return __extract_int16(x, i);
|
|
}
|
|
|
|
static inline uniform unsigned int16 extract(unsigned int16 x, uniform int i) {
|
|
return __extract_int16(x, (unsigned int)i);
|
|
}
|
|
|
|
static inline uniform int32 extract(int32 x, uniform int i) {
|
|
return __extract_int32(x, i);
|
|
}
|
|
|
|
static inline uniform unsigned int32 extract(unsigned int32 x, uniform int i) {
|
|
return __extract_int32(x, (unsigned int)i);
|
|
}
|
|
|
|
static inline uniform double extract(double x, uniform int i) {
|
|
return doublebits(__extract_int64((int64)intbits(x), i));
|
|
}
|
|
|
|
static inline uniform int64 extract(int64 x, uniform int i) {
|
|
return __extract_int64(x, i);
|
|
}
|
|
|
|
static inline uniform unsigned int64 extract(unsigned int64 x, uniform int i) {
|
|
return __extract_int64(x, (unsigned int)i);
|
|
}
|
|
|
|
// x[i] = v
|
|
static inline float insert(float x, uniform int i, uniform float v) {
|
|
return floatbits(__insert_int32((int)intbits(x), i, (int)intbits(v)));
|
|
}
|
|
|
|
static inline int8 insert(int8 x, uniform int i, uniform int8 v) {
|
|
return __insert_int8(x, i, v);
|
|
}
|
|
|
|
static inline unsigned int8 insert(unsigned int8 x, uniform int i,
|
|
uniform unsigned int8 v) {
|
|
return __insert_int8(x, (unsigned int)i, v);
|
|
}
|
|
|
|
static inline int16 insert(int16 x, uniform int i, uniform int16 v) {
|
|
return __insert_int16(x, i, v);
|
|
}
|
|
|
|
static inline unsigned int16 insert(unsigned int16 x, uniform int i,
|
|
uniform unsigned int16 v) {
|
|
return __insert_int16(x, (unsigned int)i, v);
|
|
}
|
|
|
|
static inline int32 insert(int32 x, uniform int i, uniform int32 v) {
|
|
return __insert_int32(x, i, v);
|
|
}
|
|
|
|
static inline unsigned int32 insert(unsigned int32 x, uniform int i,
|
|
uniform unsigned int32 v) {
|
|
return __insert_int32(x, (unsigned int)i, v);
|
|
}
|
|
|
|
static inline double insert(double x, uniform int i, uniform double v) {
|
|
return doublebits(__insert_int64((int64)intbits(x), i, (int64)intbits(v)));
|
|
}
|
|
|
|
static inline int64 insert(int64 x, uniform int i, uniform int64 v) {
|
|
return __insert_int64(x, i, v);
|
|
}
|
|
|
|
static inline unsigned int64 insert(unsigned int64 x, uniform int i,
|
|
uniform unsigned int64 v) {
|
|
return __insert_int64(x, (unsigned int)i, v);
|
|
}
|
|
|
|
static inline uniform int32 sign_extend(uniform bool v) {
|
|
return __sext_uniform_bool(v);
|
|
}
|
|
|
|
static inline int32 sign_extend(bool v) {
|
|
return __sext_varying_bool(v);
|
|
}
|
|
|
|
static inline uniform bool any(bool v) {
|
|
// We only care about whether "any" is true for the active program instances,
|
|
// so we have to make v with the current program mask.
|
|
return __movmsk(__sext_varying_bool(v) & __mask) != 0;
|
|
}
|
|
|
|
static inline uniform bool all(bool v) {
|
|
// As with any(), we need to explicitly mask v with the current program mask
|
|
// so we're only looking at the current lanes
|
|
int32 match = __sext_varying_bool((__sext_varying_bool(v) & __mask) == __mask);
|
|
return __movmsk(match) == (1 << programCount) - 1;
|
|
}
|
|
|
|
static inline uniform int32 popcnt(uniform int32 v) {
|
|
return __popcnt_int32(v);
|
|
}
|
|
|
|
static inline uniform int popcnt(uniform int64 v) {
|
|
return (int32)__popcnt_int64(v);
|
|
}
|
|
|
|
static inline int popcnt(int v) {
|
|
int r;
|
|
for (uniform int i = 0; i < programCount; ++i)
|
|
r = insert(r, i, popcnt(extract(v, i)));
|
|
return (r & __mask);
|
|
}
|
|
|
|
static inline int popcnt(int64 v) {
|
|
int r;
|
|
for (uniform int i = 0; i < programCount; ++i)
|
|
r = insert(r, i, popcnt(extract(v, i)));
|
|
return (r & __mask);
|
|
}
|
|
|
|
static inline uniform int popcnt(bool v) {
|
|
// As with any() and all(), only count across the active lanes
|
|
return __popcnt_int32(__movmsk(__sext_varying_bool(v) & __mask));
|
|
}
|
|
|
|
static inline uniform int lanemask() {
|
|
return __movmsk(__mask);
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
// AOS/SOA conversion
|
|
|
|
static inline void
|
|
aos_to_soa3(uniform float a[], uniform int offset, float * uniform v0,
|
|
float * uniform v1, float * uniform v2) {
|
|
__aos_to_soa3_float(&a[0], offset, v0, v1, v2);
|
|
}
|
|
|
|
static inline void
|
|
soa_to_aos3(float v0, float v1, float v2, uniform float a[],
|
|
uniform int offset) {
|
|
__soa_to_aos3_float(v0, v1, v2, &a[0], offset);
|
|
}
|
|
|
|
static inline void
|
|
aos_to_soa4(uniform float a[], uniform int offset, float * uniform v0,
|
|
float * uniform v1, float * uniform v2, float * uniform v3) {
|
|
__aos_to_soa4_float(&a[0], offset, v0, v1, v2, v3);
|
|
}
|
|
|
|
static inline void
|
|
soa_to_aos4(float v0, float v1, float v2, float v3, uniform float a[],
|
|
uniform int offset) {
|
|
__soa_to_aos4_float(v0, v1, v2, v3, &a[0], offset);
|
|
}
|
|
|
|
static inline void
|
|
aos_to_soa3(uniform int32 a[], uniform int offset, int32 * uniform v0,
|
|
int32 * uniform v1, int32 * uniform v2) {
|
|
__aos_to_soa3_int32(&a[0], offset, v0, v1, v2);
|
|
}
|
|
|
|
static inline void
|
|
soa_to_aos3(int32 v0, int32 v1, int32 v2, uniform int32 a[],
|
|
uniform int offset) {
|
|
__soa_to_aos3_int32(v0, v1, v2, &a[0], offset);
|
|
}
|
|
|
|
static inline void
|
|
aos_to_soa4(uniform int32 a[], uniform int offset, int32 * uniform v0,
|
|
int32 * uniform v1, int32 * uniform v2, int32 * uniform v3) {
|
|
__aos_to_soa4_int32(&a[0], offset, v0, v1, v2, v3);
|
|
}
|
|
|
|
static inline void
|
|
soa_to_aos4(int32 v0, int32 v1, int32 v2, int32 v3, uniform int32 a[],
|
|
uniform int offset) {
|
|
__soa_to_aos4_int32(v0, v1, v2, v3, &a[0], offset);
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
// Prefetching
|
|
|
|
static inline void prefetch_l1(const void * uniform ptr) {
|
|
__prefetch_read_uniform_1((uniform int8 * uniform)ptr);
|
|
}
|
|
|
|
static inline void prefetch_l2(const void * uniform ptr) {
|
|
__prefetch_read_uniform_2((uniform int8 * uniform)ptr);
|
|
}
|
|
|
|
static inline void prefetch_l3(const void * uniform ptr) {
|
|
__prefetch_read_uniform_3((uniform int8 * uniform)ptr);
|
|
}
|
|
|
|
static inline void prefetch_nt(const void * uniform ptr) {
|
|
__prefetch_read_uniform_nt((uniform int8 * uniform)ptr);
|
|
}
|
|
|
|
static inline void prefetch_l1(const void * varying ptr) {
|
|
const void * uniform ptrArray[programCount];
|
|
ptrArray[programIndex] = ptr;
|
|
|
|
uniform int mask = lanemask();
|
|
for (uniform int i = 0; i < programCount; ++i) {
|
|
if ((mask & (1 << i)) == 0)
|
|
continue;
|
|
const void * uniform p = ptrArray[i];
|
|
prefetch_l1(p);
|
|
}
|
|
}
|
|
|
|
static inline void prefetch_l2(const void * varying ptr) {
|
|
const void * uniform ptrArray[programCount];
|
|
ptrArray[programIndex] = ptr;
|
|
|
|
uniform int mask = lanemask();
|
|
for (uniform int i = 0; i < programCount; ++i) {
|
|
if ((mask & (1 << i)) == 0)
|
|
continue;
|
|
const void * uniform p = ptrArray[i];
|
|
prefetch_l2(p);
|
|
}
|
|
}
|
|
|
|
static inline void prefetch_l3(const void * varying ptr) {
|
|
const void * uniform ptrArray[programCount];
|
|
ptrArray[programIndex] = ptr;
|
|
|
|
uniform int mask = lanemask();
|
|
for (uniform int i = 0; i < programCount; ++i) {
|
|
if ((mask & (1 << i)) == 0)
|
|
continue;
|
|
const void * uniform p = ptrArray[i];
|
|
prefetch_l3(p);
|
|
}
|
|
}
|
|
|
|
static inline void prefetch_nt(const void * varying ptr) {
|
|
const void * uniform ptrArray[programCount];
|
|
ptrArray[programIndex] = ptr;
|
|
|
|
uniform int mask = lanemask();
|
|
for (uniform int i = 0; i < programCount; ++i) {
|
|
if ((mask & (1 << i)) == 0)
|
|
continue;
|
|
const void * uniform p = ptrArray[i];
|
|
prefetch_nt(p);
|
|
}
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
// Horizontal ops / reductions
|
|
|
|
static inline uniform float reduce_add(float x) {
|
|
// zero the lanes where the mask is off
|
|
return __reduce_add_float(__mask ? x : 0.);
|
|
}
|
|
|
|
|
|
static inline uniform float reduce_min(float v) {
|
|
// For the lanes where the mask is off, replace the given value with
|
|
// infinity, so that it doesn't affect the result.
|
|
int iflt_max = 0x7f800000; // infinity
|
|
// Must use __floatbits_varying_int32, not floatbits(), since with the
|
|
// latter the current mask enters into the returned result...
|
|
return __reduce_min_float(__mask ? v : __floatbits_varying_int32(iflt_max));
|
|
}
|
|
|
|
static inline uniform float reduce_max(float v) {
|
|
// For the lanes where the mask is off, replace the given value with
|
|
// negative infinity, so that it doesn't affect the result.
|
|
const int iflt_neg_max = 0xff800000; // -infinity
|
|
// Must use __floatbits_varying_int32, not floatbits(), since with the
|
|
// latter the current mask enters into the returned result...
|
|
return __reduce_max_float(__mask ? v : __floatbits_varying_int32(iflt_neg_max));
|
|
}
|
|
|
|
static inline uniform int reduce_add(int x) {
|
|
// Zero out the values for lanes that aren't running
|
|
return __reduce_add_int32(x & __mask);
|
|
}
|
|
|
|
static inline uniform int reduce_min(int v) {
|
|
// Set values for non-running lanes to the maximum integer value so
|
|
// they don't affect the result.
|
|
int int_max = 0x7fffffff;
|
|
return __reduce_min_int32(__mask ? v : int_max);
|
|
}
|
|
|
|
static inline uniform int reduce_max(int v) {
|
|
// Set values for non-running lanes to the minimum integer value so
|
|
// they don't affect the result.
|
|
int int_min = 0x80000000;
|
|
return __reduce_max_int32(__mask ? v : int_min);
|
|
}
|
|
|
|
static inline uniform unsigned int reduce_add(unsigned int x) {
|
|
// Set values for non-running lanes to zero so they don't affect the
|
|
// result.
|
|
return __reduce_add_uint32(x & __mask);
|
|
}
|
|
|
|
static inline uniform unsigned int reduce_min(unsigned int v) {
|
|
// Set values for non-running lanes to the maximum unsigned integer
|
|
// value so they don't affect the result.
|
|
unsigned int uint_max = 0xffffffff;
|
|
return __reduce_min_uint32(__mask ? v : uint_max);
|
|
}
|
|
|
|
static inline uniform unsigned int reduce_max(unsigned int v) {
|
|
// Set values for non-running lanes to zero so they don't affect the
|
|
// result.
|
|
return __reduce_max_uint32(__mask ? v : 0);
|
|
}
|
|
|
|
|
|
static inline uniform double reduce_add(double x) {
|
|
// zero the lanes where the mask is off
|
|
return __reduce_add_double(__mask ? x : 0.);
|
|
}
|
|
|
|
static inline uniform double reduce_min(double v) {
|
|
int64 iflt_max = 0x7ff0000000000000; // infinity
|
|
// Must use __doublebits_varying_int64, not doublebits(), since with the
|
|
// latter the current mask enters into the returned result...
|
|
return __reduce_min_double(__mask ? v : __doublebits_varying_int64(iflt_max));
|
|
}
|
|
|
|
static inline uniform double reduce_max(double v) {
|
|
const int64 iflt_neg_max = 0xfff0000000000000; // -infinity
|
|
// Must use __doublebits_varying_int64, not doublebits(), since with the
|
|
// latter the current mask enters into the returned result...
|
|
return __reduce_max_double(__mask ? v : __doublebits_varying_int64(iflt_neg_max));
|
|
}
|
|
|
|
static inline uniform int64 reduce_add(int64 x) {
|
|
// Zero out the values for lanes that aren't running
|
|
return __reduce_add_int64(x & (int64)(__mask));
|
|
}
|
|
|
|
static inline uniform int64 reduce_min(int64 v) {
|
|
// Set values for non-running lanes to the maximum integer value so
|
|
// they don't affect the result.
|
|
int64 int_max = 0x7fffffffffffffff;
|
|
return __reduce_min_int64(__mask ? v : int_max);
|
|
}
|
|
|
|
static inline uniform int64 reduce_max(int64 v) {
|
|
// Set values for non-running lanes to the minimum integer value so
|
|
// they don't affect the result.
|
|
int64 int_min = 0x8000000000000000;
|
|
return __reduce_max_int64(__mask ? v : int_min);
|
|
}
|
|
|
|
static inline uniform unsigned int64 reduce_add(unsigned int64 x) {
|
|
// Set values for non-running lanes to zero so they don't affect the
|
|
// result.
|
|
return __reduce_add_int64(x & (int64)(__mask));
|
|
}
|
|
|
|
static inline uniform unsigned int64 reduce_min(unsigned int64 v) {
|
|
// Set values for non-running lanes to the maximum unsigned integer
|
|
// value so they don't affect the result.
|
|
unsigned int64 uint_max = 0xffffffffffffffff;
|
|
return __reduce_min_uint64(__mask ? v : uint_max);
|
|
}
|
|
|
|
static inline uniform unsigned int64 reduce_max(unsigned int64 v) {
|
|
// Set values for non-running lanes to zero so they don't affect the
|
|
// result.
|
|
return __reduce_max_uint64(__mask ? v : 0);
|
|
}
|
|
|
|
#define REDUCE_EQUAL(TYPE, FUNCTYPE, MASKTYPE) \
|
|
static inline uniform bool reduce_equal(TYPE v) { \
|
|
uniform TYPE unusedValue; \
|
|
return __reduce_equal_##FUNCTYPE(v, &unusedValue, (MASKTYPE)__mask); \
|
|
} \
|
|
static inline uniform bool reduce_equal(TYPE v, uniform TYPE * uniform value) { \
|
|
return __reduce_equal_##FUNCTYPE(v, value, (MASKTYPE)__mask); \
|
|
}
|
|
|
|
REDUCE_EQUAL(int32, int32, int32)
|
|
REDUCE_EQUAL(unsigned int32, int32, unsigned int32)
|
|
REDUCE_EQUAL(float, float, int32)
|
|
REDUCE_EQUAL(int64, int64, int32)
|
|
REDUCE_EQUAL(unsigned int64, int64, unsigned int32)
|
|
REDUCE_EQUAL(double, double, int32)
|
|
|
|
static int32 exclusive_scan_add(int32 v) {
|
|
return __exclusive_scan_add_i32(v, (int32)__mask);
|
|
}
|
|
|
|
static unsigned int32 exclusive_scan_add(unsigned int32 v) {
|
|
return __exclusive_scan_add_i32(v, __mask);
|
|
}
|
|
|
|
static float exclusive_scan_add(float v) {
|
|
return __exclusive_scan_add_float(v, __mask);
|
|
}
|
|
|
|
static int64 exclusive_scan_add(int64 v) {
|
|
return __exclusive_scan_add_i64(v, (int32)__mask);
|
|
}
|
|
|
|
static unsigned int64 exclusive_scan_add(unsigned int64 v) {
|
|
return __exclusive_scan_add_i64(v, __mask);
|
|
}
|
|
|
|
static double exclusive_scan_add(double v) {
|
|
return __exclusive_scan_add_double(v, __mask);
|
|
}
|
|
|
|
static int32 exclusive_scan_and(int32 v) {
|
|
return __exclusive_scan_and_i32(v, (int32)__mask);
|
|
}
|
|
|
|
static unsigned int32 exclusive_scan_and(unsigned int32 v) {
|
|
return __exclusive_scan_and_i32(v, __mask);
|
|
}
|
|
|
|
static int64 exclusive_scan_and(int64 v) {
|
|
return __exclusive_scan_and_i64(v, (int32)__mask);
|
|
}
|
|
|
|
static unsigned int64 exclusive_scan_and(unsigned int64 v) {
|
|
return __exclusive_scan_and_i64(v, __mask);
|
|
}
|
|
|
|
static int32 exclusive_scan_or(int32 v) {
|
|
return __exclusive_scan_or_i32(v, (int32)__mask);
|
|
}
|
|
|
|
static unsigned int32 exclusive_scan_or(unsigned int32 v) {
|
|
return __exclusive_scan_or_i32(v, __mask);
|
|
}
|
|
|
|
static int64 exclusive_scan_or(int64 v) {
|
|
return __exclusive_scan_or_i64(v, (int32)__mask);
|
|
}
|
|
|
|
static unsigned int64 exclusive_scan_or(unsigned int64 v) {
|
|
return __exclusive_scan_or_i64(v, __mask);
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
// packed load, store
|
|
|
|
static inline uniform int
|
|
packed_load_active(uniform unsigned int * uniform a,
|
|
unsigned int * uniform vals) {
|
|
return __packed_load_active(a, vals, (unsigned int32)__mask);
|
|
}
|
|
|
|
static inline uniform int
|
|
packed_store_active(uniform unsigned int * uniform a,
|
|
unsigned int vals) {
|
|
return __packed_store_active(a, vals, (unsigned int32)__mask);
|
|
}
|
|
|
|
static inline uniform int
|
|
packed_load_active(uniform int * uniform a, int * uniform vals) {
|
|
return __packed_load_active(a, vals, (int32)__mask);
|
|
}
|
|
|
|
static inline uniform int
|
|
packed_store_active(uniform int * uniform a, int vals) {
|
|
return __packed_store_active(a, vals, (int32)__mask);
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
// System information
|
|
|
|
static inline int num_cores() {
|
|
return __num_cores();
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
// Atomics and memory barriers
|
|
|
|
static inline void memory_barrier() {
|
|
__memory_barrier();
|
|
}
|
|
|
|
#define DEFINE_ATOMIC_OP(TA,TB,OPA,OPB,MASKTYPE) \
|
|
static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
|
|
memory_barrier(); \
|
|
TA ret = __atomic_##OPB##_##TB##_global(ptr, value, (MASKTYPE)__mask); \
|
|
memory_barrier(); \
|
|
return ret; \
|
|
} \
|
|
static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
|
|
uniform TA value) { \
|
|
memory_barrier(); \
|
|
uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value, \
|
|
(MASKTYPE)__mask); \
|
|
memory_barrier(); \
|
|
return ret; \
|
|
} \
|
|
static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \
|
|
uniform TA * uniform ptrArray[programCount]; \
|
|
ptrArray[programIndex] = ptr; \
|
|
memory_barrier(); \
|
|
TA ret; \
|
|
uniform int mask = lanemask(); \
|
|
for (uniform int i = 0; i < programCount; ++i) { \
|
|
if ((mask & (1 << i)) == 0) \
|
|
continue; \
|
|
uniform TA * uniform p = ptrArray[i]; \
|
|
uniform TA v = extract(value, i); \
|
|
uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v, \
|
|
(MASKTYPE)__mask); \
|
|
ret = insert(ret, i, r); \
|
|
} \
|
|
memory_barrier(); \
|
|
return ret; \
|
|
} \
|
|
|
|
#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB, MASKTYPE) \
|
|
static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
|
|
uniform TA oneval = reduce_##OPA(value); \
|
|
TA ret; \
|
|
if (lanemask() != 0) { \
|
|
memory_barrier(); \
|
|
ret = __atomic_##OPB##_uniform_##TB##_global(ptr, oneval, \
|
|
(MASKTYPE)__mask); \
|
|
memory_barrier(); \
|
|
} \
|
|
return ret; \
|
|
} \
|
|
static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
|
|
uniform TA value) { \
|
|
memory_barrier(); \
|
|
uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value, \
|
|
(MASKTYPE)__mask); \
|
|
memory_barrier(); \
|
|
return ret; \
|
|
} \
|
|
static inline TA atomic_##OPA##_global(uniform TA * varying ptr, \
|
|
TA value) { \
|
|
uniform TA * uniform ptrArray[programCount]; \
|
|
ptrArray[programIndex] = ptr; \
|
|
memory_barrier(); \
|
|
TA ret; \
|
|
uniform int mask = lanemask(); \
|
|
for (uniform int i = 0; i < programCount; ++i) { \
|
|
if ((mask & (1 << i)) == 0) \
|
|
continue; \
|
|
uniform TA * uniform p = ptrArray[i]; \
|
|
uniform TA v = extract(value, i); \
|
|
uniform TA r = __atomic_##OPB##_uniform_##TB##_global(p, v, \
|
|
(MASKTYPE)__mask); \
|
|
ret = insert(ret, i, r); \
|
|
} \
|
|
memory_barrier(); \
|
|
return ret; \
|
|
}
|
|
|
|
DEFINE_ATOMIC_OP(int32,int32,add,add,int32)
|
|
DEFINE_ATOMIC_OP(int32,int32,subtract,sub,int32)
|
|
DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min,int32)
|
|
DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max,int32)
|
|
DEFINE_ATOMIC_OP(int32,int32,and,and,int32)
|
|
DEFINE_ATOMIC_OP(int32,int32,or,or,int32)
|
|
DEFINE_ATOMIC_OP(int32,int32,xor,xor,int32)
|
|
DEFINE_ATOMIC_OP(int32,int32,swap,swap,int32)
|
|
|
|
// For everything but atomic min and max, we can use the same
|
|
// implementations for unsigned as for signed.
|
|
DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,unsigned int32)
|
|
DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,unsigned int32)
|
|
DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin,unsigned int32)
|
|
DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax,unsigned int32)
|
|
DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,unsigned int32)
|
|
DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,unsigned int32)
|
|
DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,unsigned int32)
|
|
DEFINE_ATOMIC_OP(unsigned int32,int32,swap,swap,unsigned int32)
|
|
|
|
DEFINE_ATOMIC_OP(float,float,swap,swap,int32)
|
|
|
|
DEFINE_ATOMIC_OP(int64,int64,add,add,int32)
|
|
DEFINE_ATOMIC_OP(int64,int64,subtract,sub,int32)
|
|
DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min,int32)
|
|
DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max,int32)
|
|
DEFINE_ATOMIC_OP(int64,int64,and,and,int32)
|
|
DEFINE_ATOMIC_OP(int64,int64,or,or,int32)
|
|
DEFINE_ATOMIC_OP(int64,int64,xor,xor,int32)
|
|
DEFINE_ATOMIC_OP(int64,int64,swap,swap,int32)
|
|
|
|
// For everything but atomic min and max, we can use the same
|
|
// implementations for unsigned as for signed.
|
|
DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,unsigned int32)
|
|
DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,unsigned int32)
|
|
DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin,unsigned int32)
|
|
DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax,unsigned int32)
|
|
DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,unsigned int32)
|
|
DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,unsigned int32)
|
|
DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,unsigned int32)
|
|
DEFINE_ATOMIC_OP(unsigned int64,int64,swap,swap,unsigned int32)
|
|
|
|
DEFINE_ATOMIC_OP(double,double,swap,swap,int32)
|
|
|
|
#undef DEFINE_ATOMIC_OP
|
|
|
|
#define ATOMIC_DECL_CMPXCHG(TA, TB, MASKTYPE) \
|
|
static inline TA atomic_compare_exchange_global( \
|
|
uniform TA * uniform ptr, TA oldval, TA newval) { \
|
|
memory_barrier(); \
|
|
TA ret = __atomic_compare_exchange_##TB##_global(ptr, oldval, newval, \
|
|
(MASKTYPE)__mask); \
|
|
memory_barrier(); \
|
|
return ret; \
|
|
} \
|
|
static inline uniform TA atomic_compare_exchange_global( \
|
|
uniform TA * uniform ptr, uniform TA oldval, uniform TA newval) { \
|
|
memory_barrier(); \
|
|
uniform TA ret = \
|
|
__atomic_compare_exchange_uniform_##TB##_global(ptr, oldval, newval, \
|
|
(MASKTYPE)__mask); \
|
|
memory_barrier(); \
|
|
return ret; \
|
|
}
|
|
|
|
ATOMIC_DECL_CMPXCHG(int32, int32, int32)
|
|
ATOMIC_DECL_CMPXCHG(unsigned int32, int32, unsigned int32)
|
|
ATOMIC_DECL_CMPXCHG(float, float, int32)
|
|
ATOMIC_DECL_CMPXCHG(int64, int64, int32)
|
|
ATOMIC_DECL_CMPXCHG(unsigned int64, int64, unsigned int32)
|
|
ATOMIC_DECL_CMPXCHG(double, double, int32)
|
|
|
|
#undef ATOMIC_DECL_CMPXCHG
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
// Floating-Point Math
|
|
|
|
static inline float abs(float a) {
|
|
// Floating-point hack: zeroing the high bit clears the sign
|
|
unsigned int i = intbits(a);
|
|
i &= 0x7fffffff;
|
|
return floatbits(i);
|
|
}
|
|
|
|
static inline uniform float abs(uniform float a) {
|
|
uniform unsigned int i = intbits(a);
|
|
i &= 0x7fffffff;
|
|
return floatbits(i);
|
|
}
|
|
|
|
static inline double abs(double a) {
|
|
// zeroing the high bit clears the sign
|
|
unsigned int64 i = intbits(a);
|
|
i &= 0x7fffffffffffffff;
|
|
return doublebits(i);
|
|
}
|
|
|
|
static inline uniform double abs(uniform double a) {
|
|
uniform unsigned int64 i = intbits(a);
|
|
i &= 0x7fffffffffffffff;
|
|
return doublebits(i);
|
|
}
|
|
|
|
static inline unsigned int signbits(float x) {
|
|
unsigned int i = intbits(x);
|
|
return (i & 0x80000000);
|
|
}
|
|
|
|
static inline uniform unsigned int signbits(uniform float x) {
|
|
uniform unsigned int i = intbits(x);
|
|
return (i & 0x80000000);
|
|
}
|
|
|
|
static inline unsigned int64 signbits(double x) {
|
|
unsigned int64 i = intbits(x);
|
|
return (i & 0x8000000000000000);
|
|
}
|
|
|
|
static inline uniform unsigned int64 signbits(uniform double x) {
|
|
uniform unsigned int64 i = intbits(x);
|
|
return (i & 0x8000000000000000);
|
|
}
|
|
|
|
static inline float round(float x) {
|
|
return __round_varying_float(x);
|
|
}
|
|
|
|
static inline uniform float round(uniform float x) {
|
|
return __round_uniform_float(x);
|
|
}
|
|
|
|
static inline double round(double x) {
|
|
return __round_varying_double(x);
|
|
}
|
|
|
|
static inline uniform double round(uniform double x) {
|
|
return __round_uniform_double(x);
|
|
}
|
|
|
|
static inline float floor(float x) {
|
|
return __floor_varying_float(x);
|
|
}
|
|
|
|
static inline uniform float floor(uniform float x) {
|
|
return __floor_uniform_float(x);
|
|
}
|
|
|
|
static inline double floor(double x) {
|
|
return __floor_varying_double(x);
|
|
}
|
|
|
|
static inline uniform double floor(uniform double x) {
|
|
return __floor_uniform_double(x);
|
|
}
|
|
|
|
static inline float ceil(float x) {
|
|
return __ceil_varying_float(x);
|
|
}
|
|
|
|
static inline uniform float ceil(uniform float x) {
|
|
return __ceil_uniform_float(x);
|
|
}
|
|
|
|
static inline double ceil(double x) {
|
|
return __ceil_varying_double(x);
|
|
}
|
|
|
|
static inline uniform double ceil(uniform double x) {
|
|
return __ceil_uniform_double(x);
|
|
}
|
|
|
|
static inline float rcp(float v) {
|
|
return __rcp_varying_float(v);
|
|
}
|
|
|
|
static inline uniform float rcp(uniform float v) {
|
|
return __rcp_uniform_float(v);
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
// min/max
|
|
|
|
// float
|
|
|
|
static inline float min(float a, float b) {
|
|
return __min_varying_float(a, b);
|
|
}
|
|
|
|
static inline uniform float min(uniform float a, uniform float b) {
|
|
return __min_uniform_float(a, b);
|
|
}
|
|
|
|
static inline float max(float a, float b) {
|
|
return __max_varying_float(a, b);
|
|
}
|
|
|
|
static inline uniform float max(uniform float a, uniform float b) {
|
|
return __max_uniform_float(a, b);
|
|
}
|
|
|
|
|
|
// double
|
|
|
|
static inline double min(double a, double b) {
|
|
return __min_varying_double(a, b);
|
|
}
|
|
|
|
static inline uniform double min(uniform double a, uniform double b) {
|
|
return __min_uniform_double(a, b);
|
|
}
|
|
|
|
static inline double max(double a, double b) {
|
|
return __max_varying_double(a, b);
|
|
}
|
|
|
|
static inline uniform double max(uniform double a, uniform double b) {
|
|
return __max_uniform_double(a, b);
|
|
}
|
|
|
|
// int8
|
|
|
|
static inline uniform unsigned int8 min(uniform unsigned int8 a,
|
|
uniform unsigned int8 b) {
|
|
return (a < b) ? a : b;
|
|
}
|
|
|
|
static inline uniform unsigned int8 max(uniform unsigned int8 a,
|
|
uniform unsigned int8 b) {
|
|
return (a > b) ? a : b;
|
|
}
|
|
|
|
static inline uniform int8 min(uniform int8 a, uniform int8 b) {
|
|
return (a < b) ? a : b;
|
|
}
|
|
|
|
static inline uniform int8 max(uniform int8 a, uniform int8 b) {
|
|
return (a > b) ? a : b;
|
|
}
|
|
|
|
static inline unsigned int8 min(unsigned int8 a, unsigned int8 b) {
|
|
return (a < b) ? a : b;
|
|
}
|
|
|
|
static inline unsigned int8 max(unsigned int8 a, unsigned int8 b) {
|
|
return (a > b) ? a : b;
|
|
}
|
|
|
|
static inline int8 min(int8 a, int8 b) {
|
|
return (a < b) ? a : b;
|
|
}
|
|
|
|
static inline int8 max(int8 a, int8 b) {
|
|
return (a > b) ? a : b;
|
|
}
|
|
|
|
// int16
|
|
|
|
static inline uniform unsigned int16 min(uniform unsigned int16 a,
|
|
uniform unsigned int16 b) {
|
|
return (a < b) ? a : b;
|
|
}
|
|
|
|
static inline uniform unsigned int16 max(uniform unsigned int16 a,
|
|
uniform unsigned int16 b) {
|
|
return (a > b) ? a : b;
|
|
}
|
|
|
|
static inline uniform int16 min(uniform int16 a, uniform int16 b) {
|
|
return (a < b) ? a : b;
|
|
}
|
|
|
|
static inline uniform int16 max(uniform int16 a, uniform int16 b) {
|
|
return (a > b) ? a : b;
|
|
}
|
|
|
|
static inline unsigned int16 min(unsigned int16 a, unsigned int16 b) {
|
|
return (a < b) ? a : b;
|
|
}
|
|
|
|
static inline unsigned int16 max(unsigned int16 a, unsigned int16 b) {
|
|
return (a > b) ? a : b;
|
|
}
|
|
|
|
static inline int16 min(int16 a, int16 b) {
|
|
return (a < b) ? a : b;
|
|
}
|
|
|
|
static inline int16 max(int16 a, int16 b) {
|
|
return (a > b) ? a : b;
|
|
}
|
|
|
|
// int32
|
|
|
|
static inline unsigned int min(unsigned int a, unsigned int b) {
|
|
return __min_varying_uint32(a, b);
|
|
}
|
|
|
|
static inline uniform unsigned int min(uniform unsigned int a, uniform unsigned int b) {
|
|
return __min_uniform_uint32(a, b);
|
|
}
|
|
|
|
static inline unsigned int max(unsigned int a, unsigned int b) {
|
|
return __max_varying_uint32(a, b);
|
|
}
|
|
|
|
static inline uniform unsigned int max(uniform unsigned int a, uniform unsigned int b) {
|
|
return __max_uniform_uint32(a, b);
|
|
}
|
|
|
|
static inline int min(int a, int b) {
|
|
return __min_varying_int32(a, b);
|
|
}
|
|
|
|
static inline uniform int min(uniform int a, uniform int b) {
|
|
return __min_uniform_int32(a, b);
|
|
}
|
|
|
|
static inline int max(int a, int b) {
|
|
return __max_varying_int32(a, b);
|
|
}
|
|
|
|
static inline uniform int max(uniform int a, uniform int b) {
|
|
return __max_uniform_int32(a, b);
|
|
}
|
|
|
|
// int64
|
|
|
|
static inline unsigned int64 min(unsigned int64 a, unsigned int64 b) {
|
|
return __min_varying_uint64(a, b);
|
|
}
|
|
|
|
static inline uniform unsigned int64 min(uniform unsigned int64 a, uniform unsigned int64 b) {
|
|
return __min_uniform_uint64(a, b);
|
|
}
|
|
|
|
static inline unsigned int64 max(unsigned int64 a, unsigned int64 b) {
|
|
return __max_varying_uint64(a, b);
|
|
}
|
|
|
|
static inline uniform unsigned int64 max(uniform unsigned int64 a, uniform unsigned int64 b) {
|
|
return __max_uniform_uint64(a, b);
|
|
}
|
|
|
|
static inline int64 min(int64 a, int64 b) {
|
|
return __min_varying_int64(a, b);
|
|
}
|
|
|
|
static inline uniform int64 min(uniform int64 a, uniform int64 b) {
|
|
return __min_uniform_int64(a, b);
|
|
}
|
|
|
|
static inline int64 max(int64 a, int64 b) {
|
|
return __max_varying_int64(a, b);
|
|
}
|
|
|
|
static inline uniform int64 max(uniform int64 a, uniform int64 b) {
|
|
return __max_uniform_int64(a, b);
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
// clamps
|
|
|
|
// float
|
|
|
|
static inline float clamp(float v, float low, float high) {
|
|
return min(max(v, low), high);
|
|
}
|
|
|
|
static inline uniform float clamp(uniform float v, uniform float low, uniform float high) {
|
|
return min(max(v, low), high);
|
|
}
|
|
|
|
// int8
|
|
|
|
static inline unsigned int8 clamp(unsigned int8 v, unsigned int8 low,
|
|
unsigned int8 high) {
|
|
return min(max(v, low), high);
|
|
}
|
|
|
|
static inline uniform unsigned int8 clamp(uniform unsigned int8 v,
|
|
uniform unsigned int8 low,
|
|
uniform unsigned int8 high) {
|
|
return min(max(v, low), high);
|
|
}
|
|
|
|
static inline int8 clamp(int8 v, int8 low, int8 high) {
|
|
return min(max(v, low), high);
|
|
}
|
|
|
|
static inline uniform int8 clamp(uniform int8 v, uniform int8 low,
|
|
uniform int8 high) {
|
|
return min(max(v, low), high);
|
|
}
|
|
|
|
// int16
|
|
|
|
static inline unsigned int16 clamp(unsigned int16 v, unsigned int16 low,
|
|
unsigned int16 high) {
|
|
return min(max(v, low), high);
|
|
}
|
|
|
|
static inline uniform unsigned int16 clamp(uniform unsigned int16 v,
|
|
uniform unsigned int16 low,
|
|
uniform unsigned int16 high) {
|
|
return min(max(v, low), high);
|
|
}
|
|
|
|
static inline int16 clamp(int16 v, int16 low, int16 high) {
|
|
return min(max(v, low), high);
|
|
}
|
|
|
|
static inline uniform int16 clamp(uniform int16 v, uniform int16 low,
|
|
uniform int16 high) {
|
|
return min(max(v, low), high);
|
|
}
|
|
|
|
// int32
|
|
|
|
static inline unsigned int clamp(unsigned int v, unsigned int low, unsigned int high) {
|
|
return min(max(v, low), high);
|
|
}
|
|
|
|
static inline uniform unsigned int clamp(uniform unsigned int v, uniform unsigned int low,
|
|
uniform unsigned int high) {
|
|
return min(max(v, low), high);
|
|
}
|
|
|
|
static inline int clamp(int v, int low, int high) {
|
|
return min(max(v, low), high);
|
|
}
|
|
|
|
static inline uniform int clamp(uniform int v, uniform int low, uniform int high) {
|
|
return min(max(v, low), high);
|
|
}
|
|
|
|
// int64
|
|
|
|
static inline unsigned int64 clamp(unsigned int64 v, unsigned int64 low,
|
|
unsigned int64 high) {
|
|
return min(max(v, low), high);
|
|
}
|
|
|
|
static inline uniform unsigned int64 clamp(uniform unsigned int64 v,
|
|
uniform unsigned int64 low,
|
|
uniform unsigned int64 high) {
|
|
return min(max(v, low), high);
|
|
}
|
|
|
|
static inline int64 clamp(int64 v, int64 low, int64 high) {
|
|
return min(max(v, low), high);
|
|
}
|
|
|
|
static inline uniform int64 clamp(uniform int64 v, uniform int64 low,
|
|
uniform int64 high) {
|
|
return min(max(v, low), high);
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
// Transcendentals (float precision)
|
|
|
|
static inline float sqrt(float v) {
|
|
return __sqrt_varying_float(v);
|
|
}
|
|
|
|
static inline uniform float sqrt(uniform float v) {
|
|
return __sqrt_uniform_float(v);
|
|
}
|
|
|
|
static inline float rsqrt(float v) {
|
|
return __rsqrt_varying_float(v);
|
|
}
|
|
|
|
static inline uniform float rsqrt(uniform float v) {
|
|
return __rsqrt_uniform_float(v);
|
|
}
|
|
|
|
static inline float ldexp(float x, int n) {
|
|
unsigned int ex = 0x7F800000u;
|
|
unsigned int ix = intbits(x);
|
|
ex &= ix; // extract old exponent;
|
|
ix = ix & ~0x7F800000u; // clear exponent
|
|
n = (n << 23) + ex;
|
|
ix |= n; // insert new exponent
|
|
return floatbits(ix);
|
|
}
|
|
|
|
static inline uniform float ldexp(uniform float x, uniform int n) {
|
|
uniform unsigned int ex = 0x7F800000u;
|
|
uniform unsigned int ix = intbits(x);
|
|
ex &= ix; // extract old exponent;
|
|
ix = ix & ~0x7F800000u; // clear exponent
|
|
n = (n << 23) + ex;
|
|
ix |= n; // insert new exponent
|
|
return floatbits(ix);
|
|
}
|
|
|
|
static inline float frexp(float x, int * uniform pw2) {
|
|
unsigned int ex = 0x7F800000u; // exponent mask
|
|
unsigned int ix = intbits(x);
|
|
ex &= ix;
|
|
ix &= ~0x7F800000u; // clear exponent
|
|
*pw2 = (int)(ex >> 23) - 126; // compute exponent
|
|
ix |= 0x3F000000u; // insert exponent +1 in x
|
|
return floatbits(ix);
|
|
}
|
|
|
|
static inline uniform float frexp(uniform float x, uniform int * uniform pw2) {
|
|
uniform unsigned int ex = 0x7F800000u; // exponent mask
|
|
uniform unsigned int ix = intbits(x);
|
|
ex &= ix;
|
|
ix &= ~0x7F800000u; // clear exponent
|
|
*pw2 = (uniform int)(ex >> 23) - 126; // compute exponent
|
|
ix |= 0x3F000000u; // insert exponent +1 in x
|
|
return floatbits(ix);
|
|
}
|
|
|
|
// Most of the transcendental implementations in ispc code here come from
|
|
// Solomon Boulos's "syrah": https://github.com/boulos/syrah/
|
|
|
|
static inline float sin(float x_full) {
|
|
if (__math_lib == __math_lib_svml) {
|
|
return __svml_sin(x_full);
|
|
}
|
|
else if (__math_lib == __math_lib_system) {
|
|
float ret;
|
|
uniform int mask = lanemask();
|
|
for (uniform int i = 0; i < programCount; ++i) {
|
|
if ((mask & (1 << i)) == 0)
|
|
continue;
|
|
uniform float r = __stdlib_sinf(extract(x_full, i));
|
|
ret = insert(ret, i, r);
|
|
}
|
|
return ret;
|
|
}
|
|
else if (__math_lib == __math_lib_ispc ||
|
|
__math_lib == __math_lib_ispc_fast) {
|
|
static const float pi_over_two_vec = 1.57079637050628662109375;
|
|
static const float two_over_pi_vec = 0.636619746685028076171875;
|
|
float scaled = x_full * two_over_pi_vec;
|
|
float k_real = floor(scaled);
|
|
int k = (int)k_real;
|
|
|
|
// Reduced range version of x
|
|
float x = x_full - k_real * pi_over_two_vec;
|
|
int k_mod4 = k & 3;
|
|
bool sin_usecos = (k_mod4 == 1 || k_mod4 == 3);
|
|
bool flip_sign = (k_mod4 > 1);
|
|
|
|
// These coefficients are from sollya with fpminimax(sin(x)/x, [|0, 2,
|
|
// 4, 6, 8, 10|], [|single...|], [0;Pi/2]);
|
|
static const float sin_c2 = -0.16666667163372039794921875;
|
|
static const float sin_c4 = 8.333347737789154052734375e-3;
|
|
static const float sin_c6 = -1.9842604524455964565277099609375e-4;
|
|
static const float sin_c8 = 2.760012648650445044040679931640625e-6;
|
|
static const float sin_c10 = -2.50293279435709337121807038784027099609375e-8;
|
|
|
|
static const float cos_c2 = -0.5;
|
|
static const float cos_c4 = 4.166664183139801025390625e-2;
|
|
static const float cos_c6 = -1.388833043165504932403564453125e-3;
|
|
static const float cos_c8 = 2.47562347794882953166961669921875e-5;
|
|
static const float cos_c10 = -2.59630184018533327616751194000244140625e-7;
|
|
|
|
float outside = sin_usecos ? 1 : x;
|
|
float c2 = sin_usecos ? cos_c2 : sin_c2;
|
|
float c4 = sin_usecos ? cos_c4 : sin_c4;
|
|
float c6 = sin_usecos ? cos_c6 : sin_c6;
|
|
float c8 = sin_usecos ? cos_c8 : sin_c8;
|
|
float c10 = sin_usecos ? cos_c10 : sin_c10;
|
|
|
|
float x2 = x * x;
|
|
float formula = x2 * c10 + c8;
|
|
formula = x2 * formula + c6;
|
|
formula = x2 * formula + c4;
|
|
formula = x2 * formula + c2;
|
|
formula = x2 * formula + 1;
|
|
formula *= outside;
|
|
|
|
formula = flip_sign ? -formula : formula;
|
|
return formula;
|
|
}
|
|
}
|
|
|
|
|
|
static inline uniform float sin(uniform float x_full) {
|
|
if (__math_lib == __math_lib_system ||
|
|
__math_lib == __math_lib_svml) {
|
|
return __stdlib_sinf(x_full);
|
|
}
|
|
else if (__math_lib == __math_lib_ispc ||
|
|
__math_lib == __math_lib_ispc_fast) {
|
|
static const uniform float pi_over_two_vec = 1.57079637050628662109375;
|
|
static const uniform float two_over_pi_vec = 0.636619746685028076171875;
|
|
uniform float scaled = x_full * two_over_pi_vec;
|
|
uniform float k_real = floor(scaled);
|
|
uniform int k = (int)k_real;
|
|
|
|
// Reduced range version of x
|
|
uniform float x = x_full - k_real * pi_over_two_vec;
|
|
uniform int k_mod4 = k & 3;
|
|
uniform bool sin_usecos = (k_mod4 == 1 || k_mod4 == 3);
|
|
uniform bool flip_sign = (k_mod4 > 1);
|
|
|
|
// These coefficients are from sollya with fpminimax(sin(x)/x, [|0, 2,
|
|
// 4, 6, 8, 10|], [|single...|], [0;Pi/2]);
|
|
static const uniform float sin_c2 = -0.16666667163372039794921875;
|
|
static const uniform float sin_c4 = 8.333347737789154052734375e-3;
|
|
static const uniform float sin_c6 = -1.9842604524455964565277099609375e-4;
|
|
static const uniform float sin_c8 = 2.760012648650445044040679931640625e-6;
|
|
static const uniform float sin_c10 = -2.50293279435709337121807038784027099609375e-8;
|
|
|
|
static const uniform float cos_c2 = -0.5;
|
|
static const uniform float cos_c4 = 4.166664183139801025390625e-2;
|
|
static const uniform float cos_c6 = -1.388833043165504932403564453125e-3;
|
|
static const uniform float cos_c8 = 2.47562347794882953166961669921875e-5;
|
|
static const uniform float cos_c10 = -2.59630184018533327616751194000244140625e-7;
|
|
|
|
uniform float outside, c2, c4, c6, c8, c10;
|
|
if (sin_usecos) {
|
|
outside = 1.;
|
|
c2 = cos_c2;
|
|
c4 = cos_c4;
|
|
c6 = cos_c6;
|
|
c8 = cos_c8;
|
|
c10 = cos_c10;
|
|
}
|
|
else {
|
|
outside = x;
|
|
c2 = sin_c2;
|
|
c4 = sin_c4;
|
|
c6 = sin_c6;
|
|
c8 = sin_c8;
|
|
c10 = sin_c10;
|
|
}
|
|
|
|
uniform float x2 = x * x;
|
|
uniform float formula = x2 * c10 + c8;
|
|
formula = x2 * formula + c6;
|
|
formula = x2 * formula + c4;
|
|
formula = x2 * formula + c2;
|
|
formula = x2 * formula + 1.;
|
|
formula *= outside;
|
|
|
|
formula = flip_sign ? -formula : formula;
|
|
return formula;
|
|
}
|
|
}
|
|
|
|
|
|
static inline float cos(float x_full) {
|
|
if (__math_lib == __math_lib_svml) {
|
|
return __svml_cos(x_full);
|
|
}
|
|
else if (__math_lib == __math_lib_system) {
|
|
float ret;
|
|
uniform int mask = lanemask();
|
|
for (uniform int i = 0; i < programCount; ++i) {
|
|
if ((mask & (1 << i)) == 0)
|
|
continue;
|
|
uniform float r = __stdlib_cosf(extract(x_full, i));
|
|
ret = insert(ret, i, r);
|
|
}
|
|
return ret;
|
|
}
|
|
else if (__math_lib == __math_lib_ispc ||
|
|
__math_lib == __math_lib_ispc_fast) {
|
|
static const float pi_over_two_vec = 1.57079637050628662109375;
|
|
static const float two_over_pi_vec = 0.636619746685028076171875;
|
|
float scaled = x_full * two_over_pi_vec;
|
|
float k_real = floor(scaled);
|
|
int k = (int)k_real;
|
|
|
|
// Reduced range version of x
|
|
float x = x_full - k_real * pi_over_two_vec;
|
|
|
|
int k_mod4 = k & 3;
|
|
bool cos_usecos = (k_mod4 == 0 || k_mod4 == 2);
|
|
bool flip_sign = (k_mod4 == 1 || k_mod4 == 2);
|
|
|
|
const float sin_c2 = -0.16666667163372039794921875;
|
|
const float sin_c4 = 8.333347737789154052734375e-3;
|
|
const float sin_c6 = -1.9842604524455964565277099609375e-4;
|
|
const float sin_c8 = 2.760012648650445044040679931640625e-6;
|
|
const float sin_c10 = -2.50293279435709337121807038784027099609375e-8;
|
|
|
|
const float cos_c2 = -0.5;
|
|
const float cos_c4 = 4.166664183139801025390625e-2;
|
|
const float cos_c6 = -1.388833043165504932403564453125e-3;
|
|
const float cos_c8 = 2.47562347794882953166961669921875e-5;
|
|
const float cos_c10 = -2.59630184018533327616751194000244140625e-7;
|
|
|
|
float outside = cos_usecos ? 1. : x;
|
|
float c2 = cos_usecos ? cos_c2 : sin_c2;
|
|
float c4 = cos_usecos ? cos_c4 : sin_c4;
|
|
float c6 = cos_usecos ? cos_c6 : sin_c6;
|
|
float c8 = cos_usecos ? cos_c8 : sin_c8;
|
|
float c10 = cos_usecos ? cos_c10 : sin_c10;
|
|
|
|
float x2 = x * x;
|
|
float formula = x2 * c10 + c8;
|
|
formula = x2 * formula + c6;
|
|
formula = x2 * formula + c4;
|
|
formula = x2 * formula + c2;
|
|
formula = x2 * formula + 1.;
|
|
formula *= outside;
|
|
|
|
formula = flip_sign ? -formula : formula;
|
|
return formula;
|
|
}
|
|
}
|
|
|
|
|
|
static inline uniform float cos(uniform float x_full) {
|
|
if (__math_lib == __math_lib_system ||
|
|
__math_lib == __math_lib_svml) {
|
|
return __stdlib_cosf(x_full);
|
|
}
|
|
else if (__math_lib == __math_lib_ispc ||
|
|
__math_lib == __math_lib_ispc_fast) {
|
|
static const uniform float pi_over_two_vec = 1.57079637050628662109375;
|
|
static const uniform float two_over_pi_vec = 0.636619746685028076171875;
|
|
uniform float scaled = x_full * two_over_pi_vec;
|
|
uniform float k_real = floor(scaled);
|
|
uniform int k = (int)k_real;
|
|
|
|
// Reduced range version of x
|
|
uniform float x = x_full - k_real * pi_over_two_vec;
|
|
|
|
uniform int k_mod4 = k & 3;
|
|
uniform bool cos_usecos = (k_mod4 == 0 || k_mod4 == 2);
|
|
uniform bool flip_sign = (k_mod4 == 1 || k_mod4 == 2);
|
|
|
|
const uniform float sin_c2 = -0.16666667163372039794921875;
|
|
const uniform float sin_c4 = 8.333347737789154052734375e-3;
|
|
const uniform float sin_c6 = -1.9842604524455964565277099609375e-4;
|
|
const uniform float sin_c8 = 2.760012648650445044040679931640625e-6;
|
|
const uniform float sin_c10 = -2.50293279435709337121807038784027099609375e-8;
|
|
|
|
const uniform float cos_c2 = -0.5;
|
|
const uniform float cos_c4 = 4.166664183139801025390625e-2;
|
|
const uniform float cos_c6 = -1.388833043165504932403564453125e-3;
|
|
const uniform float cos_c8 = 2.47562347794882953166961669921875e-5;
|
|
const uniform float cos_c10 = -2.59630184018533327616751194000244140625e-7;
|
|
|
|
uniform float outside, c2, c4, c6, c8, c10;
|
|
if (cos_usecos) {
|
|
outside = 1.;
|
|
c2 = cos_c2;
|
|
c4 = cos_c4;
|
|
c6 = cos_c6;
|
|
c8 = cos_c8;
|
|
c10 = cos_c10;
|
|
}
|
|
else {
|
|
outside = x;
|
|
c2 = sin_c2;
|
|
c4 = sin_c4;
|
|
c6 = sin_c6;
|
|
c8 = sin_c8;
|
|
c10 = sin_c10;
|
|
}
|
|
|
|
uniform float x2 = x * x;
|
|
uniform float formula = x2 * c10 + c8;
|
|
formula = x2 * formula + c6;
|
|
formula = x2 * formula + c4;
|
|
formula = x2 * formula + c2;
|
|
formula = x2 * formula + 1.;
|
|
formula *= outside;
|
|
|
|
formula = flip_sign ? -formula : formula;
|
|
return formula;
|
|
}
|
|
}
|
|
|
|
|
|
static inline void sincos(float x_full, float * uniform sin_result,
|
|
float * uniform cos_result) {
|
|
if (__math_lib == __math_lib_svml) {
|
|
__svml_sincos(x_full, sin_result, cos_result);
|
|
}
|
|
else if (__math_lib == __math_lib_system) {
|
|
uniform int mask = lanemask();
|
|
for (uniform int i = 0; i < programCount; ++i) {
|
|
if ((mask & (1 << i)) == 0)
|
|
continue;
|
|
uniform float s, c;
|
|
__stdlib_sincosf(extract(x_full, i), &s, &c);
|
|
*sin_result = insert(*sin_result, i, s);
|
|
*cos_result = insert(*cos_result, i, c);
|
|
}
|
|
}
|
|
else if (__math_lib == __math_lib_ispc ||
|
|
__math_lib == __math_lib_ispc_fast) {
|
|
const float pi_over_two_vec = 1.57079637050628662109375;
|
|
const float two_over_pi_vec = 0.636619746685028076171875;
|
|
float scaled = x_full * two_over_pi_vec;
|
|
float k_real = floor(scaled);
|
|
int k = (int)k_real;
|
|
|
|
// Reduced range version of x
|
|
float x = x_full - k_real * pi_over_two_vec;
|
|
int k_mod4 = k & 3;
|
|
bool cos_usecos = (k_mod4 == 0 || k_mod4 == 2);
|
|
bool sin_usecos = (k_mod4 == 1 || k_mod4 == 3);
|
|
bool sin_flipsign = (k_mod4 > 1);
|
|
bool cos_flipsign = (k_mod4 == 1 || k_mod4 == 2);
|
|
|
|
const float one_vec = 1.;
|
|
const float sin_c2 = -0.16666667163372039794921875;
|
|
const float sin_c4 = 8.333347737789154052734375e-3;
|
|
const float sin_c6 = -1.9842604524455964565277099609375e-4;
|
|
const float sin_c8 = 2.760012648650445044040679931640625e-6;
|
|
const float sin_c10 = -2.50293279435709337121807038784027099609375e-8;
|
|
|
|
const float cos_c2 = -0.5;
|
|
const float cos_c4 = 4.166664183139801025390625e-2;
|
|
const float cos_c6 = -1.388833043165504932403564453125e-3;
|
|
const float cos_c8 = 2.47562347794882953166961669921875e-5;
|
|
const float cos_c10 = -2.59630184018533327616751194000244140625e-7;
|
|
|
|
float x2 = x * x;
|
|
|
|
float sin_formula = x2 * sin_c10 + sin_c8;
|
|
float cos_formula = x2 * cos_c10 + cos_c8;
|
|
sin_formula = x2 * sin_formula + sin_c6;
|
|
cos_formula = x2 * cos_formula + cos_c6;
|
|
|
|
sin_formula = x2 * sin_formula + sin_c4;
|
|
cos_formula = x2 * cos_formula + cos_c4;
|
|
|
|
sin_formula = x2 * sin_formula + sin_c2;
|
|
cos_formula = x2 * cos_formula + cos_c2;
|
|
|
|
sin_formula = x2 * sin_formula + one_vec;
|
|
cos_formula = x2 * cos_formula + one_vec;
|
|
|
|
sin_formula *= x;
|
|
|
|
*sin_result = sin_usecos ? cos_formula : sin_formula;
|
|
*cos_result = cos_usecos ? cos_formula : sin_formula;
|
|
|
|
*sin_result = sin_flipsign ? -*sin_result : *sin_result;
|
|
*cos_result = cos_flipsign ? -*cos_result : *cos_result;
|
|
}
|
|
}
|
|
|
|
|
|
static inline void sincos(uniform float x_full, uniform float * uniform sin_result,
|
|
uniform float * uniform cos_result) {
|
|
if (__math_lib == __math_lib_system ||
|
|
__math_lib == __math_lib_svml) {
|
|
__stdlib_sincosf(x_full, sin_result, cos_result);
|
|
}
|
|
else if (__math_lib == __math_lib_ispc ||
|
|
__math_lib == __math_lib_ispc_fast) {
|
|
const uniform float pi_over_two_vec = 1.57079637050628662109375;
|
|
const uniform float two_over_pi_vec = 0.636619746685028076171875;
|
|
uniform float scaled = x_full * two_over_pi_vec;
|
|
uniform float k_real = floor(scaled);
|
|
uniform int k = (uniform int)k_real;
|
|
|
|
// Reduced range version of x
|
|
uniform float x = x_full - k_real * pi_over_two_vec;
|
|
uniform int k_mod4 = k & 3;
|
|
uniform bool cos_usecos = (k_mod4 == 0 || k_mod4 == 2);
|
|
uniform bool sin_usecos = (k_mod4 == 1 || k_mod4 == 3);
|
|
uniform bool sin_flipsign = (k_mod4 > 1);
|
|
uniform bool cos_flipsign = (k_mod4 == 1 || k_mod4 == 2);
|
|
|
|
const uniform float one_vec = 1.;
|
|
const uniform float sin_c2 = -0.16666667163372039794921875;
|
|
const uniform float sin_c4 = 8.333347737789154052734375e-3;
|
|
const uniform float sin_c6 = -1.9842604524455964565277099609375e-4;
|
|
const uniform float sin_c8 = 2.760012648650445044040679931640625e-6;
|
|
const uniform float sin_c10 = -2.50293279435709337121807038784027099609375e-8;
|
|
|
|
const uniform float cos_c2 = -0.5;
|
|
const uniform float cos_c4 = 4.166664183139801025390625e-2;
|
|
const uniform float cos_c6 = -1.388833043165504932403564453125e-3;
|
|
const uniform float cos_c8 = 2.47562347794882953166961669921875e-5;
|
|
const uniform float cos_c10 = -2.59630184018533327616751194000244140625e-7;
|
|
|
|
uniform float x2 = x * x;
|
|
|
|
uniform float sin_formula = x2 * sin_c10 + sin_c8;
|
|
uniform float cos_formula = x2 * cos_c10 + cos_c8;
|
|
sin_formula = x2 * sin_formula + sin_c6;
|
|
cos_formula = x2 * cos_formula + cos_c6;
|
|
|
|
sin_formula = x2 * sin_formula + sin_c4;
|
|
cos_formula = x2 * cos_formula + cos_c4;
|
|
|
|
sin_formula = x2 * sin_formula + sin_c2;
|
|
cos_formula = x2 * cos_formula + cos_c2;
|
|
|
|
sin_formula = x2 * sin_formula + one_vec;
|
|
cos_formula = x2 * cos_formula + one_vec;
|
|
|
|
sin_formula *= x;
|
|
|
|
*sin_result = sin_usecos ? cos_formula : sin_formula;
|
|
*cos_result = cos_usecos ? cos_formula : sin_formula;
|
|
|
|
*sin_result = sin_flipsign ? -*sin_result : *sin_result;
|
|
*cos_result = cos_flipsign ? -*cos_result : *cos_result;
|
|
}
|
|
}
|
|
|
|
|
|
static inline float tan(float x_full) {
|
|
if (__math_lib == __math_lib_svml) {
|
|
return __svml_tan(x_full);
|
|
}
|
|
else if (__math_lib == __math_lib_system) {
|
|
float ret;
|
|
uniform int mask = lanemask();
|
|
for (uniform int i = 0; i < programCount; ++i) {
|
|
if ((mask & (1 << i)) == 0)
|
|
continue;
|
|
uniform float r = __stdlib_tanf(extract(x_full, i));
|
|
ret = insert(ret, i, r);
|
|
}
|
|
return ret;
|
|
}
|
|
else if (__math_lib == __math_lib_ispc ||
|
|
__math_lib == __math_lib_ispc_fast) {
|
|
const float pi_over_four_vec = 0.785398185253143310546875;
|
|
const float four_over_pi_vec = 1.27323949337005615234375;
|
|
|
|
bool x_lt_0 = x_full < 0.;
|
|
float y = x_lt_0 ? -x_full : x_full;
|
|
float scaled = y * four_over_pi_vec;
|
|
|
|
float k_real = floor(scaled);
|
|
int k = (int)k_real;
|
|
|
|
float x = y - k_real * pi_over_four_vec;
|
|
|
|
// if k & 1, x -= Pi/4
|
|
bool need_offset = (k & 1) != 0;
|
|
x = need_offset ? x - pi_over_four_vec : x;
|
|
|
|
// if k & 3 == (0 or 3) let z = tan_In...(y) otherwise z = -cot_In0To...
|
|
int k_mod4 = k & 3;
|
|
bool use_cotan = (k_mod4 == 1) || (k_mod4 == 2);
|
|
|
|
const float one_vec = 1.0;
|
|
|
|
const float tan_c2 = 0.33333075046539306640625;
|
|
const float tan_c4 = 0.13339905440807342529296875;
|
|
const float tan_c6 = 5.3348250687122344970703125e-2;
|
|
const float tan_c8 = 2.46033705770969390869140625e-2;
|
|
const float tan_c10 = 2.892402000725269317626953125e-3;
|
|
const float tan_c12 = 9.500005282461643218994140625e-3;
|
|
|
|
const float cot_c2 = -0.3333333432674407958984375;
|
|
const float cot_c4 = -2.222204394638538360595703125e-2;
|
|
const float cot_c6 = -2.11752182804048061370849609375e-3;
|
|
const float cot_c8 = -2.0846328698098659515380859375e-4;
|
|
const float cot_c10 = -2.548247357481159269809722900390625e-5;
|
|
const float cot_c12 = -3.5257363606433500535786151885986328125e-7;
|
|
|
|
float x2 = x * x;
|
|
float z;
|
|
cif (use_cotan) {
|
|
float cot_val = x2 * cot_c12 + cot_c10;
|
|
cot_val = x2 * cot_val + cot_c8;
|
|
cot_val = x2 * cot_val + cot_c6;
|
|
cot_val = x2 * cot_val + cot_c4;
|
|
cot_val = x2 * cot_val + cot_c2;
|
|
cot_val = x2 * cot_val + one_vec;
|
|
// The equation is for x * cot(x) but we need -x * cot(x) for the tan part.
|
|
cot_val /= -x;
|
|
z = cot_val;
|
|
} else {
|
|
float tan_val = x2 * tan_c12 + tan_c10;
|
|
tan_val = x2 * tan_val + tan_c8;
|
|
tan_val = x2 * tan_val + tan_c6;
|
|
tan_val = x2 * tan_val + tan_c4;
|
|
tan_val = x2 * tan_val + tan_c2;
|
|
tan_val = x2 * tan_val + one_vec;
|
|
// Equation was for tan(x)/x
|
|
tan_val *= x;
|
|
z = tan_val;
|
|
}
|
|
return x_lt_0 ? -z : z;
|
|
}
|
|
}
|
|
|
|
|
|
static inline uniform float tan(uniform float x_full) {
|
|
if (__math_lib == __math_lib_system ||
|
|
__math_lib == __math_lib_svml) {
|
|
return __stdlib_tanf(x_full);
|
|
}
|
|
else if (__math_lib == __math_lib_ispc ||
|
|
__math_lib == __math_lib_ispc_fast) {
|
|
const uniform float pi_over_four_vec = 0.785398185253143310546875;
|
|
const uniform float four_over_pi_vec = 1.27323949337005615234375;
|
|
|
|
uniform bool x_lt_0 = x_full < 0.;
|
|
uniform float y = x_lt_0 ? -x_full : x_full;
|
|
uniform float scaled = y * four_over_pi_vec;
|
|
|
|
uniform float k_real = floor(scaled);
|
|
uniform int k = (int)k_real;
|
|
|
|
uniform float x = y - k_real * pi_over_four_vec;
|
|
|
|
// if k & 1, x -= Pi/4
|
|
uniform bool need_offset = (k & 1) != 0;
|
|
x = need_offset ? x - pi_over_four_vec : x;
|
|
|
|
// if k & 3 == (0 or 3) let z = tan_In...(y) otherwise z = -cot_In0To...
|
|
uniform int k_mod4 = k & 3;
|
|
uniform bool use_cotan = (k_mod4 == 1) || (k_mod4 == 2);
|
|
|
|
const uniform float one_vec = 1.0;
|
|
|
|
const uniform float tan_c2 = 0.33333075046539306640625;
|
|
const uniform float tan_c4 = 0.13339905440807342529296875;
|
|
const uniform float tan_c6 = 5.3348250687122344970703125e-2;
|
|
const uniform float tan_c8 = 2.46033705770969390869140625e-2;
|
|
const uniform float tan_c10 = 2.892402000725269317626953125e-3;
|
|
const uniform float tan_c12 = 9.500005282461643218994140625e-3;
|
|
|
|
const uniform float cot_c2 = -0.3333333432674407958984375;
|
|
const uniform float cot_c4 = -2.222204394638538360595703125e-2;
|
|
const uniform float cot_c6 = -2.11752182804048061370849609375e-3;
|
|
const uniform float cot_c8 = -2.0846328698098659515380859375e-4;
|
|
const uniform float cot_c10 = -2.548247357481159269809722900390625e-5;
|
|
const uniform float cot_c12 = -3.5257363606433500535786151885986328125e-7;
|
|
|
|
uniform float x2 = x * x;
|
|
uniform float z;
|
|
if (use_cotan) {
|
|
uniform float cot_val = x2 * cot_c12 + cot_c10;
|
|
cot_val = x2 * cot_val + cot_c8;
|
|
cot_val = x2 * cot_val + cot_c6;
|
|
cot_val = x2 * cot_val + cot_c4;
|
|
cot_val = x2 * cot_val + cot_c2;
|
|
cot_val = x2 * cot_val + one_vec;
|
|
// The equation is for x * cot(x) but we need -x * cot(x) for the tan part.
|
|
cot_val /= -x;
|
|
z = cot_val;
|
|
} else {
|
|
uniform float tan_val = x2 * tan_c12 + tan_c10;
|
|
tan_val = x2 * tan_val + tan_c8;
|
|
tan_val = x2 * tan_val + tan_c6;
|
|
tan_val = x2 * tan_val + tan_c4;
|
|
tan_val = x2 * tan_val + tan_c2;
|
|
tan_val = x2 * tan_val + one_vec;
|
|
// Equation was for tan(x)/x
|
|
tan_val *= x;
|
|
z = tan_val;
|
|
}
|
|
return x_lt_0 ? -z : z;
|
|
}
|
|
}
|
|
|
|
|
|
static inline float atan(float x_full) {
|
|
if (__math_lib == __math_lib_svml) {
|
|
return __svml_atan(x_full);
|
|
}
|
|
else if (__math_lib == __math_lib_system) {
|
|
float ret;
|
|
uniform int mask = lanemask();
|
|
for (uniform int i = 0; i < programCount; ++i) {
|
|
if ((mask & (1 << i)) == 0)
|
|
continue;
|
|
uniform float r = __stdlib_atanf(extract(x_full, i));
|
|
ret = insert(ret, i, r);
|
|
}
|
|
return ret;
|
|
}
|
|
else if (__math_lib == __math_lib_ispc ||
|
|
__math_lib == __math_lib_ispc_fast) {
|
|
const float pi_over_two_vec = 1.57079637050628662109375;
|
|
// atan(-x) = -atan(x) (so flip from negative to positive first)
|
|
// if x > 1 -> atan(x) = Pi/2 - atan(1/x)
|
|
bool x_neg = x_full < 0;
|
|
float x_flipped = x_neg ? -x_full : x_full;
|
|
|
|
bool x_gt_1 = x_flipped > 1.;
|
|
float x = x_gt_1 ? 1./x_flipped : x_flipped;
|
|
|
|
// These coefficients approximate atan(x)/x
|
|
const float atan_c0 = 0.99999988079071044921875;
|
|
const float atan_c2 = -0.3333191573619842529296875;
|
|
const float atan_c4 = 0.199689209461212158203125;
|
|
const float atan_c6 = -0.14015688002109527587890625;
|
|
const float atan_c8 = 9.905083477497100830078125e-2;
|
|
const float atan_c10 = -5.93664981424808502197265625e-2;
|
|
const float atan_c12 = 2.417283318936824798583984375e-2;
|
|
const float atan_c14 = -4.6721356920897960662841796875e-3;
|
|
|
|
float x2 = x * x;
|
|
float result = x2 * atan_c14 + atan_c12;
|
|
result = x2 * result + atan_c10;
|
|
result = x2 * result + atan_c8;
|
|
result = x2 * result + atan_c6;
|
|
result = x2 * result + atan_c4;
|
|
result = x2 * result + atan_c2;
|
|
result = x2 * result + atan_c0;
|
|
result *= x;
|
|
|
|
result = x_gt_1 ? pi_over_two_vec - result : result;
|
|
result = x_neg ? -result : result;
|
|
return result;
|
|
}
|
|
}
|
|
|
|
|
|
static inline uniform float atan(uniform float x_full) {
|
|
if (__math_lib == __math_lib_system ||
|
|
__math_lib == __math_lib_svml) {
|
|
return __stdlib_atanf(x_full);
|
|
}
|
|
else if (__math_lib == __math_lib_ispc ||
|
|
__math_lib == __math_lib_ispc_fast) {
|
|
const uniform float pi_over_two_vec = 1.57079637050628662109375;
|
|
// atan(-x) = -atan(x) (so flip from negative to positive first)
|
|
// if x > 1 -> atan(x) = Pi/2 - atan(1/x)
|
|
uniform bool x_neg = x_full < 0;
|
|
uniform float x_flipped = x_neg ? -x_full : x_full;
|
|
|
|
uniform bool x_gt_1 = x_flipped > 1.;
|
|
uniform float x = x_gt_1 ? 1./x_flipped : x_flipped;
|
|
|
|
// These coefficients approximate atan(x)/x
|
|
const uniform float atan_c0 = 0.99999988079071044921875;
|
|
const uniform float atan_c2 = -0.3333191573619842529296875;
|
|
const uniform float atan_c4 = 0.199689209461212158203125;
|
|
const uniform float atan_c6 = -0.14015688002109527587890625;
|
|
const uniform float atan_c8 = 9.905083477497100830078125e-2;
|
|
const uniform float atan_c10 = -5.93664981424808502197265625e-2;
|
|
const uniform float atan_c12 = 2.417283318936824798583984375e-2;
|
|
const uniform float atan_c14 = -4.6721356920897960662841796875e-3;
|
|
|
|
uniform float x2 = x * x;
|
|
uniform float result = x2 * atan_c14 + atan_c12;
|
|
result = x2 * result + atan_c10;
|
|
result = x2 * result + atan_c8;
|
|
result = x2 * result + atan_c6;
|
|
result = x2 * result + atan_c4;
|
|
result = x2 * result + atan_c2;
|
|
result = x2 * result + atan_c0;
|
|
result *= x;
|
|
|
|
result = x_gt_1 ? pi_over_two_vec - result : result;
|
|
result = x_neg ? -result : result;
|
|
return result;
|
|
}
|
|
}
|
|
|
|
|
|
static inline float atan2(float y, float x) {
|
|
if (__math_lib == __math_lib_svml) {
|
|
return __svml_atan2(y, x);
|
|
}
|
|
else if (__math_lib == __math_lib_system) {
|
|
float ret;
|
|
uniform int mask = lanemask();
|
|
for (uniform int i = 0; i < programCount; ++i) {
|
|
if ((mask & (1 << i)) == 0)
|
|
continue;
|
|
uniform float r = __stdlib_atan2f(extract(y, i), extract(x, i));
|
|
ret = insert(ret, i, r);
|
|
}
|
|
return ret;
|
|
}
|
|
else if (__math_lib == __math_lib_ispc ||
|
|
__math_lib == __math_lib_ispc_fast) {
|
|
const float pi_vec = 3.1415926536;
|
|
const float pi_over_two_vec = 1.5707963267;
|
|
// atan2(y, x) =
|
|
//
|
|
// atan2(y > 0, x = +-0) -> Pi/2
|
|
// atan2(y < 0, x = +-0) -> -Pi/2
|
|
// atan2(y = +-0, x < +0) -> +-Pi
|
|
// atan2(y = +-0, x >= +0) -> +-0
|
|
//
|
|
// atan2(y >= 0, x < 0) -> Pi + atan(y/x)
|
|
// atan2(y < 0, x < 0) -> -Pi + atan(y/x)
|
|
// atan2(y, x > 0) -> atan(y/x)
|
|
//
|
|
// and then a bunch of code for dealing with infinities.
|
|
float y_over_x = y/x;
|
|
float atan_arg = atan(y_over_x);
|
|
bool x_lt_0 = x < 0;
|
|
bool y_lt_0 = y < 0;
|
|
float offset = x_lt_0 ? (y_lt_0 ? -pi_vec : pi_vec) : 0;
|
|
return offset + atan_arg;
|
|
}
|
|
}
|
|
|
|
|
|
static inline uniform float atan2(uniform float y, uniform float x) {
|
|
if (__math_lib == __math_lib_system ||
|
|
__math_lib == __math_lib_svml) {
|
|
return __stdlib_atan2f(y, x);
|
|
}
|
|
else if (__math_lib == __math_lib_ispc ||
|
|
__math_lib == __math_lib_ispc_fast) {
|
|
const uniform float pi_vec = 3.1415927410125732421875;
|
|
const uniform float pi_over_two_vec = 1.57079637050628662109375;
|
|
|
|
uniform float y_over_x = y/x;
|
|
uniform float atan_arg = atan(y_over_x);
|
|
uniform bool x_lt_0 = x < 0;
|
|
uniform bool y_lt_0 = y < 0;
|
|
uniform float offset = x_lt_0 ? (y_lt_0 ? -pi_vec : pi_vec) : 0;
|
|
return offset + atan_arg;
|
|
}
|
|
}
|
|
|
|
|
|
static inline float exp(float x_full) {
|
|
if (__math_lib == __math_lib_svml) {
|
|
return __svml_exp(x_full);
|
|
}
|
|
else if (__math_lib == __math_lib_system) {
|
|
float ret;
|
|
uniform int mask = lanemask();
|
|
for (uniform int i = 0; i < programCount; ++i) {
|
|
if ((mask & (1 << i)) == 0)
|
|
continue;
|
|
uniform float r = __stdlib_expf(extract(x_full, i));
|
|
ret = insert(ret, i, r);
|
|
}
|
|
return ret;
|
|
}
|
|
else if (__math_lib == __math_lib_ispc_fast) {
|
|
float z = floor(1.44269504088896341f * x_full + 0.5f);
|
|
int n;
|
|
x_full -= z * 0.693359375f;
|
|
x_full -= z * -2.12194440e-4f;
|
|
n = (int)z;
|
|
|
|
z = x_full * x_full;
|
|
z = (((((1.9875691500E-4f * x_full + 1.3981999507E-3f) * x_full +
|
|
8.3334519073E-3f) * x_full + 4.1665795894E-2f) * x_full +
|
|
1.6666665459E-1f) * x_full + 5.0000001201E-1f) * z + x_full + 1.f;
|
|
x_full = ldexp(z, n);
|
|
return x_full;
|
|
}
|
|
else if (__math_lib == __math_lib_ispc) {
|
|
const float ln2_part1 = 0.6931457519;
|
|
const float ln2_part2 = 1.4286067653e-6;
|
|
const float one_over_ln2 = 1.44269502162933349609375;
|
|
|
|
float scaled = x_full * one_over_ln2;
|
|
float k_real = floor(scaled);
|
|
int k = (int)k_real;
|
|
|
|
// Reduced range version of x
|
|
float x = x_full - k_real * ln2_part1;
|
|
x -= k_real * ln2_part2;
|
|
|
|
// These coefficients are for e^x in [0, ln(2)]
|
|
const float one = 1.;
|
|
const float c2 = 0.4999999105930328369140625;
|
|
const float c3 = 0.166668415069580078125;
|
|
const float c4 = 4.16539050638675689697265625e-2;
|
|
const float c5 = 8.378830738365650177001953125e-3;
|
|
const float c6 = 1.304379315115511417388916015625e-3;
|
|
const float c7 = 2.7555381529964506626129150390625e-4;
|
|
|
|
float result = x * c7 + c6;
|
|
result = x * result + c5;
|
|
result = x * result + c4;
|
|
result = x * result + c3;
|
|
result = x * result + c2;
|
|
result = x * result + one;
|
|
result = x * result + one;
|
|
|
|
// Compute 2^k (should differ for float and double, but I'll avoid
|
|
// it for now and just do floats)
|
|
const int fpbias = 127;
|
|
int biased_n = k + fpbias;
|
|
bool overflow = k > fpbias;
|
|
// Minimum exponent is -126, so if k is <= -127 (k + 127 <= 0)
|
|
// we've got underflow. -127 * ln(2) -> -88.02. So the most
|
|
// negative float input that doesn't result in zero is like -88.
|
|
bool underflow = (biased_n <= 0);
|
|
const int InfBits = 0x7f800000;
|
|
biased_n <<= 23;
|
|
// Reinterpret this thing as float
|
|
float two_to_the_n = floatbits(biased_n);
|
|
// Handle both doubles and floats (hopefully eliding the copy for float)
|
|
float elemtype_2n = two_to_the_n;
|
|
result *= elemtype_2n;
|
|
result = overflow ? floatbits(InfBits) : result;
|
|
result = underflow ? 0. : result;
|
|
return result;
|
|
}
|
|
}
|
|
|
|
static inline uniform float exp(uniform float x_full) {
|
|
if (__math_lib == __math_lib_system ||
|
|
__math_lib == __math_lib_svml) {
|
|
return __stdlib_expf(x_full);
|
|
}
|
|
else if (__math_lib == __math_lib_ispc_fast) {
|
|
uniform float z = floor(1.44269504088896341f * x_full + 0.5f);
|
|
uniform int n;
|
|
x_full -= z * 0.693359375f;
|
|
x_full -= z * -2.12194440e-4f;
|
|
n = (int)z;
|
|
|
|
z = x_full * x_full;
|
|
z = (((((1.9875691500E-4f * x_full + 1.3981999507E-3f) * x_full +
|
|
8.3334519073E-3f) * x_full + 4.1665795894E-2f) * x_full +
|
|
1.6666665459E-1f) * x_full + 5.0000001201E-1f) * z + x_full + 1.f;
|
|
x_full = ldexp(z, n);
|
|
return x_full;
|
|
}
|
|
else if (__math_lib == __math_lib_ispc) {
|
|
const uniform float ln2_part1 = 0.6931457519;
|
|
const uniform float ln2_part2 = 1.4286067653e-6;
|
|
const uniform float one_over_ln2 = 1.44269502162933349609375;
|
|
|
|
uniform float scaled = x_full * one_over_ln2;
|
|
uniform float k_real = floor(scaled);
|
|
uniform int k = (uniform int)k_real;
|
|
|
|
// Reduced range version of x
|
|
uniform float x = x_full - k_real * ln2_part1;
|
|
x -= k_real * ln2_part2;
|
|
|
|
// These coefficients are for e^x in [0, ln(2)]
|
|
const uniform float one = 1.;
|
|
const uniform float c2 = 0.4999999105930328369140625;
|
|
const uniform float c3 = 0.166668415069580078125;
|
|
const uniform float c4 = 4.16539050638675689697265625e-2;
|
|
const uniform float c5 = 8.378830738365650177001953125e-3;
|
|
const uniform float c6 = 1.304379315115511417388916015625e-3;
|
|
const uniform float c7 = 2.7555381529964506626129150390625e-4;
|
|
|
|
uniform float result = x * c7 + c6;
|
|
result = x * result + c5;
|
|
result = x * result + c4;
|
|
result = x * result + c3;
|
|
result = x * result + c2;
|
|
result = x * result + one;
|
|
result = x * result + one;
|
|
|
|
// Compute 2^k (should differ for uniform float and double, but I'll avoid
|
|
// it for now and just do uniform floats)
|
|
const uniform int fpbias = 127;
|
|
uniform int biased_n = k + fpbias;
|
|
uniform bool overflow = k > fpbias;
|
|
// Minimum exponent is -126, so if k is <= -127 (k + 127 <= 0)
|
|
// we've got underflow. -127 * ln(2) -> -88.02. So the most
|
|
// negative uniform float input that doesn't result in zero is like -88.
|
|
uniform bool underflow = (biased_n <= 0);
|
|
const uniform int InfBits = 0x7f800000;
|
|
biased_n <<= 23;
|
|
// Reuniform interpret this thing as uniform float
|
|
uniform float two_to_the_n = floatbits(biased_n);
|
|
// Handle both doubles and uniform floats (hopefully eliding the copy for uniform float)
|
|
uniform float elemtype_2n = two_to_the_n;
|
|
result *= elemtype_2n;
|
|
result = overflow ? floatbits(InfBits) : result;
|
|
result = underflow ? 0. : result;
|
|
return result;
|
|
}
|
|
}
|
|
|
|
// Range reduction for logarithms takes log(x) -> log(2^n * y) -> n
|
|
// * log(2) + log(y) where y is the reduced range (usually in [1/2,
|
|
// 1)).
|
|
static inline void __range_reduce_log(float input, float * uniform reduced,
|
|
int * uniform exponent) {
|
|
int int_version = intbits(input);
|
|
// single precision = SEEE EEEE EMMM MMMM MMMM MMMM MMMM MMMM
|
|
// exponent mask = 0111 1111 1000 0000 0000 0000 0000 0000
|
|
// 0x7 0xF 0x8 0x0 0x0 0x0 0x0 0x0
|
|
// non-exponent = 1000 0000 0111 1111 1111 1111 1111 1111
|
|
// = 0x8 0x0 0x7 0xF 0xF 0xF 0xF 0xF
|
|
|
|
//const int exponent_mask(0x7F800000)
|
|
static const int nonexponent_mask = 0x807FFFFF;
|
|
|
|
// We want the reduced version to have an exponent of -1 which is -1 + 127 after biasing or 126
|
|
static const int exponent_neg1 = (126 << 23);
|
|
// NOTE(boulos): We don't need to mask anything out since we know
|
|
// the sign bit has to be 0. If it's 1, we need to return infinity/nan
|
|
// anyway (log(x), x = +-0 -> infinity, x < 0 -> NaN).
|
|
int biased_exponent = int_version >> 23; // This number is [0, 255] but it means [-127, 128]
|
|
|
|
int offset_exponent = biased_exponent + 1; // Treat the number as if it were 2^{e+1} * (1.m)/2
|
|
*exponent = offset_exponent - 127; // get the real value
|
|
|
|
// Blend the offset_exponent with the original input (do this in
|
|
// int for now, until I decide if float can have & and ¬)
|
|
int blended = (int_version & nonexponent_mask) | (exponent_neg1);
|
|
*reduced = floatbits(blended);
|
|
}
|
|
|
|
|
|
|
|
static inline void __range_reduce_log(uniform float input, uniform float * uniform reduced,
|
|
uniform int * uniform exponent) {
|
|
uniform int int_version = intbits(input);
|
|
static const uniform int nonexponent_mask = 0x807FFFFF;
|
|
|
|
static const uniform int exponent_neg1 = (126 << 23);
|
|
uniform int biased_exponent = int_version >> 23;
|
|
uniform int offset_exponent = biased_exponent + 1;
|
|
*exponent = offset_exponent - 127; // get the real value
|
|
|
|
uniform int blended = (int_version & nonexponent_mask) | (exponent_neg1);
|
|
*reduced = floatbits(blended);
|
|
}
|
|
|
|
|
|
static inline float log(float x_full) {
|
|
if (__math_lib == __math_lib_svml) {
|
|
return __svml_log(x_full);
|
|
}
|
|
else if (__math_lib == __math_lib_system) {
|
|
float ret;
|
|
uniform int mask = lanemask();
|
|
for (uniform int i = 0; i < programCount; ++i) {
|
|
if ((mask & (1 << i)) == 0)
|
|
continue;
|
|
uniform float r = __stdlib_logf(extract(x_full, i));
|
|
ret = insert(ret, i, r);
|
|
}
|
|
return ret;
|
|
}
|
|
else if (__math_lib == __math_lib_ispc_fast) {
|
|
int e;
|
|
x_full = frexp(x_full, &e);
|
|
|
|
int x_smaller_SQRTHF = (0.707106781186547524f > x_full) ? 0xffffffff : 0;
|
|
e += x_smaller_SQRTHF;
|
|
int ix_add = intbits(x_full);
|
|
ix_add &= x_smaller_SQRTHF;
|
|
x_full += floatbits(ix_add) - 1.f;
|
|
|
|
float z = x_full * x_full;
|
|
float y =
|
|
((((((((7.0376836292E-2f * x_full
|
|
+ -1.1514610310E-1f) * x_full
|
|
+ 1.1676998740E-1f) * x_full
|
|
+ -1.2420140846E-1f) * x_full
|
|
+ 1.4249322787E-1f) * x_full
|
|
+ -1.6668057665E-1f) * x_full
|
|
+ 2.0000714765E-1f) * x_full
|
|
+ -2.4999993993E-1f) * x_full
|
|
+ 3.3333331174E-1f) * x_full * z;
|
|
|
|
float fe = (float)e;
|
|
y += fe * -2.12194440e-4;
|
|
y -= 0.5f * z;
|
|
z = x_full + y;
|
|
return z + 0.693359375 * fe;
|
|
}
|
|
else if (__math_lib == __math_lib_ispc) {
|
|
float reduced;
|
|
int exponent;
|
|
|
|
const int NaN_bits = 0x7fc00000;
|
|
const int Neg_Inf_bits = 0xFF800000;
|
|
const float NaN = floatbits(NaN_bits);
|
|
const float neg_inf = floatbits(Neg_Inf_bits);
|
|
bool use_nan = x_full < 0.;
|
|
bool use_inf = x_full == 0.;
|
|
bool exceptional = use_nan || use_inf;
|
|
const float one = 1.0;
|
|
|
|
float patched = exceptional ? one : x_full;
|
|
__range_reduce_log(patched, &reduced, &exponent);
|
|
|
|
const float ln2 = 0.693147182464599609375;
|
|
|
|
float x1 = one - reduced;
|
|
const float c1 = 0.50000095367431640625;
|
|
const float c2 = 0.33326041698455810546875;
|
|
const float c3 = 0.2519190013408660888671875;
|
|
const float c4 = 0.17541764676570892333984375;
|
|
const float c5 = 0.3424419462680816650390625;
|
|
const float c6 = -0.599632322788238525390625;
|
|
const float c7 = +1.98442304134368896484375;
|
|
const float c8 = -2.4899270534515380859375;
|
|
const float c9 = +1.7491014003753662109375;
|
|
|
|
float result = x1 * c9 + c8;
|
|
result = x1 * result + c7;
|
|
result = x1 * result + c6;
|
|
result = x1 * result + c5;
|
|
result = x1 * result + c4;
|
|
result = x1 * result + c3;
|
|
result = x1 * result + c2;
|
|
result = x1 * result + c1;
|
|
result = x1 * result + one;
|
|
|
|
// Equation was for -(ln(red)/(1-red))
|
|
result *= -x1;
|
|
result += (float)(exponent) * ln2;
|
|
|
|
return exceptional ? (use_nan ? NaN : neg_inf) : result;
|
|
}
|
|
}
|
|
|
|
static inline uniform float log(uniform float x_full) {
|
|
if (__math_lib == __math_lib_system ||
|
|
__math_lib == __math_lib_svml) {
|
|
return __stdlib_logf(x_full);
|
|
}
|
|
else if (__math_lib == __math_lib_ispc_fast) {
|
|
uniform int e;
|
|
x_full = frexp(x_full, &e);
|
|
|
|
uniform int x_smaller_SQRTHF = (0.707106781186547524f > x_full) ? 0xffffffff : 0;
|
|
e += x_smaller_SQRTHF;
|
|
uniform int ix_add = intbits(x_full);
|
|
ix_add &= x_smaller_SQRTHF;
|
|
x_full += floatbits(ix_add) - 1.f;
|
|
|
|
uniform float z = x_full * x_full;
|
|
uniform float y =
|
|
((((((((7.0376836292E-2f * x_full
|
|
+ -1.1514610310E-1f) * x_full
|
|
+ 1.1676998740E-1f) * x_full
|
|
+ -1.2420140846E-1f) * x_full
|
|
+ 1.4249322787E-1f) * x_full
|
|
+ -1.6668057665E-1f) * x_full
|
|
+ 2.0000714765E-1f) * x_full
|
|
+ -2.4999993993E-1f) * x_full
|
|
+ 3.3333331174E-1f) * x_full * z;
|
|
|
|
uniform float fe = (uniform float)e;
|
|
y += fe * -2.12194440e-4;
|
|
y -= 0.5f * z;
|
|
z = x_full + y;
|
|
return z + 0.693359375 * fe;
|
|
}
|
|
else if (__math_lib == __math_lib_ispc) {
|
|
uniform float reduced;
|
|
uniform int exponent;
|
|
|
|
const uniform int NaN_bits = 0x7fc00000;
|
|
const uniform int Neg_Inf_bits = 0xFF800000;
|
|
const uniform float NaN = floatbits(NaN_bits);
|
|
const uniform float neg_inf = floatbits(Neg_Inf_bits);
|
|
uniform bool use_nan = x_full < 0.;
|
|
uniform bool use_inf = x_full == 0.;
|
|
uniform bool exceptional = use_nan || use_inf;
|
|
const uniform float one = 1.0;
|
|
|
|
uniform float patched = exceptional ? one : x_full;
|
|
__range_reduce_log(patched, &reduced, &exponent);
|
|
|
|
const uniform float ln2 = 0.693147182464599609375;
|
|
|
|
uniform float x1 = one - reduced;
|
|
const uniform float c1 = 0.50000095367431640625;
|
|
const uniform float c2 = 0.33326041698455810546875;
|
|
const uniform float c3 = 0.2519190013408660888671875;
|
|
const uniform float c4 = 0.17541764676570892333984375;
|
|
const uniform float c5 = 0.3424419462680816650390625;
|
|
const uniform float c6 = -0.599632322788238525390625;
|
|
const uniform float c7 = +1.98442304134368896484375;
|
|
const uniform float c8 = -2.4899270534515380859375;
|
|
const uniform float c9 = +1.7491014003753662109375;
|
|
|
|
uniform float result = x1 * c9 + c8;
|
|
result = x1 * result + c7;
|
|
result = x1 * result + c6;
|
|
result = x1 * result + c5;
|
|
result = x1 * result + c4;
|
|
result = x1 * result + c3;
|
|
result = x1 * result + c2;
|
|
result = x1 * result + c1;
|
|
result = x1 * result + one;
|
|
|
|
// Equation was for -(ln(red)/(1-red))
|
|
result *= -x1;
|
|
result += (uniform float)(exponent) * ln2;
|
|
|
|
return exceptional ? (use_nan ? NaN : neg_inf) : result;
|
|
}
|
|
}
|
|
|
|
static inline float pow(float a, float b) {
|
|
if (__math_lib == __math_lib_svml) {
|
|
return __svml_pow(a, b);
|
|
}
|
|
else if (__math_lib == __math_lib_system) {
|
|
float ret;
|
|
uniform int mask = lanemask();
|
|
for (uniform int i = 0; i < programCount; ++i) {
|
|
if ((mask & (1 << i)) == 0)
|
|
continue;
|
|
uniform float r = __stdlib_powf(extract(a, i), extract(b, i));
|
|
ret = insert(ret, i, r);
|
|
}
|
|
return ret;
|
|
}
|
|
else if (__math_lib == __math_lib_ispc ||
|
|
__math_lib == __math_lib_ispc_fast) {
|
|
return exp(b * log(a));
|
|
}
|
|
}
|
|
|
|
static inline uniform float pow(uniform float a, uniform float b) {
|
|
if (__math_lib == __math_lib_system ||
|
|
__math_lib == __math_lib_svml) {
|
|
return __stdlib_powf(a, b);
|
|
}
|
|
else if (__math_lib == __math_lib_ispc ||
|
|
__math_lib == __math_lib_ispc_fast) {
|
|
return exp(b * log(a));
|
|
}
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
// Transcendentals (double precision)
|
|
|
|
static inline double sqrt(double v) {
|
|
return __sqrt_varying_double(v);
|
|
}
|
|
|
|
static inline uniform double sqrt(uniform double v) {
|
|
return __sqrt_uniform_double(v);
|
|
}
|
|
|
|
static inline double ldexp(double x, int n) {
|
|
unsigned int64 ex = 0x7ff0000000000000;
|
|
unsigned int64 ix = intbits(x);
|
|
ex &= ix;
|
|
ix = ix & ~0x7ff0000000000000; // clear exponent
|
|
int64 n64 = ((int64)n << 52) + ex;
|
|
ix |= n64; // insert new exponent
|
|
return doublebits(ix);
|
|
}
|
|
|
|
static inline uniform double ldexp(uniform double x, uniform int n) {
|
|
uniform unsigned int64 ex = 0x7ff0000000000000;
|
|
uniform unsigned int64 ix = intbits(x);
|
|
ex &= ix;
|
|
ix = ix & ~0x7ff0000000000000; // clear exponent
|
|
uniform int64 n64 = ((int64)n << 52) + ex;
|
|
ix |= n64; // insert new exponent
|
|
return doublebits(ix);
|
|
}
|
|
|
|
static inline double frexp(double x, int * uniform pw2) {
|
|
unsigned int64 ex = 0x7ff0000000000000; // exponent mask
|
|
unsigned int64 ix = intbits(x);
|
|
ex &= ix;
|
|
ix &= ~0x7ff0000000000000; // clear exponent
|
|
*pw2 = (int)(ex >> 52) - 1022; // compute exponent
|
|
ix |= 0x3fe0000000000000; // insert exponent +1 in x
|
|
return doublebits(ix);
|
|
}
|
|
|
|
static inline uniform double frexp(uniform double x, uniform int * uniform pw2) {
|
|
uniform unsigned int64 ex = 0x7ff0000000000000; // exponent mask
|
|
uniform unsigned int64 ix = intbits(x);
|
|
ex &= ix;
|
|
ix &= ~0x7ff0000000000000; // clear exponent
|
|
*pw2 = (int)(ex >> 52) - 1022; // compute exponent
|
|
ix |= 0x3fe0000000000000; // insert exponent +1 in x
|
|
return doublebits(ix);
|
|
}
|
|
|
|
static inline double sin(double x) {
|
|
if (__math_lib == __math_lib_ispc_fast)
|
|
return sin((float)x);
|
|
else {
|
|
double ret;
|
|
uniform int mask = lanemask();
|
|
for (uniform int i = 0; i < programCount; ++i) {
|
|
if ((mask & (1 << i)) == 0)
|
|
continue;
|
|
uniform double r = __stdlib_sin(extract(x, i));
|
|
ret = insert(ret, i, r);
|
|
}
|
|
return ret;
|
|
}
|
|
}
|
|
|
|
static inline uniform double sin(uniform double x) {
|
|
if (__math_lib == __math_lib_ispc_fast)
|
|
return sin((float)x);
|
|
else
|
|
return __stdlib_sin(x);
|
|
}
|
|
|
|
static inline double cos(double x) {
|
|
if (__math_lib == __math_lib_ispc_fast)
|
|
return cos((float)x);
|
|
else {
|
|
double ret;
|
|
uniform int mask = lanemask();
|
|
for (uniform int i = 0; i < programCount; ++i) {
|
|
if ((mask & (1 << i)) == 0)
|
|
continue;
|
|
uniform double r = __stdlib_cos(extract(x, i));
|
|
ret = insert(ret, i, r);
|
|
}
|
|
return ret;
|
|
}
|
|
}
|
|
|
|
static inline uniform double cos(uniform double x) {
|
|
if (__math_lib == __math_lib_ispc_fast)
|
|
return cos((float)x);
|
|
else
|
|
return __stdlib_cos(x);
|
|
}
|
|
|
|
static inline void sincos(double x, double * uniform sin_result,
|
|
double * uniform cos_result) {
|
|
if (__math_lib == __math_lib_ispc_fast) {
|
|
float sr, cr;
|
|
sincos((float)x, &sr, &cr);
|
|
*sin_result = sr;
|
|
*cos_result = cr;
|
|
}
|
|
else {
|
|
uniform int mask = lanemask();
|
|
for (uniform int i = 0; i < programCount; ++i) {
|
|
uniform double sr, cr;
|
|
if ((mask & (1 << i)) == 0)
|
|
continue;
|
|
__stdlib_sincos(extract(x, i), &sr, &cr);
|
|
*sin_result = insert(*sin_result, i, sr);
|
|
*cos_result = insert(*cos_result, i, cr);
|
|
}
|
|
}
|
|
}
|
|
|
|
static inline void sincos(uniform double x, uniform double * uniform sin_result,
|
|
uniform double * uniform cos_result) {
|
|
if (__math_lib == __math_lib_ispc_fast) {
|
|
uniform float sr, cr;
|
|
sincos((uniform float)x, &sr, &cr);
|
|
*sin_result = sr;
|
|
*cos_result = cr;
|
|
}
|
|
else
|
|
__stdlib_sincos(x, sin_result, cos_result);
|
|
}
|
|
|
|
static inline double tan(double x) {
|
|
if (__math_lib == __math_lib_ispc_fast)
|
|
return tan((float)x);
|
|
else {
|
|
double ret;
|
|
uniform int mask = lanemask();
|
|
for (uniform int i = 0; i < programCount; ++i) {
|
|
if ((mask & (1 << i)) == 0)
|
|
continue;
|
|
uniform double r = __stdlib_tan(extract(x, i));
|
|
ret = insert(ret, i, r);
|
|
}
|
|
return ret;
|
|
}
|
|
}
|
|
|
|
static inline uniform double tan(uniform double x) {
|
|
if (__math_lib == __math_lib_ispc_fast)
|
|
return tan((float)x);
|
|
else
|
|
return __stdlib_tan(x);
|
|
}
|
|
|
|
static inline double atan(double x) {
|
|
if (__math_lib == __math_lib_ispc_fast)
|
|
return atan((float)x);
|
|
else {
|
|
double ret;
|
|
uniform int mask = lanemask();
|
|
for (uniform int i = 0; i < programCount; ++i) {
|
|
if ((mask & (1 << i)) == 0)
|
|
continue;
|
|
uniform double r = __stdlib_atan(extract(x, i));
|
|
ret = insert(ret, i, r);
|
|
}
|
|
return ret;
|
|
}
|
|
}
|
|
|
|
static inline uniform double atan(uniform double x) {
|
|
if (__math_lib == __math_lib_ispc_fast)
|
|
return atan((float)x);
|
|
else
|
|
return __stdlib_atan(x);
|
|
}
|
|
|
|
static inline double atan2(double y, double x) {
|
|
if (__math_lib == __math_lib_ispc_fast)
|
|
return atan2((float)y, (float)x);
|
|
else {
|
|
double ret;
|
|
uniform int mask = lanemask();
|
|
for (uniform int i = 0; i < programCount; ++i) {
|
|
if ((mask & (1 << i)) == 0)
|
|
continue;
|
|
uniform double r = __stdlib_atan2(extract(y, i), extract(x, i));
|
|
ret = insert(ret, i, r);
|
|
}
|
|
return ret;
|
|
}
|
|
}
|
|
|
|
static inline uniform double atan2(uniform double y, uniform double x) {
|
|
if (__math_lib == __math_lib_ispc_fast)
|
|
return atan2((float)y, (float)x);
|
|
else
|
|
return __stdlib_atan2(y, x);
|
|
}
|
|
|
|
static inline double exp(double x) {
|
|
if (__math_lib == __math_lib_ispc_fast)
|
|
return exp((float)x);
|
|
else {
|
|
double ret;
|
|
uniform int mask = lanemask();
|
|
for (uniform int i = 0; i < programCount; ++i) {
|
|
if ((mask & (1 << i)) == 0)
|
|
continue;
|
|
uniform double r = __stdlib_exp(extract(x, i));
|
|
ret = insert(ret, i, r);
|
|
}
|
|
return ret;
|
|
}
|
|
}
|
|
|
|
static inline uniform double exp(uniform double x) {
|
|
if (__math_lib == __math_lib_ispc_fast)
|
|
return exp((float)x);
|
|
else
|
|
return __stdlib_exp(x);
|
|
}
|
|
|
|
static inline double log(double x) {
|
|
if (__math_lib == __math_lib_ispc_fast)
|
|
return log((float)x);
|
|
else {
|
|
double ret;
|
|
uniform int mask = lanemask();
|
|
for (uniform int i = 0; i < programCount; ++i) {
|
|
if ((mask & (1 << i)) == 0)
|
|
continue;
|
|
uniform double r = __stdlib_log(extract(x, i));
|
|
ret = insert(ret, i, r);
|
|
}
|
|
return ret;
|
|
}
|
|
}
|
|
|
|
static inline uniform double log(uniform double x) {
|
|
if (__math_lib == __math_lib_ispc_fast)
|
|
return log((float)x);
|
|
else
|
|
return __stdlib_log(x);
|
|
}
|
|
|
|
static inline double pow(double a, double b) {
|
|
if (__math_lib == __math_lib_ispc_fast)
|
|
return pow((float)a, (float)b);
|
|
else {
|
|
double ret;
|
|
uniform int mask = lanemask();
|
|
for (uniform int i = 0; i < programCount; ++i) {
|
|
if ((mask & (1 << i)) == 0)
|
|
continue;
|
|
uniform double r = __stdlib_pow(extract(a, i), extract(b, i));
|
|
ret = insert(ret, i, r);
|
|
}
|
|
return ret;
|
|
}
|
|
}
|
|
|
|
static inline uniform double pow(uniform double a, uniform double b) {
|
|
if (__math_lib == __math_lib_ispc_fast)
|
|
return pow((float)a, (float)b);
|
|
else
|
|
return __stdlib_pow(a, b);
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
// half-precision floats
|
|
|
|
static inline uniform float half_to_float(uniform unsigned int16 h) {
|
|
if ((h & 0x7FFFu) == 0)
|
|
// Signed zero
|
|
return floatbits(((unsigned int32) h) << 16);
|
|
else {
|
|
// Though these are int16 quantities, we get much better code
|
|
// with them stored as int32s...
|
|
uniform unsigned int32 hs = h & (int32)0x8000u; // Pick off sign bit
|
|
uniform unsigned int32 he = h & (int32)0x7C00u; // Pick off exponent bits
|
|
uniform unsigned int32 hm = h & (int32)0x03FFu; // Pick off mantissa bits
|
|
if (he == 0) {
|
|
// Denormal will convert to normalized
|
|
uniform int e = -1;
|
|
// The following loop figures out how much extra to adjust the exponent
|
|
// Shift until leading bit overflows into exponent bit
|
|
do {
|
|
e++;
|
|
hm <<= 1;
|
|
} while((hm & 0x0400u) == 0);
|
|
|
|
// Sign bit
|
|
uniform unsigned int32 xs = ((unsigned int32) hs) << 16;
|
|
// Exponent: unbias the halfp, then bias the single
|
|
uniform int32 xes = ((int32)(he >> 10)) - 15 + 127 - e;
|
|
// Exponent
|
|
uniform unsigned int32 xe = (unsigned int32) (xes << 23);
|
|
// Mantissa
|
|
uniform unsigned int32 xm = ((unsigned int32) (hm & 0x03FFu)) << 13;
|
|
return floatbits(xs | xe | xm);
|
|
}
|
|
else {
|
|
if (he == 0x7C00u) {
|
|
// Inf or NaN (all the exponent bits are set)
|
|
if (hm == 0)
|
|
// Zero mantissa -> signed inf
|
|
return floatbits((((unsigned int32) hs) << 16) |
|
|
((unsigned int32) 0x7F800000u));
|
|
else
|
|
// NaN
|
|
return floatbits(0xFFC00000u);
|
|
}
|
|
else {
|
|
// Normalized number
|
|
// sign
|
|
uniform unsigned int32 xs = ((unsigned int32) hs) << 16;
|
|
// Exponent: unbias the halfp, then bias the single
|
|
uniform int32 xes = ((int32) (he >> 10)) - 15 + 127;
|
|
// Exponent
|
|
uniform unsigned int32 xe = (unsigned int32) (xes << 23);
|
|
// Mantissa
|
|
uniform unsigned int32 xm = ((unsigned int32) hm) << 13;
|
|
return floatbits(xs | xe | xm);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
static inline float half_to_float(unsigned int16 h) {
|
|
if ((h & 0x7FFFu) == 0)
|
|
// Signed zero
|
|
return floatbits(((unsigned int32) h) << 16);
|
|
else {
|
|
// Though these are int16 quantities, we get much better code
|
|
// with them stored as int32s...
|
|
unsigned int32 hs = h & (int32)0x8000u; // Pick off sign bit
|
|
unsigned int32 he = h & (int32)0x7C00u; // Pick off exponent bits
|
|
unsigned int32 hm = h & (int32)0x03FFu; // Pick off mantissa bits
|
|
cif (he == 0) {
|
|
// Denormal will convert to normalized
|
|
int e = -1;
|
|
// The following loop figures out how much extra to adjust the exponent
|
|
// Shift until leading bit overflows into exponent bit
|
|
do {
|
|
e++;
|
|
hm <<= 1;
|
|
} while((hm & 0x0400u) == 0);
|
|
|
|
// Sign bit
|
|
unsigned int32 xs = ((unsigned int32) hs) << 16;
|
|
// Exponent: unbias the halfp, then bias the single
|
|
int32 xes = ((int32)(he >> 10)) - 15 + 127 - e;
|
|
// Exponent
|
|
unsigned int32 xe = (unsigned int32) (xes << 23);
|
|
// Mantissa
|
|
unsigned int32 xm = ((unsigned int32) (hm & 0x03FFu)) << 13;
|
|
return floatbits(xs | xe | xm);
|
|
}
|
|
else {
|
|
if (he == 0x7C00u) {
|
|
// Inf or NaN (all the exponent bits are set)
|
|
if (hm == 0)
|
|
// Zero mantissa -> signed inf
|
|
return floatbits((((unsigned int32) hs) << 16) |
|
|
((unsigned int32) 0x7F800000u));
|
|
else
|
|
// NaN
|
|
return floatbits(0xFFC00000u);
|
|
}
|
|
else {
|
|
// Normalized number
|
|
// sign
|
|
unsigned int32 xs = ((unsigned int32) hs) << 16;
|
|
// Exponent: unbias the halfp, then bias the single
|
|
int32 xes = ((int32) (he >> 10)) - 15 + 127;
|
|
// Exponent
|
|
unsigned int32 xe = (unsigned int32) (xes << 23);
|
|
// Mantissa
|
|
unsigned int32 xm = ((unsigned int32) hm) << 13;
|
|
return floatbits(xs | xe | xm);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
static inline uniform int16 float_to_half(uniform float f) {
|
|
uniform int32 x = intbits(f);
|
|
// Store the return value in an int32 until the very end; this ends up
|
|
// generating better code...
|
|
uniform int32 ret;
|
|
if ((x & 0x7FFFFFFFu) == 0)
|
|
// Signed zero
|
|
ret = (x >> 16);
|
|
else {
|
|
uniform unsigned int32 xs = x & 0x80000000u; // Pick off sign bit
|
|
uniform unsigned int32 xe = x & 0x7F800000u; // Pick off exponent bits
|
|
uniform unsigned int32 xm = x & 0x007FFFFFu; // Pick off mantissa bits
|
|
if (xe == 0) {
|
|
// Denormal will underflow, return a signed zero
|
|
ret = (xs >> 16);
|
|
}
|
|
else {
|
|
if (xe == 0x7F800000u) {
|
|
// Inf or NaN (all the exponent bits are set)
|
|
if (xm == 0)
|
|
// Zero mantissa -> signed infinity
|
|
ret = ((xs >> 16) | 0x7C00u);
|
|
else
|
|
// NaN, only 1st mantissa bit set
|
|
ret = 0xFE00u;
|
|
}
|
|
else {
|
|
// Normalized number
|
|
uniform unsigned int32 hs = (xs >> 16); // Sign bit
|
|
uniform unsigned int32 hm;
|
|
// Exponent unbias the single, then bias the halfp
|
|
uniform int32 hes = ((int)(xe >> 23)) - 127 + 15;
|
|
if (hes >= 0x1F)
|
|
// Overflow: return signed infinity
|
|
ret = ((xs >> 16) | 0x7C00u);
|
|
else if (hes <= 0) {
|
|
// Underflow
|
|
if ((14 - hes) > 24) {
|
|
// Mantissa shifted all the way off & no rounding possibility
|
|
hm = 0u; // Set mantissa to zero
|
|
}
|
|
else {
|
|
xm |= 0x00800000u; // Add the hidden leading bit
|
|
hm = (xm >> (14 - hes)); // Mantissa
|
|
if ((xm >> (13 - hes)) & 0x00000001u) // Check for rounding
|
|
// Round, might overflow into exp bit, but this is OK
|
|
hm += 1u;
|
|
}
|
|
ret = (hs | hm);
|
|
}
|
|
else {
|
|
uniform unsigned int32 he = (hes << 10); // Exponent
|
|
hm = (xm >> 13); // Mantissa
|
|
if (xm & 0x00001000u) // Check for rounding
|
|
// Round, might overflow to inf, this is OK
|
|
ret = (hs | he | hm) + 1u;
|
|
else
|
|
ret = (hs | he | hm);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return (int16)ret;
|
|
}
|
|
|
|
|
|
static inline int16 float_to_half(float f) {
|
|
int32 x = intbits(f);
|
|
// Store the return value in an int32 until the very end; this ends up
|
|
// generating better code...
|
|
int32 ret;
|
|
if ((x & 0x7FFFFFFFu) == 0)
|
|
// Signed zero
|
|
ret = (x >> 16);
|
|
else {
|
|
unsigned int32 xs = x & 0x80000000u; // Pick off sign bit
|
|
unsigned int32 xe = x & 0x7F800000u; // Pick off exponent bits
|
|
unsigned int32 xm = x & 0x007FFFFFu; // Pick off mantissa bits
|
|
if (xe == 0) {
|
|
// Denormal will underflow, return a signed zero
|
|
ret = (xs >> 16);
|
|
}
|
|
else {
|
|
cif (xe == 0x7F800000u) {
|
|
// Inf or NaN (all the exponent bits are set)
|
|
if (xm == 0)
|
|
// Zero mantissa -> signed infinity
|
|
ret = ((xs >> 16) | 0x7C00u);
|
|
else
|
|
// NaN, only 1st mantissa bit set
|
|
ret = 0xFE00u;
|
|
}
|
|
else {
|
|
// Normalized number
|
|
unsigned int32 hs = (xs >> 16); // Sign bit
|
|
unsigned int32 hm;
|
|
// Exponent unbias the single, then bias the halfp
|
|
int32 hes = ((int)(xe >> 23)) - 127 + 15;
|
|
if (hes >= 0x1F)
|
|
// Overflow: return signed infinity
|
|
ret = ((xs >> 16) | 0x7C00u);
|
|
else if (hes <= 0) {
|
|
// Underflow
|
|
if ((14 - hes) > 24) {
|
|
// Mantissa shifted all the way off & no rounding possibility
|
|
hm = 0u; // Set mantissa to zero
|
|
}
|
|
else {
|
|
xm |= 0x00800000u; // Add the hidden leading bit
|
|
hm = (xm >> (14 - hes)); // Mantissa
|
|
if ((xm >> (13 - hes)) & 0x00000001u) // Check for rounding
|
|
// Round, might overflow into exp bit, but this is OK
|
|
hm += 1u;
|
|
}
|
|
ret = (hs | hm);
|
|
}
|
|
else {
|
|
unsigned int32 he = (hes << 10); // Exponent
|
|
hm = (xm >> 13); // Mantissa
|
|
if (xm & 0x00001000u) // Check for rounding
|
|
// Round, might overflow to inf, this is OK
|
|
ret = (hs | he | hm) + 1u;
|
|
else
|
|
ret = (hs | he | hm);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return (int16)ret;
|
|
}
|
|
|
|
|
|
static inline uniform float half_to_float_fast(uniform unsigned int16 h) {
|
|
uniform unsigned int32 hs = h & (int32)0x8000u; // Pick off sign bit
|
|
uniform unsigned int32 he = h & (int32)0x7C00u; // Pick off exponent bits
|
|
uniform unsigned int32 hm = h & (int32)0x03FFu; // Pick off mantissa bits
|
|
|
|
// sign
|
|
uniform unsigned int32 xs = ((unsigned int32) hs) << 16;
|
|
// Exponent: unbias the halfp, then bias the single
|
|
uniform int32 xes = ((int32) (he >> 10)) - 15 + 127;
|
|
// Exponent
|
|
uniform unsigned int32 xe = (unsigned int32) (xes << 23);
|
|
// Mantissa
|
|
uniform unsigned int32 xm = ((unsigned int32) hm) << 13;
|
|
return floatbits(xs | xe | xm);
|
|
|
|
}
|
|
|
|
static inline float half_to_float_fast(unsigned int16 h) {
|
|
unsigned int32 hs = h & (int32)0x8000u; // Pick off sign bit
|
|
unsigned int32 he = h & (int32)0x7C00u; // Pick off exponent bits
|
|
unsigned int32 hm = h & (int32)0x03FFu; // Pick off mantissa bits
|
|
|
|
// sign
|
|
unsigned int32 xs = ((unsigned int32) hs) << 16;
|
|
// Exponent: unbias the halfp, then bias the single
|
|
int32 xes = ((int32) (he >> 10)) - 15 + 127;
|
|
// Exponent
|
|
unsigned int32 xe = (unsigned int32) (xes << 23);
|
|
// Mantissa
|
|
unsigned int32 xm = ((unsigned int32) hm) << 13;
|
|
return floatbits(xs | xe | xm);
|
|
|
|
}
|
|
|
|
static inline uniform int16 float_to_half_fast(uniform float f) {
|
|
uniform int32 x = intbits(f);
|
|
uniform unsigned int32 xs = x & 0x80000000u; // Pick off sign bit
|
|
uniform unsigned int32 xe = x & 0x7F800000u; // Pick off exponent bits
|
|
uniform unsigned int32 xm = x & 0x007FFFFFu; // Pick off mantissa bits
|
|
|
|
uniform unsigned int32 hs = (xs >> 16); // Sign bit
|
|
// Exponent unbias the single, then bias the halfp
|
|
uniform int32 hes = ((int)(xe >> 23)) - 127 + 15;
|
|
uniform unsigned int32 he = (hes << 10); // Exponent
|
|
uniform int32 hm = (xm >> 13); // Mantissa
|
|
uniform int32 ret = (hs | he | hm);
|
|
|
|
if (xm & 0x00001000u) // Check for rounding
|
|
// Round, might overflow to inf, this is OK
|
|
ret += 1u;
|
|
|
|
return (int16)ret;
|
|
}
|
|
|
|
static inline int16 float_to_half_fast(float f) {
|
|
int32 x = intbits(f);
|
|
unsigned int32 xs = x & 0x80000000u; // Pick off sign bit
|
|
unsigned int32 xe = x & 0x7F800000u; // Pick off exponent bits
|
|
unsigned int32 xm = x & 0x007FFFFFu; // Pick off mantissa bits
|
|
|
|
unsigned int32 hs = (xs >> 16); // Sign bit
|
|
// Exponent unbias the single, then bias the halfp
|
|
int32 hes = ((int)(xe >> 23)) - 127 + 15;
|
|
unsigned int32 he = (hes << 10); // Exponent
|
|
int32 hm = (xm >> 13); // Mantissa
|
|
int32 ret = (hs | he | hm);
|
|
|
|
if (xm & 0x00001000u) // Check for rounding
|
|
// Round, might overflow to inf, this is OK
|
|
ret += 1u;
|
|
|
|
return (int16)ret;
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
// RNG stuff
|
|
|
|
struct RNGState {
|
|
unsigned int z1, z2, z3, z4;
|
|
};
|
|
|
|
static inline unsigned int random(RNGState * uniform state)
|
|
{
|
|
unsigned int b;
|
|
|
|
// FIXME: state->z1, etc..
|
|
b = (((*state).z1 << 6) ^ (*state).z1) >> 13;
|
|
(*state).z1 = (((*state).z1 & 4294967294U) << 18) ^ b;
|
|
b = (((*state).z2 << 2) ^ (*state).z2) >> 27;
|
|
(*state).z2 = (((*state).z2 & 4294967288U) << 2) ^ b;
|
|
b = (((*state).z3 << 13) ^ (*state).z3) >> 21;
|
|
(*state).z3 = (((*state).z3 & 4294967280U) << 7) ^ b;
|
|
b = (((*state).z4 << 3) ^ (*state).z4) >> 12;
|
|
(*state).z4 = (((*state).z4 & 4294967168U) << 13) ^ b;
|
|
return ((*state).z1 ^ (*state).z2 ^ (*state).z3 ^ (*state).z4);
|
|
}
|
|
|
|
static inline float frandom(RNGState * uniform state)
|
|
{
|
|
unsigned int irand = random(state);
|
|
irand &= (1<<23)-1;
|
|
return floatbits(0x3F800000 | irand)-1.0f;
|
|
}
|
|
|
|
static inline uniform unsigned int __seed4(RNGState * uniform state,
|
|
uniform int start,
|
|
uniform unsigned int seed) {
|
|
uniform unsigned int c1 = 0xf0f0f0f0;
|
|
uniform unsigned int c2 = 0x0f0f0f0f;
|
|
|
|
(*state).z1 = insert((*state).z1, start + 0, seed);
|
|
(*state).z1 = insert((*state).z1, start + 1, seed ^ c1);
|
|
(*state).z1 = insert((*state).z1, start + 2, (seed << 3) ^ c1);
|
|
(*state).z1 = insert((*state).z1, start + 3, (seed << 2) ^ c2);
|
|
|
|
seed += 131;
|
|
(*state).z2 = insert((*state).z2, start + 0, seed);
|
|
(*state).z2 = insert((*state).z2, start + 1, seed ^ c1);
|
|
(*state).z2 = insert((*state).z2, start + 2, (seed << 3) ^ c1);
|
|
(*state).z2 = insert((*state).z2, start + 3, (seed << 2) ^ c2);
|
|
|
|
seed ^= extract((*state).z2, 2);
|
|
(*state).z3 = insert((*state).z3, start + 0, seed);
|
|
(*state).z3 = insert((*state).z3, start + 1, seed ^ c1);
|
|
(*state).z3 = insert((*state).z3, start + 2, (seed << 3) ^ c1);
|
|
(*state).z3 = insert((*state).z3, start + 3, (seed << 2) ^ c2);
|
|
|
|
seed <<= 4;
|
|
seed += 3;
|
|
seed ^= extract((*state).z1, 3);
|
|
(*state).z4 = insert((*state).z4, start + 0, seed);
|
|
(*state).z4 = insert((*state).z4, start + 1, seed ^ c1);
|
|
(*state).z4 = insert((*state).z4, start + 2, (seed << 3) ^ c1);
|
|
(*state).z4 = insert((*state).z4, start + 3, (seed << 2) ^ c2);
|
|
|
|
return seed;
|
|
}
|
|
|
|
static inline void seed_rng(uniform RNGState * uniform state, uniform unsigned int seed) {
|
|
seed = __seed4(state, 0, seed);
|
|
if (programCount == 8)
|
|
__seed4(state, 4, seed ^ 0xbeeff00d);
|
|
if (programCount == 16) {
|
|
__seed4(state, 4, seed ^ 0xbeeff00d);
|
|
__seed4(state, 8, ((seed & 0xffff) << 16) | (seed >> 16));
|
|
__seed4(state, 12, (((seed & 0xff) << 24) | ((seed & 0xff00) << 8) |
|
|
((seed & 0xff0000) >> 8) | (seed & 0xff000000) >> 24));
|
|
}
|
|
}
|
|
|
|
static inline void fastmath() {
|
|
__fastmath();
|
|
}
|