Merge pull request #641 from jbrodman/stdlibshift
Add a "shift" operator to the stdlib.
This commit is contained in:
@@ -536,6 +536,12 @@ lSetInternalFunctions(llvm::Module *module) {
|
|||||||
"__set_system_isa",
|
"__set_system_isa",
|
||||||
"__sext_uniform_bool",
|
"__sext_uniform_bool",
|
||||||
"__sext_varying_bool",
|
"__sext_varying_bool",
|
||||||
|
"__shift_double",
|
||||||
|
"__shift_float",
|
||||||
|
"__shift_i16",
|
||||||
|
"__shift_i32",
|
||||||
|
"__shift_i64",
|
||||||
|
"__shift_i8",
|
||||||
"__shuffle2_double",
|
"__shuffle2_double",
|
||||||
"__shuffle2_float",
|
"__shuffle2_float",
|
||||||
"__shuffle2_i16",
|
"__shuffle2_i16",
|
||||||
|
|||||||
@@ -80,6 +80,13 @@ declare <WIDTH x i32> @__rotate_i32(<WIDTH x i32>, i32) nounwind readnone
|
|||||||
declare <WIDTH x double> @__rotate_double(<WIDTH x double>, i32) nounwind readnone
|
declare <WIDTH x double> @__rotate_double(<WIDTH x double>, i32) nounwind readnone
|
||||||
declare <WIDTH x i64> @__rotate_i64(<WIDTH x i64>, i32) nounwind readnone
|
declare <WIDTH x i64> @__rotate_i64(<WIDTH x i64>, i32) nounwind readnone
|
||||||
|
|
||||||
|
declare <WIDTH x i8> @__shift_i8(<WIDTH x i8>, i32) nounwind readnone
|
||||||
|
declare <WIDTH x i16> @__shift_i16(<WIDTH x i16>, i32) nounwind readnone
|
||||||
|
declare <WIDTH x float> @__shift_float(<WIDTH x float>, i32) nounwind readnone
|
||||||
|
declare <WIDTH x i32> @__shift_i32(<WIDTH x i32>, i32) nounwind readnone
|
||||||
|
declare <WIDTH x double> @__shift_double(<WIDTH x double>, i32) nounwind readnone
|
||||||
|
declare <WIDTH x i64> @__shift_i64(<WIDTH x i64>, i32) nounwind readnone
|
||||||
|
|
||||||
declare <WIDTH x i8> @__shuffle_i8(<WIDTH x i8>, <WIDTH x i32>) nounwind readnone
|
declare <WIDTH x i8> @__shuffle_i8(<WIDTH x i8>, <WIDTH x i32>) nounwind readnone
|
||||||
declare <WIDTH x i8> @__shuffle2_i8(<WIDTH x i8>, <WIDTH x i8>,
|
declare <WIDTH x i8> @__shuffle2_i8(<WIDTH x i8>, <WIDTH x i8>,
|
||||||
<WIDTH x i32>) nounwind readnone
|
<WIDTH x i32>) nounwind readnone
|
||||||
|
|||||||
@@ -797,6 +797,24 @@ not_const:
|
|||||||
ret <WIDTH x $1> %result
|
ret <WIDTH x $1> %result
|
||||||
}
|
}
|
||||||
|
|
||||||
|
define <WIDTH x $1> @__shift_$1(<WIDTH x $1>, i32) nounwind readnone alwaysinline {
|
||||||
|
%ptr = alloca <WIDTH x $1>, i32 3
|
||||||
|
%ptr0 = getelementptr <WIDTH x $1> * %ptr, i32 0
|
||||||
|
store <WIDTH x $1> zeroinitializer, <WIDTH x $1> * %ptr0
|
||||||
|
%ptr1 = getelementptr <WIDTH x $1> * %ptr, i32 1
|
||||||
|
store <WIDTH x $1> %0, <WIDTH x $1> * %ptr1
|
||||||
|
%ptr2 = getelementptr <WIDTH x $1> * %ptr, i32 2
|
||||||
|
store <WIDTH x $1> zeroinitializer, <WIDTH x $1> * %ptr2
|
||||||
|
|
||||||
|
%offset = add i32 %1, WIDTH
|
||||||
|
%ptr_as_elt_array = bitcast <WIDTH x $1> * %ptr to [eval(3*WIDTH) x $1] *
|
||||||
|
%load_ptr = getelementptr [eval(3*WIDTH) x $1] * %ptr_as_elt_array, i32 0, i32 %offset
|
||||||
|
%load_ptr_vec = bitcast $1 * %load_ptr to <WIDTH x $1> *
|
||||||
|
%result = load <WIDTH x $1> * %load_ptr_vec, align $2
|
||||||
|
ret <WIDTH x $1> %result
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
define <WIDTH x $1> @__shuffle_$1(<WIDTH x $1>, <WIDTH x i32>) nounwind readnone alwaysinline {
|
define <WIDTH x $1> @__shuffle_$1(<WIDTH x $1>, <WIDTH x i32>) nounwind readnone alwaysinline {
|
||||||
forloop(i, 0, eval(WIDTH-1), `
|
forloop(i, 0, eval(WIDTH-1), `
|
||||||
%index_`'i = extractelement <WIDTH x i32> %1, i32 i')
|
%index_`'i = extractelement <WIDTH x i32> %1, i32 i')
|
||||||
|
|||||||
@@ -3719,6 +3719,22 @@ the size of the gang (it is masked to ensure valid offsets).
|
|||||||
double rotate(double value, uniform int offset)
|
double rotate(double value, uniform int offset)
|
||||||
|
|
||||||
|
|
||||||
|
The ``shift()`` function allows each program instance to find the value of
|
||||||
|
the given value that their neighbor ``offset`` steps away has. This is similar
|
||||||
|
to ``rotate()`` with the exception that values are not circularly shifted.
|
||||||
|
Instead, zeroes are shifted in where appropriate.
|
||||||
|
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
int8 shift(int8 value, uniform int offset)
|
||||||
|
int16 shift(int16 value, uniform int offset)
|
||||||
|
int32 shift(int32 value, uniform int offset)
|
||||||
|
int64 shift(int64 value, uniform int offset)
|
||||||
|
float shift(float value, uniform int offset)
|
||||||
|
double shift(double value, uniform int offset)
|
||||||
|
|
||||||
|
|
||||||
Finally, the ``shuffle()`` functions allow two variants of fully general
|
Finally, the ``shuffle()`` functions allow two variants of fully general
|
||||||
shuffling of values among the program instances. For the first version,
|
shuffling of values among the program instances. For the first version,
|
||||||
each program instance's value of permutation gives the program instance
|
each program instance's value of permutation gives the program instance
|
||||||
@@ -3751,7 +3767,7 @@ the last element of ``value1``, etc.)
|
|||||||
double shuffle(double value0, double value1, int permutation)
|
double shuffle(double value0, double value1, int permutation)
|
||||||
|
|
||||||
Finally, there are primitive operations that extract and set values in the
|
Finally, there are primitive operations that extract and set values in the
|
||||||
SIMD lanes. You can implement all of the broadcast, rotate, and shuffle
|
SIMD lanes. You can implement all of the broadcast, rotate, shift, and shuffle
|
||||||
operations described above in this section from these routines, though in
|
operations described above in this section from these routines, though in
|
||||||
general, not as efficiently. These routines are useful for implementing
|
general, not as efficiently. These routines are useful for implementing
|
||||||
other reductions and cross-lane communication that isn't included in the
|
other reductions and cross-lane communication that isn't included in the
|
||||||
|
|||||||
@@ -311,6 +311,17 @@ static FORCEINLINE VTYPE __rotate_##NAME(VTYPE v, int index) { \
|
|||||||
return ret; \
|
return ret; \
|
||||||
} \
|
} \
|
||||||
|
|
||||||
|
#define SHIFT(VTYPE, NAME, STYPE) \
|
||||||
|
static FORCEINLINE VTYPE __shift_##NAME(VTYPE v, int index) { \
|
||||||
|
VTYPE ret; \
|
||||||
|
for (int i = 0; i < 16; ++i) { \
|
||||||
|
int modIndex = i+index; \
|
||||||
|
STYPE val = ((modIndex >= 0) && (modIndex < 16)) ? v.v[modIndex] : 0; \
|
||||||
|
ret.v[i] = val; \
|
||||||
|
} \
|
||||||
|
return ret; \
|
||||||
|
} \
|
||||||
|
|
||||||
#define SHUFFLES(VTYPE, NAME, STYPE) \
|
#define SHUFFLES(VTYPE, NAME, STYPE) \
|
||||||
static FORCEINLINE VTYPE __shuffle_##NAME(VTYPE v, __vec16_i32 index) { \
|
static FORCEINLINE VTYPE __shuffle_##NAME(VTYPE v, __vec16_i32 index) { \
|
||||||
VTYPE ret; \
|
VTYPE ret; \
|
||||||
@@ -492,6 +503,7 @@ SETZERO(__vec16_i8, i8)
|
|||||||
UNDEF(__vec16_i8, i8)
|
UNDEF(__vec16_i8, i8)
|
||||||
BROADCAST(__vec16_i8, i8, int8_t)
|
BROADCAST(__vec16_i8, i8, int8_t)
|
||||||
ROTATE(__vec16_i8, i8, int8_t)
|
ROTATE(__vec16_i8, i8, int8_t)
|
||||||
|
SHIFT(__vec16_i8, i8, int8_t)
|
||||||
SHUFFLES(__vec16_i8, i8, int8_t)
|
SHUFFLES(__vec16_i8, i8, int8_t)
|
||||||
LOAD_STORE(__vec16_i8, int8_t)
|
LOAD_STORE(__vec16_i8, int8_t)
|
||||||
|
|
||||||
@@ -537,6 +549,7 @@ SETZERO(__vec16_i16, i16)
|
|||||||
UNDEF(__vec16_i16, i16)
|
UNDEF(__vec16_i16, i16)
|
||||||
BROADCAST(__vec16_i16, i16, int16_t)
|
BROADCAST(__vec16_i16, i16, int16_t)
|
||||||
ROTATE(__vec16_i16, i16, int16_t)
|
ROTATE(__vec16_i16, i16, int16_t)
|
||||||
|
SHIFT(__vec16_i16, i16, int16_t)
|
||||||
SHUFFLES(__vec16_i16, i16, int16_t)
|
SHUFFLES(__vec16_i16, i16, int16_t)
|
||||||
LOAD_STORE(__vec16_i16, int16_t)
|
LOAD_STORE(__vec16_i16, int16_t)
|
||||||
|
|
||||||
@@ -582,6 +595,7 @@ SETZERO(__vec16_i32, i32)
|
|||||||
UNDEF(__vec16_i32, i32)
|
UNDEF(__vec16_i32, i32)
|
||||||
BROADCAST(__vec16_i32, i32, int32_t)
|
BROADCAST(__vec16_i32, i32, int32_t)
|
||||||
ROTATE(__vec16_i32, i32, int32_t)
|
ROTATE(__vec16_i32, i32, int32_t)
|
||||||
|
SHIFT(__vec16_i32, i32, int32_t)
|
||||||
SHUFFLES(__vec16_i32, i32, int32_t)
|
SHUFFLES(__vec16_i32, i32, int32_t)
|
||||||
LOAD_STORE(__vec16_i32, int32_t)
|
LOAD_STORE(__vec16_i32, int32_t)
|
||||||
|
|
||||||
@@ -627,6 +641,7 @@ SETZERO(__vec16_i64, i64)
|
|||||||
UNDEF(__vec16_i64, i64)
|
UNDEF(__vec16_i64, i64)
|
||||||
BROADCAST(__vec16_i64, i64, int64_t)
|
BROADCAST(__vec16_i64, i64, int64_t)
|
||||||
ROTATE(__vec16_i64, i64, int64_t)
|
ROTATE(__vec16_i64, i64, int64_t)
|
||||||
|
SHIFT(__vec16_i64, i64, int64_t)
|
||||||
SHUFFLES(__vec16_i64, i64, int64_t)
|
SHUFFLES(__vec16_i64, i64, int64_t)
|
||||||
LOAD_STORE(__vec16_i64, int64_t)
|
LOAD_STORE(__vec16_i64, int64_t)
|
||||||
|
|
||||||
@@ -672,6 +687,7 @@ SETZERO(__vec16_f, float)
|
|||||||
UNDEF(__vec16_f, float)
|
UNDEF(__vec16_f, float)
|
||||||
BROADCAST(__vec16_f, float, float)
|
BROADCAST(__vec16_f, float, float)
|
||||||
ROTATE(__vec16_f, float, float)
|
ROTATE(__vec16_f, float, float)
|
||||||
|
SHIFT(__vec16_f, float, float)
|
||||||
SHUFFLES(__vec16_f, float, float)
|
SHUFFLES(__vec16_f, float, float)
|
||||||
LOAD_STORE(__vec16_f, float)
|
LOAD_STORE(__vec16_f, float)
|
||||||
|
|
||||||
@@ -832,6 +848,7 @@ SETZERO(__vec16_d, double)
|
|||||||
UNDEF(__vec16_d, double)
|
UNDEF(__vec16_d, double)
|
||||||
BROADCAST(__vec16_d, double, double)
|
BROADCAST(__vec16_d, double, double)
|
||||||
ROTATE(__vec16_d, double, double)
|
ROTATE(__vec16_d, double, double)
|
||||||
|
SHIFT(__vec16_d, double, double)
|
||||||
SHUFFLES(__vec16_d, double, double)
|
SHUFFLES(__vec16_d, double, double)
|
||||||
LOAD_STORE(__vec16_d, double)
|
LOAD_STORE(__vec16_d, double)
|
||||||
|
|
||||||
|
|||||||
@@ -451,6 +451,17 @@ static FORCEINLINE VTYPE __rotate_##NAME(VTYPE v, int index) { \
|
|||||||
return ret; \
|
return ret; \
|
||||||
} \
|
} \
|
||||||
|
|
||||||
|
#define SHIFT(VTYPE, NAME, STYPE) \
|
||||||
|
static FORCEINLINE VTYPE __shift_##NAME(VTYPE v, int index) { \
|
||||||
|
VTYPE ret; \
|
||||||
|
for (int i = 0; i < 16; ++i) { \
|
||||||
|
int modIndex = i+index; \
|
||||||
|
STYPE val = ((modIndex >= 0) && (modIndex < 16)) ? v[modIndex] : 0; \
|
||||||
|
ret[i] = val; \
|
||||||
|
} \
|
||||||
|
return ret; \
|
||||||
|
} \
|
||||||
|
|
||||||
/* knc::macro::used */
|
/* knc::macro::used */
|
||||||
#define SHUFFLES(VTYPE, NAME, STYPE) \
|
#define SHUFFLES(VTYPE, NAME, STYPE) \
|
||||||
static FORCEINLINE VTYPE __shuffle_##NAME(VTYPE v, __vec16_i32 index) { \
|
static FORCEINLINE VTYPE __shuffle_##NAME(VTYPE v, __vec16_i32 index) { \
|
||||||
@@ -566,6 +577,7 @@ SETZERO(__vec16_i8, i8)
|
|||||||
UNDEF(__vec16_i8, i8)
|
UNDEF(__vec16_i8, i8)
|
||||||
BROADCAST(__vec16_i8, i8, int8_t)
|
BROADCAST(__vec16_i8, i8, int8_t)
|
||||||
ROTATE(__vec16_i8, i8, int8_t)
|
ROTATE(__vec16_i8, i8, int8_t)
|
||||||
|
SHIFT(__vec16_i8, i8, int8_t)
|
||||||
SHUFFLES(__vec16_i8, i8, int8_t)
|
SHUFFLES(__vec16_i8, i8, int8_t)
|
||||||
LOAD_STORE(__vec16_i8, int8_t)
|
LOAD_STORE(__vec16_i8, int8_t)
|
||||||
|
|
||||||
@@ -612,6 +624,7 @@ SETZERO(__vec16_i16, i16)
|
|||||||
UNDEF(__vec16_i16, i16)
|
UNDEF(__vec16_i16, i16)
|
||||||
BROADCAST(__vec16_i16, i16, int16_t)
|
BROADCAST(__vec16_i16, i16, int16_t)
|
||||||
ROTATE(__vec16_i16, i16, int16_t)
|
ROTATE(__vec16_i16, i16, int16_t)
|
||||||
|
SHIFT(__vec16_i16, i16, int16_t)
|
||||||
SHUFFLES(__vec16_i16, i16, int16_t)
|
SHUFFLES(__vec16_i16, i16, int16_t)
|
||||||
LOAD_STORE(__vec16_i16, int16_t)
|
LOAD_STORE(__vec16_i16, int16_t)
|
||||||
|
|
||||||
@@ -688,6 +701,8 @@ static FORCEINLINE __vec16_i32 __rotate_i32(__vec16_i32 v, int index)
|
|||||||
return _mm512_mask_permutevar_epi32(v, 0xFFFF, shuffle, v);
|
return _mm512_mask_permutevar_epi32(v, 0xFFFF, shuffle, v);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
SHIFT(__vec16_i32, i32, int32_t)
|
||||||
|
|
||||||
static FORCEINLINE __vec16_i32 __shuffle_i32 (__vec16_i32 v, __vec16_i32 index)
|
static FORCEINLINE __vec16_i32 __shuffle_i32 (__vec16_i32 v, __vec16_i32 index)
|
||||||
{
|
{
|
||||||
return _mm512_mask_permutevar_epi32(v, 0xFFFF, __and(index, __smear_i32<__vec16_i32>(0xF)), v);
|
return _mm512_mask_permutevar_epi32(v, 0xFFFF, __and(index, __smear_i32<__vec16_i32>(0xF)), v);
|
||||||
@@ -942,6 +957,8 @@ static FORCEINLINE __vec16_i64 __rotate_i64(const __vec16_i64 _v, const int inde
|
|||||||
const __vec16_i32 ret_lo = __rotate_i32(v_lo, index);
|
const __vec16_i32 ret_lo = __rotate_i32(v_lo, index);
|
||||||
return CASTI2L(ret_hi, ret_lo);
|
return CASTI2L(ret_hi, ret_lo);
|
||||||
}
|
}
|
||||||
|
SHIFT(__vec16_i64, i64, int64_t)
|
||||||
|
|
||||||
static FORCEINLINE __vec16_i64 __shuffle_double(__vec16_i64 _v, const __vec16_i32 index)
|
static FORCEINLINE __vec16_i64 __shuffle_double(__vec16_i64 _v, const __vec16_i32 index)
|
||||||
{
|
{
|
||||||
CASTL2I(_v, v_hi, v_lo);
|
CASTL2I(_v, v_hi, v_lo);
|
||||||
@@ -1063,6 +1080,7 @@ static FORCEINLINE __vec16_f __rotate_float(__vec16_f _v, int index)
|
|||||||
const __vec16_i32 shuffle = _mm512_and_epi32(_mm512_add_epi32(__ispc_stride1, idx), __smear_i32<__vec16_i32>(0xF));
|
const __vec16_i32 shuffle = _mm512_and_epi32(_mm512_add_epi32(__ispc_stride1, idx), __smear_i32<__vec16_i32>(0xF));
|
||||||
return _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(v, 0xFFFF, shuffle, v));
|
return _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(v, 0xFFFF, shuffle, v));
|
||||||
}
|
}
|
||||||
|
SHIFT(__vec16_f, float, float)
|
||||||
static FORCEINLINE __vec16_f __shuffle_float(__vec16_f v, __vec16_i32 index)
|
static FORCEINLINE __vec16_f __shuffle_float(__vec16_f v, __vec16_i32 index)
|
||||||
{
|
{
|
||||||
return _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(_mm512_castps_si512(v), 0xffff, index, _mm512_castps_si512(v)));
|
return _mm512_castsi512_ps(_mm512_mask_permutevar_epi32(_mm512_castps_si512(v), 0xffff, index, _mm512_castps_si512(v)));
|
||||||
@@ -1333,6 +1351,7 @@ static FORCEINLINE __vec16_d __rotate_double(const __vec16_d _v, const int index
|
|||||||
const __vec16_f ret_lo = __rotate_float(v_lo, index);
|
const __vec16_f ret_lo = __rotate_float(v_lo, index);
|
||||||
return CASTF2D(ret_hi, ret_lo);
|
return CASTF2D(ret_hi, ret_lo);
|
||||||
}
|
}
|
||||||
|
SHIFT(__vec16_d, double, double)
|
||||||
static FORCEINLINE __vec16_d __shuffle_double(__vec16_d _v, const __vec16_i32 index)
|
static FORCEINLINE __vec16_d __shuffle_double(__vec16_d _v, const __vec16_i32 index)
|
||||||
{
|
{
|
||||||
CASTD2F(_v, v_hi, v_lo);
|
CASTD2F(_v, v_hi, v_lo);
|
||||||
|
|||||||
@@ -108,22 +108,21 @@ struct __vec4_i64 {
|
|||||||
};
|
};
|
||||||
|
|
||||||
struct __vec4_i32 {
|
struct __vec4_i32 {
|
||||||
__vec4_i32() { }
|
FORCEINLINE __vec4_i32() { }
|
||||||
FORCEINLINE __vec4_i32(__m128i vv) : v(vv) { }
|
FORCEINLINE __vec4_i32(__m128i vv) : v(vv) { }
|
||||||
FORCEINLINE __vec4_i32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
|
FORCEINLINE __vec4_i32(int32_t a, int32_t b, int32_t c, int32_t d) {
|
||||||
v = _mm_set_epi32(d, c, b, a);
|
v = _mm_set_epi32(d, c, b, a);
|
||||||
}
|
}
|
||||||
FORCEINLINE __vec4_i32(uint32_t *p) {
|
FORCEINLINE __vec4_i32(int32_t *p) {
|
||||||
v = _mm_loadu_si128((__m128i *)p);
|
v = _mm_loadu_si128((__m128i *)p);
|
||||||
}
|
}
|
||||||
|
FORCEINLINE __vec4_i32(const __vec4_i32 &other) : v(other.v) {}
|
||||||
|
FORCEINLINE __vec4_i32& operator =(const __vec4_i32 &o) { v=o.v; return *this; }
|
||||||
FORCEINLINE operator __m128() const { return _mm_castsi128_ps(v); }
|
FORCEINLINE operator __m128() const { return _mm_castsi128_ps(v); }
|
||||||
|
|
||||||
__m128i v;
|
__m128i v;
|
||||||
};
|
};
|
||||||
|
|
||||||
static inline int32_t __extract_element(__vec4_i32 v, int index);
|
|
||||||
|
|
||||||
struct __vec4_i16 {
|
struct __vec4_i16 {
|
||||||
__vec4_i16() { }
|
__vec4_i16() { }
|
||||||
FORCEINLINE __vec4_i16(__m128i vv) : v(vv) { }
|
FORCEINLINE __vec4_i16(__m128i vv) : v(vv) { }
|
||||||
@@ -215,6 +214,64 @@ INSERT_EXTRACT(__vec1_i64, int64_t)
|
|||||||
INSERT_EXTRACT(__vec1_f, float)
|
INSERT_EXTRACT(__vec1_f, float)
|
||||||
INSERT_EXTRACT(__vec1_d, double)
|
INSERT_EXTRACT(__vec1_d, double)
|
||||||
|
|
||||||
|
static FORCEINLINE bool __extract_element(const __vec4_i1 &v, int index) {
|
||||||
|
return ((int32_t *)&v)[index] ? true : false;
|
||||||
|
}
|
||||||
|
|
||||||
|
static FORCEINLINE void __insert_element(__vec4_i1 *v, int index, bool val) {
|
||||||
|
((int32_t *)v)[index] = val ? -1 : 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static FORCEINLINE int8_t __extract_element(const __vec4_i8 &v, int index) {
|
||||||
|
return ((int8_t *)&v)[index];
|
||||||
|
}
|
||||||
|
|
||||||
|
static FORCEINLINE void __insert_element(__vec4_i8 *v, int index, int8_t val) {
|
||||||
|
((int8_t *)v)[index] = val;
|
||||||
|
}
|
||||||
|
|
||||||
|
static FORCEINLINE int16_t __extract_element(const __vec4_i16 &v, int index) {
|
||||||
|
return ((int16_t *)&v)[index];
|
||||||
|
}
|
||||||
|
|
||||||
|
static FORCEINLINE void __insert_element(__vec4_i16 *v, int index, int16_t val) {
|
||||||
|
((int16_t *)v)[index] = val;
|
||||||
|
}
|
||||||
|
|
||||||
|
static FORCEINLINE int32_t __extract_element(const __vec4_i32 &v, int index) {
|
||||||
|
return ((int32_t *)&v)[index];
|
||||||
|
}
|
||||||
|
|
||||||
|
static FORCEINLINE void __insert_element(__vec4_i32 *v, int index, int32_t val) {
|
||||||
|
((int32_t *)v)[index] = val;
|
||||||
|
}
|
||||||
|
|
||||||
|
static FORCEINLINE int64_t __extract_element(const __vec4_i64 &v, int index) {
|
||||||
|
return ((int64_t *)&v)[index];
|
||||||
|
}
|
||||||
|
|
||||||
|
static FORCEINLINE void __insert_element(__vec4_i64 *v, int index, int64_t val) {
|
||||||
|
((int64_t *)v)[index] = val;
|
||||||
|
}
|
||||||
|
|
||||||
|
static FORCEINLINE float __extract_element(const __vec4_f &v, int index) {
|
||||||
|
return ((float *)&v)[index];
|
||||||
|
}
|
||||||
|
|
||||||
|
static FORCEINLINE void __insert_element(__vec4_f *v, int index, float val) {
|
||||||
|
((float *)v)[index] = val;
|
||||||
|
}
|
||||||
|
|
||||||
|
static FORCEINLINE double __extract_element(const __vec4_d &v, int index) {
|
||||||
|
return ((double *)&v)[index];
|
||||||
|
}
|
||||||
|
|
||||||
|
static FORCEINLINE void __insert_element(__vec4_d *v, int index, double val) {
|
||||||
|
((double *)v)[index] = val;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#define CAST_BITS_SCALAR(TO, FROM) \
|
#define CAST_BITS_SCALAR(TO, FROM) \
|
||||||
static FORCEINLINE TO __cast_bits(TO, FROM v) { \
|
static FORCEINLINE TO __cast_bits(TO, FROM v) { \
|
||||||
union { \
|
union { \
|
||||||
@@ -313,13 +370,6 @@ static FORCEINLINE __vec4_i1 __select(__vec4_i1 mask, __vec4_i1 a, __vec4_i1 b)
|
|||||||
return _mm_blendv_ps(b.v, a.v, mask.v);
|
return _mm_blendv_ps(b.v, a.v, mask.v);
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE bool __extract_element(__vec4_i1 v, int index) {
|
|
||||||
return ((int32_t *)&v)[index] ? true : false;
|
|
||||||
}
|
|
||||||
|
|
||||||
static FORCEINLINE void __insert_element(__vec4_i1 *v, int index, bool val) {
|
|
||||||
((int32_t *)v)[index] = val ? -1 : 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
template <int ALIGN> static FORCEINLINE __vec4_i1 __load(const __vec4_i1 *v) {
|
template <int ALIGN> static FORCEINLINE __vec4_i1 __load(const __vec4_i1 *v) {
|
||||||
// FIXME: handle align of 16...
|
// FIXME: handle align of 16...
|
||||||
@@ -564,13 +614,6 @@ static FORCEINLINE __vec4_i8 __select(__vec4_i1 mask, __vec4_i8 a, __vec4_i8 b)
|
|||||||
_mm_extract_epi8(b.v, 3));
|
_mm_extract_epi8(b.v, 3));
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE int8_t __extract_element(__vec4_i8 v, int index) {
|
|
||||||
return ((int8_t *)&v)[index];
|
|
||||||
}
|
|
||||||
|
|
||||||
static FORCEINLINE void __insert_element(__vec4_i8 *v, int index, int8_t val) {
|
|
||||||
((int8_t *)v)[index] = val;
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class RetVecType> __vec4_i8 __smear_i8(int8_t v);
|
template <class RetVecType> __vec4_i8 __smear_i8(int8_t v);
|
||||||
template <> FORCEINLINE __vec4_i8 __smear_i8<__vec4_i8>(int8_t v) {
|
template <> FORCEINLINE __vec4_i8 __smear_i8<__vec4_i8>(int8_t v) {
|
||||||
@@ -598,6 +641,20 @@ static FORCEINLINE __vec4_i8 __rotate_i8(__vec4_i8 v, int delta) {
|
|||||||
__extract_element(v, (delta+3) & 0x3));
|
__extract_element(v, (delta+3) & 0x3));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static FORCEINLINE __vec4_i8 __shift_i8(__vec4_i8 v, int delta) {
|
||||||
|
int8_t v1, v2, v3, v4;
|
||||||
|
int d1, d2, d3, d4;
|
||||||
|
d1 = delta+0;
|
||||||
|
d2 = delta+1;
|
||||||
|
d3 = delta+2;
|
||||||
|
d4 = delta+3;
|
||||||
|
v1 = ((d1 >= 0) && (d1 < 4)) ? __extract_element(v, d1) : 0;
|
||||||
|
v2 = ((d2 >= 0) && (d2 < 4)) ? __extract_element(v, d2) : 0;
|
||||||
|
v3 = ((d3 >= 0) && (d3 < 4)) ? __extract_element(v, d3) : 0;
|
||||||
|
v4 = ((d4 >= 0) && (d4 < 4)) ? __extract_element(v, d4) : 0;
|
||||||
|
return __vec4_i8(v1, v2, v3, v4);
|
||||||
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i8 __shuffle_i8(__vec4_i8 v, __vec4_i32 index) {
|
static FORCEINLINE __vec4_i8 __shuffle_i8(__vec4_i8 v, __vec4_i32 index) {
|
||||||
return __vec4_i8(__extract_element(v, __extract_element(index, 0) & 0x3),
|
return __vec4_i8(__extract_element(v, __extract_element(index, 0) & 0x3),
|
||||||
__extract_element(v, __extract_element(index, 1) & 0x3),
|
__extract_element(v, __extract_element(index, 1) & 0x3),
|
||||||
@@ -836,13 +893,6 @@ static FORCEINLINE __vec4_i16 __select(__vec4_i1 mask, __vec4_i16 a, __vec4_i16
|
|||||||
_mm_extract_epi16(b.v, 3));
|
_mm_extract_epi16(b.v, 3));
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE int16_t __extract_element(__vec4_i16 v, int index) {
|
|
||||||
return ((int16_t *)&v)[index];
|
|
||||||
}
|
|
||||||
|
|
||||||
static FORCEINLINE void __insert_element(__vec4_i16 *v, int index, int16_t val) {
|
|
||||||
((int16_t *)v)[index] = val;
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class RetVecType> __vec4_i16 __smear_i16(int16_t v);
|
template <class RetVecType> __vec4_i16 __smear_i16(int16_t v);
|
||||||
template <> FORCEINLINE __vec4_i16 __smear_i16<__vec4_i16>(int16_t v) {
|
template <> FORCEINLINE __vec4_i16 __smear_i16<__vec4_i16>(int16_t v) {
|
||||||
@@ -870,6 +920,20 @@ static FORCEINLINE __vec4_i16 __rotate_i16(__vec4_i16 v, int delta) {
|
|||||||
__extract_element(v, (delta+3) & 0x3));
|
__extract_element(v, (delta+3) & 0x3));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static FORCEINLINE __vec4_i16 __shift_i16(__vec4_i16 v, int delta) {
|
||||||
|
int16_t v1, v2, v3, v4;
|
||||||
|
int d1, d2, d3, d4;
|
||||||
|
d1 = delta+0;
|
||||||
|
d2 = delta+1;
|
||||||
|
d3 = delta+2;
|
||||||
|
d4 = delta+3;
|
||||||
|
v1 = ((d1 >= 0) && (d1 < 4)) ? __extract_element(v, d1) : 0;
|
||||||
|
v2 = ((d2 >= 0) && (d2 < 4)) ? __extract_element(v, d2) : 0;
|
||||||
|
v3 = ((d3 >= 0) && (d3 < 4)) ? __extract_element(v, d3) : 0;
|
||||||
|
v4 = ((d4 >= 0) && (d4 < 4)) ? __extract_element(v, d4) : 0;
|
||||||
|
return __vec4_i16(v1, v2, v3, v4);
|
||||||
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i16 __shuffle_i16(__vec4_i16 v, __vec4_i32 index) {
|
static FORCEINLINE __vec4_i16 __shuffle_i16(__vec4_i16 v, __vec4_i32 index) {
|
||||||
return __vec4_i16(__extract_element(v, __extract_element(index, 0) & 0x3),
|
return __vec4_i16(__extract_element(v, __extract_element(index, 0) & 0x3),
|
||||||
__extract_element(v, __extract_element(index, 1) & 0x3),
|
__extract_element(v, __extract_element(index, 1) & 0x3),
|
||||||
@@ -1109,13 +1173,6 @@ template <> FORCEINLINE __vec4_i32 __undef_i32<__vec4_i32>() {
|
|||||||
return __vec4_i32();
|
return __vec4_i32();
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE int32_t __extract_element(__vec4_i32 v, int index) {
|
|
||||||
return ((int32_t *)&v)[index];
|
|
||||||
}
|
|
||||||
|
|
||||||
static FORCEINLINE void __insert_element(__vec4_i32 *v, int index, int32_t val) {
|
|
||||||
((int32_t *)v)[index] = val;
|
|
||||||
}
|
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i32 __broadcast_i32(__vec4_i32 v, int index) {
|
static FORCEINLINE __vec4_i32 __broadcast_i32(__vec4_i32 v, int index) {
|
||||||
return _mm_set1_epi32(__extract_element(v, index));
|
return _mm_set1_epi32(__extract_element(v, index));
|
||||||
@@ -1128,6 +1185,21 @@ static FORCEINLINE __vec4_i32 __rotate_i32(__vec4_i32 v, int delta) {
|
|||||||
__extract_element(v, (delta+3) & 0x3));
|
__extract_element(v, (delta+3) & 0x3));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#include <iostream>
|
||||||
|
static FORCEINLINE __vec4_i32 __shift_i32(const __vec4_i32 &v, int delta) {
|
||||||
|
int32_t v1, v2, v3, v4;
|
||||||
|
int32_t d1, d2, d3, d4;
|
||||||
|
d1 = delta+0;
|
||||||
|
d2 = delta+1;
|
||||||
|
d3 = delta+2;
|
||||||
|
d4 = delta+3;
|
||||||
|
v1 = ((d1 >= 0) && (d1 < 4)) ? __extract_element(v, d1) : 0;
|
||||||
|
v2 = ((d2 >= 0) && (d2 < 4)) ? __extract_element(v, d2) : 0;
|
||||||
|
v3 = ((d3 >= 0) && (d3 < 4)) ? __extract_element(v, d3) : 0;
|
||||||
|
v4 = ((d4 >= 0) && (d4 < 4)) ? __extract_element(v, d4) : 0;
|
||||||
|
return __vec4_i32(v1, v2, v3, v4);
|
||||||
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i32 __shuffle_i32(__vec4_i32 v, __vec4_i32 index) {
|
static FORCEINLINE __vec4_i32 __shuffle_i32(__vec4_i32 v, __vec4_i32 index) {
|
||||||
return __vec4_i32(__extract_element(v, __extract_element(index, 0) & 0x3),
|
return __vec4_i32(__extract_element(v, __extract_element(index, 0) & 0x3),
|
||||||
__extract_element(v, __extract_element(index, 1) & 0x3),
|
__extract_element(v, __extract_element(index, 1) & 0x3),
|
||||||
@@ -1383,13 +1455,6 @@ template <> FORCEINLINE __vec4_i64 __undef_i64<__vec4_i64>() {
|
|||||||
return __vec4_i64();
|
return __vec4_i64();
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE int64_t __extract_element(__vec4_i64 v, int index) {
|
|
||||||
return ((int64_t *)&v)[index];
|
|
||||||
}
|
|
||||||
|
|
||||||
static FORCEINLINE void __insert_element(__vec4_i64 *v, int index, int64_t val) {
|
|
||||||
((int64_t *)v)[index] = val;
|
|
||||||
}
|
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i64 __broadcast_i64(__vec4_i64 v, int index) {
|
static FORCEINLINE __vec4_i64 __broadcast_i64(__vec4_i64 v, int index) {
|
||||||
uint64_t val = __extract_element(v, index);
|
uint64_t val = __extract_element(v, index);
|
||||||
@@ -1403,6 +1468,20 @@ static FORCEINLINE __vec4_i64 __rotate_i64(__vec4_i64 v, int delta) {
|
|||||||
__extract_element(v, (delta+3) & 0x3));
|
__extract_element(v, (delta+3) & 0x3));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static FORCEINLINE __vec4_i64 __shift_i64(__vec4_i64 v, int delta) {
|
||||||
|
int64_t v1, v2, v3, v4;
|
||||||
|
int d1, d2, d3, d4;
|
||||||
|
d1 = delta+0;
|
||||||
|
d2 = delta+1;
|
||||||
|
d3 = delta+2;
|
||||||
|
d4 = delta+3;
|
||||||
|
v1 = ((d1 >= 0) && (d1 < 4)) ? __extract_element(v, d1) : 0;
|
||||||
|
v2 = ((d2 >= 0) && (d2 < 4)) ? __extract_element(v, d2) : 0;
|
||||||
|
v3 = ((d3 >= 0) && (d3 < 4)) ? __extract_element(v, d3) : 0;
|
||||||
|
v4 = ((d4 >= 0) && (d4 < 4)) ? __extract_element(v, d4) : 0;
|
||||||
|
return __vec4_i64(v1, v2, v3, v4);
|
||||||
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_i64 __shuffle_i64(__vec4_i64 v, __vec4_i32 index) {
|
static FORCEINLINE __vec4_i64 __shuffle_i64(__vec4_i64 v, __vec4_i32 index) {
|
||||||
return __vec4_i64(__extract_element(v, __extract_element(index, 0) & 0x3),
|
return __vec4_i64(__extract_element(v, __extract_element(index, 0) & 0x3),
|
||||||
__extract_element(v, __extract_element(index, 1) & 0x3),
|
__extract_element(v, __extract_element(index, 1) & 0x3),
|
||||||
@@ -1504,13 +1583,6 @@ template <> FORCEINLINE __vec4_f __undef_float<__vec4_f>() {
|
|||||||
return __vec4_f();
|
return __vec4_f();
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE float __extract_element(__vec4_f v, int index) {
|
|
||||||
return ((float *)&v)[index];
|
|
||||||
}
|
|
||||||
|
|
||||||
static FORCEINLINE void __insert_element(__vec4_f *v, int index, float val) {
|
|
||||||
((float *)v)[index] = val;
|
|
||||||
}
|
|
||||||
|
|
||||||
static FORCEINLINE __vec4_f __broadcast_float(__vec4_f v, int index) {
|
static FORCEINLINE __vec4_f __broadcast_float(__vec4_f v, int index) {
|
||||||
return _mm_set1_ps(__extract_element(v, index));
|
return _mm_set1_ps(__extract_element(v, index));
|
||||||
@@ -1523,6 +1595,20 @@ static FORCEINLINE __vec4_f __rotate_float(__vec4_f v, int delta) {
|
|||||||
__extract_element(v, (delta+3) & 0x3));
|
__extract_element(v, (delta+3) & 0x3));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static FORCEINLINE __vec4_f __shift_float(__vec4_f v, int delta) {
|
||||||
|
float v1, v2, v3, v4;
|
||||||
|
int d1, d2, d3, d4;
|
||||||
|
d1 = delta+0;
|
||||||
|
d2 = delta+1;
|
||||||
|
d3 = delta+2;
|
||||||
|
d4 = delta+3;
|
||||||
|
v1 = ((d1 >= 0) && (d1 < 4)) ? __extract_element(v, d1) : 0.f;
|
||||||
|
v2 = ((d2 >= 0) && (d2 < 4)) ? __extract_element(v, d2) : 0.f;
|
||||||
|
v3 = ((d3 >= 0) && (d3 < 4)) ? __extract_element(v, d3) : 0.f;
|
||||||
|
v4 = ((d4 >= 0) && (d4 < 4)) ? __extract_element(v, d4) : 0.f;
|
||||||
|
return __vec4_f(v1, v2, v3, v4);
|
||||||
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_f __shuffle_float(__vec4_f v, __vec4_i32 index) {
|
static FORCEINLINE __vec4_f __shuffle_float(__vec4_f v, __vec4_i32 index) {
|
||||||
return __vec4_f(__extract_element(v, __extract_element(index, 0) & 0x3),
|
return __vec4_f(__extract_element(v, __extract_element(index, 0) & 0x3),
|
||||||
__extract_element(v, __extract_element(index, 1) & 0x3),
|
__extract_element(v, __extract_element(index, 1) & 0x3),
|
||||||
@@ -1656,13 +1742,6 @@ template <> FORCEINLINE __vec4_d __undef_double<__vec4_d>() {
|
|||||||
return __vec4_d();
|
return __vec4_d();
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE double __extract_element(__vec4_d v, int index) {
|
|
||||||
return ((double *)&v)[index];
|
|
||||||
}
|
|
||||||
|
|
||||||
static FORCEINLINE void __insert_element(__vec4_d *v, int index, double val) {
|
|
||||||
((double *)v)[index] = val;
|
|
||||||
}
|
|
||||||
|
|
||||||
static FORCEINLINE __vec4_d __broadcast_double(__vec4_d v, int index) {
|
static FORCEINLINE __vec4_d __broadcast_double(__vec4_d v, int index) {
|
||||||
return __vec4_d(_mm_set1_pd(__extract_element(v, index)),
|
return __vec4_d(_mm_set1_pd(__extract_element(v, index)),
|
||||||
@@ -1676,6 +1755,20 @@ static FORCEINLINE __vec4_d __rotate_double(__vec4_d v, int delta) {
|
|||||||
__extract_element(v, (delta+3) & 0x3));
|
__extract_element(v, (delta+3) & 0x3));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static FORCEINLINE __vec4_d __shift_double(__vec4_d v, int delta) {
|
||||||
|
double v1, v2, v3, v4;
|
||||||
|
int d1, d2, d3, d4;
|
||||||
|
d1 = delta+0;
|
||||||
|
d2 = delta+1;
|
||||||
|
d3 = delta+2;
|
||||||
|
d4 = delta+3;
|
||||||
|
v1 = ((d1 >= 0) && (d1 < 4)) ? __extract_element(v, d1) : 0;
|
||||||
|
v2 = ((d2 >= 0) && (d2 < 4)) ? __extract_element(v, d2) : 0;
|
||||||
|
v3 = ((d3 >= 0) && (d3 < 4)) ? __extract_element(v, d3) : 0;
|
||||||
|
v4 = ((d4 >= 0) && (d4 < 4)) ? __extract_element(v, d4) : 0;
|
||||||
|
return __vec4_d(v1, v2, v3, v4);
|
||||||
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_d __shuffle_double(__vec4_d v, __vec4_i32 index) {
|
static FORCEINLINE __vec4_d __shuffle_double(__vec4_d v, __vec4_i32 index) {
|
||||||
return __vec4_d(__extract_element(v, __extract_element(index, 0) & 0x3),
|
return __vec4_d(__extract_element(v, __extract_element(index, 0) & 0x3),
|
||||||
__extract_element(v, __extract_element(index, 1) & 0x3),
|
__extract_element(v, __extract_element(index, 1) & 0x3),
|
||||||
@@ -1889,7 +1982,7 @@ static FORCEINLINE __vec4_f __cast_sitofp(__vec4_f, __vec4_i16 val) {
|
|||||||
(float)((int16_t)_mm_extract_epi16(val.v, 3)));
|
(float)((int16_t)_mm_extract_epi16(val.v, 3)));
|
||||||
}
|
}
|
||||||
|
|
||||||
static FORCEINLINE __vec4_f __cast_sitofp(__vec4_f, __vec4_i32 val) {
|
static FORCEINLINE __vec4_f __cast_sitofp(__vec4_f, const __vec4_i32 &val) {
|
||||||
return _mm_cvtepi32_ps(val.v);
|
return _mm_cvtepi32_ps(val.v);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
95
opt.cpp
95
opt.cpp
@@ -72,6 +72,7 @@
|
|||||||
#include <llvm/Analysis/ConstantFolding.h>
|
#include <llvm/Analysis/ConstantFolding.h>
|
||||||
#include <llvm/Target/TargetLibraryInfo.h>
|
#include <llvm/Target/TargetLibraryInfo.h>
|
||||||
#include <llvm/ADT/Triple.h>
|
#include <llvm/ADT/Triple.h>
|
||||||
|
#include <llvm/ADT/SmallSet.h>
|
||||||
#include <llvm/Transforms/Scalar.h>
|
#include <llvm/Transforms/Scalar.h>
|
||||||
#include <llvm/Transforms/IPO.h>
|
#include <llvm/Transforms/IPO.h>
|
||||||
#include <llvm/Transforms/Utils/BasicBlockUtils.h>
|
#include <llvm/Transforms/Utils/BasicBlockUtils.h>
|
||||||
@@ -124,6 +125,8 @@ static llvm::Pass *CreateMakeInternalFuncsStaticPass();
|
|||||||
|
|
||||||
static llvm::Pass *CreateDebugPass(char * output);
|
static llvm::Pass *CreateDebugPass(char * output);
|
||||||
|
|
||||||
|
static llvm::Pass *CreateReplaceStdlibShiftPass();
|
||||||
|
|
||||||
#define DEBUG_START_PASS(NAME) \
|
#define DEBUG_START_PASS(NAME) \
|
||||||
if (g->debugPrint && \
|
if (g->debugPrint && \
|
||||||
(getenv("FUNC") == NULL || \
|
(getenv("FUNC") == NULL || \
|
||||||
@@ -521,6 +524,7 @@ Optimize(llvm::Module *module, int optLevel) {
|
|||||||
optPM.add(llvm::createPromoteMemoryToRegisterPass());
|
optPM.add(llvm::createPromoteMemoryToRegisterPass());
|
||||||
optPM.add(llvm::createAggressiveDCEPass());
|
optPM.add(llvm::createAggressiveDCEPass());
|
||||||
|
|
||||||
|
|
||||||
if (g->opt.disableGatherScatterOptimizations == false &&
|
if (g->opt.disableGatherScatterOptimizations == false &&
|
||||||
g->target->getVectorWidth() > 1) {
|
g->target->getVectorWidth() > 1) {
|
||||||
optPM.add(llvm::createInstructionCombiningPass(), 210);
|
optPM.add(llvm::createInstructionCombiningPass(), 210);
|
||||||
@@ -546,7 +550,8 @@ Optimize(llvm::Module *module, int optLevel) {
|
|||||||
optPM.add(llvm::createGlobalOptimizerPass());
|
optPM.add(llvm::createGlobalOptimizerPass());
|
||||||
optPM.add(llvm::createReassociatePass());
|
optPM.add(llvm::createReassociatePass());
|
||||||
optPM.add(llvm::createIPConstantPropagationPass());
|
optPM.add(llvm::createIPConstantPropagationPass());
|
||||||
optPM.add(llvm::createDeadArgEliminationPass());
|
optPM.add(CreateReplaceStdlibShiftPass(),229);
|
||||||
|
optPM.add(llvm::createDeadArgEliminationPass(),230);
|
||||||
optPM.add(llvm::createInstructionCombiningPass());
|
optPM.add(llvm::createInstructionCombiningPass());
|
||||||
optPM.add(llvm::createCFGSimplificationPass());
|
optPM.add(llvm::createCFGSimplificationPass());
|
||||||
optPM.add(llvm::createPruneEHPass());
|
optPM.add(llvm::createPruneEHPass());
|
||||||
@@ -4879,6 +4884,7 @@ lMatchAvgDownInt16(llvm::Value *inst) {
|
|||||||
}
|
}
|
||||||
#endif // !LLVM_3_1 && !LLVM_3_2
|
#endif // !LLVM_3_1 && !LLVM_3_2
|
||||||
|
|
||||||
|
|
||||||
bool
|
bool
|
||||||
PeepholePass::runOnBasicBlock(llvm::BasicBlock &bb) {
|
PeepholePass::runOnBasicBlock(llvm::BasicBlock &bb) {
|
||||||
DEBUG_START_PASS("PeepholePass");
|
DEBUG_START_PASS("PeepholePass");
|
||||||
@@ -4923,3 +4929,90 @@ static llvm::Pass *
|
|||||||
CreatePeepholePass() {
|
CreatePeepholePass() {
|
||||||
return new PeepholePass;
|
return new PeepholePass;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
|
/** Given an llvm::Value known to be an integer, return its value as
|
||||||
|
an int64_t.
|
||||||
|
*/
|
||||||
|
static int64_t
|
||||||
|
lGetIntValue(llvm::Value *offset) {
|
||||||
|
llvm::ConstantInt *intOffset = llvm::dyn_cast<llvm::ConstantInt>(offset);
|
||||||
|
Assert(intOffset && (intOffset->getBitWidth() == 32 ||
|
||||||
|
intOffset->getBitWidth() == 64));
|
||||||
|
return intOffset->getSExtValue();
|
||||||
|
}
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
// ReplaceStdlibShiftPass
|
||||||
|
|
||||||
|
class ReplaceStdlibShiftPass : public llvm::BasicBlockPass {
|
||||||
|
public:
|
||||||
|
static char ID;
|
||||||
|
ReplaceStdlibShiftPass() : BasicBlockPass(ID) {
|
||||||
|
}
|
||||||
|
|
||||||
|
const char *getPassName() const { return "Resolve \"replace extract insert chains\""; }
|
||||||
|
bool runOnBasicBlock(llvm::BasicBlock &BB);
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
char ReplaceStdlibShiftPass::ID = 0;
|
||||||
|
|
||||||
|
bool
|
||||||
|
ReplaceStdlibShiftPass::runOnBasicBlock(llvm::BasicBlock &bb) {
|
||||||
|
DEBUG_START_PASS("ReplaceStdlibShiftPass");
|
||||||
|
bool modifiedAny = false;
|
||||||
|
|
||||||
|
llvm::Function *shifts[6];
|
||||||
|
shifts[0] = m->module->getFunction("__shift_i8");
|
||||||
|
shifts[1] = m->module->getFunction("__shift_i16");
|
||||||
|
shifts[2] = m->module->getFunction("__shift_i32");
|
||||||
|
shifts[3] = m->module->getFunction("__shift_i64");
|
||||||
|
shifts[4] = m->module->getFunction("__shift_float");
|
||||||
|
shifts[5] = m->module->getFunction("__shift_double");
|
||||||
|
|
||||||
|
for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
|
||||||
|
llvm::Instruction *inst = &*iter;
|
||||||
|
|
||||||
|
if (llvm::CallInst *ci = llvm::dyn_cast<llvm::CallInst>(inst)) {
|
||||||
|
llvm::Function *func = ci->getCalledFunction();
|
||||||
|
for (int i = 0; i < 6; i++) {
|
||||||
|
if (shifts[i] == func) {
|
||||||
|
// we matched a call
|
||||||
|
llvm::Value *shiftedVec = ci->getArgOperand(0);
|
||||||
|
llvm::Value *shiftAmt = ci->getArgOperand(1);
|
||||||
|
if (llvm::isa<llvm::Constant>(shiftAmt)) {
|
||||||
|
int vectorWidth = g->target->getVectorWidth();
|
||||||
|
int shuffleVals[vectorWidth];
|
||||||
|
int shiftInt = lGetIntValue(shiftAmt);
|
||||||
|
for (int i = 0; i < vectorWidth; i++) {
|
||||||
|
int s = i + shiftInt;
|
||||||
|
s = (s < 0) ? vectorWidth : s;
|
||||||
|
s = (s >= vectorWidth) ? vectorWidth : s;
|
||||||
|
shuffleVals[i] = s;
|
||||||
|
}
|
||||||
|
llvm::Value *shuffleIdxs = LLVMInt32Vector(shuffleVals);
|
||||||
|
llvm::Value *zeroVec = llvm::ConstantAggregateZero::get(shiftedVec->getType());
|
||||||
|
llvm::Value *shuffle = new llvm::ShuffleVectorInst(shiftedVec, zeroVec,
|
||||||
|
shuffleIdxs, "vecShift", ci);
|
||||||
|
ci->replaceAllUsesWith(shuffle);
|
||||||
|
modifiedAny = true;
|
||||||
|
} else {
|
||||||
|
PerformanceWarning(SourcePos(), "Stdlib shift() called without constant shift amount.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
DEBUG_END_PASS("ReplaceStdlibShiftPass");
|
||||||
|
|
||||||
|
return modifiedAny;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static llvm::Pass *
|
||||||
|
CreateReplaceStdlibShiftPass() {
|
||||||
|
return new ReplaceStdlibShiftPass();
|
||||||
|
}
|
||||||
|
|||||||
54
stdlib.ispc
54
stdlib.ispc
@@ -170,6 +170,60 @@ static inline int64 rotate(int64 v, uniform int i) {
|
|||||||
return __rotate_i64(v, i);
|
return __rotate_i64(v, i);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__declspec(safe)
|
||||||
|
static inline float shift(float v, uniform int i) {
|
||||||
|
varying float result;
|
||||||
|
unmasked {
|
||||||
|
result = __shift_float(v, i);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
__declspec(safe)
|
||||||
|
static inline int8 shift(int8 v, uniform int i) {
|
||||||
|
varying int8 result;
|
||||||
|
unmasked {
|
||||||
|
result = __shift_i8(v, i);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
__declspec(safe)
|
||||||
|
static inline int16 shift(int16 v, uniform int i) {
|
||||||
|
varying int16 result;
|
||||||
|
unmasked {
|
||||||
|
result = __shift_i16(v, i);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
__declspec(safe)
|
||||||
|
static inline int32 shift(int32 v, uniform int i) {
|
||||||
|
varying int32 result;
|
||||||
|
unmasked {
|
||||||
|
result = __shift_i32(v, i);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
__declspec(safe)
|
||||||
|
static inline double shift(double v, uniform int i) {
|
||||||
|
varying double result;
|
||||||
|
unmasked {
|
||||||
|
result = __shift_double(v, i);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
__declspec(safe)
|
||||||
|
static inline int64 shift(int64 v, uniform int i) {
|
||||||
|
varying int64 result;
|
||||||
|
unmasked {
|
||||||
|
result = __shift_i64(v, i);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
__declspec(safe)
|
__declspec(safe)
|
||||||
static inline float shuffle(float v, int i) {
|
static inline float shuffle(float v, int i) {
|
||||||
return __shuffle_float(v, i);
|
return __shuffle_float(v, i);
|
||||||
|
|||||||
14
tests/shift-1.ispc
Normal file
14
tests/shift-1.ispc
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
|
||||||
|
export uniform int width() { return programCount; }
|
||||||
|
|
||||||
|
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||||
|
int a = aFOO[programIndex];
|
||||||
|
int rot = shift(a, -1);
|
||||||
|
RET[programIndex] = rot;
|
||||||
|
}
|
||||||
|
|
||||||
|
export void result(uniform float RET[]) {
|
||||||
|
varying int val = programIndex;
|
||||||
|
if (val < 0) val = 0;
|
||||||
|
RET[programIndex] = val;
|
||||||
|
}
|
||||||
15
tests/shift-2.ispc
Normal file
15
tests/shift-2.ispc
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
|
||||||
|
export uniform int width() { return programCount; }
|
||||||
|
|
||||||
|
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||||
|
int a = aFOO[programIndex];
|
||||||
|
uniform int delta = b - 6; // -1
|
||||||
|
int rot = shift(a, delta);
|
||||||
|
RET[programIndex] = rot;
|
||||||
|
}
|
||||||
|
|
||||||
|
export void result(uniform float RET[]) {
|
||||||
|
varying int val = programIndex;
|
||||||
|
if (val < 0) val = 0;
|
||||||
|
RET[programIndex] = val;
|
||||||
|
}
|
||||||
14
tests/shift-3.ispc
Normal file
14
tests/shift-3.ispc
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
|
||||||
|
export uniform int width() { return programCount; }
|
||||||
|
|
||||||
|
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||||
|
int a = aFOO[programIndex];
|
||||||
|
int rot = shift(a, 1);
|
||||||
|
RET[programIndex] = rot;
|
||||||
|
}
|
||||||
|
|
||||||
|
export void result(uniform float RET[]) {
|
||||||
|
varying int val = 2 + programIndex;
|
||||||
|
if (val > programCount) val = 0;
|
||||||
|
RET[programIndex] = val;
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user