Add docs/generic impls

This commit is contained in:
james.brodman
2013-10-23 15:51:59 -04:00
parent 4d289b16c2
commit c4ad8f6ed4
2 changed files with 101 additions and 1 deletions

View File

@@ -3719,6 +3719,22 @@ the size of the gang (it is masked to ensure valid offsets).
double rotate(double value, uniform int offset) double rotate(double value, uniform int offset)
The ``shift()`` function allows each program instance to find the value of
the given value that their neighbor ``offset`` steps away has. This is similar
to ``rotate()`` with the exception that values are not circularly shifted.
Instead, zeroes are shifted in where appropriate.
::
int8 shift(int8 value, uniform int offset)
int16 shift(int16 value, uniform int offset)
int32 shift(int32 value, uniform int offset)
int64 shift(int64 value, uniform int offset)
float shift(float value, uniform int offset)
double shift(double value, uniform int offset)
Finally, the ``shuffle()`` functions allow two variants of fully general Finally, the ``shuffle()`` functions allow two variants of fully general
shuffling of values among the program instances. For the first version, shuffling of values among the program instances. For the first version,
each program instance's value of permutation gives the program instance each program instance's value of permutation gives the program instance
@@ -3751,7 +3767,7 @@ the last element of ``value1``, etc.)
double shuffle(double value0, double value1, int permutation) double shuffle(double value0, double value1, int permutation)
Finally, there are primitive operations that extract and set values in the Finally, there are primitive operations that extract and set values in the
SIMD lanes. You can implement all of the broadcast, rotate, and shuffle SIMD lanes. You can implement all of the broadcast, rotate, shift, and shuffle
operations described above in this section from these routines, though in operations described above in this section from these routines, though in
general, not as efficiently. These routines are useful for implementing general, not as efficiently. These routines are useful for implementing
other reductions and cross-lane communication that isn't included in the other reductions and cross-lane communication that isn't included in the

View File

@@ -598,6 +598,20 @@ static FORCEINLINE __vec4_i8 __rotate_i8(__vec4_i8 v, int delta) {
__extract_element(v, (delta+3) & 0x3)); __extract_element(v, (delta+3) & 0x3));
} }
static FORCEINLINE __vec4_i8 __shift_i8(__vec4_i8 v, int delta) {
int8_t v1, v2, v3, v4;
int d1, d2, d3, d4;
d1 = delta+0;
d2 = delta+1;
d3 = delta+2;
d4 = delta+3;
v1 = ((d1 >= 0) && (d1 < 4)) ? __extract_element(v, d1) : 0;
v2 = ((d2 >= 0) && (d2 < 4)) ? __extract_element(v, d2) : 0;
v3 = ((d3 >= 0) && (d3 < 4)) ? __extract_element(v, d3) : 0;
v4 = ((d4 >= 0) && (d4 < 4)) ? __extract_element(v, d4) : 0;
return __vec4_i8(v1, v2, v3, v4);
}
static FORCEINLINE __vec4_i8 __shuffle_i8(__vec4_i8 v, __vec4_i32 index) { static FORCEINLINE __vec4_i8 __shuffle_i8(__vec4_i8 v, __vec4_i32 index) {
return __vec4_i8(__extract_element(v, __extract_element(index, 0) & 0x3), return __vec4_i8(__extract_element(v, __extract_element(index, 0) & 0x3),
__extract_element(v, __extract_element(index, 1) & 0x3), __extract_element(v, __extract_element(index, 1) & 0x3),
@@ -870,6 +884,20 @@ static FORCEINLINE __vec4_i16 __rotate_i16(__vec4_i16 v, int delta) {
__extract_element(v, (delta+3) & 0x3)); __extract_element(v, (delta+3) & 0x3));
} }
static FORCEINLINE __vec4_i16 __shift_i16(__vec4_i16 v, int delta) {
int16_t v1, v2, v3, v4;
int d1, d2, d3, d4;
d1 = delta+0;
d2 = delta+1;
d3 = delta+2;
d4 = delta+3;
v1 = ((d1 >= 0) && (d1 < 4)) ? __extract_element(v, d1) : 0;
v2 = ((d2 >= 0) && (d2 < 4)) ? __extract_element(v, d2) : 0;
v3 = ((d3 >= 0) && (d3 < 4)) ? __extract_element(v, d3) : 0;
v4 = ((d4 >= 0) && (d4 < 4)) ? __extract_element(v, d4) : 0;
return __vec4_i16(v1, v2, v3, v4);
}
static FORCEINLINE __vec4_i16 __shuffle_i16(__vec4_i16 v, __vec4_i32 index) { static FORCEINLINE __vec4_i16 __shuffle_i16(__vec4_i16 v, __vec4_i32 index) {
return __vec4_i16(__extract_element(v, __extract_element(index, 0) & 0x3), return __vec4_i16(__extract_element(v, __extract_element(index, 0) & 0x3),
__extract_element(v, __extract_element(index, 1) & 0x3), __extract_element(v, __extract_element(index, 1) & 0x3),
@@ -1128,6 +1156,20 @@ static FORCEINLINE __vec4_i32 __rotate_i32(__vec4_i32 v, int delta) {
__extract_element(v, (delta+3) & 0x3)); __extract_element(v, (delta+3) & 0x3));
} }
static FORCEINLINE __vec4_i32 __shift_i32(__vec4_i32 v, int delta) {
int32_t v1, v2, v3, v4;
int d1, d2, d3, d4;
d1 = delta+0;
d2 = delta+1;
d3 = delta+2;
d4 = delta+3;
v1 = ((d1 >= 0) && (d1 < 4)) ? __extract_element(v, d1) : 0;
v2 = ((d2 >= 0) && (d2 < 4)) ? __extract_element(v, d2) : 0;
v3 = ((d3 >= 0) && (d3 < 4)) ? __extract_element(v, d3) : 0;
v4 = ((d4 >= 0) && (d4 < 4)) ? __extract_element(v, d4) : 0;
return __vec4_i32(v1, v2, v3, v4);
}
static FORCEINLINE __vec4_i32 __shuffle_i32(__vec4_i32 v, __vec4_i32 index) { static FORCEINLINE __vec4_i32 __shuffle_i32(__vec4_i32 v, __vec4_i32 index) {
return __vec4_i32(__extract_element(v, __extract_element(index, 0) & 0x3), return __vec4_i32(__extract_element(v, __extract_element(index, 0) & 0x3),
__extract_element(v, __extract_element(index, 1) & 0x3), __extract_element(v, __extract_element(index, 1) & 0x3),
@@ -1403,6 +1445,20 @@ static FORCEINLINE __vec4_i64 __rotate_i64(__vec4_i64 v, int delta) {
__extract_element(v, (delta+3) & 0x3)); __extract_element(v, (delta+3) & 0x3));
} }
static FORCEINLINE __vec4_i64 __shift_i64(__vec4_i64 v, int delta) {
int64_t v1, v2, v3, v4;
int d1, d2, d3, d4;
d1 = delta+0;
d2 = delta+1;
d3 = delta+2;
d4 = delta+3;
v1 = ((d1 >= 0) && (d1 < 4)) ? __extract_element(v, d1) : 0;
v2 = ((d2 >= 0) && (d2 < 4)) ? __extract_element(v, d2) : 0;
v3 = ((d3 >= 0) && (d3 < 4)) ? __extract_element(v, d3) : 0;
v4 = ((d4 >= 0) && (d4 < 4)) ? __extract_element(v, d4) : 0;
return __vec4_i64(v1, v2, v3, v4);
}
static FORCEINLINE __vec4_i64 __shuffle_i64(__vec4_i64 v, __vec4_i32 index) { static FORCEINLINE __vec4_i64 __shuffle_i64(__vec4_i64 v, __vec4_i32 index) {
return __vec4_i64(__extract_element(v, __extract_element(index, 0) & 0x3), return __vec4_i64(__extract_element(v, __extract_element(index, 0) & 0x3),
__extract_element(v, __extract_element(index, 1) & 0x3), __extract_element(v, __extract_element(index, 1) & 0x3),
@@ -1523,6 +1579,20 @@ static FORCEINLINE __vec4_f __rotate_float(__vec4_f v, int delta) {
__extract_element(v, (delta+3) & 0x3)); __extract_element(v, (delta+3) & 0x3));
} }
static FORCEINLINE __vec4_f __shift_float(__vec4_f v, int delta) {
float v1, v2, v3, v4;
int d1, d2, d3, d4;
d1 = delta+0;
d2 = delta+1;
d3 = delta+2;
d4 = delta+3;
v1 = ((d1 >= 0) && (d1 < 4)) ? __extract_element(v, d1) : 0.f;
v2 = ((d2 >= 0) && (d2 < 4)) ? __extract_element(v, d2) : 0.f;
v3 = ((d3 >= 0) && (d3 < 4)) ? __extract_element(v, d3) : 0.f;
v4 = ((d4 >= 0) && (d4 < 4)) ? __extract_element(v, d4) : 0.f;
return __vec4_f(v1, v2, v3, v4);
}
static FORCEINLINE __vec4_f __shuffle_float(__vec4_f v, __vec4_i32 index) { static FORCEINLINE __vec4_f __shuffle_float(__vec4_f v, __vec4_i32 index) {
return __vec4_f(__extract_element(v, __extract_element(index, 0) & 0x3), return __vec4_f(__extract_element(v, __extract_element(index, 0) & 0x3),
__extract_element(v, __extract_element(index, 1) & 0x3), __extract_element(v, __extract_element(index, 1) & 0x3),
@@ -1676,6 +1746,20 @@ static FORCEINLINE __vec4_d __rotate_double(__vec4_d v, int delta) {
__extract_element(v, (delta+3) & 0x3)); __extract_element(v, (delta+3) & 0x3));
} }
static FORCEINLINE __vec4_d __shift_double(__vec4_d v, int delta) {
double v1, v2, v3, v4;
int d1, d2, d3, d4;
d1 = delta+0;
d2 = delta+1;
d3 = delta+2;
d4 = delta+3;
v1 = ((d1 >= 0) && (d1 < 4)) ? __extract_element(v, d1) : 0;
v2 = ((d2 >= 0) && (d2 < 4)) ? __extract_element(v, d2) : 0;
v3 = ((d3 >= 0) && (d3 < 4)) ? __extract_element(v, d3) : 0;
v4 = ((d4 >= 0) && (d4 < 4)) ? __extract_element(v, d4) : 0;
return __vec4_d(v1, v2, v3, v4);
}
static FORCEINLINE __vec4_d __shuffle_double(__vec4_d v, __vec4_i32 index) { static FORCEINLINE __vec4_d __shuffle_double(__vec4_d v, __vec4_i32 index) {
return __vec4_d(__extract_element(v, __extract_element(index, 0) & 0x3), return __vec4_d(__extract_element(v, __extract_element(index, 0) & 0x3),
__extract_element(v, __extract_element(index, 1) & 0x3), __extract_element(v, __extract_element(index, 1) & 0x3),