Add support for coalescing memory accesses from gathers.
There are two related optimizations that happen now. (These currently only apply for gathers where the mask is known to be all on, and to gathers that are accessing 32-bit sized elements, but both of these may be generalized in the future.) First, for any single gather, we are now more flexible in mapping it to individual memory operations. Previously, we would only either map it to a general gather (one scalar load per SIMD lane), or an unaligned vector load (if the program instances could be determined to be accessing a sequential set of locations in memory.) Now, we are able to break gathers into scalar, 2-wide (i.e. 64-bit), 4-wide, or 8-wide loads. Further, we now generate code that shuffles these loads around. Doing fewer, larger loads in this manner, when possible, can be more efficient. Second, we can coalesce memory accesses across multiple gathers. If we have a series of gathers without any memory writes in the middle, then we try to analyze their reads collectively and choose an efficient set of loads for them. Not only does this help if different gathers reuse values from the same location in memory, but it's specifically helpful when data with AOS layout is being accessed; in this case, we're often able to generate wide vector loads and appropriate shuffles automatically.
This commit is contained in:
1
ispc.cpp
1
ispc.cpp
@@ -497,6 +497,7 @@ Opt::Opt() {
|
||||
disableMaskedStoreToStore = false;
|
||||
disableGatherScatterFlattening = false;
|
||||
disableUniformMemoryOptimizations = false;
|
||||
disableCoalescing = false;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
4
ispc.h
4
ispc.h
@@ -339,6 +339,10 @@ struct Opt {
|
||||
than gathers/scatters. This is likely only useful for measuring
|
||||
the impact of this optimization. */
|
||||
bool disableUniformMemoryOptimizations;
|
||||
|
||||
/** Disables optimizations that coalesce incoherent scalar memory
|
||||
access from gathers into wider vector operations, when possible. */
|
||||
bool disableCoalescing;
|
||||
};
|
||||
|
||||
/** @brief This structure collects together a number of global variables.
|
||||
|
||||
11
main.cpp
11
main.cpp
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
Copyright (c) 2010-2012, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -141,16 +141,17 @@ devUsage(int ret) {
|
||||
printf(" [--fuzz-test]\t\t\tRandomly perturb program input to test error conditions\n");
|
||||
printf(" [--fuzz-seed=<value>]\t\tSeed value for RNG for fuzz testing\n");
|
||||
printf(" [--opt=<option>]\t\t\tSet optimization option\n");
|
||||
printf(" disable-all-on-optimizations\n");
|
||||
printf(" disable-all-on-optimizations\t\tDisable optimizations that take advantage of \"all on\" mask\n");
|
||||
printf(" disable-blended-masked-stores\t\tScalarize masked stores on SSE (vs. using vblendps)\n");
|
||||
printf(" disable-blending-removal\t\tDisable eliminating blend at same scope\n");
|
||||
printf(" disable-coalescing\t\t\tDisable gather coalescing\n");
|
||||
printf(" disable-coherent-control-flow\t\tDisable coherent control flow optimizations\n");
|
||||
printf(" disable-gather-scatter-flattening\tDisable flattening when all lanes are on\n");
|
||||
printf(" disable-gather-scatter-optimizations\tDisable improvements to gather/scatter\n");
|
||||
printf(" disable-handle-pseudo-memory-ops\n");
|
||||
printf(" disable-handle-pseudo-memory-ops\tLeave __pseudo_* calls for gather/scatter/etc. in final IR\n");
|
||||
printf(" disable-uniform-control-flow\t\tDisable uniform control flow optimizations\n");
|
||||
printf(" disable-uniform-memory-optimizations\tDisable uniform-based coherent memory access\n");
|
||||
printf(" [--yydebug]\t\t\tPrint debugging information during parsing\n");
|
||||
printf(" [--yydebug]\t\t\t\tPrint debugging information during parsing\n");
|
||||
exit(ret);
|
||||
}
|
||||
|
||||
@@ -341,6 +342,8 @@ int main(int Argc, char *Argv[]) {
|
||||
// optimizations
|
||||
else if (!strcmp(opt, "disable-all-on-optimizations"))
|
||||
g->opt.disableMaskAllOnOptimizations = true;
|
||||
else if (!strcmp(opt, "disable-coalescing"))
|
||||
g->opt.disableCoalescing = true;
|
||||
else if (!strcmp(opt, "disable-handle-pseudo-memory-ops"))
|
||||
g->opt.disableHandlePseudoMemoryOps = true;
|
||||
else if (!strcmp(opt, "disable-blended-masked-stores"))
|
||||
|
||||
14
tests/coalesce-1.ispc
Normal file
14
tests/coalesce-1.ispc
Normal file
@@ -0,0 +1,14 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform float * uniform buf = uniform new uniform float[32*32];
|
||||
for (uniform int i = 0; i < 32*32; ++i)
|
||||
buf[i] = i;
|
||||
|
||||
RET[programIndex] = buf[64-programIndex];
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 64 - programIndex;
|
||||
}
|
||||
14
tests/coalesce-2.ispc
Normal file
14
tests/coalesce-2.ispc
Normal file
@@ -0,0 +1,14 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform float * uniform buf = uniform new uniform float[32*32];
|
||||
for (uniform int i = 0; i < 32*32; ++i)
|
||||
buf[i] = i;
|
||||
|
||||
RET[programIndex] = buf[programIndex & 1];
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = programIndex & 1;
|
||||
}
|
||||
14
tests/coalesce-3.ispc
Normal file
14
tests/coalesce-3.ispc
Normal file
@@ -0,0 +1,14 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform float * uniform buf = uniform new uniform float[32*32];
|
||||
for (uniform int i = 0; i < 32*32; ++i)
|
||||
buf[i] = i;
|
||||
|
||||
RET[programIndex] = buf[(programIndex >> 2) * 16 + (programIndex & 3)];
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = (programIndex >> 2) * 16 + (programIndex & 3);
|
||||
}
|
||||
17
tests/coalesce-4.ispc
Normal file
17
tests/coalesce-4.ispc
Normal file
@@ -0,0 +1,17 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform float * uniform buf = uniform new uniform float[32*32];
|
||||
for (uniform int i = 0; i < 32*32; ++i)
|
||||
buf[i] = i;
|
||||
|
||||
float a = buf[2*programIndex];
|
||||
float b = buf[2*programIndex+1];
|
||||
|
||||
RET[programIndex] = a+b;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 2 * programIndex + 2 * programIndex + 1;
|
||||
}
|
||||
20
tests/coalesce-5.ispc
Normal file
20
tests/coalesce-5.ispc
Normal file
@@ -0,0 +1,20 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform float * uniform buf = uniform new uniform float[32*32];
|
||||
for (uniform int i = 0; i < 32*32; ++i)
|
||||
buf[i] = i;
|
||||
|
||||
float a = buf[4*programIndex];
|
||||
float b = buf[4*programIndex+1];
|
||||
float c = buf[4*programIndex+2];
|
||||
float d = buf[4*programIndex+3];
|
||||
|
||||
RET[programIndex] = a+b+c+d;;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 4 * programIndex + 4 * programIndex + 1 +
|
||||
4 * programIndex + 2 + 4 * programIndex + 3;
|
||||
}
|
||||
21
tests/coalesce-6.ispc
Normal file
21
tests/coalesce-6.ispc
Normal file
@@ -0,0 +1,21 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform float * uniform buf = uniform new uniform float[32*32];
|
||||
for (uniform int i = 0; i < 32*32; ++i)
|
||||
buf[i] = i;
|
||||
|
||||
float a = buf[4*programIndex];
|
||||
float b = buf[4*programIndex+1];
|
||||
buf[4*programIndex+2] = 0;
|
||||
float c = buf[4*programIndex+2];
|
||||
float d = buf[4*programIndex+3];
|
||||
|
||||
RET[programIndex] = a+b+c+d;;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 4 * programIndex + 4 * programIndex + 1 +
|
||||
4 * programIndex + 3;
|
||||
}
|
||||
21
tests/coalesce-7.ispc
Normal file
21
tests/coalesce-7.ispc
Normal file
@@ -0,0 +1,21 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform float * uniform buf = uniform new uniform float[32*32];
|
||||
for (uniform int i = 0; i < 32*32; ++i)
|
||||
buf[i] = i;
|
||||
|
||||
float a = buf[4*programIndex];
|
||||
buf[4*programIndex+1] = 0;
|
||||
buf[4*programIndex+3] = 0;
|
||||
float b = buf[4*programIndex+1];
|
||||
float c = buf[4*programIndex+2];
|
||||
float d = buf[4*programIndex+3];
|
||||
|
||||
RET[programIndex] = a+b+c+d;;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 4 * programIndex + 4 * programIndex + 2;
|
||||
}
|
||||
19
tests/coalesce-8.ispc
Normal file
19
tests/coalesce-8.ispc
Normal file
@@ -0,0 +1,19 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform float * uniform buf = uniform new uniform float[32*32];
|
||||
for (uniform int i = 0; i < 32*32; ++i)
|
||||
buf[i] = i;
|
||||
|
||||
int index = (programIndex < 4) ? (programIndex & 1) :
|
||||
(programIndex / 4);
|
||||
float a = buf[index];
|
||||
|
||||
RET[programIndex] = a;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = (programIndex < 4) ? (programIndex & 1) :
|
||||
(programIndex / 4);
|
||||
}
|
||||
Reference in New Issue
Block a user