added wc-timer for host code
This commit is contained in:
@@ -739,7 +739,7 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_i32 *p, __vec16_i32
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
#if 0 /* knc::fails ./tests/foreach-25.ispc ./tests/forach-26.ispc ./tests/foreach-27.ispc */
|
#if 1 /* knc::fails ./tests/foreach-25.ispc ./tests/forach-26.ispc ./tests/foreach-27.ispc */
|
||||||
template <> static FORCEINLINE __vec16_i32 __load<64>(const __vec16_i32 *p)
|
template <> static FORCEINLINE __vec16_i32 __load<64>(const __vec16_i32 *p)
|
||||||
{
|
{
|
||||||
return _mm512_load_epi32(p);
|
return _mm512_load_epi32(p);
|
||||||
@@ -1007,7 +1007,7 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_i64 *p, __vec16_i64
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
#if 0 /* knc::fails as with _i32 this may generate fails ... so commetining it out */
|
#if 1 /* knc::fails as with _i32 this may generate fails ... so commetining it out */
|
||||||
template <> static FORCEINLINE __vec16_i64 __load<64>(const __vec16_i64 *p)
|
template <> static FORCEINLINE __vec16_i64 __load<64>(const __vec16_i64 *p)
|
||||||
{
|
{
|
||||||
__m512i v2 = _mm512_load_epi32(p);
|
__m512i v2 = _mm512_load_epi32(p);
|
||||||
@@ -1119,7 +1119,7 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_f *p, __vec16_f v)
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
#if 0 /* knc::fails ./tests/gs-improve-progindex.ispc with segfault */
|
#if 1 /* knc::fails ./tests/gs-improve-progindex.ispc with segfault */
|
||||||
template <> static FORCEINLINE __vec16_f __load<64>(const __vec16_f *p)
|
template <> static FORCEINLINE __vec16_f __load<64>(const __vec16_f *p)
|
||||||
{
|
{
|
||||||
return _mm512_load_ps(p);
|
return _mm512_load_ps(p);
|
||||||
@@ -1397,7 +1397,7 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_d *p, __vec16_d v)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#if 0 /* knc::fails as with _f this may generate fails ... so commetining it out */
|
#if 1 /* knc::fails as with _f this may generate fails ... so commetining it out */
|
||||||
template <> static FORCEINLINE __vec16_d __load<64>(const __vec16_d *p)
|
template <> static FORCEINLINE __vec16_d __load<64>(const __vec16_d *p)
|
||||||
{
|
{
|
||||||
return __vec16_d(_mm512_load_pd(p), _mm512_load_pd(((uint8_t*)p)+64));
|
return __vec16_d(_mm512_load_pd(p), _mm512_load_pd(((uint8_t*)p)+64));
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
EXAMPLE=mandelbrot_tasks
|
EXAMPLE=mandelbrot_tasks
|
||||||
CPP_SRC=mandelbrot_tasks.cpp mandelbrot_tasks_serial.cpp
|
CPP_SRC=mandelbrot_tasks.cpp mandelbrot_tasks_serial.cpp
|
||||||
ISPC_SRC=mandelbrot_tasks.ispc
|
ISPC_SRC=mandelbrot_tasks.ispc
|
||||||
ISPC_IA_TARGETS=sse2,sse4-x2,avx-x2
|
ISPC_IA_TARGETS=avx
|
||||||
ISPC_ARM_TARGETS=neon
|
ISPC_ARM_TARGETS=neon
|
||||||
|
|
||||||
include ../common.mk
|
include ../common.mk
|
||||||
|
|||||||
@@ -42,105 +42,123 @@
|
|||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include "../timing.h"
|
#include "../timing.h"
|
||||||
#include "mandelbrot_tasks_ispc.h"
|
#include "mandelbrot_ispc.h"
|
||||||
using namespace ispc;
|
using namespace ispc;
|
||||||
|
|
||||||
|
#include <sys/time.h>
|
||||||
|
|
||||||
|
|
||||||
|
double rtc(void)
|
||||||
|
{
|
||||||
|
struct timeval Tvalue;
|
||||||
|
double etime;
|
||||||
|
struct timezone dummy;
|
||||||
|
|
||||||
|
gettimeofday(&Tvalue,&dummy);
|
||||||
|
etime = (double) Tvalue.tv_sec +
|
||||||
|
1.e-6*((double) Tvalue.tv_usec);
|
||||||
|
return etime;
|
||||||
|
}
|
||||||
|
|
||||||
extern void mandelbrot_serial(float x0, float y0, float x1, float y1,
|
extern void mandelbrot_serial(float x0, float y0, float x1, float y1,
|
||||||
int width, int height, int maxIterations,
|
int width, int height, int maxIterations,
|
||||||
int output[]);
|
int output[]);
|
||||||
|
|
||||||
/* Write a PPM image file with the image of the Mandelbrot set */
|
/* Write a PPM image file with the image of the Mandelbrot set */
|
||||||
static void
|
static void
|
||||||
writePPM(int *buf, int width, int height, const char *fn) {
|
writePPM(int *buf, int width, int height, const char *fn) {
|
||||||
FILE *fp = fopen(fn, "wb");
|
FILE *fp = fopen(fn, "wb");
|
||||||
fprintf(fp, "P6\n");
|
fprintf(fp, "P6\n");
|
||||||
fprintf(fp, "%d %d\n", width, height);
|
fprintf(fp, "%d %d\n", width, height);
|
||||||
fprintf(fp, "255\n");
|
fprintf(fp, "255\n");
|
||||||
for (int i = 0; i < width*height; ++i) {
|
for (int i = 0; i < width*height; ++i) {
|
||||||
// Map the iteration count to colors by just alternating between
|
// Map the iteration count to colors by just alternating between
|
||||||
// two greys.
|
// two greys.
|
||||||
char c = (buf[i] & 0x1) ? 240 : 20;
|
char c = (buf[i] & 0x1) ? 240 : 20;
|
||||||
for (int j = 0; j < 3; ++j)
|
for (int j = 0; j < 3; ++j)
|
||||||
fputc(c, fp);
|
fputc(c, fp);
|
||||||
}
|
}
|
||||||
fclose(fp);
|
fclose(fp);
|
||||||
printf("Wrote image file %s\n", fn);
|
printf("Wrote image file %s\n", fn);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static void usage() {
|
static void usage() {
|
||||||
fprintf(stderr, "usage: mandelbrot [--scale=<factor>]\n");
|
fprintf(stderr, "usage: mandelbrot [--scale=<factor>]\n");
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char *argv[]) {
|
int main(int argc, char *argv[]) {
|
||||||
unsigned int width = 1536;
|
unsigned int width = 1536;
|
||||||
unsigned int height = 1024;
|
unsigned int height = 1024;
|
||||||
float x0 = -2;
|
float x0 = -2;
|
||||||
float x1 = 1;
|
float x1 = 1;
|
||||||
float y0 = -1;
|
float y0 = -1;
|
||||||
float y1 = 1;
|
float y1 = 1;
|
||||||
|
|
||||||
if (argc == 1)
|
if (argc == 1)
|
||||||
;
|
;
|
||||||
else if (argc == 2) {
|
else if (argc == 2) {
|
||||||
if (strncmp(argv[1], "--scale=", 8) == 0) {
|
if (strncmp(argv[1], "--scale=", 8) == 0) {
|
||||||
float scale = atof(argv[1] + 8);
|
float scale = atof(argv[1] + 8);
|
||||||
if (scale == 0.f)
|
if (scale == 0.f)
|
||||||
usage();
|
usage();
|
||||||
width *= scale;
|
width *= scale;
|
||||||
height *= scale;
|
height *= scale;
|
||||||
// round up to multiples of 16
|
// round up to multiples of 16
|
||||||
width = (width + 0xf) & ~0xf;
|
width = (width + 0xf) & ~0xf;
|
||||||
height = (height + 0xf) & ~0xf;
|
height = (height + 0xf) & ~0xf;
|
||||||
}
|
|
||||||
else
|
|
||||||
usage();
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
usage();
|
usage();
|
||||||
|
}
|
||||||
|
else
|
||||||
|
usage();
|
||||||
|
|
||||||
int maxIterations = 512;
|
int maxIterations = 512;
|
||||||
int *buf = new int[width*height];
|
int *buf = new int[width*height];
|
||||||
|
|
||||||
//
|
//
|
||||||
// Compute the image using the ispc implementation; report the minimum
|
// Compute the image using the ispc implementation; report the minimum
|
||||||
// time of three runs.
|
// time of three runs.
|
||||||
//
|
//
|
||||||
double minISPC = 1e30;
|
double minISPC = 1e30;
|
||||||
for (int i = 0; i < 3; ++i) {
|
for (int i = 0; i < 3; ++i) {
|
||||||
// Clear out the buffer
|
// Clear out the buffer
|
||||||
for (unsigned int i = 0; i < width * height; ++i)
|
for (unsigned int i = 0; i < width * height; ++i)
|
||||||
buf[i] = 0;
|
buf[i] = 0;
|
||||||
reset_and_start_timer();
|
reset_and_start_timer();
|
||||||
mandelbrot_ispc(x0, y0, x1, y1, width, height, maxIterations, buf);
|
double t0 = rtc();
|
||||||
double dt = get_elapsed_mcycles();
|
mandelbrot_ispc(x0, y0, x1, y1, width, height, maxIterations, buf);
|
||||||
minISPC = std::min(minISPC, dt);
|
double dt = rtc() - t0; //get_elapsed_mcycles();
|
||||||
}
|
minISPC = std::min(minISPC, dt);
|
||||||
|
}
|
||||||
|
|
||||||
printf("[mandelbrot ispc+tasks]:\t[%.3f] million cycles\n", minISPC);
|
printf("[mandelbrot ispc+tasks]:\t[%.3f] million cycles\n", minISPC);
|
||||||
writePPM(buf, width, height, "mandelbrot-ispc.ppm");
|
writePPM(buf, width, height, "mandelbrot-ispc.ppm");
|
||||||
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// And run the serial implementation 3 times, again reporting the
|
// And run the serial implementation 3 times, again reporting the
|
||||||
// minimum time.
|
// minimum time.
|
||||||
//
|
//
|
||||||
double minSerial = 1e30;
|
double minSerial = 1e30;
|
||||||
for (int i = 0; i < 3; ++i) {
|
#if 0
|
||||||
// Clear out the buffer
|
for (int i = 0; i < 3; ++i) {
|
||||||
for (unsigned int i = 0; i < width * height; ++i)
|
// Clear out the buffer
|
||||||
buf[i] = 0;
|
for (unsigned int i = 0; i < width * height; ++i)
|
||||||
reset_and_start_timer();
|
buf[i] = 0;
|
||||||
mandelbrot_serial(x0, y0, x1, y1, width, height, maxIterations, buf);
|
reset_and_start_timer();
|
||||||
double dt = get_elapsed_mcycles();
|
mandelbrot_serial(x0, y0, x1, y1, width, height, maxIterations, buf);
|
||||||
minSerial = std::min(minSerial, dt);
|
double dt = get_elapsed_mcycles();
|
||||||
}
|
minSerial = std::min(minSerial, dt);
|
||||||
|
}
|
||||||
|
|
||||||
printf("[mandelbrot serial]:\t\t[%.3f] million cycles\n", minSerial);
|
printf("[mandelbrot serial]:\t\t[%.3f] million cycles\n", minSerial);
|
||||||
writePPM(buf, width, height, "mandelbrot-serial.ppm");
|
writePPM(buf, width, height, "mandelbrot-serial.ppm");
|
||||||
|
#endif
|
||||||
|
|
||||||
printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", minSerial/minISPC);
|
printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", minSerial/minISPC);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -42,7 +42,7 @@
|
|||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include "../timing.h"
|
#include "../timing.h"
|
||||||
#include "mandelbrot_tasks3d_ispc.h"
|
#include "mandelbrot_ispc.h"
|
||||||
using namespace ispc;
|
using namespace ispc;
|
||||||
|
|
||||||
extern void mandelbrot_serial(float x0, float y0, float x1, float y1,
|
extern void mandelbrot_serial(float x0, float y0, float x1, float y1,
|
||||||
@@ -127,6 +127,7 @@ int main(int argc, char *argv[]) {
|
|||||||
// minimum time.
|
// minimum time.
|
||||||
//
|
//
|
||||||
double minSerial = 1e30;
|
double minSerial = 1e30;
|
||||||
|
#if 0
|
||||||
for (int i = 0; i < 3; ++i) {
|
for (int i = 0; i < 3; ++i) {
|
||||||
// Clear out the buffer
|
// Clear out the buffer
|
||||||
for (unsigned int i = 0; i < width * height; ++i)
|
for (unsigned int i = 0; i < width * height; ++i)
|
||||||
@@ -139,6 +140,7 @@ int main(int argc, char *argv[]) {
|
|||||||
|
|
||||||
printf("[mandelbrot serial]:\t\t[%.3f] million cycles\n", minSerial);
|
printf("[mandelbrot serial]:\t\t[%.3f] million cycles\n", minSerial);
|
||||||
writePPM(buf, width, height, "mandelbrot-serial.ppm");
|
writePPM(buf, width, height, "mandelbrot-serial.ppm");
|
||||||
|
#endif
|
||||||
|
|
||||||
printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", minSerial/minISPC);
|
printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", minSerial/minISPC);
|
||||||
|
|
||||||
|
|||||||
@@ -45,107 +45,129 @@
|
|||||||
#include "stencil_ispc.h"
|
#include "stencil_ispc.h"
|
||||||
using namespace ispc;
|
using namespace ispc;
|
||||||
|
|
||||||
|
#include <sys/time.h>
|
||||||
|
|
||||||
|
|
||||||
|
double rtc(void)
|
||||||
|
{
|
||||||
|
struct timeval Tvalue;
|
||||||
|
double etime;
|
||||||
|
struct timezone dummy;
|
||||||
|
|
||||||
|
gettimeofday(&Tvalue,&dummy);
|
||||||
|
etime = (double) Tvalue.tv_sec +
|
||||||
|
1.e-6*((double) Tvalue.tv_usec);
|
||||||
|
return etime;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
extern void loop_stencil_serial(int t0, int t1, int x0, int x1,
|
extern void loop_stencil_serial(int t0, int t1, int x0, int x1,
|
||||||
int y0, int y1, int z0, int z1,
|
int y0, int y1, int z0, int z1,
|
||||||
int Nx, int Ny, int Nz,
|
int Nx, int Ny, int Nz,
|
||||||
const double coef[5],
|
const double coef[5],
|
||||||
const double vsq[],
|
const double vsq[],
|
||||||
double Aeven[], double Aodd[]);
|
double Aeven[], double Aodd[]);
|
||||||
|
|
||||||
|
|
||||||
void InitData(int Nx, int Ny, int Nz, double *A[2], double *vsq) {
|
void InitData(int Nx, int Ny, int Nz, double *A[2], double *vsq) {
|
||||||
int offset = 0;
|
int offset = 0;
|
||||||
for (int z = 0; z < Nz; ++z)
|
#pragma omp parallel for collapse(2) private(offset)
|
||||||
for (int y = 0; y < Ny; ++y)
|
for (int z = 0; z < Nz; ++z)
|
||||||
for (int x = 0; x < Nx; ++x, ++offset) {
|
for (int y = 0; y < Ny; ++y)
|
||||||
A[0][offset] = (x < Nx / 2) ? x / double(Nx) : y / double(Ny);
|
for (int x = 0; x < Nx; ++x, ++offset) {
|
||||||
A[1][offset] = 0;
|
A[0][offset] = (x < Nx / 2) ? x / double(Nx) : y / double(Ny);
|
||||||
vsq[offset] = x*y*z / double(Nx * Ny * Nz);
|
A[1][offset] = 0;
|
||||||
}
|
vsq[offset] = x*y*z / double(Nx * Ny * Nz);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
int main() {
|
int main() {
|
||||||
int Nx = 256, Ny = 256, Nz = 256;
|
int Nx = 256, Ny = 256, Nz = 256;
|
||||||
int width = 4;
|
int width = 4;
|
||||||
double *Aserial[2], *Aispc[2];
|
double *Aserial[2], *Aispc[2];
|
||||||
Aserial[0] = new double [Nx * Ny * Nz];
|
Aserial[0] = new double [Nx * Ny * Nz];
|
||||||
Aserial[1] = new double [Nx * Ny * Nz];
|
Aserial[1] = new double [Nx * Ny * Nz];
|
||||||
Aispc[0] = new double [Nx * Ny * Nz];
|
Aispc[0] = new double [Nx * Ny * Nz];
|
||||||
Aispc[1] = new double [Nx * Ny * Nz];
|
Aispc[1] = new double [Nx * Ny * Nz];
|
||||||
double *vsq = new double [Nx * Ny * Nz];
|
double *vsq = new double [Nx * Ny * Nz];
|
||||||
|
|
||||||
double coeff[4] = { 0.5, -.25, .125, -.0625 };
|
double coeff[4] = { 0.5, -.25, .125, -.0625 };
|
||||||
|
|
||||||
InitData(Nx, Ny, Nz, Aispc, vsq);
|
// InitData(Nx, Ny, Nz, Aispc, vsq);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Compute the image using the ispc implementation on one core; report
|
// Compute the image using the ispc implementation on one core; report
|
||||||
// the minimum time of three runs.
|
// the minimum time of three runs.
|
||||||
//
|
//
|
||||||
double minTimeISPC = 1e30;
|
double minTimeISPC = 1e30;
|
||||||
for (int i = 0; i < 3; ++i) {
|
#if 0
|
||||||
reset_and_start_timer();
|
for (int i = 0; i < 3; ++i) {
|
||||||
loop_stencil_ispc(0, 6, width, Nx - width, width, Ny - width,
|
reset_and_start_timer();
|
||||||
width, Nz - width, Nx, Ny, Nz, coeff, vsq,
|
loop_stencil_ispc(0, 6, width, Nx - width, width, Ny - width,
|
||||||
Aispc[0], Aispc[1]);
|
width, Nz - width, Nx, Ny, Nz, coeff, vsq,
|
||||||
double dt = get_elapsed_mcycles();
|
Aispc[0], Aispc[1]);
|
||||||
minTimeISPC = std::min(minTimeISPC, dt);
|
double dt = get_elapsed_mcycles();
|
||||||
}
|
minTimeISPC = std::min(minTimeISPC, dt);
|
||||||
|
}
|
||||||
|
|
||||||
printf("[stencil ispc 1 core]:\t\t[%.3f] million cycles\n", minTimeISPC);
|
printf("[stencil ispc 1 core]:\t\t[%.3f] million cycles\n", minTimeISPC);
|
||||||
|
#endif
|
||||||
|
|
||||||
InitData(Nx, Ny, Nz, Aispc, vsq);
|
fprintf(stderr, " -- init -- \n");
|
||||||
|
InitData(Nx, Ny, Nz, Aispc, vsq);
|
||||||
|
fprintf(stderr, " -- done init -- \n");
|
||||||
|
|
||||||
//
|
//
|
||||||
// Compute the image using the ispc implementation with tasks; report
|
// Compute the image using the ispc implementation with tasks; report
|
||||||
// the minimum time of three runs.
|
// the minimum time of three runs.
|
||||||
//
|
//
|
||||||
double minTimeISPCTasks = 1e30;
|
double minTimeISPCTasks = 1e30;
|
||||||
for (int i = 0; i < 3; ++i) {
|
for (int i = 0; i < 3; ++i) {
|
||||||
reset_and_start_timer();
|
reset_and_start_timer();
|
||||||
loop_stencil_ispc_tasks(0, 6, width, Nx - width, width, Ny - width,
|
const double t0 = rtc();
|
||||||
width, Nz - width, Nx, Ny, Nz, coeff, vsq,
|
loop_stencil_ispc_tasks(0, 6, width, Nx - width, width, Ny - width,
|
||||||
Aispc[0], Aispc[1]);
|
width, Nz - width, Nx, Ny, Nz, coeff, vsq,
|
||||||
double dt = get_elapsed_mcycles();
|
Aispc[0], Aispc[1]);
|
||||||
minTimeISPCTasks = std::min(minTimeISPCTasks, dt);
|
double dt = rtc() - t0; //get_elapsed_mcycles();
|
||||||
}
|
minTimeISPCTasks = std::min(minTimeISPCTasks, dt);
|
||||||
|
}
|
||||||
|
|
||||||
printf("[stencil ispc + tasks]:\t\t[%.3f] million cycles\n", minTimeISPCTasks);
|
fprintf(stderr, "[stencil ispc + tasks]:\t\t[%.3f] million cycles\n", minTimeISPCTasks);
|
||||||
|
|
||||||
InitData(Nx, Ny, Nz, Aserial, vsq);
|
|
||||||
|
|
||||||
//
|
InitData(Nx, Ny, Nz, Aserial, vsq);
|
||||||
// And run the serial implementation 3 times, again reporting the
|
|
||||||
// minimum time.
|
|
||||||
//
|
|
||||||
double minTimeSerial = 1e30;
|
|
||||||
for (int i = 0; i < 3; ++i) {
|
|
||||||
reset_and_start_timer();
|
|
||||||
loop_stencil_serial(0, 6, width, Nx-width, width, Ny - width,
|
|
||||||
width, Nz - width, Nx, Ny, Nz, coeff, vsq,
|
|
||||||
Aserial[0], Aserial[1]);
|
|
||||||
double dt = get_elapsed_mcycles();
|
|
||||||
minTimeSerial = std::min(minTimeSerial, dt);
|
|
||||||
}
|
|
||||||
|
|
||||||
printf("[stencil serial]:\t\t[%.3f] million cycles\n", minTimeSerial);
|
//
|
||||||
|
// And run the serial implementation 3 times, again reporting the
|
||||||
|
// minimum time.
|
||||||
|
//
|
||||||
|
double minTimeSerial = 1e30;
|
||||||
|
for (int i = 0; i < 3; ++i) {
|
||||||
|
reset_and_start_timer();
|
||||||
|
loop_stencil_serial(0, 6, width, Nx-width, width, Ny - width,
|
||||||
|
width, Nz - width, Nx, Ny, Nz, coeff, vsq,
|
||||||
|
Aserial[0], Aserial[1]);
|
||||||
|
double dt = get_elapsed_mcycles();
|
||||||
|
minTimeSerial = std::min(minTimeSerial, dt);
|
||||||
|
}
|
||||||
|
|
||||||
printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n",
|
printf("[stencil serial]:\t\t[%.3f] million cycles\n", minTimeSerial);
|
||||||
minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks);
|
|
||||||
|
|
||||||
// Check for agreement
|
printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n",
|
||||||
int offset = 0;
|
minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks);
|
||||||
for (int z = 0; z < Nz; ++z)
|
|
||||||
for (int y = 0; y < Ny; ++y)
|
|
||||||
for (int x = 0; x < Nx; ++x, ++offset) {
|
|
||||||
double error = fabsf((Aserial[1][offset] - Aispc[1][offset]) /
|
|
||||||
Aserial[1][offset]);
|
|
||||||
if (error > 1e-4)
|
|
||||||
printf("Error @ (%d,%d,%d): ispc = %f, serial = %f\n",
|
|
||||||
x, y, z, Aispc[1][offset], Aserial[1][offset]);
|
|
||||||
}
|
|
||||||
|
|
||||||
return 0;
|
// Check for agreement
|
||||||
|
int offset = 0;
|
||||||
|
for (int z = 0; z < Nz; ++z)
|
||||||
|
for (int y = 0; y < Ny; ++y)
|
||||||
|
for (int x = 0; x < Nx; ++x, ++offset) {
|
||||||
|
double error = fabsf((Aserial[1][offset] - Aispc[1][offset]) /
|
||||||
|
Aserial[1][offset]);
|
||||||
|
if (error > 1e-4)
|
||||||
|
printf("Error @ (%d,%d,%d): ispc = %f, serial = %f\n",
|
||||||
|
x, y, z, Aispc[1][offset], Aserial[1][offset]);
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -59,7 +59,9 @@
|
|||||||
#define ISPC_USE_PTHREADS
|
#define ISPC_USE_PTHREADS
|
||||||
#define ISPC_USE_PTHREADS_FULLY_SUBSCRIBED
|
#define ISPC_USE_PTHREADS_FULLY_SUBSCRIBED
|
||||||
#define ISPC_USE_CILK
|
#define ISPC_USE_CILK
|
||||||
|
*/
|
||||||
#define ISPC_USE_OMP
|
#define ISPC_USE_OMP
|
||||||
|
/*
|
||||||
#define ISPC_USE_TBB_TASK_GROUP
|
#define ISPC_USE_TBB_TASK_GROUP
|
||||||
#define ISPC_USE_TBB_PARALLEL_FOR
|
#define ISPC_USE_TBB_PARALLEL_FOR
|
||||||
|
|
||||||
@@ -943,7 +945,7 @@ InitTaskSystem() {
|
|||||||
|
|
||||||
inline void
|
inline void
|
||||||
TaskGroup::Launch(int baseIndex, int count) {
|
TaskGroup::Launch(int baseIndex, int count) {
|
||||||
#pragma omp parallel for
|
#pragma omp parallel for schedule(dynamic)
|
||||||
for(int i = 0; i < count; i++) {
|
for(int i = 0; i < count; i++) {
|
||||||
TaskInfo *ti = GetTaskInfo(baseIndex + i);
|
TaskInfo *ti = GetTaskInfo(baseIndex + i);
|
||||||
|
|
||||||
|
|||||||
@@ -179,10 +179,10 @@ int main(int argc, char *argv[]) {
|
|||||||
double dt = get_elapsed_mcycles();
|
double dt = get_elapsed_mcycles();
|
||||||
minISPC = std::min(minISPC, dt);
|
minISPC = std::min(minISPC, dt);
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
printf("[volume ispc 1 core]:\t\t[%.3f] million cycles\n", minISPC);
|
printf("[volume ispc 1 core]:\t\t[%.3f] million cycles\n", minISPC);
|
||||||
writePPM(image, width, height, "volume-ispc-1core.ppm");
|
writePPM(image, width, height, "volume-ispc-1core.ppm");
|
||||||
|
#endif
|
||||||
|
|
||||||
// Clear out the buffer
|
// Clear out the buffer
|
||||||
for (int i = 0; i < width * height; ++i)
|
for (int i = 0; i < width * height; ++i)
|
||||||
@@ -214,6 +214,7 @@ int main(int argc, char *argv[]) {
|
|||||||
// minimum time.
|
// minimum time.
|
||||||
//
|
//
|
||||||
double minSerial = 1e30;
|
double minSerial = 1e30;
|
||||||
|
#if 0
|
||||||
for (int i = 0; i < 3; ++i) {
|
for (int i = 0; i < 3; ++i) {
|
||||||
reset_and_start_timer();
|
reset_and_start_timer();
|
||||||
volume_serial(density, n, raster2camera, camera2world,
|
volume_serial(density, n, raster2camera, camera2world,
|
||||||
@@ -224,6 +225,7 @@ int main(int argc, char *argv[]) {
|
|||||||
|
|
||||||
printf("[volume serial]:\t\t[%.3f] million cycles\n", minSerial);
|
printf("[volume serial]:\t\t[%.3f] million cycles\n", minSerial);
|
||||||
writePPM(image, width, height, "volume-serial.ppm");
|
writePPM(image, width, height, "volume-serial.ppm");
|
||||||
|
#endif
|
||||||
|
|
||||||
printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n",
|
printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n",
|
||||||
minSerial/minISPC, minSerial / minISPCtasks);
|
minSerial/minISPC, minSerial / minISPCtasks);
|
||||||
|
|||||||
Reference in New Issue
Block a user