added wc-timer for host code

This commit is contained in:
evghenii
2013-11-08 15:39:57 +01:00
parent eb8e1a2160
commit 87de3a2d06
7 changed files with 213 additions and 167 deletions

View File

@@ -739,7 +739,7 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_i32 *p, __vec16_i32
#endif #endif
} }
#if 0 /* knc::fails ./tests/foreach-25.ispc ./tests/forach-26.ispc ./tests/foreach-27.ispc */ #if 1 /* knc::fails ./tests/foreach-25.ispc ./tests/forach-26.ispc ./tests/foreach-27.ispc */
template <> static FORCEINLINE __vec16_i32 __load<64>(const __vec16_i32 *p) template <> static FORCEINLINE __vec16_i32 __load<64>(const __vec16_i32 *p)
{ {
return _mm512_load_epi32(p); return _mm512_load_epi32(p);
@@ -1007,7 +1007,7 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_i64 *p, __vec16_i64
#endif #endif
} }
#if 0 /* knc::fails as with _i32 this may generate fails ... so commetining it out */ #if 1 /* knc::fails as with _i32 this may generate fails ... so commetining it out */
template <> static FORCEINLINE __vec16_i64 __load<64>(const __vec16_i64 *p) template <> static FORCEINLINE __vec16_i64 __load<64>(const __vec16_i64 *p)
{ {
__m512i v2 = _mm512_load_epi32(p); __m512i v2 = _mm512_load_epi32(p);
@@ -1119,7 +1119,7 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_f *p, __vec16_f v)
#endif #endif
} }
#if 0 /* knc::fails ./tests/gs-improve-progindex.ispc with segfault */ #if 1 /* knc::fails ./tests/gs-improve-progindex.ispc with segfault */
template <> static FORCEINLINE __vec16_f __load<64>(const __vec16_f *p) template <> static FORCEINLINE __vec16_f __load<64>(const __vec16_f *p)
{ {
return _mm512_load_ps(p); return _mm512_load_ps(p);
@@ -1397,7 +1397,7 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_d *p, __vec16_d v)
} }
#if 0 /* knc::fails as with _f this may generate fails ... so commetining it out */ #if 1 /* knc::fails as with _f this may generate fails ... so commetining it out */
template <> static FORCEINLINE __vec16_d __load<64>(const __vec16_d *p) template <> static FORCEINLINE __vec16_d __load<64>(const __vec16_d *p)
{ {
return __vec16_d(_mm512_load_pd(p), _mm512_load_pd(((uint8_t*)p)+64)); return __vec16_d(_mm512_load_pd(p), _mm512_load_pd(((uint8_t*)p)+64));

View File

@@ -2,7 +2,7 @@
EXAMPLE=mandelbrot_tasks EXAMPLE=mandelbrot_tasks
CPP_SRC=mandelbrot_tasks.cpp mandelbrot_tasks_serial.cpp CPP_SRC=mandelbrot_tasks.cpp mandelbrot_tasks_serial.cpp
ISPC_SRC=mandelbrot_tasks.ispc ISPC_SRC=mandelbrot_tasks.ispc
ISPC_IA_TARGETS=sse2,sse4-x2,avx-x2 ISPC_IA_TARGETS=avx
ISPC_ARM_TARGETS=neon ISPC_ARM_TARGETS=neon
include ../common.mk include ../common.mk

View File

@@ -42,105 +42,123 @@
#include <algorithm> #include <algorithm>
#include <string.h> #include <string.h>
#include "../timing.h" #include "../timing.h"
#include "mandelbrot_tasks_ispc.h" #include "mandelbrot_ispc.h"
using namespace ispc; using namespace ispc;
#include <sys/time.h>
double rtc(void)
{
struct timeval Tvalue;
double etime;
struct timezone dummy;
gettimeofday(&Tvalue,&dummy);
etime = (double) Tvalue.tv_sec +
1.e-6*((double) Tvalue.tv_usec);
return etime;
}
extern void mandelbrot_serial(float x0, float y0, float x1, float y1, extern void mandelbrot_serial(float x0, float y0, float x1, float y1,
int width, int height, int maxIterations, int width, int height, int maxIterations,
int output[]); int output[]);
/* Write a PPM image file with the image of the Mandelbrot set */ /* Write a PPM image file with the image of the Mandelbrot set */
static void static void
writePPM(int *buf, int width, int height, const char *fn) { writePPM(int *buf, int width, int height, const char *fn) {
FILE *fp = fopen(fn, "wb"); FILE *fp = fopen(fn, "wb");
fprintf(fp, "P6\n"); fprintf(fp, "P6\n");
fprintf(fp, "%d %d\n", width, height); fprintf(fp, "%d %d\n", width, height);
fprintf(fp, "255\n"); fprintf(fp, "255\n");
for (int i = 0; i < width*height; ++i) { for (int i = 0; i < width*height; ++i) {
// Map the iteration count to colors by just alternating between // Map the iteration count to colors by just alternating between
// two greys. // two greys.
char c = (buf[i] & 0x1) ? 240 : 20; char c = (buf[i] & 0x1) ? 240 : 20;
for (int j = 0; j < 3; ++j) for (int j = 0; j < 3; ++j)
fputc(c, fp); fputc(c, fp);
} }
fclose(fp); fclose(fp);
printf("Wrote image file %s\n", fn); printf("Wrote image file %s\n", fn);
} }
static void usage() { static void usage() {
fprintf(stderr, "usage: mandelbrot [--scale=<factor>]\n"); fprintf(stderr, "usage: mandelbrot [--scale=<factor>]\n");
exit(1); exit(1);
} }
int main(int argc, char *argv[]) { int main(int argc, char *argv[]) {
unsigned int width = 1536; unsigned int width = 1536;
unsigned int height = 1024; unsigned int height = 1024;
float x0 = -2; float x0 = -2;
float x1 = 1; float x1 = 1;
float y0 = -1; float y0 = -1;
float y1 = 1; float y1 = 1;
if (argc == 1) if (argc == 1)
; ;
else if (argc == 2) { else if (argc == 2) {
if (strncmp(argv[1], "--scale=", 8) == 0) { if (strncmp(argv[1], "--scale=", 8) == 0) {
float scale = atof(argv[1] + 8); float scale = atof(argv[1] + 8);
if (scale == 0.f) if (scale == 0.f)
usage(); usage();
width *= scale; width *= scale;
height *= scale; height *= scale;
// round up to multiples of 16 // round up to multiples of 16
width = (width + 0xf) & ~0xf; width = (width + 0xf) & ~0xf;
height = (height + 0xf) & ~0xf; height = (height + 0xf) & ~0xf;
}
else
usage();
} }
else else
usage(); usage();
}
else
usage();
int maxIterations = 512; int maxIterations = 512;
int *buf = new int[width*height]; int *buf = new int[width*height];
// //
// Compute the image using the ispc implementation; report the minimum // Compute the image using the ispc implementation; report the minimum
// time of three runs. // time of three runs.
// //
double minISPC = 1e30; double minISPC = 1e30;
for (int i = 0; i < 3; ++i) { for (int i = 0; i < 3; ++i) {
// Clear out the buffer // Clear out the buffer
for (unsigned int i = 0; i < width * height; ++i) for (unsigned int i = 0; i < width * height; ++i)
buf[i] = 0; buf[i] = 0;
reset_and_start_timer(); reset_and_start_timer();
mandelbrot_ispc(x0, y0, x1, y1, width, height, maxIterations, buf); double t0 = rtc();
double dt = get_elapsed_mcycles(); mandelbrot_ispc(x0, y0, x1, y1, width, height, maxIterations, buf);
minISPC = std::min(minISPC, dt); double dt = rtc() - t0; //get_elapsed_mcycles();
} minISPC = std::min(minISPC, dt);
}
printf("[mandelbrot ispc+tasks]:\t[%.3f] million cycles\n", minISPC); printf("[mandelbrot ispc+tasks]:\t[%.3f] million cycles\n", minISPC);
writePPM(buf, width, height, "mandelbrot-ispc.ppm"); writePPM(buf, width, height, "mandelbrot-ispc.ppm");
// //
// And run the serial implementation 3 times, again reporting the // And run the serial implementation 3 times, again reporting the
// minimum time. // minimum time.
// //
double minSerial = 1e30; double minSerial = 1e30;
for (int i = 0; i < 3; ++i) { #if 0
// Clear out the buffer for (int i = 0; i < 3; ++i) {
for (unsigned int i = 0; i < width * height; ++i) // Clear out the buffer
buf[i] = 0; for (unsigned int i = 0; i < width * height; ++i)
reset_and_start_timer(); buf[i] = 0;
mandelbrot_serial(x0, y0, x1, y1, width, height, maxIterations, buf); reset_and_start_timer();
double dt = get_elapsed_mcycles(); mandelbrot_serial(x0, y0, x1, y1, width, height, maxIterations, buf);
minSerial = std::min(minSerial, dt); double dt = get_elapsed_mcycles();
} minSerial = std::min(minSerial, dt);
}
printf("[mandelbrot serial]:\t\t[%.3f] million cycles\n", minSerial); printf("[mandelbrot serial]:\t\t[%.3f] million cycles\n", minSerial);
writePPM(buf, width, height, "mandelbrot-serial.ppm"); writePPM(buf, width, height, "mandelbrot-serial.ppm");
#endif
printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", minSerial/minISPC); printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", minSerial/minISPC);
return 0; return 0;
} }

View File

@@ -42,7 +42,7 @@
#include <algorithm> #include <algorithm>
#include <string.h> #include <string.h>
#include "../timing.h" #include "../timing.h"
#include "mandelbrot_tasks3d_ispc.h" #include "mandelbrot_ispc.h"
using namespace ispc; using namespace ispc;
extern void mandelbrot_serial(float x0, float y0, float x1, float y1, extern void mandelbrot_serial(float x0, float y0, float x1, float y1,
@@ -127,6 +127,7 @@ int main(int argc, char *argv[]) {
// minimum time. // minimum time.
// //
double minSerial = 1e30; double minSerial = 1e30;
#if 0
for (int i = 0; i < 3; ++i) { for (int i = 0; i < 3; ++i) {
// Clear out the buffer // Clear out the buffer
for (unsigned int i = 0; i < width * height; ++i) for (unsigned int i = 0; i < width * height; ++i)
@@ -139,6 +140,7 @@ int main(int argc, char *argv[]) {
printf("[mandelbrot serial]:\t\t[%.3f] million cycles\n", minSerial); printf("[mandelbrot serial]:\t\t[%.3f] million cycles\n", minSerial);
writePPM(buf, width, height, "mandelbrot-serial.ppm"); writePPM(buf, width, height, "mandelbrot-serial.ppm");
#endif
printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", minSerial/minISPC); printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", minSerial/minISPC);

View File

@@ -45,107 +45,129 @@
#include "stencil_ispc.h" #include "stencil_ispc.h"
using namespace ispc; using namespace ispc;
#include <sys/time.h>
double rtc(void)
{
struct timeval Tvalue;
double etime;
struct timezone dummy;
gettimeofday(&Tvalue,&dummy);
etime = (double) Tvalue.tv_sec +
1.e-6*((double) Tvalue.tv_usec);
return etime;
}
extern void loop_stencil_serial(int t0, int t1, int x0, int x1, extern void loop_stencil_serial(int t0, int t1, int x0, int x1,
int y0, int y1, int z0, int z1, int y0, int y1, int z0, int z1,
int Nx, int Ny, int Nz, int Nx, int Ny, int Nz,
const double coef[5], const double coef[5],
const double vsq[], const double vsq[],
double Aeven[], double Aodd[]); double Aeven[], double Aodd[]);
void InitData(int Nx, int Ny, int Nz, double *A[2], double *vsq) { void InitData(int Nx, int Ny, int Nz, double *A[2], double *vsq) {
int offset = 0; int offset = 0;
for (int z = 0; z < Nz; ++z) #pragma omp parallel for collapse(2) private(offset)
for (int y = 0; y < Ny; ++y) for (int z = 0; z < Nz; ++z)
for (int x = 0; x < Nx; ++x, ++offset) { for (int y = 0; y < Ny; ++y)
A[0][offset] = (x < Nx / 2) ? x / double(Nx) : y / double(Ny); for (int x = 0; x < Nx; ++x, ++offset) {
A[1][offset] = 0; A[0][offset] = (x < Nx / 2) ? x / double(Nx) : y / double(Ny);
vsq[offset] = x*y*z / double(Nx * Ny * Nz); A[1][offset] = 0;
} vsq[offset] = x*y*z / double(Nx * Ny * Nz);
}
} }
int main() { int main() {
int Nx = 256, Ny = 256, Nz = 256; int Nx = 256, Ny = 256, Nz = 256;
int width = 4; int width = 4;
double *Aserial[2], *Aispc[2]; double *Aserial[2], *Aispc[2];
Aserial[0] = new double [Nx * Ny * Nz]; Aserial[0] = new double [Nx * Ny * Nz];
Aserial[1] = new double [Nx * Ny * Nz]; Aserial[1] = new double [Nx * Ny * Nz];
Aispc[0] = new double [Nx * Ny * Nz]; Aispc[0] = new double [Nx * Ny * Nz];
Aispc[1] = new double [Nx * Ny * Nz]; Aispc[1] = new double [Nx * Ny * Nz];
double *vsq = new double [Nx * Ny * Nz]; double *vsq = new double [Nx * Ny * Nz];
double coeff[4] = { 0.5, -.25, .125, -.0625 }; double coeff[4] = { 0.5, -.25, .125, -.0625 };
InitData(Nx, Ny, Nz, Aispc, vsq); // InitData(Nx, Ny, Nz, Aispc, vsq);
// //
// Compute the image using the ispc implementation on one core; report // Compute the image using the ispc implementation on one core; report
// the minimum time of three runs. // the minimum time of three runs.
// //
double minTimeISPC = 1e30; double minTimeISPC = 1e30;
for (int i = 0; i < 3; ++i) { #if 0
reset_and_start_timer(); for (int i = 0; i < 3; ++i) {
loop_stencil_ispc(0, 6, width, Nx - width, width, Ny - width, reset_and_start_timer();
width, Nz - width, Nx, Ny, Nz, coeff, vsq, loop_stencil_ispc(0, 6, width, Nx - width, width, Ny - width,
Aispc[0], Aispc[1]); width, Nz - width, Nx, Ny, Nz, coeff, vsq,
double dt = get_elapsed_mcycles(); Aispc[0], Aispc[1]);
minTimeISPC = std::min(minTimeISPC, dt); double dt = get_elapsed_mcycles();
} minTimeISPC = std::min(minTimeISPC, dt);
}
printf("[stencil ispc 1 core]:\t\t[%.3f] million cycles\n", minTimeISPC); printf("[stencil ispc 1 core]:\t\t[%.3f] million cycles\n", minTimeISPC);
#endif
InitData(Nx, Ny, Nz, Aispc, vsq); fprintf(stderr, " -- init -- \n");
InitData(Nx, Ny, Nz, Aispc, vsq);
fprintf(stderr, " -- done init -- \n");
// //
// Compute the image using the ispc implementation with tasks; report // Compute the image using the ispc implementation with tasks; report
// the minimum time of three runs. // the minimum time of three runs.
// //
double minTimeISPCTasks = 1e30; double minTimeISPCTasks = 1e30;
for (int i = 0; i < 3; ++i) { for (int i = 0; i < 3; ++i) {
reset_and_start_timer(); reset_and_start_timer();
loop_stencil_ispc_tasks(0, 6, width, Nx - width, width, Ny - width, const double t0 = rtc();
width, Nz - width, Nx, Ny, Nz, coeff, vsq, loop_stencil_ispc_tasks(0, 6, width, Nx - width, width, Ny - width,
Aispc[0], Aispc[1]); width, Nz - width, Nx, Ny, Nz, coeff, vsq,
double dt = get_elapsed_mcycles(); Aispc[0], Aispc[1]);
minTimeISPCTasks = std::min(minTimeISPCTasks, dt); double dt = rtc() - t0; //get_elapsed_mcycles();
} minTimeISPCTasks = std::min(minTimeISPCTasks, dt);
}
printf("[stencil ispc + tasks]:\t\t[%.3f] million cycles\n", minTimeISPCTasks); fprintf(stderr, "[stencil ispc + tasks]:\t\t[%.3f] million cycles\n", minTimeISPCTasks);
InitData(Nx, Ny, Nz, Aserial, vsq);
// InitData(Nx, Ny, Nz, Aserial, vsq);
// And run the serial implementation 3 times, again reporting the
// minimum time.
//
double minTimeSerial = 1e30;
for (int i = 0; i < 3; ++i) {
reset_and_start_timer();
loop_stencil_serial(0, 6, width, Nx-width, width, Ny - width,
width, Nz - width, Nx, Ny, Nz, coeff, vsq,
Aserial[0], Aserial[1]);
double dt = get_elapsed_mcycles();
minTimeSerial = std::min(minTimeSerial, dt);
}
printf("[stencil serial]:\t\t[%.3f] million cycles\n", minTimeSerial); //
// And run the serial implementation 3 times, again reporting the
// minimum time.
//
double minTimeSerial = 1e30;
for (int i = 0; i < 3; ++i) {
reset_and_start_timer();
loop_stencil_serial(0, 6, width, Nx-width, width, Ny - width,
width, Nz - width, Nx, Ny, Nz, coeff, vsq,
Aserial[0], Aserial[1]);
double dt = get_elapsed_mcycles();
minTimeSerial = std::min(minTimeSerial, dt);
}
printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", printf("[stencil serial]:\t\t[%.3f] million cycles\n", minTimeSerial);
minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks);
// Check for agreement printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n",
int offset = 0; minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks);
for (int z = 0; z < Nz; ++z)
for (int y = 0; y < Ny; ++y)
for (int x = 0; x < Nx; ++x, ++offset) {
double error = fabsf((Aserial[1][offset] - Aispc[1][offset]) /
Aserial[1][offset]);
if (error > 1e-4)
printf("Error @ (%d,%d,%d): ispc = %f, serial = %f\n",
x, y, z, Aispc[1][offset], Aserial[1][offset]);
}
return 0; // Check for agreement
int offset = 0;
for (int z = 0; z < Nz; ++z)
for (int y = 0; y < Ny; ++y)
for (int x = 0; x < Nx; ++x, ++offset) {
double error = fabsf((Aserial[1][offset] - Aispc[1][offset]) /
Aserial[1][offset]);
if (error > 1e-4)
printf("Error @ (%d,%d,%d): ispc = %f, serial = %f\n",
x, y, z, Aispc[1][offset], Aserial[1][offset]);
}
return 0;
} }

View File

@@ -59,7 +59,9 @@
#define ISPC_USE_PTHREADS #define ISPC_USE_PTHREADS
#define ISPC_USE_PTHREADS_FULLY_SUBSCRIBED #define ISPC_USE_PTHREADS_FULLY_SUBSCRIBED
#define ISPC_USE_CILK #define ISPC_USE_CILK
*/
#define ISPC_USE_OMP #define ISPC_USE_OMP
/*
#define ISPC_USE_TBB_TASK_GROUP #define ISPC_USE_TBB_TASK_GROUP
#define ISPC_USE_TBB_PARALLEL_FOR #define ISPC_USE_TBB_PARALLEL_FOR
@@ -943,7 +945,7 @@ InitTaskSystem() {
inline void inline void
TaskGroup::Launch(int baseIndex, int count) { TaskGroup::Launch(int baseIndex, int count) {
#pragma omp parallel for #pragma omp parallel for schedule(dynamic)
for(int i = 0; i < count; i++) { for(int i = 0; i < count; i++) {
TaskInfo *ti = GetTaskInfo(baseIndex + i); TaskInfo *ti = GetTaskInfo(baseIndex + i);

View File

@@ -179,10 +179,10 @@ int main(int argc, char *argv[]) {
double dt = get_elapsed_mcycles(); double dt = get_elapsed_mcycles();
minISPC = std::min(minISPC, dt); minISPC = std::min(minISPC, dt);
} }
#endif
printf("[volume ispc 1 core]:\t\t[%.3f] million cycles\n", minISPC); printf("[volume ispc 1 core]:\t\t[%.3f] million cycles\n", minISPC);
writePPM(image, width, height, "volume-ispc-1core.ppm"); writePPM(image, width, height, "volume-ispc-1core.ppm");
#endif
// Clear out the buffer // Clear out the buffer
for (int i = 0; i < width * height; ++i) for (int i = 0; i < width * height; ++i)
@@ -214,6 +214,7 @@ int main(int argc, char *argv[]) {
// minimum time. // minimum time.
// //
double minSerial = 1e30; double minSerial = 1e30;
#if 0
for (int i = 0; i < 3; ++i) { for (int i = 0; i < 3; ++i) {
reset_and_start_timer(); reset_and_start_timer();
volume_serial(density, n, raster2camera, camera2world, volume_serial(density, n, raster2camera, camera2world,
@@ -224,6 +225,7 @@ int main(int argc, char *argv[]) {
printf("[volume serial]:\t\t[%.3f] million cycles\n", minSerial); printf("[volume serial]:\t\t[%.3f] million cycles\n", minSerial);
writePPM(image, width, height, "volume-serial.ppm"); writePPM(image, width, height, "volume-serial.ppm");
#endif
printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n",
minSerial/minISPC, minSerial / minISPCtasks); minSerial/minISPC, minSerial / minISPCtasks);