added wc-timer for host code

This commit is contained in:
evghenii
2013-11-08 15:39:57 +01:00
parent eb8e1a2160
commit 87de3a2d06
7 changed files with 213 additions and 167 deletions

View File

@@ -739,7 +739,7 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_i32 *p, __vec16_i32
#endif
}
#if 0 /* knc::fails ./tests/foreach-25.ispc ./tests/forach-26.ispc ./tests/foreach-27.ispc */
#if 1 /* knc::fails ./tests/foreach-25.ispc ./tests/forach-26.ispc ./tests/foreach-27.ispc */
template <> static FORCEINLINE __vec16_i32 __load<64>(const __vec16_i32 *p)
{
return _mm512_load_epi32(p);
@@ -1007,7 +1007,7 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_i64 *p, __vec16_i64
#endif
}
#if 0 /* knc::fails as with _i32 this may generate fails ... so commetining it out */
#if 1 /* knc::fails as with _i32 this may generate fails ... so commetining it out */
template <> static FORCEINLINE __vec16_i64 __load<64>(const __vec16_i64 *p)
{
__m512i v2 = _mm512_load_epi32(p);
@@ -1119,7 +1119,7 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_f *p, __vec16_f v)
#endif
}
#if 0 /* knc::fails ./tests/gs-improve-progindex.ispc with segfault */
#if 1 /* knc::fails ./tests/gs-improve-progindex.ispc with segfault */
template <> static FORCEINLINE __vec16_f __load<64>(const __vec16_f *p)
{
return _mm512_load_ps(p);
@@ -1397,7 +1397,7 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_d *p, __vec16_d v)
}
#if 0 /* knc::fails as with _f this may generate fails ... so commetining it out */
#if 1 /* knc::fails as with _f this may generate fails ... so commetining it out */
template <> static FORCEINLINE __vec16_d __load<64>(const __vec16_d *p)
{
return __vec16_d(_mm512_load_pd(p), _mm512_load_pd(((uint8_t*)p)+64));

View File

@@ -2,7 +2,7 @@
EXAMPLE=mandelbrot_tasks
CPP_SRC=mandelbrot_tasks.cpp mandelbrot_tasks_serial.cpp
ISPC_SRC=mandelbrot_tasks.ispc
ISPC_IA_TARGETS=sse2,sse4-x2,avx-x2
ISPC_IA_TARGETS=avx
ISPC_ARM_TARGETS=neon
include ../common.mk

View File

@@ -42,105 +42,123 @@
#include <algorithm>
#include <string.h>
#include "../timing.h"
#include "mandelbrot_tasks_ispc.h"
#include "mandelbrot_ispc.h"
using namespace ispc;
#include <sys/time.h>
double rtc(void)
{
struct timeval Tvalue;
double etime;
struct timezone dummy;
gettimeofday(&Tvalue,&dummy);
etime = (double) Tvalue.tv_sec +
1.e-6*((double) Tvalue.tv_usec);
return etime;
}
extern void mandelbrot_serial(float x0, float y0, float x1, float y1,
int width, int height, int maxIterations,
int output[]);
int width, int height, int maxIterations,
int output[]);
/* Write a PPM image file with the image of the Mandelbrot set */
static void
writePPM(int *buf, int width, int height, const char *fn) {
FILE *fp = fopen(fn, "wb");
fprintf(fp, "P6\n");
fprintf(fp, "%d %d\n", width, height);
fprintf(fp, "255\n");
for (int i = 0; i < width*height; ++i) {
// Map the iteration count to colors by just alternating between
// two greys.
char c = (buf[i] & 0x1) ? 240 : 20;
for (int j = 0; j < 3; ++j)
fputc(c, fp);
}
fclose(fp);
printf("Wrote image file %s\n", fn);
FILE *fp = fopen(fn, "wb");
fprintf(fp, "P6\n");
fprintf(fp, "%d %d\n", width, height);
fprintf(fp, "255\n");
for (int i = 0; i < width*height; ++i) {
// Map the iteration count to colors by just alternating between
// two greys.
char c = (buf[i] & 0x1) ? 240 : 20;
for (int j = 0; j < 3; ++j)
fputc(c, fp);
}
fclose(fp);
printf("Wrote image file %s\n", fn);
}
static void usage() {
fprintf(stderr, "usage: mandelbrot [--scale=<factor>]\n");
exit(1);
fprintf(stderr, "usage: mandelbrot [--scale=<factor>]\n");
exit(1);
}
int main(int argc, char *argv[]) {
unsigned int width = 1536;
unsigned int height = 1024;
float x0 = -2;
float x1 = 1;
float y0 = -1;
float y1 = 1;
unsigned int width = 1536;
unsigned int height = 1024;
float x0 = -2;
float x1 = 1;
float y0 = -1;
float y1 = 1;
if (argc == 1)
;
else if (argc == 2) {
if (strncmp(argv[1], "--scale=", 8) == 0) {
float scale = atof(argv[1] + 8);
if (scale == 0.f)
usage();
width *= scale;
height *= scale;
// round up to multiples of 16
width = (width + 0xf) & ~0xf;
height = (height + 0xf) & ~0xf;
}
else
usage();
}
else
if (argc == 1)
;
else if (argc == 2) {
if (strncmp(argv[1], "--scale=", 8) == 0) {
float scale = atof(argv[1] + 8);
if (scale == 0.f)
usage();
int maxIterations = 512;
int *buf = new int[width*height];
//
// Compute the image using the ispc implementation; report the minimum
// time of three runs.
//
double minISPC = 1e30;
for (int i = 0; i < 3; ++i) {
// Clear out the buffer
for (unsigned int i = 0; i < width * height; ++i)
buf[i] = 0;
reset_and_start_timer();
mandelbrot_ispc(x0, y0, x1, y1, width, height, maxIterations, buf);
double dt = get_elapsed_mcycles();
minISPC = std::min(minISPC, dt);
width *= scale;
height *= scale;
// round up to multiples of 16
width = (width + 0xf) & ~0xf;
height = (height + 0xf) & ~0xf;
}
else
usage();
}
else
usage();
printf("[mandelbrot ispc+tasks]:\t[%.3f] million cycles\n", minISPC);
writePPM(buf, width, height, "mandelbrot-ispc.ppm");
int maxIterations = 512;
int *buf = new int[width*height];
//
// Compute the image using the ispc implementation; report the minimum
// time of three runs.
//
double minISPC = 1e30;
for (int i = 0; i < 3; ++i) {
// Clear out the buffer
for (unsigned int i = 0; i < width * height; ++i)
buf[i] = 0;
reset_and_start_timer();
double t0 = rtc();
mandelbrot_ispc(x0, y0, x1, y1, width, height, maxIterations, buf);
double dt = rtc() - t0; //get_elapsed_mcycles();
minISPC = std::min(minISPC, dt);
}
printf("[mandelbrot ispc+tasks]:\t[%.3f] million cycles\n", minISPC);
writePPM(buf, width, height, "mandelbrot-ispc.ppm");
//
// And run the serial implementation 3 times, again reporting the
// minimum time.
//
double minSerial = 1e30;
for (int i = 0; i < 3; ++i) {
// Clear out the buffer
for (unsigned int i = 0; i < width * height; ++i)
buf[i] = 0;
reset_and_start_timer();
mandelbrot_serial(x0, y0, x1, y1, width, height, maxIterations, buf);
double dt = get_elapsed_mcycles();
minSerial = std::min(minSerial, dt);
}
//
// And run the serial implementation 3 times, again reporting the
// minimum time.
//
double minSerial = 1e30;
#if 0
for (int i = 0; i < 3; ++i) {
// Clear out the buffer
for (unsigned int i = 0; i < width * height; ++i)
buf[i] = 0;
reset_and_start_timer();
mandelbrot_serial(x0, y0, x1, y1, width, height, maxIterations, buf);
double dt = get_elapsed_mcycles();
minSerial = std::min(minSerial, dt);
}
printf("[mandelbrot serial]:\t\t[%.3f] million cycles\n", minSerial);
writePPM(buf, width, height, "mandelbrot-serial.ppm");
printf("[mandelbrot serial]:\t\t[%.3f] million cycles\n", minSerial);
writePPM(buf, width, height, "mandelbrot-serial.ppm");
#endif
printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", minSerial/minISPC);
printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", minSerial/minISPC);
return 0;
return 0;
}

View File

@@ -42,7 +42,7 @@
#include <algorithm>
#include <string.h>
#include "../timing.h"
#include "mandelbrot_tasks3d_ispc.h"
#include "mandelbrot_ispc.h"
using namespace ispc;
extern void mandelbrot_serial(float x0, float y0, float x1, float y1,
@@ -127,6 +127,7 @@ int main(int argc, char *argv[]) {
// minimum time.
//
double minSerial = 1e30;
#if 0
for (int i = 0; i < 3; ++i) {
// Clear out the buffer
for (unsigned int i = 0; i < width * height; ++i)
@@ -139,6 +140,7 @@ int main(int argc, char *argv[]) {
printf("[mandelbrot serial]:\t\t[%.3f] million cycles\n", minSerial);
writePPM(buf, width, height, "mandelbrot-serial.ppm");
#endif
printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", minSerial/minISPC);

View File

@@ -45,107 +45,129 @@
#include "stencil_ispc.h"
using namespace ispc;
#include <sys/time.h>
double rtc(void)
{
struct timeval Tvalue;
double etime;
struct timezone dummy;
gettimeofday(&Tvalue,&dummy);
etime = (double) Tvalue.tv_sec +
1.e-6*((double) Tvalue.tv_usec);
return etime;
}
extern void loop_stencil_serial(int t0, int t1, int x0, int x1,
int y0, int y1, int z0, int z1,
int Nx, int Ny, int Nz,
const double coef[5],
const double vsq[],
double Aeven[], double Aodd[]);
int y0, int y1, int z0, int z1,
int Nx, int Ny, int Nz,
const double coef[5],
const double vsq[],
double Aeven[], double Aodd[]);
void InitData(int Nx, int Ny, int Nz, double *A[2], double *vsq) {
int offset = 0;
for (int z = 0; z < Nz; ++z)
for (int y = 0; y < Ny; ++y)
for (int x = 0; x < Nx; ++x, ++offset) {
A[0][offset] = (x < Nx / 2) ? x / double(Nx) : y / double(Ny);
A[1][offset] = 0;
vsq[offset] = x*y*z / double(Nx * Ny * Nz);
}
int offset = 0;
#pragma omp parallel for collapse(2) private(offset)
for (int z = 0; z < Nz; ++z)
for (int y = 0; y < Ny; ++y)
for (int x = 0; x < Nx; ++x, ++offset) {
A[0][offset] = (x < Nx / 2) ? x / double(Nx) : y / double(Ny);
A[1][offset] = 0;
vsq[offset] = x*y*z / double(Nx * Ny * Nz);
}
}
int main() {
int Nx = 256, Ny = 256, Nz = 256;
int width = 4;
double *Aserial[2], *Aispc[2];
Aserial[0] = new double [Nx * Ny * Nz];
Aserial[1] = new double [Nx * Ny * Nz];
Aispc[0] = new double [Nx * Ny * Nz];
Aispc[1] = new double [Nx * Ny * Nz];
double *vsq = new double [Nx * Ny * Nz];
int Nx = 256, Ny = 256, Nz = 256;
int width = 4;
double *Aserial[2], *Aispc[2];
Aserial[0] = new double [Nx * Ny * Nz];
Aserial[1] = new double [Nx * Ny * Nz];
Aispc[0] = new double [Nx * Ny * Nz];
Aispc[1] = new double [Nx * Ny * Nz];
double *vsq = new double [Nx * Ny * Nz];
double coeff[4] = { 0.5, -.25, .125, -.0625 };
double coeff[4] = { 0.5, -.25, .125, -.0625 };
InitData(Nx, Ny, Nz, Aispc, vsq);
// InitData(Nx, Ny, Nz, Aispc, vsq);
//
// Compute the image using the ispc implementation on one core; report
// the minimum time of three runs.
//
double minTimeISPC = 1e30;
for (int i = 0; i < 3; ++i) {
reset_and_start_timer();
loop_stencil_ispc(0, 6, width, Nx - width, width, Ny - width,
width, Nz - width, Nx, Ny, Nz, coeff, vsq,
Aispc[0], Aispc[1]);
double dt = get_elapsed_mcycles();
minTimeISPC = std::min(minTimeISPC, dt);
}
//
// Compute the image using the ispc implementation on one core; report
// the minimum time of three runs.
//
double minTimeISPC = 1e30;
#if 0
for (int i = 0; i < 3; ++i) {
reset_and_start_timer();
loop_stencil_ispc(0, 6, width, Nx - width, width, Ny - width,
width, Nz - width, Nx, Ny, Nz, coeff, vsq,
Aispc[0], Aispc[1]);
double dt = get_elapsed_mcycles();
minTimeISPC = std::min(minTimeISPC, dt);
}
printf("[stencil ispc 1 core]:\t\t[%.3f] million cycles\n", minTimeISPC);
printf("[stencil ispc 1 core]:\t\t[%.3f] million cycles\n", minTimeISPC);
#endif
InitData(Nx, Ny, Nz, Aispc, vsq);
fprintf(stderr, " -- init -- \n");
InitData(Nx, Ny, Nz, Aispc, vsq);
fprintf(stderr, " -- done init -- \n");
//
// Compute the image using the ispc implementation with tasks; report
// the minimum time of three runs.
//
double minTimeISPCTasks = 1e30;
for (int i = 0; i < 3; ++i) {
reset_and_start_timer();
loop_stencil_ispc_tasks(0, 6, width, Nx - width, width, Ny - width,
width, Nz - width, Nx, Ny, Nz, coeff, vsq,
Aispc[0], Aispc[1]);
double dt = get_elapsed_mcycles();
minTimeISPCTasks = std::min(minTimeISPCTasks, dt);
}
//
// Compute the image using the ispc implementation with tasks; report
// the minimum time of three runs.
//
double minTimeISPCTasks = 1e30;
for (int i = 0; i < 3; ++i) {
reset_and_start_timer();
const double t0 = rtc();
loop_stencil_ispc_tasks(0, 6, width, Nx - width, width, Ny - width,
width, Nz - width, Nx, Ny, Nz, coeff, vsq,
Aispc[0], Aispc[1]);
double dt = rtc() - t0; //get_elapsed_mcycles();
minTimeISPCTasks = std::min(minTimeISPCTasks, dt);
}
printf("[stencil ispc + tasks]:\t\t[%.3f] million cycles\n", minTimeISPCTasks);
fprintf(stderr, "[stencil ispc + tasks]:\t\t[%.3f] million cycles\n", minTimeISPCTasks);
InitData(Nx, Ny, Nz, Aserial, vsq);
//
// And run the serial implementation 3 times, again reporting the
// minimum time.
//
double minTimeSerial = 1e30;
for (int i = 0; i < 3; ++i) {
reset_and_start_timer();
loop_stencil_serial(0, 6, width, Nx-width, width, Ny - width,
width, Nz - width, Nx, Ny, Nz, coeff, vsq,
Aserial[0], Aserial[1]);
double dt = get_elapsed_mcycles();
minTimeSerial = std::min(minTimeSerial, dt);
}
InitData(Nx, Ny, Nz, Aserial, vsq);
printf("[stencil serial]:\t\t[%.3f] million cycles\n", minTimeSerial);
//
// And run the serial implementation 3 times, again reporting the
// minimum time.
//
double minTimeSerial = 1e30;
for (int i = 0; i < 3; ++i) {
reset_and_start_timer();
loop_stencil_serial(0, 6, width, Nx-width, width, Ny - width,
width, Nz - width, Nx, Ny, Nz, coeff, vsq,
Aserial[0], Aserial[1]);
double dt = get_elapsed_mcycles();
minTimeSerial = std::min(minTimeSerial, dt);
}
printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n",
minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks);
printf("[stencil serial]:\t\t[%.3f] million cycles\n", minTimeSerial);
// Check for agreement
int offset = 0;
for (int z = 0; z < Nz; ++z)
for (int y = 0; y < Ny; ++y)
for (int x = 0; x < Nx; ++x, ++offset) {
double error = fabsf((Aserial[1][offset] - Aispc[1][offset]) /
Aserial[1][offset]);
if (error > 1e-4)
printf("Error @ (%d,%d,%d): ispc = %f, serial = %f\n",
x, y, z, Aispc[1][offset], Aserial[1][offset]);
}
printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n",
minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks);
return 0;
// Check for agreement
int offset = 0;
for (int z = 0; z < Nz; ++z)
for (int y = 0; y < Ny; ++y)
for (int x = 0; x < Nx; ++x, ++offset) {
double error = fabsf((Aserial[1][offset] - Aispc[1][offset]) /
Aserial[1][offset]);
if (error > 1e-4)
printf("Error @ (%d,%d,%d): ispc = %f, serial = %f\n",
x, y, z, Aispc[1][offset], Aserial[1][offset]);
}
return 0;
}

View File

@@ -59,7 +59,9 @@
#define ISPC_USE_PTHREADS
#define ISPC_USE_PTHREADS_FULLY_SUBSCRIBED
#define ISPC_USE_CILK
*/
#define ISPC_USE_OMP
/*
#define ISPC_USE_TBB_TASK_GROUP
#define ISPC_USE_TBB_PARALLEL_FOR
@@ -943,7 +945,7 @@ InitTaskSystem() {
inline void
TaskGroup::Launch(int baseIndex, int count) {
#pragma omp parallel for
#pragma omp parallel for schedule(dynamic)
for(int i = 0; i < count; i++) {
TaskInfo *ti = GetTaskInfo(baseIndex + i);

View File

@@ -179,10 +179,10 @@ int main(int argc, char *argv[]) {
double dt = get_elapsed_mcycles();
minISPC = std::min(minISPC, dt);
}
#endif
printf("[volume ispc 1 core]:\t\t[%.3f] million cycles\n", minISPC);
writePPM(image, width, height, "volume-ispc-1core.ppm");
#endif
// Clear out the buffer
for (int i = 0; i < width * height; ++i)
@@ -214,6 +214,7 @@ int main(int argc, char *argv[]) {
// minimum time.
//
double minSerial = 1e30;
#if 0
for (int i = 0; i < 3; ++i) {
reset_and_start_timer();
volume_serial(density, n, raster2camera, camera2world,
@@ -224,6 +225,7 @@ int main(int argc, char *argv[]) {
printf("[volume serial]:\t\t[%.3f] million cycles\n", minSerial);
writePPM(image, width, height, "volume-serial.ppm");
#endif
printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n",
minSerial/minISPC, minSerial / minISPCtasks);