added wc-timer for host code

This commit is contained in:
evghenii
2013-11-08 15:39:57 +01:00
parent eb8e1a2160
commit 87de3a2d06
7 changed files with 213 additions and 167 deletions

View File

@@ -739,7 +739,7 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_i32 *p, __vec16_i32
#endif #endif
} }
#if 0 /* knc::fails ./tests/foreach-25.ispc ./tests/forach-26.ispc ./tests/foreach-27.ispc */ #if 1 /* knc::fails ./tests/foreach-25.ispc ./tests/forach-26.ispc ./tests/foreach-27.ispc */
template <> static FORCEINLINE __vec16_i32 __load<64>(const __vec16_i32 *p) template <> static FORCEINLINE __vec16_i32 __load<64>(const __vec16_i32 *p)
{ {
return _mm512_load_epi32(p); return _mm512_load_epi32(p);
@@ -1007,7 +1007,7 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_i64 *p, __vec16_i64
#endif #endif
} }
#if 0 /* knc::fails as with _i32 this may generate fails ... so commetining it out */ #if 1 /* knc::fails as with _i32 this may generate fails ... so commetining it out */
template <> static FORCEINLINE __vec16_i64 __load<64>(const __vec16_i64 *p) template <> static FORCEINLINE __vec16_i64 __load<64>(const __vec16_i64 *p)
{ {
__m512i v2 = _mm512_load_epi32(p); __m512i v2 = _mm512_load_epi32(p);
@@ -1119,7 +1119,7 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_f *p, __vec16_f v)
#endif #endif
} }
#if 0 /* knc::fails ./tests/gs-improve-progindex.ispc with segfault */ #if 1 /* knc::fails ./tests/gs-improve-progindex.ispc with segfault */
template <> static FORCEINLINE __vec16_f __load<64>(const __vec16_f *p) template <> static FORCEINLINE __vec16_f __load<64>(const __vec16_f *p)
{ {
return _mm512_load_ps(p); return _mm512_load_ps(p);
@@ -1397,7 +1397,7 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_d *p, __vec16_d v)
} }
#if 0 /* knc::fails as with _f this may generate fails ... so commetining it out */ #if 1 /* knc::fails as with _f this may generate fails ... so commetining it out */
template <> static FORCEINLINE __vec16_d __load<64>(const __vec16_d *p) template <> static FORCEINLINE __vec16_d __load<64>(const __vec16_d *p)
{ {
return __vec16_d(_mm512_load_pd(p), _mm512_load_pd(((uint8_t*)p)+64)); return __vec16_d(_mm512_load_pd(p), _mm512_load_pd(((uint8_t*)p)+64));

View File

@@ -2,7 +2,7 @@
EXAMPLE=mandelbrot_tasks EXAMPLE=mandelbrot_tasks
CPP_SRC=mandelbrot_tasks.cpp mandelbrot_tasks_serial.cpp CPP_SRC=mandelbrot_tasks.cpp mandelbrot_tasks_serial.cpp
ISPC_SRC=mandelbrot_tasks.ispc ISPC_SRC=mandelbrot_tasks.ispc
ISPC_IA_TARGETS=sse2,sse4-x2,avx-x2 ISPC_IA_TARGETS=avx
ISPC_ARM_TARGETS=neon ISPC_ARM_TARGETS=neon
include ../common.mk include ../common.mk

View File

@@ -42,9 +42,24 @@
#include <algorithm> #include <algorithm>
#include <string.h> #include <string.h>
#include "../timing.h" #include "../timing.h"
#include "mandelbrot_tasks_ispc.h" #include "mandelbrot_ispc.h"
using namespace ispc; using namespace ispc;
#include <sys/time.h>
double rtc(void)
{
struct timeval Tvalue;
double etime;
struct timezone dummy;
gettimeofday(&Tvalue,&dummy);
etime = (double) Tvalue.tv_sec +
1.e-6*((double) Tvalue.tv_usec);
return etime;
}
extern void mandelbrot_serial(float x0, float y0, float x1, float y1, extern void mandelbrot_serial(float x0, float y0, float x1, float y1,
int width, int height, int maxIterations, int width, int height, int maxIterations,
int output[]); int output[]);
@@ -113,8 +128,9 @@ int main(int argc, char *argv[]) {
for (unsigned int i = 0; i < width * height; ++i) for (unsigned int i = 0; i < width * height; ++i)
buf[i] = 0; buf[i] = 0;
reset_and_start_timer(); reset_and_start_timer();
double t0 = rtc();
mandelbrot_ispc(x0, y0, x1, y1, width, height, maxIterations, buf); mandelbrot_ispc(x0, y0, x1, y1, width, height, maxIterations, buf);
double dt = get_elapsed_mcycles(); double dt = rtc() - t0; //get_elapsed_mcycles();
minISPC = std::min(minISPC, dt); minISPC = std::min(minISPC, dt);
} }
@@ -127,6 +143,7 @@ int main(int argc, char *argv[]) {
// minimum time. // minimum time.
// //
double minSerial = 1e30; double minSerial = 1e30;
#if 0
for (int i = 0; i < 3; ++i) { for (int i = 0; i < 3; ++i) {
// Clear out the buffer // Clear out the buffer
for (unsigned int i = 0; i < width * height; ++i) for (unsigned int i = 0; i < width * height; ++i)
@@ -139,6 +156,7 @@ int main(int argc, char *argv[]) {
printf("[mandelbrot serial]:\t\t[%.3f] million cycles\n", minSerial); printf("[mandelbrot serial]:\t\t[%.3f] million cycles\n", minSerial);
writePPM(buf, width, height, "mandelbrot-serial.ppm"); writePPM(buf, width, height, "mandelbrot-serial.ppm");
#endif
printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", minSerial/minISPC); printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", minSerial/minISPC);

View File

@@ -42,7 +42,7 @@
#include <algorithm> #include <algorithm>
#include <string.h> #include <string.h>
#include "../timing.h" #include "../timing.h"
#include "mandelbrot_tasks3d_ispc.h" #include "mandelbrot_ispc.h"
using namespace ispc; using namespace ispc;
extern void mandelbrot_serial(float x0, float y0, float x1, float y1, extern void mandelbrot_serial(float x0, float y0, float x1, float y1,
@@ -127,6 +127,7 @@ int main(int argc, char *argv[]) {
// minimum time. // minimum time.
// //
double minSerial = 1e30; double minSerial = 1e30;
#if 0
for (int i = 0; i < 3; ++i) { for (int i = 0; i < 3; ++i) {
// Clear out the buffer // Clear out the buffer
for (unsigned int i = 0; i < width * height; ++i) for (unsigned int i = 0; i < width * height; ++i)
@@ -139,6 +140,7 @@ int main(int argc, char *argv[]) {
printf("[mandelbrot serial]:\t\t[%.3f] million cycles\n", minSerial); printf("[mandelbrot serial]:\t\t[%.3f] million cycles\n", minSerial);
writePPM(buf, width, height, "mandelbrot-serial.ppm"); writePPM(buf, width, height, "mandelbrot-serial.ppm");
#endif
printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", minSerial/minISPC); printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", minSerial/minISPC);

View File

@@ -45,6 +45,21 @@
#include "stencil_ispc.h" #include "stencil_ispc.h"
using namespace ispc; using namespace ispc;
#include <sys/time.h>
double rtc(void)
{
struct timeval Tvalue;
double etime;
struct timezone dummy;
gettimeofday(&Tvalue,&dummy);
etime = (double) Tvalue.tv_sec +
1.e-6*((double) Tvalue.tv_usec);
return etime;
}
extern void loop_stencil_serial(int t0, int t1, int x0, int x1, extern void loop_stencil_serial(int t0, int t1, int x0, int x1,
int y0, int y1, int z0, int z1, int y0, int y1, int z0, int z1,
@@ -56,6 +71,7 @@ extern void loop_stencil_serial(int t0, int t1, int x0, int x1,
void InitData(int Nx, int Ny, int Nz, double *A[2], double *vsq) { void InitData(int Nx, int Ny, int Nz, double *A[2], double *vsq) {
int offset = 0; int offset = 0;
#pragma omp parallel for collapse(2) private(offset)
for (int z = 0; z < Nz; ++z) for (int z = 0; z < Nz; ++z)
for (int y = 0; y < Ny; ++y) for (int y = 0; y < Ny; ++y)
for (int x = 0; x < Nx; ++x, ++offset) { for (int x = 0; x < Nx; ++x, ++offset) {
@@ -78,13 +94,14 @@ int main() {
double coeff[4] = { 0.5, -.25, .125, -.0625 }; double coeff[4] = { 0.5, -.25, .125, -.0625 };
InitData(Nx, Ny, Nz, Aispc, vsq); // InitData(Nx, Ny, Nz, Aispc, vsq);
// //
// Compute the image using the ispc implementation on one core; report // Compute the image using the ispc implementation on one core; report
// the minimum time of three runs. // the minimum time of three runs.
// //
double minTimeISPC = 1e30; double minTimeISPC = 1e30;
#if 0
for (int i = 0; i < 3; ++i) { for (int i = 0; i < 3; ++i) {
reset_and_start_timer(); reset_and_start_timer();
loop_stencil_ispc(0, 6, width, Nx - width, width, Ny - width, loop_stencil_ispc(0, 6, width, Nx - width, width, Ny - width,
@@ -95,8 +112,11 @@ int main() {
} }
printf("[stencil ispc 1 core]:\t\t[%.3f] million cycles\n", minTimeISPC); printf("[stencil ispc 1 core]:\t\t[%.3f] million cycles\n", minTimeISPC);
#endif
fprintf(stderr, " -- init -- \n");
InitData(Nx, Ny, Nz, Aispc, vsq); InitData(Nx, Ny, Nz, Aispc, vsq);
fprintf(stderr, " -- done init -- \n");
// //
// Compute the image using the ispc implementation with tasks; report // Compute the image using the ispc implementation with tasks; report
@@ -105,14 +125,16 @@ int main() {
double minTimeISPCTasks = 1e30; double minTimeISPCTasks = 1e30;
for (int i = 0; i < 3; ++i) { for (int i = 0; i < 3; ++i) {
reset_and_start_timer(); reset_and_start_timer();
const double t0 = rtc();
loop_stencil_ispc_tasks(0, 6, width, Nx - width, width, Ny - width, loop_stencil_ispc_tasks(0, 6, width, Nx - width, width, Ny - width,
width, Nz - width, Nx, Ny, Nz, coeff, vsq, width, Nz - width, Nx, Ny, Nz, coeff, vsq,
Aispc[0], Aispc[1]); Aispc[0], Aispc[1]);
double dt = get_elapsed_mcycles(); double dt = rtc() - t0; //get_elapsed_mcycles();
minTimeISPCTasks = std::min(minTimeISPCTasks, dt); minTimeISPCTasks = std::min(minTimeISPCTasks, dt);
} }
printf("[stencil ispc + tasks]:\t\t[%.3f] million cycles\n", minTimeISPCTasks); fprintf(stderr, "[stencil ispc + tasks]:\t\t[%.3f] million cycles\n", minTimeISPCTasks);
InitData(Nx, Ny, Nz, Aserial, vsq); InitData(Nx, Ny, Nz, Aserial, vsq);

View File

@@ -59,7 +59,9 @@
#define ISPC_USE_PTHREADS #define ISPC_USE_PTHREADS
#define ISPC_USE_PTHREADS_FULLY_SUBSCRIBED #define ISPC_USE_PTHREADS_FULLY_SUBSCRIBED
#define ISPC_USE_CILK #define ISPC_USE_CILK
*/
#define ISPC_USE_OMP #define ISPC_USE_OMP
/*
#define ISPC_USE_TBB_TASK_GROUP #define ISPC_USE_TBB_TASK_GROUP
#define ISPC_USE_TBB_PARALLEL_FOR #define ISPC_USE_TBB_PARALLEL_FOR
@@ -943,7 +945,7 @@ InitTaskSystem() {
inline void inline void
TaskGroup::Launch(int baseIndex, int count) { TaskGroup::Launch(int baseIndex, int count) {
#pragma omp parallel for #pragma omp parallel for schedule(dynamic)
for(int i = 0; i < count; i++) { for(int i = 0; i < count; i++) {
TaskInfo *ti = GetTaskInfo(baseIndex + i); TaskInfo *ti = GetTaskInfo(baseIndex + i);

View File

@@ -179,10 +179,10 @@ int main(int argc, char *argv[]) {
double dt = get_elapsed_mcycles(); double dt = get_elapsed_mcycles();
minISPC = std::min(minISPC, dt); minISPC = std::min(minISPC, dt);
} }
#endif
printf("[volume ispc 1 core]:\t\t[%.3f] million cycles\n", minISPC); printf("[volume ispc 1 core]:\t\t[%.3f] million cycles\n", minISPC);
writePPM(image, width, height, "volume-ispc-1core.ppm"); writePPM(image, width, height, "volume-ispc-1core.ppm");
#endif
// Clear out the buffer // Clear out the buffer
for (int i = 0; i < width * height; ++i) for (int i = 0; i < width * height; ++i)
@@ -214,6 +214,7 @@ int main(int argc, char *argv[]) {
// minimum time. // minimum time.
// //
double minSerial = 1e30; double minSerial = 1e30;
#if 0
for (int i = 0; i < 3; ++i) { for (int i = 0; i < 3; ++i) {
reset_and_start_timer(); reset_and_start_timer();
volume_serial(density, n, raster2camera, camera2world, volume_serial(density, n, raster2camera, camera2world,
@@ -224,6 +225,7 @@ int main(int argc, char *argv[]) {
printf("[volume serial]:\t\t[%.3f] million cycles\n", minSerial); printf("[volume serial]:\t\t[%.3f] million cycles\n", minSerial);
writePPM(image, width, height, "volume-serial.ppm"); writePPM(image, width, height, "volume-serial.ppm");
#endif
printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n", printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n",
minSerial/minISPC, minSerial / minISPCtasks); minSerial/minISPC, minSerial / minISPCtasks);