added wc-timer for host code
This commit is contained in:
@@ -739,7 +739,7 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_i32 *p, __vec16_i32
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
#if 0 /* knc::fails ./tests/foreach-25.ispc ./tests/forach-26.ispc ./tests/foreach-27.ispc */
|
#if 1 /* knc::fails ./tests/foreach-25.ispc ./tests/forach-26.ispc ./tests/foreach-27.ispc */
|
||||||
template <> static FORCEINLINE __vec16_i32 __load<64>(const __vec16_i32 *p)
|
template <> static FORCEINLINE __vec16_i32 __load<64>(const __vec16_i32 *p)
|
||||||
{
|
{
|
||||||
return _mm512_load_epi32(p);
|
return _mm512_load_epi32(p);
|
||||||
@@ -1007,7 +1007,7 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_i64 *p, __vec16_i64
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
#if 0 /* knc::fails as with _i32 this may generate fails ... so commetining it out */
|
#if 1 /* knc::fails as with _i32 this may generate fails ... so commetining it out */
|
||||||
template <> static FORCEINLINE __vec16_i64 __load<64>(const __vec16_i64 *p)
|
template <> static FORCEINLINE __vec16_i64 __load<64>(const __vec16_i64 *p)
|
||||||
{
|
{
|
||||||
__m512i v2 = _mm512_load_epi32(p);
|
__m512i v2 = _mm512_load_epi32(p);
|
||||||
@@ -1119,7 +1119,7 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_f *p, __vec16_f v)
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
#if 0 /* knc::fails ./tests/gs-improve-progindex.ispc with segfault */
|
#if 1 /* knc::fails ./tests/gs-improve-progindex.ispc with segfault */
|
||||||
template <> static FORCEINLINE __vec16_f __load<64>(const __vec16_f *p)
|
template <> static FORCEINLINE __vec16_f __load<64>(const __vec16_f *p)
|
||||||
{
|
{
|
||||||
return _mm512_load_ps(p);
|
return _mm512_load_ps(p);
|
||||||
@@ -1397,7 +1397,7 @@ template <int ALIGN> static FORCEINLINE void __store(__vec16_d *p, __vec16_d v)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#if 0 /* knc::fails as with _f this may generate fails ... so commetining it out */
|
#if 1 /* knc::fails as with _f this may generate fails ... so commetining it out */
|
||||||
template <> static FORCEINLINE __vec16_d __load<64>(const __vec16_d *p)
|
template <> static FORCEINLINE __vec16_d __load<64>(const __vec16_d *p)
|
||||||
{
|
{
|
||||||
return __vec16_d(_mm512_load_pd(p), _mm512_load_pd(((uint8_t*)p)+64));
|
return __vec16_d(_mm512_load_pd(p), _mm512_load_pd(((uint8_t*)p)+64));
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
EXAMPLE=mandelbrot_tasks
|
EXAMPLE=mandelbrot_tasks
|
||||||
CPP_SRC=mandelbrot_tasks.cpp mandelbrot_tasks_serial.cpp
|
CPP_SRC=mandelbrot_tasks.cpp mandelbrot_tasks_serial.cpp
|
||||||
ISPC_SRC=mandelbrot_tasks.ispc
|
ISPC_SRC=mandelbrot_tasks.ispc
|
||||||
ISPC_IA_TARGETS=sse2,sse4-x2,avx-x2
|
ISPC_IA_TARGETS=avx
|
||||||
ISPC_ARM_TARGETS=neon
|
ISPC_ARM_TARGETS=neon
|
||||||
|
|
||||||
include ../common.mk
|
include ../common.mk
|
||||||
|
|||||||
@@ -42,9 +42,24 @@
|
|||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include "../timing.h"
|
#include "../timing.h"
|
||||||
#include "mandelbrot_tasks_ispc.h"
|
#include "mandelbrot_ispc.h"
|
||||||
using namespace ispc;
|
using namespace ispc;
|
||||||
|
|
||||||
|
#include <sys/time.h>
|
||||||
|
|
||||||
|
|
||||||
|
double rtc(void)
|
||||||
|
{
|
||||||
|
struct timeval Tvalue;
|
||||||
|
double etime;
|
||||||
|
struct timezone dummy;
|
||||||
|
|
||||||
|
gettimeofday(&Tvalue,&dummy);
|
||||||
|
etime = (double) Tvalue.tv_sec +
|
||||||
|
1.e-6*((double) Tvalue.tv_usec);
|
||||||
|
return etime;
|
||||||
|
}
|
||||||
|
|
||||||
extern void mandelbrot_serial(float x0, float y0, float x1, float y1,
|
extern void mandelbrot_serial(float x0, float y0, float x1, float y1,
|
||||||
int width, int height, int maxIterations,
|
int width, int height, int maxIterations,
|
||||||
int output[]);
|
int output[]);
|
||||||
@@ -113,8 +128,9 @@ int main(int argc, char *argv[]) {
|
|||||||
for (unsigned int i = 0; i < width * height; ++i)
|
for (unsigned int i = 0; i < width * height; ++i)
|
||||||
buf[i] = 0;
|
buf[i] = 0;
|
||||||
reset_and_start_timer();
|
reset_and_start_timer();
|
||||||
|
double t0 = rtc();
|
||||||
mandelbrot_ispc(x0, y0, x1, y1, width, height, maxIterations, buf);
|
mandelbrot_ispc(x0, y0, x1, y1, width, height, maxIterations, buf);
|
||||||
double dt = get_elapsed_mcycles();
|
double dt = rtc() - t0; //get_elapsed_mcycles();
|
||||||
minISPC = std::min(minISPC, dt);
|
minISPC = std::min(minISPC, dt);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -127,6 +143,7 @@ int main(int argc, char *argv[]) {
|
|||||||
// minimum time.
|
// minimum time.
|
||||||
//
|
//
|
||||||
double minSerial = 1e30;
|
double minSerial = 1e30;
|
||||||
|
#if 0
|
||||||
for (int i = 0; i < 3; ++i) {
|
for (int i = 0; i < 3; ++i) {
|
||||||
// Clear out the buffer
|
// Clear out the buffer
|
||||||
for (unsigned int i = 0; i < width * height; ++i)
|
for (unsigned int i = 0; i < width * height; ++i)
|
||||||
@@ -139,6 +156,7 @@ int main(int argc, char *argv[]) {
|
|||||||
|
|
||||||
printf("[mandelbrot serial]:\t\t[%.3f] million cycles\n", minSerial);
|
printf("[mandelbrot serial]:\t\t[%.3f] million cycles\n", minSerial);
|
||||||
writePPM(buf, width, height, "mandelbrot-serial.ppm");
|
writePPM(buf, width, height, "mandelbrot-serial.ppm");
|
||||||
|
#endif
|
||||||
|
|
||||||
printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", minSerial/minISPC);
|
printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", minSerial/minISPC);
|
||||||
|
|
||||||
|
|||||||
@@ -42,7 +42,7 @@
|
|||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include "../timing.h"
|
#include "../timing.h"
|
||||||
#include "mandelbrot_tasks3d_ispc.h"
|
#include "mandelbrot_ispc.h"
|
||||||
using namespace ispc;
|
using namespace ispc;
|
||||||
|
|
||||||
extern void mandelbrot_serial(float x0, float y0, float x1, float y1,
|
extern void mandelbrot_serial(float x0, float y0, float x1, float y1,
|
||||||
@@ -127,6 +127,7 @@ int main(int argc, char *argv[]) {
|
|||||||
// minimum time.
|
// minimum time.
|
||||||
//
|
//
|
||||||
double minSerial = 1e30;
|
double minSerial = 1e30;
|
||||||
|
#if 0
|
||||||
for (int i = 0; i < 3; ++i) {
|
for (int i = 0; i < 3; ++i) {
|
||||||
// Clear out the buffer
|
// Clear out the buffer
|
||||||
for (unsigned int i = 0; i < width * height; ++i)
|
for (unsigned int i = 0; i < width * height; ++i)
|
||||||
@@ -139,6 +140,7 @@ int main(int argc, char *argv[]) {
|
|||||||
|
|
||||||
printf("[mandelbrot serial]:\t\t[%.3f] million cycles\n", minSerial);
|
printf("[mandelbrot serial]:\t\t[%.3f] million cycles\n", minSerial);
|
||||||
writePPM(buf, width, height, "mandelbrot-serial.ppm");
|
writePPM(buf, width, height, "mandelbrot-serial.ppm");
|
||||||
|
#endif
|
||||||
|
|
||||||
printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", minSerial/minISPC);
|
printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", minSerial/minISPC);
|
||||||
|
|
||||||
|
|||||||
@@ -45,6 +45,21 @@
|
|||||||
#include "stencil_ispc.h"
|
#include "stencil_ispc.h"
|
||||||
using namespace ispc;
|
using namespace ispc;
|
||||||
|
|
||||||
|
#include <sys/time.h>
|
||||||
|
|
||||||
|
|
||||||
|
double rtc(void)
|
||||||
|
{
|
||||||
|
struct timeval Tvalue;
|
||||||
|
double etime;
|
||||||
|
struct timezone dummy;
|
||||||
|
|
||||||
|
gettimeofday(&Tvalue,&dummy);
|
||||||
|
etime = (double) Tvalue.tv_sec +
|
||||||
|
1.e-6*((double) Tvalue.tv_usec);
|
||||||
|
return etime;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
extern void loop_stencil_serial(int t0, int t1, int x0, int x1,
|
extern void loop_stencil_serial(int t0, int t1, int x0, int x1,
|
||||||
int y0, int y1, int z0, int z1,
|
int y0, int y1, int z0, int z1,
|
||||||
@@ -56,6 +71,7 @@ extern void loop_stencil_serial(int t0, int t1, int x0, int x1,
|
|||||||
|
|
||||||
void InitData(int Nx, int Ny, int Nz, double *A[2], double *vsq) {
|
void InitData(int Nx, int Ny, int Nz, double *A[2], double *vsq) {
|
||||||
int offset = 0;
|
int offset = 0;
|
||||||
|
#pragma omp parallel for collapse(2) private(offset)
|
||||||
for (int z = 0; z < Nz; ++z)
|
for (int z = 0; z < Nz; ++z)
|
||||||
for (int y = 0; y < Ny; ++y)
|
for (int y = 0; y < Ny; ++y)
|
||||||
for (int x = 0; x < Nx; ++x, ++offset) {
|
for (int x = 0; x < Nx; ++x, ++offset) {
|
||||||
@@ -78,13 +94,14 @@ int main() {
|
|||||||
|
|
||||||
double coeff[4] = { 0.5, -.25, .125, -.0625 };
|
double coeff[4] = { 0.5, -.25, .125, -.0625 };
|
||||||
|
|
||||||
InitData(Nx, Ny, Nz, Aispc, vsq);
|
// InitData(Nx, Ny, Nz, Aispc, vsq);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Compute the image using the ispc implementation on one core; report
|
// Compute the image using the ispc implementation on one core; report
|
||||||
// the minimum time of three runs.
|
// the minimum time of three runs.
|
||||||
//
|
//
|
||||||
double minTimeISPC = 1e30;
|
double minTimeISPC = 1e30;
|
||||||
|
#if 0
|
||||||
for (int i = 0; i < 3; ++i) {
|
for (int i = 0; i < 3; ++i) {
|
||||||
reset_and_start_timer();
|
reset_and_start_timer();
|
||||||
loop_stencil_ispc(0, 6, width, Nx - width, width, Ny - width,
|
loop_stencil_ispc(0, 6, width, Nx - width, width, Ny - width,
|
||||||
@@ -95,8 +112,11 @@ int main() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
printf("[stencil ispc 1 core]:\t\t[%.3f] million cycles\n", minTimeISPC);
|
printf("[stencil ispc 1 core]:\t\t[%.3f] million cycles\n", minTimeISPC);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
fprintf(stderr, " -- init -- \n");
|
||||||
InitData(Nx, Ny, Nz, Aispc, vsq);
|
InitData(Nx, Ny, Nz, Aispc, vsq);
|
||||||
|
fprintf(stderr, " -- done init -- \n");
|
||||||
|
|
||||||
//
|
//
|
||||||
// Compute the image using the ispc implementation with tasks; report
|
// Compute the image using the ispc implementation with tasks; report
|
||||||
@@ -105,14 +125,16 @@ int main() {
|
|||||||
double minTimeISPCTasks = 1e30;
|
double minTimeISPCTasks = 1e30;
|
||||||
for (int i = 0; i < 3; ++i) {
|
for (int i = 0; i < 3; ++i) {
|
||||||
reset_and_start_timer();
|
reset_and_start_timer();
|
||||||
|
const double t0 = rtc();
|
||||||
loop_stencil_ispc_tasks(0, 6, width, Nx - width, width, Ny - width,
|
loop_stencil_ispc_tasks(0, 6, width, Nx - width, width, Ny - width,
|
||||||
width, Nz - width, Nx, Ny, Nz, coeff, vsq,
|
width, Nz - width, Nx, Ny, Nz, coeff, vsq,
|
||||||
Aispc[0], Aispc[1]);
|
Aispc[0], Aispc[1]);
|
||||||
double dt = get_elapsed_mcycles();
|
double dt = rtc() - t0; //get_elapsed_mcycles();
|
||||||
minTimeISPCTasks = std::min(minTimeISPCTasks, dt);
|
minTimeISPCTasks = std::min(minTimeISPCTasks, dt);
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("[stencil ispc + tasks]:\t\t[%.3f] million cycles\n", minTimeISPCTasks);
|
fprintf(stderr, "[stencil ispc + tasks]:\t\t[%.3f] million cycles\n", minTimeISPCTasks);
|
||||||
|
|
||||||
|
|
||||||
InitData(Nx, Ny, Nz, Aserial, vsq);
|
InitData(Nx, Ny, Nz, Aserial, vsq);
|
||||||
|
|
||||||
|
|||||||
@@ -59,7 +59,9 @@
|
|||||||
#define ISPC_USE_PTHREADS
|
#define ISPC_USE_PTHREADS
|
||||||
#define ISPC_USE_PTHREADS_FULLY_SUBSCRIBED
|
#define ISPC_USE_PTHREADS_FULLY_SUBSCRIBED
|
||||||
#define ISPC_USE_CILK
|
#define ISPC_USE_CILK
|
||||||
|
*/
|
||||||
#define ISPC_USE_OMP
|
#define ISPC_USE_OMP
|
||||||
|
/*
|
||||||
#define ISPC_USE_TBB_TASK_GROUP
|
#define ISPC_USE_TBB_TASK_GROUP
|
||||||
#define ISPC_USE_TBB_PARALLEL_FOR
|
#define ISPC_USE_TBB_PARALLEL_FOR
|
||||||
|
|
||||||
@@ -943,7 +945,7 @@ InitTaskSystem() {
|
|||||||
|
|
||||||
inline void
|
inline void
|
||||||
TaskGroup::Launch(int baseIndex, int count) {
|
TaskGroup::Launch(int baseIndex, int count) {
|
||||||
#pragma omp parallel for
|
#pragma omp parallel for schedule(dynamic)
|
||||||
for(int i = 0; i < count; i++) {
|
for(int i = 0; i < count; i++) {
|
||||||
TaskInfo *ti = GetTaskInfo(baseIndex + i);
|
TaskInfo *ti = GetTaskInfo(baseIndex + i);
|
||||||
|
|
||||||
|
|||||||
@@ -179,10 +179,10 @@ int main(int argc, char *argv[]) {
|
|||||||
double dt = get_elapsed_mcycles();
|
double dt = get_elapsed_mcycles();
|
||||||
minISPC = std::min(minISPC, dt);
|
minISPC = std::min(minISPC, dt);
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
printf("[volume ispc 1 core]:\t\t[%.3f] million cycles\n", minISPC);
|
printf("[volume ispc 1 core]:\t\t[%.3f] million cycles\n", minISPC);
|
||||||
writePPM(image, width, height, "volume-ispc-1core.ppm");
|
writePPM(image, width, height, "volume-ispc-1core.ppm");
|
||||||
|
#endif
|
||||||
|
|
||||||
// Clear out the buffer
|
// Clear out the buffer
|
||||||
for (int i = 0; i < width * height; ++i)
|
for (int i = 0; i < width * height; ++i)
|
||||||
@@ -214,6 +214,7 @@ int main(int argc, char *argv[]) {
|
|||||||
// minimum time.
|
// minimum time.
|
||||||
//
|
//
|
||||||
double minSerial = 1e30;
|
double minSerial = 1e30;
|
||||||
|
#if 0
|
||||||
for (int i = 0; i < 3; ++i) {
|
for (int i = 0; i < 3; ++i) {
|
||||||
reset_and_start_timer();
|
reset_and_start_timer();
|
||||||
volume_serial(density, n, raster2camera, camera2world,
|
volume_serial(density, n, raster2camera, camera2world,
|
||||||
@@ -224,6 +225,7 @@ int main(int argc, char *argv[]) {
|
|||||||
|
|
||||||
printf("[volume serial]:\t\t[%.3f] million cycles\n", minSerial);
|
printf("[volume serial]:\t\t[%.3f] million cycles\n", minSerial);
|
||||||
writePPM(image, width, height, "volume-serial.ppm");
|
writePPM(image, width, height, "volume-serial.ppm");
|
||||||
|
#endif
|
||||||
|
|
||||||
printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n",
|
printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n",
|
||||||
minSerial/minISPC, minSerial / minISPCtasks);
|
minSerial/minISPC, minSerial / minISPCtasks);
|
||||||
|
|||||||
Reference in New Issue
Block a user