Compare commits
8 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
1633d50b34
|
|||
|
b8453b4a3a
|
|||
| 1eb64a13e1 | |||
| 2921430e45 | |||
| 34d26554bf | |||
| 5c0911c2a8 | |||
| f513e085ea | |||
| a47cab4dfa |
2
func.cpp
2
func.cpp
@@ -635,7 +635,7 @@ Function::GenerateIR() {
|
|||||||
const bool
|
const bool
|
||||||
Function::IsPolyFunction() const {
|
Function::IsPolyFunction() const {
|
||||||
for (size_t i = 0; i < args.size(); i++) {
|
for (size_t i = 0; i < args.size(); i++) {
|
||||||
if (args[i]->type->IsPolymorphicType()) {
|
if (args[i] && args[i]->type->IsPolymorphicType()) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
12
module.cpp
12
module.cpp
@@ -1061,8 +1061,6 @@ Module::AddFunctionDeclaration(const std::string &name,
|
|||||||
|
|
||||||
const Type *ret = eft->GetReturnType();
|
const Type *ret = eft->GetReturnType();
|
||||||
if (Type::EqualForReplacement(ret, pt)) {
|
if (Type::EqualForReplacement(ret, pt)) {
|
||||||
printf("Replaced return type %s\n",
|
|
||||||
ret->GetString().c_str());
|
|
||||||
ret = PolyType::ReplaceType(ret, *te);
|
ret = PolyType::ReplaceType(ret, *te);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1998,11 +1996,13 @@ lPrintPolyFunctionWrappers(FILE *file, const std::vector<std::string> &funcs) {
|
|||||||
for (size_t j=0; j<poly.size(); j++) {
|
for (size_t j=0; j<poly.size(); j++) {
|
||||||
const FunctionType *ftype = CastType<FunctionType>(poly[j]->type);
|
const FunctionType *ftype = CastType<FunctionType>(poly[j]->type);
|
||||||
Assert(ftype);
|
Assert(ftype);
|
||||||
std::string decl = ftype->GetCDeclaration(funcs[i]);
|
if (ftype->isExported || ftype->isExternC) {
|
||||||
fprintf(file, " %s {\n", decl.c_str());
|
std::string decl = ftype->GetCDeclaration(funcs[i]);
|
||||||
|
fprintf(file, " %s {\n", decl.c_str());
|
||||||
|
|
||||||
std::string call = ftype->GetCCall(poly[j]->name);
|
std::string call = ftype->GetCCall(poly[j]->name);
|
||||||
fprintf(file, " return %s;\n }\n", call.c_str());
|
fprintf(file, " return %s;\n }\n", call.c_str());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
2
parse.yy
2
parse.yy
@@ -37,7 +37,7 @@
|
|||||||
/* one for 'if', one for 'cif' */
|
/* one for 'if', one for 'cif' */
|
||||||
%expect 2
|
%expect 2
|
||||||
|
|
||||||
%error-verbose
|
%define parse.error verbose
|
||||||
|
|
||||||
%code requires {
|
%code requires {
|
||||||
|
|
||||||
|
|||||||
177
tests_ispcpp/CycleTimer.h
Normal file
177
tests_ispcpp/CycleTimer.h
Normal file
@@ -0,0 +1,177 @@
|
|||||||
|
#ifndef _SYRAH_CYCLE_TIMER_H_
|
||||||
|
#define _SYRAH_CYCLE_TIMER_H_
|
||||||
|
|
||||||
|
#if defined(__APPLE__)
|
||||||
|
#if defined(__x86_64__)
|
||||||
|
#include <sys/sysctl.h>
|
||||||
|
#else
|
||||||
|
#include <mach/mach.h>
|
||||||
|
#include <mach/mach_time.h>
|
||||||
|
#endif // __x86_64__ or not
|
||||||
|
|
||||||
|
#include <stdio.h> // fprintf
|
||||||
|
#include <stdlib.h> // exit
|
||||||
|
|
||||||
|
#elif _WIN32
|
||||||
|
# include <windows.h>
|
||||||
|
# include <time.h>
|
||||||
|
#else
|
||||||
|
# include <stdio.h>
|
||||||
|
# include <stdlib.h>
|
||||||
|
# include <string.h>
|
||||||
|
# include <sys/time.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
// This uses the cycle counter of the processor. Different
|
||||||
|
// processors in the system will have different values for this. If
|
||||||
|
// you process moves across processors, then the delta time you
|
||||||
|
// measure will likely be incorrect. This is mostly for fine
|
||||||
|
// grained measurements where the process is likely to be on the
|
||||||
|
// same processor. For more global things you should use the
|
||||||
|
// Time interface.
|
||||||
|
|
||||||
|
// Also note that if you processors' speeds change (i.e. processors
|
||||||
|
// scaling) or if you are in a heterogenous environment, you will
|
||||||
|
// likely get spurious results.
|
||||||
|
class CycleTimer {
|
||||||
|
public:
|
||||||
|
typedef unsigned long long SysClock;
|
||||||
|
|
||||||
|
//////////
|
||||||
|
// Return the current CPU time, in terms of clock ticks.
|
||||||
|
// Time zero is at some arbitrary point in the past.
|
||||||
|
static SysClock currentTicks() {
|
||||||
|
#if defined(__APPLE__) && !defined(__x86_64__)
|
||||||
|
return mach_absolute_time();
|
||||||
|
#elif defined(_WIN32)
|
||||||
|
LARGE_INTEGER qwTime;
|
||||||
|
QueryPerformanceCounter(&qwTime);
|
||||||
|
return qwTime.QuadPart;
|
||||||
|
#elif defined(__x86_64__)
|
||||||
|
unsigned int a, d;
|
||||||
|
asm volatile("rdtsc" : "=a" (a), "=d" (d));
|
||||||
|
return static_cast<unsigned long long>(a) |
|
||||||
|
(static_cast<unsigned long long>(d) << 32);
|
||||||
|
#elif defined(__ARM_NEON__) && 0 // mrc requires superuser.
|
||||||
|
unsigned int val;
|
||||||
|
asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(val));
|
||||||
|
return val;
|
||||||
|
#else
|
||||||
|
timespec spec;
|
||||||
|
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &spec);
|
||||||
|
return CycleTimer::SysClock(static_cast<float>(spec.tv_sec) * 1e9 + static_cast<float>(spec.tv_nsec));
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
//////////
|
||||||
|
// Return the current CPU time, in terms of seconds.
|
||||||
|
// This is slower than currentTicks(). Time zero is at
|
||||||
|
// some arbitrary point in the past.
|
||||||
|
static double currentSeconds() {
|
||||||
|
return currentTicks() * secondsPerTick();
|
||||||
|
}
|
||||||
|
|
||||||
|
//////////
|
||||||
|
// Return the conversion from seconds to ticks.
|
||||||
|
static double ticksPerSecond() {
|
||||||
|
return 1.0/secondsPerTick();
|
||||||
|
}
|
||||||
|
|
||||||
|
static const char* tickUnits() {
|
||||||
|
#if defined(__APPLE__) && !defined(__x86_64__)
|
||||||
|
return "ns";
|
||||||
|
#elif defined(__WIN32__) || defined(__x86_64__)
|
||||||
|
return "cycles";
|
||||||
|
#else
|
||||||
|
return "ns"; // clock_gettime
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
//////////
|
||||||
|
// Return the conversion from ticks to seconds.
|
||||||
|
static double secondsPerTick() {
|
||||||
|
static bool initialized = false;
|
||||||
|
static double secondsPerTick_val;
|
||||||
|
if (initialized) return secondsPerTick_val;
|
||||||
|
#if defined(__APPLE__)
|
||||||
|
#ifdef __x86_64__
|
||||||
|
int args[] = {CTL_HW, HW_CPU_FREQ};
|
||||||
|
unsigned int Hz;
|
||||||
|
size_t len = sizeof(Hz);
|
||||||
|
if (sysctl(args, 2, &Hz, &len, NULL, 0) != 0) {
|
||||||
|
fprintf(stderr, "Failed to initialize secondsPerTick_val!\n");
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
secondsPerTick_val = 1.0 / (double) Hz;
|
||||||
|
#else
|
||||||
|
mach_timebase_info_data_t time_info;
|
||||||
|
mach_timebase_info(&time_info);
|
||||||
|
|
||||||
|
// Scales to nanoseconds without 1e-9f
|
||||||
|
secondsPerTick_val = (1e-9*static_cast<double>(time_info.numer))/
|
||||||
|
static_cast<double>(time_info.denom);
|
||||||
|
#endif // x86_64 or not
|
||||||
|
#elif defined(_WIN32)
|
||||||
|
LARGE_INTEGER qwTicksPerSec;
|
||||||
|
QueryPerformanceFrequency(&qwTicksPerSec);
|
||||||
|
secondsPerTick_val = 1.0/static_cast<double>(qwTicksPerSec.QuadPart);
|
||||||
|
#else
|
||||||
|
FILE *fp = fopen("/proc/cpuinfo","r");
|
||||||
|
char input[1024];
|
||||||
|
if (!fp) {
|
||||||
|
fprintf(stderr, "CycleTimer::resetScale failed: couldn't find /proc/cpuinfo.");
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
// In case we don't find it, e.g. on the N900
|
||||||
|
secondsPerTick_val = 1e-9;
|
||||||
|
while (!feof(fp) && fgets(input, 1024, fp)) {
|
||||||
|
// NOTE(boulos): Because reading cpuinfo depends on dynamic
|
||||||
|
// frequency scaling it's better to read the @ sign first
|
||||||
|
float GHz, MHz;
|
||||||
|
if (strstr(input, "model name")) {
|
||||||
|
char* at_sign = strstr(input, "@");
|
||||||
|
if (at_sign) {
|
||||||
|
char* after_at = at_sign + 1;
|
||||||
|
char* GHz_str = strstr(after_at, "GHz");
|
||||||
|
char* MHz_str = strstr(after_at, "MHz");
|
||||||
|
if (GHz_str) {
|
||||||
|
*GHz_str = '\0';
|
||||||
|
if (1 == sscanf(after_at, "%f", &GHz)) {
|
||||||
|
//printf("GHz = %f\n", GHz);
|
||||||
|
secondsPerTick_val = 1e-9f / GHz;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} else if (MHz_str) {
|
||||||
|
*MHz_str = '\0';
|
||||||
|
if (1 == sscanf(after_at, "%f", &MHz)) {
|
||||||
|
//printf("MHz = %f\n", MHz);
|
||||||
|
secondsPerTick_val = 1e-6f / GHz;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if (1 == sscanf(input, "cpu MHz : %f", &MHz)) {
|
||||||
|
//printf("MHz = %f\n", MHz);
|
||||||
|
secondsPerTick_val = 1e-6f / MHz;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fclose(fp);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
initialized = true;
|
||||||
|
return secondsPerTick_val;
|
||||||
|
}
|
||||||
|
|
||||||
|
//////////
|
||||||
|
// Return the conversion from ticks to milliseconds.
|
||||||
|
static double msPerTick() {
|
||||||
|
return secondsPerTick() * 1000.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
CycleTimer();
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif // #ifndef _SYRAH_CYCLE_TIMER_H_
|
||||||
@@ -1,12 +1,15 @@
|
|||||||
CXX=g++
|
CXX=g++
|
||||||
CXXFLAGS=-std=c++11 -O2
|
CXXFLAGS=-std=c++11 -O3 -lm -lpthread
|
||||||
|
|
||||||
ISPC=../ispc
|
ISPC=../ispc
|
||||||
ISPCFLAGS=--target=sse4-x2 -O2 --arch=x86-64
|
ISPCFLAGS=--target=avx2 -O3 --arch=x86-64
|
||||||
|
|
||||||
%.out : %.cpp %.o
|
%.out : %.cpp %.o tasksys.o
|
||||||
$(CXX) $(CXXFLAGS) -o $@ $^
|
$(CXX) $(CXXFLAGS) -o $@ $^
|
||||||
|
|
||||||
|
tasksys.o : ../examples/tasksys.cpp
|
||||||
|
$(CXX) $(CXXFLAGS) -c -o $@ $^
|
||||||
|
|
||||||
$ : $.o
|
$ : $.o
|
||||||
|
|
||||||
%.o : %.ispc
|
%.o : %.ispc
|
||||||
|
|||||||
13
tests_ispcpp/array.ispc
Normal file
13
tests_ispcpp/array.ispc
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
export void array(uniform int N, uniform integer * uniform X) {
|
||||||
|
integer *A = new integer[N/2];
|
||||||
|
|
||||||
|
foreach (i = 0 ... N/2) {
|
||||||
|
A[i] = X[i] + X[N/2 + i];
|
||||||
|
}
|
||||||
|
|
||||||
|
foreach (i = 0 ... N) {
|
||||||
|
X[i] = A[i/2];
|
||||||
|
}
|
||||||
|
|
||||||
|
delete[] A;
|
||||||
|
}
|
||||||
17
tests_ispcpp/error_5.ispc
Normal file
17
tests_ispcpp/error_5.ispc
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
//@error
|
||||||
|
|
||||||
|
floating$0 mult(floating$0 x, floating$1 y) {
|
||||||
|
return x * y;
|
||||||
|
}
|
||||||
|
|
||||||
|
export void saxpy(uniform int N,
|
||||||
|
uniform floating$0 scale,
|
||||||
|
uniform floating$1 X[],
|
||||||
|
uniform floating$1 Y[],
|
||||||
|
uniform floating$2 result[])
|
||||||
|
{
|
||||||
|
foreach (i = 0 ... N) {
|
||||||
|
floating$ tmp = mult(scale, X[i]) + Y[i];
|
||||||
|
result[i] = tmp;
|
||||||
|
}
|
||||||
|
}
|
||||||
14
tests_ispcpp/error_6.ispc
Normal file
14
tests_ispcpp/error_6.ispc
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
number pow(number b, int a) {
|
||||||
|
number out = b;
|
||||||
|
for (int i = 1; i<a; i++) {
|
||||||
|
out *= b;
|
||||||
|
}
|
||||||
|
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
export void square(uniform int N, uniform number$-1 b[], uniform number$-1 out[]) {
|
||||||
|
foreach (i = 0 ... N) {
|
||||||
|
out[i] = pow(b[i], 2);
|
||||||
|
}
|
||||||
|
}
|
||||||
13
tests_ispcpp/error_7.ispc
Normal file
13
tests_ispcpp/error_7.ispc
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
floating foo(floating a, floating b) {
|
||||||
|
floating d = a / b;
|
||||||
|
if (d < 0.)
|
||||||
|
return 0.;
|
||||||
|
|
||||||
|
return d;
|
||||||
|
}
|
||||||
|
|
||||||
|
export void bar(uniform integer * uniform X, uniform int N) {
|
||||||
|
foreach (i = 0 ... N-1) {
|
||||||
|
X[i] = foo(X[i], X[i+1]);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,15 +1,15 @@
|
|||||||
floating saxpy_helper(floating scale,
|
floating saxpy_helper(floating scale,
|
||||||
floating<0> x,
|
floating$3 x,
|
||||||
floating<0> y) {
|
floating$3 y) {
|
||||||
return scale * x + y;
|
return scale * x + y;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
export void saxpy(uniform int N,
|
export void saxpy(uniform int N,
|
||||||
uniform floating<0> scale,
|
uniform floating$0 scale,
|
||||||
uniform floating<1> X[],
|
uniform floating$1 X[],
|
||||||
uniform floating<1> Y[],
|
uniform floating$1 Y[],
|
||||||
uniform floating<2> result[])
|
uniform floating$2 result[])
|
||||||
{
|
{
|
||||||
foreach (i = 0 ... N) {
|
foreach (i = 0 ... N) {
|
||||||
result[i] = saxpy_helper(scale, X[i], Y[i]);
|
result[i] = saxpy_helper(scale, X[i], Y[i]);
|
||||||
|
|||||||
212
tests_ispcpp/sqrt.cpp
Normal file
212
tests_ispcpp/sqrt.cpp
Normal file
@@ -0,0 +1,212 @@
|
|||||||
|
#include <stdio.h>
|
||||||
|
#include <algorithm>
|
||||||
|
#include <pthread.h>
|
||||||
|
#include <math.h>
|
||||||
|
#include <cmath>
|
||||||
|
|
||||||
|
#include "CycleTimer.h"
|
||||||
|
#include "sqrt.h"
|
||||||
|
|
||||||
|
using namespace ispc;
|
||||||
|
|
||||||
|
void sqrtSerial(int N,
|
||||||
|
float initialGuess,
|
||||||
|
float values[],
|
||||||
|
float output[])
|
||||||
|
{
|
||||||
|
|
||||||
|
static const float kThreshold = 0.00001f;
|
||||||
|
|
||||||
|
for (int i=0; i<N; i++) {
|
||||||
|
|
||||||
|
float x = values[i];
|
||||||
|
float guess = initialGuess;
|
||||||
|
|
||||||
|
float error = fabs(guess * guess * x - 1.f);
|
||||||
|
|
||||||
|
while (error > kThreshold) {
|
||||||
|
guess = (3.f * guess - x * guess * guess * guess) * 0.5f;
|
||||||
|
error = fabs(guess * guess * x - 1.f);
|
||||||
|
}
|
||||||
|
|
||||||
|
output[i] = x * guess;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void sqrtSerial(int N,
|
||||||
|
double initialGuess,
|
||||||
|
double values[],
|
||||||
|
double output[])
|
||||||
|
{
|
||||||
|
|
||||||
|
static const double kThreshold = 0.00001f;
|
||||||
|
|
||||||
|
for (int i=0; i<N; i++) {
|
||||||
|
|
||||||
|
double x = values[i];
|
||||||
|
double guess = initialGuess;
|
||||||
|
|
||||||
|
double error = std::abs(guess * guess * x - 1.);
|
||||||
|
|
||||||
|
while (error > kThreshold) {
|
||||||
|
guess = (3. * guess - x * guess * guess * guess) * 0.5;
|
||||||
|
error = std::abs(guess * guess * x - 1.);
|
||||||
|
}
|
||||||
|
|
||||||
|
output[i] = x * guess;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void verifyResult(int N, float* result, float* gold) {
|
||||||
|
for (int i=0; i<N; i++) {
|
||||||
|
if (fabs(result[i] - gold[i]) > 1e-4) {
|
||||||
|
printf("Error: [%d] Got %f expected %f\n", i, result[i], gold[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void verifyResult(int N, double* result, double* gold) {
|
||||||
|
for (int i=0; i<N; i++) {
|
||||||
|
if (std::abs(result[i] - gold[i]) > 1e-4) {
|
||||||
|
printf("Error: [%d] Got %f expected %f\n", i, result[i], gold[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int main() {
|
||||||
|
|
||||||
|
const unsigned int N = 20 * 1000 * 1000;
|
||||||
|
const float initialGuess = 1.0f;
|
||||||
|
const double dinitialGuess = 1.0;
|
||||||
|
|
||||||
|
float* values = new float[N];
|
||||||
|
float* output = new float[N];
|
||||||
|
float* gold = new float[N];
|
||||||
|
|
||||||
|
double* dvalues = new double[N];
|
||||||
|
double* doutput = new double[N];
|
||||||
|
double* dgold = new double[N];
|
||||||
|
|
||||||
|
for (unsigned int i=0; i<N; i++)
|
||||||
|
{
|
||||||
|
// random input values
|
||||||
|
values[i] = .001f + 2.998f * static_cast<float>(rand()) / RAND_MAX;
|
||||||
|
dvalues[i] = .001 + 2.998 * static_cast<double>(rand()) / RAND_MAX;
|
||||||
|
output[i] = 0.f;
|
||||||
|
doutput[i] = 0.;
|
||||||
|
}
|
||||||
|
|
||||||
|
// generate a gold version to check results
|
||||||
|
for (unsigned int i=0; i<N; i++) {
|
||||||
|
gold[i] = sqrt(values[i]);
|
||||||
|
dgold[i] = sqrt(dvalues[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// And run the serial implementation 3 times, again reporting the
|
||||||
|
// minimum time.
|
||||||
|
//
|
||||||
|
double minSerial = 1e30;
|
||||||
|
for (int i = 0; i < 5; ++i) {
|
||||||
|
double startTime = CycleTimer::currentSeconds();
|
||||||
|
sqrtSerial(N, initialGuess, values, output);
|
||||||
|
double endTime = CycleTimer::currentSeconds();
|
||||||
|
minSerial = std::min(minSerial, endTime - startTime);
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("[sqrt float serial]:\t\t[%.3f] ms\n", minSerial * 1000);
|
||||||
|
verifyResult(N, output, gold);
|
||||||
|
|
||||||
|
double minDSerial = 1e30;
|
||||||
|
for (int i = 0; i < 5; ++i) {
|
||||||
|
double startTime = CycleTimer::currentSeconds();
|
||||||
|
sqrtSerial(N, dinitialGuess, dvalues, doutput);
|
||||||
|
double endTime = CycleTimer::currentSeconds();
|
||||||
|
minDSerial = std::min(minDSerial, endTime - startTime);
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("[sqrt double serial]:\t\t[%.3f] ms\n", minDSerial * 1000);
|
||||||
|
verifyResult(N, doutput, dgold);
|
||||||
|
|
||||||
|
// Clear out the buffer
|
||||||
|
for (unsigned int i = 0; i < N; ++i) {
|
||||||
|
output[i] = 0;
|
||||||
|
doutput[i] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
//
|
||||||
|
// Compute the image using the ispc implementation; report the minimum
|
||||||
|
// time of three runs.
|
||||||
|
//
|
||||||
|
double minISPC = 1e30;
|
||||||
|
for (int i = 0; i < 5; ++i) {
|
||||||
|
double startTime = CycleTimer::currentSeconds();
|
||||||
|
ispc::sqrt_ispc(N, initialGuess, values, output);
|
||||||
|
double endTime = CycleTimer::currentSeconds();
|
||||||
|
minISPC = std::min(minISPC, endTime - startTime);
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("[sqrt float ispc]:\t\t[%.3f] ms\n", minISPC * 1000);
|
||||||
|
|
||||||
|
verifyResult(N, output, gold);
|
||||||
|
|
||||||
|
double minDISPC = 1e30;
|
||||||
|
for (int i = 0; i < 5; ++i) {
|
||||||
|
double startTime = CycleTimer::currentSeconds();
|
||||||
|
ispc::sqrt_ispc(N, dinitialGuess, dvalues, doutput);
|
||||||
|
double endTime = CycleTimer::currentSeconds();
|
||||||
|
minDISPC = std::min(minDISPC, endTime - startTime);
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("[sqrt double ispc]:\t\t[%.3f] ms\n", minDISPC * 1000);
|
||||||
|
|
||||||
|
verifyResult(N, doutput, dgold);
|
||||||
|
|
||||||
|
// Clear out the buffer
|
||||||
|
for (unsigned int i = 0; i < N; ++i) {
|
||||||
|
output[i] = 0;
|
||||||
|
doutput[i] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// Tasking version of the ISPC code
|
||||||
|
//
|
||||||
|
double minTaskISPC = 1e30;
|
||||||
|
for (int i = 0; i < 3; ++i) {
|
||||||
|
double startTime = CycleTimer::currentSeconds();
|
||||||
|
ispc::sqrt_ispc_withtasks(N, initialGuess, values, output);
|
||||||
|
double endTime = CycleTimer::currentSeconds();
|
||||||
|
minTaskISPC = std::min(minTaskISPC, endTime - startTime);
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("[sqrt float task ispc]:\t\t[%.3f] ms\n", minTaskISPC * 1000);
|
||||||
|
|
||||||
|
verifyResult(N, output, gold);
|
||||||
|
|
||||||
|
double minDTaskISPC = 1e30;
|
||||||
|
for (int i = 0; i < 3; ++i) {
|
||||||
|
double startTime = CycleTimer::currentSeconds();
|
||||||
|
ispc::sqrt_ispc_withtasks(N, dinitialGuess, dvalues, doutput);
|
||||||
|
double endTime = CycleTimer::currentSeconds();
|
||||||
|
minDTaskISPC = std::min(minDTaskISPC, endTime - startTime);
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("[sqrt double task ispc]:\t[%.3f] ms\n", minDTaskISPC * 1000);
|
||||||
|
|
||||||
|
verifyResult(N, output, gold);
|
||||||
|
|
||||||
|
printf("\t\t\t\t(%.2fx speedup from ISPC float)\n", minSerial/minISPC);
|
||||||
|
printf("\t\t\t\t(%.2fx speedup from ISPC double)\n", minDSerial/minDISPC);
|
||||||
|
printf("\t\t\t\t(%.2fx speedup from task ISPC float)\n", minSerial/minTaskISPC);
|
||||||
|
printf("\t\t\t\t(%.2fx speedup from task ISPC double)\n", minDSerial/minDTaskISPC);
|
||||||
|
|
||||||
|
delete[] values;
|
||||||
|
delete[] output;
|
||||||
|
delete[] gold;
|
||||||
|
delete[] dvalues;
|
||||||
|
delete[] doutput;
|
||||||
|
delete[] dgold;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
62
tests_ispcpp/sqrt.ispc
Normal file
62
tests_ispcpp/sqrt.ispc
Normal file
@@ -0,0 +1,62 @@
|
|||||||
|
|
||||||
|
static const float kThreshold = 0.00001f;
|
||||||
|
|
||||||
|
export void sqrt_ispc(uniform int N,
|
||||||
|
uniform floating initialGuess,
|
||||||
|
uniform floating values[],
|
||||||
|
uniform floating output[])
|
||||||
|
{
|
||||||
|
foreach (i = 0 ... N) {
|
||||||
|
|
||||||
|
floating x = values[i];
|
||||||
|
floating guess = initialGuess;
|
||||||
|
|
||||||
|
floating pred = abs(guess * guess * x - 1.f);
|
||||||
|
|
||||||
|
while (pred > kThreshold) {
|
||||||
|
guess = (3.f * guess - x * guess * guess * guess) * 0.5f;
|
||||||
|
pred = abs(guess * guess * x - 1.f);
|
||||||
|
}
|
||||||
|
|
||||||
|
output[i] = x * guess;
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
task void sqrt_ispc_task(uniform int N,
|
||||||
|
uniform int span,
|
||||||
|
uniform floating initialGuess,
|
||||||
|
uniform floating values[],
|
||||||
|
uniform floating output[])
|
||||||
|
{
|
||||||
|
|
||||||
|
uniform int indexStart = taskIndex * span;
|
||||||
|
uniform int indexEnd = min(N, indexStart + span);
|
||||||
|
|
||||||
|
foreach (i = indexStart ... indexEnd) {
|
||||||
|
|
||||||
|
floating x = values[i];
|
||||||
|
floating guess = initialGuess;
|
||||||
|
|
||||||
|
floating pred = abs(guess * guess * x - 1.f);
|
||||||
|
|
||||||
|
while (pred > kThreshold) {
|
||||||
|
guess = (3.f * guess - x * guess * guess * guess) * 0.5f;
|
||||||
|
pred = abs(guess * guess * x - 1.f);
|
||||||
|
}
|
||||||
|
|
||||||
|
output[i] = x * guess;
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export void sqrt_ispc_withtasks(uniform int N,
|
||||||
|
uniform floating initialGuess,
|
||||||
|
uniform floating values[],
|
||||||
|
uniform floating output[])
|
||||||
|
{
|
||||||
|
|
||||||
|
uniform int span = N / 64; // 64 tasks
|
||||||
|
|
||||||
|
launch[N/span] sqrt_ispc_task(N, span, initialGuess, values, output);
|
||||||
|
}
|
||||||
@@ -6,7 +6,7 @@
|
|||||||
int main() {
|
int main() {
|
||||||
float A[256];
|
float A[256];
|
||||||
double B[256];
|
double B[256];
|
||||||
double outA[256];
|
float outA[256];
|
||||||
double outB[256];
|
double outB[256];
|
||||||
|
|
||||||
|
|
||||||
@@ -15,7 +15,7 @@ int main() {
|
|||||||
B[i] = 1. / (i+1);
|
B[i] = 1. / (i+1);
|
||||||
}
|
}
|
||||||
|
|
||||||
ispc::square(256, (float*)&A, (double*)&outA);
|
ispc::square(256, (float*)&A, (float*)&outA);
|
||||||
|
|
||||||
ispc::square(256, (double*)&B, (double*)&outB);
|
ispc::square(256, (double*)&B, (double*)&outB);
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
floating foo(const uniform int a, floating b) {
|
number pow(number b, int a) {
|
||||||
floating out = b;
|
number out = b;
|
||||||
for (int i = 1; i<a; i++) {
|
for (int i = 1; i<a; i++) {
|
||||||
out *= b;
|
out *= b;
|
||||||
}
|
}
|
||||||
@@ -7,8 +7,8 @@ floating foo(const uniform int a, floating b) {
|
|||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
|
|
||||||
export void square(uniform int N, uniform floating b[], uniform double out[]) {
|
export void square(uniform int N, uniform number b[], uniform number out[]) {
|
||||||
foreach (i = 0 ... N) {
|
foreach (i = 0 ... N) {
|
||||||
out[i] = foo(2, b[i]);
|
out[i] = pow(b[i], 2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user