19 Commits

Author SHA1 Message Date
Dmitry Babokin
28f0bce9f2 Release 1.4.4 2013-07-19 16:22:10 -07:00
Dmitry Babokin
0f82f216a2 Merge pull request #544 from mmp/master
Handle SHL with a constant vector in LLVMVectorIsLinear().
2013-07-18 11:46:11 -07:00
Matt Pharr
7454b1399c Handle SHL with a constant vector in LLVMVectorIsLinear().
LLVM3.4 seems to be turning multiplies by a constant power of 2 into
the equivalent SHL, which was in turn thwarting the pattern matching
for turning gathers/scatters into vector loads/stores.
2013-07-17 14:12:43 -07:00
jbrodman
4ebf46bd63 Merge pull request #543 from mmp/master
Fix build with LLVM top-of-tree
2013-07-17 10:38:06 -07:00
Matt Pharr
f1cce0ef5f Fix build with LLVM top-of-tree 2013-07-17 09:25:00 -07:00
Dmitry Babokin
8c9e873c10 Merge pull request #540 from dbabokin/embree_bug
Fix for the bug introduced by --intrumentation fix
2013-07-04 10:45:06 -07:00
Dmitry Babokin
c85439e7bb Fix for the bug introduced by --intrumentation fix 2013-07-04 21:41:57 +04:00
Ilia Filippov
fd7f87b55e Supporting perf.py on Windows and some small corrections in it 2013-07-02 19:23:18 +04:00
Dmitry Babokin
8be4128c5a Merge pull request #534 from ifilippov/perf
add script for measuring performance
2013-07-01 05:09:03 -07:00
Ilia Filippov
806e37338c add script for measuring performance 2013-07-01 13:30:49 +04:00
Dmitry Babokin
ec1095624a Merge pull request #527 from tkoziara/master
examples/sort added
2013-06-25 10:11:39 -07:00
Tomasz Koziara
a23d69ebe8 Copyright changed to simplify legal matters. 2013-06-25 17:28:27 +01:00
Dmitry Babokin
0aff61ffc6 Merge pull request #533 from dbabokin/patch
Quick fix for LLVM 3.3 patch
2013-06-25 08:50:32 -07:00
Dmitry Babokin
05aa540984 Quick fix for LLVM 3.3 patch 2013-06-25 19:49:41 +04:00
Dmitry Babokin
033e83e490 Merge pull request #532 from dbabokin/release_1_4_3
Release 1.4.3
2013-06-25 07:42:08 -07:00
Dmitry Babokin
1e5d852e2f Merge pull request #531 from ifilippov/qsize_fail
replacement of qsize due to it's fails on MacOS
2013-06-25 05:36:45 -07:00
Ilia Filippov
cc32d913a0 replacement of qsize due to it's fails on MacOS 2013-06-25 16:27:25 +04:00
Tomasz Koziara
86ee8db778 Parallel prefix sum added + minor amendements. 2013-06-25 12:45:51 +01:00
Tomasz Koziara
f2452f040d First commit of the radix sort example. 2013-06-24 18:37:44 +01:00
23 changed files with 859 additions and 22 deletions

View File

@@ -4914,7 +4914,12 @@ WriteCXXFile(llvm::Module *module, const char *fn, int vectorWidth,
pm.add(new llvm::TargetData(module));
#endif
#if defined(LLVM_3_1) || defined(LLVM_3_2) || defined(LLVM_3_3)
int flags = 0;
#else
llvm::sys::fs::OpenFlags flags = llvm::sys::fs::F_None;
#endif
std::string error;
llvm::tool_output_file *of = new llvm::tool_output_file(fn, error, flags);
if (error.size()) {

View File

@@ -1414,7 +1414,7 @@ FunctionEmitContext::ProgramIndexVector(bool is32bits) {
llvm::Value *
FunctionEmitContext::GetStringPtr(const std::string &str) {
llvm::Constant *lstr = llvm::ConstantDataArray::getString(*g->ctx, str, false);
llvm::Constant *lstr = llvm::ConstantDataArray::getString(*g->ctx, str);
llvm::GlobalValue::LinkageTypes linkage = llvm::GlobalValue::InternalLinkage;
llvm::Value *lstrPtr = new llvm::GlobalVariable(*m->module, lstr->getType(),
true /*isConst*/,

View File

@@ -1,3 +1,7 @@
=== v1.4.4 === (19 July 2013)
A minor version update with several stability fixes requested by the customers.
=== v1.4.3 === (25 June 2013)
A minor version update with several stability improvements:

View File

@@ -2,6 +2,13 @@
ispc News
=========
ispc 1.4.4 is Released
----------------------
A minor update of ``ispc`` has been released with several stability improvements.
The released binaries are built with patched version of LLVM 3.3. Since this
release we also distribute 32 bit Linux binaries.
ispc 1.4.3 is Released
----------------------

View File

@@ -31,7 +31,7 @@ PROJECT_NAME = "Intel SPMD Program Compiler"
# This could be handy for archiving the generated documentation or
# if some version control system is used.
PROJECT_NUMBER = 1.4.3
PROJECT_NUMBER = 1.4.4
# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
# base path where the generated documentation will be put.

View File

@@ -87,18 +87,22 @@
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
<LinkIncremental>true</LinkIncremental>
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
<TargetName>ao</TargetName>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<LinkIncremental>true</LinkIncremental>
<ExecutablePath>$(ExecutablePath);$(ProjectDir)..\..</ExecutablePath>
<TargetName>ao</TargetName>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
<LinkIncremental>false</LinkIncremental>
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
<TargetName>ao</TargetName>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<LinkIncremental>false</LinkIncremental>
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
<TargetName>ao</TargetName>
</PropertyGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
<ClCompile>
@@ -173,4 +177,4 @@
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
</ImportGroup>
</Project>
</Project>

View File

@@ -140,7 +140,7 @@ int main(int argc, char *argv[]) {
printf("[mandelbrot serial]:\t\t[%.3f] millon cycles\n", minSerial);
writePPM(buf, width, height, "mandelbrot-serial.ppm");
printf("\t\t\t\t(%.2fx speedup from ISPC)\n", minSerial/minISPC);
printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", minSerial/minISPC);
return 0;
}

View File

@@ -65,18 +65,22 @@
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
<LinkIncremental>true</LinkIncremental>
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
<TargetName>mandelbrot</TargetName>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<LinkIncremental>true</LinkIncremental>
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
<TargetName>mandelbrot</TargetName>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
<LinkIncremental>false</LinkIncremental>
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
<TargetName>mandelbrot</TargetName>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<LinkIncremental>false</LinkIncremental>
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
<TargetName>mandelbrot</TargetName>
</PropertyGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
<ClCompile>

53
examples/perf.ini Executable file
View File

@@ -0,0 +1,53 @@
%****************************************************************************************************
%Usage:
% Name of test
% Path to test from base folder (examples)
% command to execute test to compute performance
% [! X Y] //If one test has different output X is position of current output, Y is number of outputs
% [^] //concatenate output of this step with previous one
% #***
% [% comment]
%****************************************************************************************************
AOBench
aobench
ao 10 512 512
#***
Deferred Shading
deferred
deferred_shading data/pp1280x720.bin
#***
Mandelbrot Set
mandelbrot
mandelbrot
#***
Mandelbrot Set
mandelbrot_tasks
mandelbrot
^
#***
Perlin Noise Function
noise
noise
#***
Binomial Options
options
options
! 1 2
#***
Black-Scholes Options
options
options
! 2 2
#***
Ray Tracer
rt
rt sponza
#***
3D Stencil
stencil
stencil
#***
Volume Rendering
volume_rendering
volume camera.dat density_highres.vol
#***

262
examples/perf.py Executable file
View File

@@ -0,0 +1,262 @@
#!/usr/bin/python
# // Author: Filippov Ilia
from optparse import OptionParser
import sys
import os
import operator
import time
import glob
import string
import platform
def build_test():
global build_log
global is_windows
if is_windows == False:
os.system("make clean >> "+build_log)
return os.system("make >> "+build_log+" 2>> "+build_log)
else:
os.system("msbuild /t:clean >> " + build_log)
return os.system("msbuild /V:m /p:Platform=x64 /p:Configuration=Release /p:TargetDir=.\ /t:rebuild >> " + build_log)
def execute_test(command):
global perf_temp
r = 0
if os.path.exists(perf_temp):
os.remove(perf_temp)
for k in range(int(options.number)):
r = r + os.system(command)
return r
#gathers all tests results and made an item test from answer structure
def run_test(command, c1, c2, test):
global perf_temp
if build_test() != 0:
sys.stdout.write("ERROR: Compilation fails\n")
return
if execute_test(command) != 0:
sys.stdout.write("ERROR: Execution fails\n")
return
tasks = [] #list of results with tasks, it will be test[2]
ispc = [] #list of results without tasks, it will be test[1]
j = 1
for line in open(perf_temp): # we take test output
if "speedup" in line: # we are interested only in lines with speedup
if j == c1: # we are interested only in lines with c1 numbers
sys.stdout.write(line)
line = line.expandtabs(0)
line = line.replace("("," ")
line = line.split(",")
for i in range(len(line)):
subline = line[i].split(" ")
number = float(subline[1][:-1])
if "speedup from ISPC + tasks" in line[i]:
tasks.append(number)
else:
ispc.append(number)
c1 = c1 + c2
j+=1
test[1] = test[1] + ispc
test[2] = test[2] + tasks
def cpu_get():
p = open("/proc/stat", 'r')
cpu = p.readline()
p.close()
cpu = cpu.split(" ")
cpu_usage = (int(cpu[2]) + int(cpu[3]) + int(cpu[4]))
cpu_all = cpu_usage + int(cpu[5])
return [cpu_usage, cpu_all]
#returns cpu_usage
def cpu_check():
if is_windows == False:
cpu1 = cpu_get()
time.sleep(1)
cpu2 = cpu_get()
cpu_percent = (float(cpu1[0] - cpu2[0])/float(cpu1[1] - cpu2[1]))*100
else:
os.system("wmic cpu get loadpercentage /value > cpu_temp")
c = open("cpu_temp", 'r')
c_lines = c.readlines()
c.close()
os.remove("cpu_temp")
t = "0"
for i in c_lines[2]:
if i.isdigit():
t = t + i
cpu_percent = int(t)
return cpu_percent
#returns geomean of list
def geomean(par):
temp = 1
l = len(par)
for i in range(l):
temp = temp * par[i]
temp = temp ** (1.0/l)
return round(temp, 2)
#takes an answer struct and print it.
#answer struct: list answer contains lists test
#test[0] - name of test
#test[1] - list of results without tasks
#test[2] - list of results with tasks
#test[1] or test[2] may be empty
def print_answer(answer):
sys.stdout.write("Name of test:\t\tISPC:\tISPC + tasks:\n")
max_t = [0,0]
diff_t = [0,0]
geomean_t = [0,0]
list_of_max = [[],[]]
for i in range(len(answer)):
for t in range(1,3):
if len(answer[i][t]) == 0:
max_t[t-1] = "n/a"
diff_t[t-1] = "n/a"
else:
list_of_max[t-1].append(max(answer[i][t]))
max_t[t-1] = str(max(answer[i][t]))
diff_t[t-1] = str(max(answer[i][t]) - min(answer[i][t]))
sys.stdout.write("%s:\n" % answer[i][0])
sys.stdout.write("\t\tmax:\t%s\t%s\n" % (max_t[0], max_t[1]))
sys.stdout.write("\t\tdiff:\t%s\t%s\n" % (diff_t[0], diff_t[1]))
geomean_t[0] = geomean(list_of_max[0])
geomean_t[1] = geomean(list_of_max[1])
sys.stdout.write("---------------------------------------------\n")
sys.stdout.write("Geomean:\t\t%s\t%s\n" % (geomean_t[0], geomean_t[1]))
###Main###
# parsing options
parser = OptionParser()
parser.add_option('-n', '--number', dest='number',
help='number of repeats', default="3")
parser.add_option('-c', '--config', dest='config',
help='config file of tests', default="./perf.ini")
parser.add_option('-p', '--path', dest='path',
help='path to examples directory', default="./")
(options, args) = parser.parse_args()
global is_windows
is_windows = (platform.system() == 'Windows' or
'CYGWIN_NT' in platform.system())
# save corrent path
pwd = os.getcwd()
pwd = pwd + os.sep
if is_windows:
pwd = "..\\"
# check if cpu usage is low now
cpu_percent = cpu_check()
if cpu_percent > 20:
sys.stdout.write("Warning: CPU Usage is very high.\n")
sys.stdout.write("Close other applications.\n")
# check that required compilers exist
PATH_dir = string.split(os.getenv("PATH"), os.pathsep)
compiler_exists = False
ref_compiler_exists = False
if is_windows == False:
compiler = "ispc"
ref_compiler = "g++"
else:
compiler = "ispc.exe"
ref_compiler = "cl.exe"
for counter in PATH_dir:
if os.path.exists(counter + os.sep + compiler):
compiler_exists = True
if os.path.exists(counter + os.sep + ref_compiler):
ref_compiler_exists = True
if not compiler_exists:
sys.stderr.write("Fatal error: ISPC compiler not found.\n")
sys.stderr.write("Added path to ispc compiler to your PATH variable.\n")
sys.exit()
if not ref_compiler_exists:
sys.stderr.write("Fatal error: reference compiler %s not found.\n" % ref_compiler)
sys.stderr.write("Added path to %s compiler to your PATH variable.\n" % ref_compiler)
sys.exit()
# checks that config file exists
path_config = os.path.normpath(options.config)
if os.path.exists(path_config) == False:
sys.stderr.write("Fatal error: config file not found: %s.\n" % options.config)
sys.stderr.write("Set path to your config file in --config.\n")
sys.exit()
# read lines from config file except comments
f = open(path_config, 'r')
f_lines = f.readlines()
f.close()
lines =[]
for i in range(len(f_lines)):
if f_lines[i][0] != "%":
lines.append(f_lines[i])
length = len(lines)
# prepare build.log and perf_temp files
global build_log
build_log = pwd + "build.log"
if is_windows == False:
if os.path.exists(build_log):
os.remove(build_log)
else:
if os.path.exists("build.log"):
os.remove("build.log")
global perf_temp
perf_temp = pwd + "perf_temp"
i = 0
answer = []
sys.stdout.write("Okey go go go!\n\n")
# loop for all tests
while i < length-2:
# we read name of test
sys.stdout.write("%s" % lines[i])
test = [lines[i][:-1],[],[]]
# read location of test
folder = lines[i+1]
folder = folder[:-1]
folder = os.path.normpath(options.path + os.sep + folder)
# check that test exists
if os.path.exists(folder) == False:
sys.stdout.write("Fatal error: Can't find test %s. Your path is: \"%s\".\n" % (lines[i][:-1], options.path))
sys.stdout.write("Change current location to /examples or set path to /examples in --path.\n")
exit(0)
os.chdir(folder)
# read parameters of test
command = lines[i+2]
command = command[:-1]
if is_windows == False:
command = "./"+command + " >> " + perf_temp
else:
command = "x64\\Release\\"+command + " >> " + perf_temp
# parsing config parameters
next_line = lines[i+3]
if next_line[0] == "!": # we should take only one part of test output
R = next_line.split(' ')
c1 = int(R[1]) #c1 is a number of string which we want to use in test output
c2 = int(R[2]) #c2 is total number of strings in test output
i = i+1
else:
c1 = 1
c2 = 1
next_line = lines[i+3]
if next_line[0] == "^": #we should concatenate result of this test with previous one
run_test(command, c1, c2, answer[len(answer)-1])
i = i+1
else: #we run this test and append it's result to answer structure
run_test(command, c1, c2, test)
answer.append(test)
# preparing next loop iteration
os.chdir(pwd)
i+=4
# delete temp file
if os.path.exists(perf_temp):
os.remove(perf_temp)
#print collected answer
print_answer(answer)

View File

@@ -259,7 +259,7 @@ int main(int argc, char *argv[]) {
}
printf("[rt serial]:\t\t\t[%.3f] million cycles for %d x %d image\n",
minTimeSerial, width, height);
printf("\t\t\t\t(%.2fx speedup from ISPC, %.2f from ISPC + tasks)\n",
printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n",
minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCtasks);
writeImage(id, image, width, height, "rt-serial.ppm");

8
examples/sort/Makefile Normal file
View File

@@ -0,0 +1,8 @@
EXAMPLE=sort
CPP_SRC=sort.cpp sort_serial.cpp
ISPC_SRC=sort.ispc
ISPC_TARGETS=sse2,sse4-x2,avx
#ISPC_FLAGS=-DDEBUG
include ../common.mk

134
examples/sort/sort.cpp Normal file
View File

@@ -0,0 +1,134 @@
/*
Copyright (c) 2013, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* Author: Tomasz Koziara */
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <iostream>
#include <iomanip>
#include "../timing.h"
#include "sort_ispc.h"
using namespace ispc;
extern void sort_serial (int n, unsigned int code[], int order[]);
/* progress bar by Ross Hemsley;
* http://www.rosshemsley.co.uk/2011/02/creating-a-progress-bar-in-c-or-any-other-console-app/ */
static inline void progressbar (unsigned int x, unsigned int n, unsigned int w = 50)
{
if (n < 100)
{
x *= 100/n;
n = 100;
}
if ((x != n) && (x % (n/100) != 0)) return;
using namespace std;
float ratio = x/(float)n;
int c = ratio * w;
cout << setw(3) << (int)(ratio*100) << "% [";
for (int x=0; x<c; x++) cout << "=";
for (int x=c; x<w; x++) cout << " ";
cout << "]\r" << flush;
}
int main (int argc, char *argv[])
{
int i, j, n = argc == 1 ? 1000000 : atoi(argv[1]), m = n < 100 ? 1 : 50, l = n < 100 ? n : RAND_MAX;
double tISPC1 = 0.0, tISPC2 = 0.0, tSerial = 0.0;
unsigned int *code = new unsigned int [n];
int *order = new int [n];
srand (0);
for (i = 0; i < m; i ++)
{
for (j = 0; j < n; j ++) code [j] = random() % l;
reset_and_start_timer();
sort_ispc (n, code, order, 1);
tISPC1 += get_elapsed_mcycles();
progressbar (i, m);
}
printf("[sort ispc]:\t[%.3f] million cycles\n", tISPC1);
srand (0);
for (i = 0; i < m; i ++)
{
for (j = 0; j < n; j ++) code [j] = random() % l;
reset_and_start_timer();
sort_ispc (n, code, order, 0);
tISPC2 += get_elapsed_mcycles();
progressbar (i, m);
}
printf("[sort ispc+tasks]:\t[%.3f] million cycles\n", tISPC2);
srand (0);
for (i = 0; i < m; i ++)
{
for (j = 0; j < n; j ++) code [j] = random() % l;
reset_and_start_timer();
sort_serial (n, code, order);
tSerial += get_elapsed_mcycles();
progressbar (i, m);
}
printf("[sort serial]:\t\t[%.3f] million cycles\n", tSerial);
printf("\t\t\t\t(%.2fx speedup from ISPC serial)\n", tSerial/tISPC1);
printf("\t\t\t\t(%.2fx speedup from ISPC with tasks)\n", tSerial/tISPC2);
delete code;
delete order;
return 0;
}

249
examples/sort/sort.ispc Normal file
View File

@@ -0,0 +1,249 @@
/*
Copyright (c) 2013, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* Author: Tomasz Koziara */
task void histogram (uniform int span, uniform int n, uniform int64 code[], uniform int pass, uniform int hist[])
{
uniform int start = taskIndex*span;
uniform int end = taskIndex == taskCount-1 ? n : start+span;
uniform int strip = (end-start)/programCount;
uniform int tail = (end-start)%programCount;
int i = programCount*taskIndex + programIndex;
int g [256];
cfor (int j = 0; j < 256; j ++)
{
g[j] = 0;
}
cfor (int k = start+programIndex*strip; k < start+(programIndex+1)*strip; k ++)
{
unsigned int8 *c = (unsigned int8*) &code[k];
g[c[pass]] ++;
}
if (programIndex == programCount-1) /* remainder is processed by the last lane */
{
for (int k = start+programCount*strip; k < start+programCount*strip+tail; k ++)
{
unsigned int8 *c = (unsigned int8*) &code[k];
g[c[pass]] ++;
}
}
cfor (int j = 0; j < 256; j ++)
{
hist[j*programCount*taskCount+i] = g[j];
}
}
task void permutation (uniform int span, uniform int n, uniform int64 code[], uniform int pass, uniform int hist[], uniform int64 perm[])
{
uniform int start = taskIndex*span;
uniform int end = taskIndex == taskCount-1 ? n : start+span;
uniform int strip = (end-start)/programCount;
uniform int tail = (end-start)%programCount;
int i = programCount*taskIndex + programIndex;
int g [256];
cfor (int j = 0; j < 256; j ++)
{
g[j] = hist[j*programCount*taskCount+i];
}
cfor (int k = start+programIndex*strip; k < start+(programIndex+1)*strip; k ++)
{
unsigned int8 *c = (unsigned int8*) &code[k];
int l = g[c[pass]];
perm[l] = code[k];
g[c[pass]] = l+1;
}
if (programIndex == programCount-1) /* remainder is processed by the last lane */
{
for (int k = start+programCount*strip; k < start+programCount*strip+tail; k ++)
{
unsigned int8 *c = (unsigned int8*) &code[k];
int l = g[c[pass]];
perm[l] = code[k];
g[c[pass]] = l+1;
}
}
}
task void copy (uniform int span, uniform int n, uniform int64 from[], uniform int64 to[])
{
uniform int start = taskIndex*span;
uniform int end = taskIndex == taskCount-1 ? n : start+span;
foreach (i = start ... end)
{
to[i] = from[i];
}
}
task void pack (uniform int span, uniform int n, uniform unsigned int code[], uniform int64 pair[])
{
uniform int start = taskIndex*span;
uniform int end = taskIndex == taskCount-1 ? n : start+span;
foreach (i = start ... end)
{
pair[i] = ((int64)i<<32)+code[i];
}
}
task void unpack (uniform int span, uniform int n, uniform int64 pair[], uniform int unsigned code[], uniform int order[])
{
uniform int start = taskIndex*span;
uniform int end = taskIndex == taskCount-1 ? n : start+span;
foreach (i = start ... end)
{
code[i] = pair[i];
order[i] = pair[i]>>32;
}
}
task void addup (uniform int h[], uniform int g[])
{
uniform int * uniform u = &h[256*programCount*taskIndex];
uniform int i, x, y = 0;
for (i = 0; i < 256*programCount; i ++)
{
x = u[i];
u[i] = y;
y += x;
}
g[taskIndex] = y;
}
task void bumpup (uniform int h[], uniform int g[])
{
uniform int * uniform u = &h[256*programCount*taskIndex];
uniform int z = g[taskIndex];
foreach (i = 0 ... 256*programCount)
{
u[i] += z;
}
}
static void prefix_sum (uniform int num, uniform int h[])
{
uniform int * uniform g = uniform new int [num+1];
uniform int i;
launch[num] addup (h, g+1);
sync;
for (g[0] = 0, i = 1; i < num; i ++) g[i] += g[i-1];
launch[num] bumpup (h, g);
sync;
delete g;
}
export void sort_ispc (uniform int n, uniform unsigned int code[], uniform int order[], uniform int ntasks)
{
uniform int num = ntasks < 1 ? num_cores () : ntasks;
uniform int span = n / num;
uniform int hsize = 256*programCount*num;
uniform int * uniform hist = uniform new int [hsize];
uniform int64 * uniform pair = uniform new int64 [n];
uniform int64 * uniform temp = uniform new int64 [n];
uniform int pass, i;
#if DEBUG
if (n < 100)
{
print ("input: ");
for (i = 0; i < n; i ++) print ("%, ", code[i]);
print ("\n");
}
#endif
launch[num] pack (span, n, code, pair);
sync;
for (pass = 0; pass < 4; pass ++)
{
launch[num] histogram (span, n, pair, pass, hist);
sync;
prefix_sum (num, hist);
launch[num] permutation (span, n, pair, pass, hist, temp);
sync;
launch[num] copy (span, n, temp, pair);
sync;
}
launch[num] unpack (span, n, pair, code, order);
sync;
#if DEBUG
for (i = 0; i < n; i ++)
{
if (i > 0 && code[i-1] > code[i])
print ("ERR at % => % > %; ", i, code[i-1], code[i]);
}
if (n < 100)
{
print ("output: ");
for (i = 0; i < n; i ++) print ("%, ", code[i]);
print ("\n");
print ("order: ");
for (i = 0; i < n; i ++) print ("%, ", order[i]);
print ("\n");
}
#endif
delete hist;
delete pair;
delete temp;
}

View File

@@ -0,0 +1,60 @@
/*
Copyright (c) 2013, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* Author: Tomasz Koziara */
#include <vector>
#include <algorithm>
#include <utility>
typedef std::pair<double,int> pair;
struct cmp
{
bool operator() (const pair& a, const pair& b) { return a.first < b.first; }
};
void sort_serial (int n, unsigned int code[], int order[])
{
std::vector<pair> pairs;
pairs.reserve (n);
for (int i = 0; i < n; i++) pairs.push_back (pair(code[i], i));
std::sort (pairs.begin(), pairs.end(), cmp());
int *o = order;
for (std::vector<pair>::const_iterator p = pairs.begin(); p != pairs.end(); ++p, ++o) *o = p->second;
}

View File

@@ -132,7 +132,7 @@ int main() {
printf("[stencil serial]:\t\t[%.3f] millon cycles\n", minTimeSerial);
printf("\t\t\t\t(%.2fx speedup from ISPC, %.2f from ISPC + tasks)\n",
printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n",
minTimeSerial / minTimeISPC, minTimeSerial / minTimeISPCTasks);
// Check for agreement

View File

@@ -344,7 +344,7 @@ static inline void
lMemFence() {
// Windows atomic functions already contain the fence
// KNC doesn't need the memory barrier
#if !defined ISPC_IS_KNC || !defined ISPC_IS_WINDOWS
#if !defined ISPC_IS_KNC && !defined ISPC_IS_WINDOWS
__asm__ __volatile__("mfence":::"memory");
#endif
}
@@ -374,7 +374,7 @@ lAtomicCompareAndSwapPointer(void **v, void *newValue, void *oldValue) {
static int32_t
lAtomicCompareAndSwap32(volatile int32_t *v, int32_t newValue, int32_t oldValue) {
#ifdef ISPC_IS_WINDOWS
return InterlockedCompareExchange(v, newValue, oldValue);
return InterlockedCompareExchange((volatile LONG *)v, newValue, oldValue);
#else
int32_t result;
__asm__ __volatile__("lock\ncmpxchgl %2,%1"
@@ -389,7 +389,7 @@ lAtomicCompareAndSwap32(volatile int32_t *v, int32_t newValue, int32_t oldValue)
static inline int32_t
lAtomicAdd(volatile int32_t *v, int32_t delta) {
#ifdef ISPC_IS_WINDOWS
return InterlockedAdd(v, delta);
return InterlockedAdd((volatile LONG *)v, delta);
#else
int32_t origValue;
__asm__ __volatile__("lock\n"

View File

@@ -207,7 +207,7 @@ int main(int argc, char *argv[]) {
printf("[volume serial]:\t\t[%.3f] millon cycles\n", minSerial);
writePPM(image, width, height, "volume-serial.ppm");
printf("\t\t\t\t(%.2fx speedup from ISPC serial, %.2fx from ISPC+tasks)\n",
printf("\t\t\t\t(%.2fx speedup from ISPC, %.2fx speedup from ISPC + tasks)\n",
minSerial/minISPC, minSerial / minISPCtasks);
return 0;

2
ispc.h
View File

@@ -38,7 +38,7 @@
#ifndef ISPC_H
#define ISPC_H
#define ISPC_VERSION "1.4.3"
#define ISPC_VERSION "1.4.4"
#if !defined(LLVM_3_1) && !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4)
#error "Only LLVM 3.1, 3.2, 3.3 and the 3.4 development branch are supported"

View File

@@ -23,8 +23,8 @@ Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp
AddToWorkList(SmallShift.getNode());
- return DAG.getNode(ISD::ANY_EXTEND, N->getDebugLoc(), VT, SmallShift);
+ APInt Mask = APInt::getAllOnesValue(VT.getSizeInBits()).lshr(ShiftAmt);
+ return DAG.getNode(ISD::AND, SDLoc(N), VT,
+ DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, SmallShift),
+ return DAG.getNode(ISD::AND, N->getDebugLoc(), VT,
+ DAG.getNode(ISD::ANY_EXTEND, N->getDebugLoc(), VT, SmallShift),
+ DAG.getConstant(Mask, VT));
}
}

View File

@@ -1207,6 +1207,40 @@ lCheckMulForLinear(llvm::Value *op0, llvm::Value *op1, int vectorLength,
}
/** Checks to see if (op0 << op1) is a linear vector where the result is a
vector with values that increase by stride.
*/
static bool
lCheckShlForLinear(llvm::Value *op0, llvm::Value *op1, int vectorLength,
int stride, std::vector<llvm::PHINode *> &seenPhis) {
// Is the second operand a constant integer value splatted across all of
// the lanes?
llvm::ConstantDataVector *cv = llvm::dyn_cast<llvm::ConstantDataVector>(op1);
if (cv == NULL)
return false;
llvm::Constant *csplat = cv->getSplatValue();
if (csplat == NULL)
return false;
llvm::ConstantInt *splat = llvm::dyn_cast<llvm::ConstantInt>(csplat);
if (splat == NULL)
return false;
// If (1 << the splat value) doesn't evenly divide the stride we're
// looking for, there's no way that we can get the linear sequence
// we're looking or.
int64_t equivalentMul = (1 << splat->getSExtValue());
if (equivalentMul > stride || (stride % equivalentMul) != 0)
return false;
// Check to see if the first operand is a linear vector with stride
// given by stride/splatVal.
return lVectorIsLinear(op0, vectorLength, (int)(stride / equivalentMul),
seenPhis);
}
/** Given (op0 AND op1), try and see if we can determine if the result is a
linear sequence with a step of "stride" between values. Returns true
if so and false otherwise. This pattern comes up when accessing SOA
@@ -1290,6 +1324,12 @@ lVectorIsLinear(llvm::Value *v, int vectorLength, int stride,
bool m1 = lCheckMulForLinear(op1, op0, vectorLength, stride, seenPhis);
return m1;
}
else if (bop->getOpcode() == llvm::Instruction::Shl) {
// Sometimes multiplies come in as shift lefts (especially in
// LLVM 3.4+).
bool linear = lCheckShlForLinear(op0, op1, vectorLength, stride, seenPhis);
return linear;
}
else if (bop->getOpcode() == llvm::Instruction::And) {
// Special case for some AND-related patterns that come up when
// looping over SOA data

View File

@@ -1064,7 +1064,12 @@ Module::writeObjectFileOrAssembly(llvm::TargetMachine *targetMachine,
llvm::TargetMachine::CodeGenFileType fileType = (outputType == Object) ?
llvm::TargetMachine::CGFT_ObjectFile : llvm::TargetMachine::CGFT_AssemblyFile;
bool binary = (fileType == llvm::TargetMachine::CGFT_ObjectFile);
#if defined(LLVM_3_1) || defined(LLVM_3_2) || defined(LLVM_3_3)
unsigned int flags = binary ? llvm::raw_fd_ostream::F_Binary : 0;
#else
llvm::sys::fs::OpenFlags flags = binary ? llvm::sys::fs::F_Binary :
llvm::sys::fs::F_None;
#endif
std::string error;
llvm::tool_output_file *of = new llvm::tool_output_file(outFileName, error, flags);

View File

@@ -405,10 +405,11 @@ def run_tasks_from_queue(queue, queue_ret, queue_skip, total_tests_arg, max_test
compile_error_files = [ ]
run_error_files = [ ]
skip_files = [ ]
while True:
filename = queue.get()
if (filename == 'STOP'):
queue_ret.put((compile_error_files, run_error_files))
queue_ret.put((compile_error_files, run_error_files, skip_files))
if is_windows:
try:
os.remove("test_static.obj")
@@ -434,7 +435,7 @@ def run_tasks_from_queue(queue, queue_ret, queue_skip, total_tests_arg, max_test
with mutex:
update_progress(filename, total_tests_arg, counter, max_test_length_arg)
else:
queue_skip.put(filename)
skip_files += [ filename ]
task_threads = []
@@ -449,6 +450,7 @@ if __name__ == '__main__':
compile_error_files = [ ]
run_error_files = [ ]
skip_files = [ ]
nthreads = min(multiprocessing.cpu_count(), options.num_jobs)
nthreads = min(nthreads, len(files))
@@ -488,16 +490,16 @@ if __name__ == '__main__':
sys.stdout.write("Elapsed time: %d s\n" % elapsed_time)
while not qret.empty():
(c, r) = qret.get()
(c, r, s) = qret.get()
compile_error_files += c
run_error_files += r
skip_files += s
skip = 0
if qskip.qsize() > 0:
sys.stdout.write("%d / %d tests SKIPPED:\n" % (qskip.qsize(), total_tests))
while not qskip.empty():
sys.stdout.write("\t%s\n" % qskip.get())
if len(skip_files) > 0:
skip_files.sort()
sys.stdout.write("%d / %d tests SKIPPED:\n" % (len(skip_files), total_tests))
for f in skip_files:
sys.stdout.write("\t%s\n" % f)
if len(compile_error_files) > 0:
compile_error_files.sort()
sys.stdout.write("%d / %d tests FAILED compilation:\n" % (len(compile_error_files), total_tests))