added cuda examples
This commit is contained in:
3
examples_cuda/mandelbrot/.gitignore
vendored
Normal file
3
examples_cuda/mandelbrot/.gitignore
vendored
Normal file
@@ -0,0 +1,3 @@
|
||||
mandelbrot
|
||||
*.ppm
|
||||
objs
|
||||
8
examples_cuda/mandelbrot/Makefile
Normal file
8
examples_cuda/mandelbrot/Makefile
Normal file
@@ -0,0 +1,8 @@
|
||||
|
||||
EXAMPLE=mandelbrot
|
||||
CPP_SRC=mandelbrot.cpp mandelbrot_serial.cpp
|
||||
ISPC_SRC=mandelbrot.ispc
|
||||
ISPC_IA_TARGETS=sse2,sse4-x2,avx-x2
|
||||
ISPC_ARM_TARGETS=neon
|
||||
|
||||
include ../common.mk
|
||||
BIN
examples_cuda/mandelbrot/avx.out
Executable file
BIN
examples_cuda/mandelbrot/avx.out
Executable file
Binary file not shown.
BIN
examples_cuda/mandelbrot/avx1.out
Executable file
BIN
examples_cuda/mandelbrot/avx1.out
Executable file
Binary file not shown.
118
examples_cuda/mandelbrot/mandelbrot.cpp
Normal file
118
examples_cuda/mandelbrot/mandelbrot.cpp
Normal file
@@ -0,0 +1,118 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define _CRT_SECURE_NO_WARNINGS
|
||||
#define NOMINMAX
|
||||
#pragma warning (disable: 4244)
|
||||
#pragma warning (disable: 4305)
|
||||
#endif
|
||||
|
||||
#include <stdio.h>
|
||||
#include <algorithm>
|
||||
#include "../timing.h"
|
||||
#include "mandelbrot_ispc.h"
|
||||
using namespace ispc;
|
||||
|
||||
extern void mandelbrot_serial(float x0, float y0, float x1, float y1,
|
||||
int width, int height, int maxIterations,
|
||||
int output[]);
|
||||
|
||||
/* Write a PPM image file with the image of the Mandelbrot set */
|
||||
static void
|
||||
writePPM(int *buf, int width, int height, const char *fn) {
|
||||
FILE *fp = fopen(fn, "wb");
|
||||
fprintf(fp, "P6\n");
|
||||
fprintf(fp, "%d %d\n", width, height);
|
||||
fprintf(fp, "255\n");
|
||||
for (int i = 0; i < width*height; ++i) {
|
||||
// Map the iteration count to colors by just alternating between
|
||||
// two greys.
|
||||
char c = (buf[i] & 0x1) ? 240 : 20;
|
||||
for (int j = 0; j < 3; ++j)
|
||||
fputc(c, fp);
|
||||
}
|
||||
fclose(fp);
|
||||
printf("Wrote image file %s\n", fn);
|
||||
}
|
||||
|
||||
|
||||
int main() {
|
||||
unsigned int width = 768;
|
||||
unsigned int height = 512;
|
||||
float x0 = -2;
|
||||
float x1 = 1;
|
||||
float y0 = -1;
|
||||
float y1 = 1;
|
||||
|
||||
int maxIterations = 256;
|
||||
int *buf = new int[width*height];
|
||||
|
||||
//
|
||||
// Compute the image using the ispc implementation; report the minimum
|
||||
// time of three runs.
|
||||
//
|
||||
double minISPC = 1e30;
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
reset_and_start_timer();
|
||||
mandelbrot_ispc(x0, y0, x1, y1, width, height, maxIterations, buf);
|
||||
double dt = get_elapsed_mcycles();
|
||||
minISPC = std::min(minISPC, dt);
|
||||
}
|
||||
|
||||
printf("[mandelbrot ispc]:\t\t[%.3f] million cycles\n", minISPC);
|
||||
writePPM(buf, width, height, "mandelbrot-ispc.ppm");
|
||||
|
||||
// Clear out the buffer
|
||||
for (unsigned int i = 0; i < width * height; ++i)
|
||||
buf[i] = 0;
|
||||
|
||||
//
|
||||
// And run the serial implementation 3 times, again reporting the
|
||||
// minimum time.
|
||||
//
|
||||
double minSerial = 1e30;
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
reset_and_start_timer();
|
||||
mandelbrot_serial(x0, y0, x1, y1, width, height, maxIterations, buf);
|
||||
double dt = get_elapsed_mcycles();
|
||||
minSerial = std::min(minSerial, dt);
|
||||
}
|
||||
|
||||
printf("[mandelbrot serial]:\t\t[%.3f] million cycles\n", minSerial);
|
||||
writePPM(buf, width, height, "mandelbrot-serial.ppm");
|
||||
|
||||
printf("\t\t\t\t(%.2fx speedup from ISPC)\n", minSerial/minISPC);
|
||||
|
||||
return 0;
|
||||
}
|
||||
78
examples_cuda/mandelbrot/mandelbrot.ispc
Normal file
78
examples_cuda/mandelbrot/mandelbrot.ispc
Normal file
@@ -0,0 +1,78 @@
|
||||
/*
|
||||
Copyright (c) 2010-2012, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
static inline int mandel(float c_re, float c_im, int count) {
|
||||
float z_re = c_re, z_im = c_im;
|
||||
int i;
|
||||
for (i = 0; i < count; ++i) {
|
||||
if (z_re * z_re + z_im * z_im > 4.)
|
||||
break;
|
||||
|
||||
float new_re = z_re*z_re - z_im*z_im;
|
||||
float new_im = 2.f * z_re * z_im;
|
||||
unmasked {
|
||||
z_re = c_re + new_re;
|
||||
z_im = c_im + new_im;
|
||||
}
|
||||
}
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
export void mandelbrot_ispc(uniform float x0, uniform float y0,
|
||||
uniform float x1, uniform float y1,
|
||||
uniform int width, uniform int height,
|
||||
uniform int maxIterations,
|
||||
uniform int output[])
|
||||
{
|
||||
float dx = (x1 - x0) / width;
|
||||
float dy = (y1 - y0) / height;
|
||||
|
||||
for (uniform int j = 0; j < height; j++) {
|
||||
// Note that we'll be doing programCount computations in parallel,
|
||||
// so increment i by that much. This assumes that width evenly
|
||||
// divides programCount.
|
||||
foreach (i = 0 ... width) {
|
||||
// Figure out the position on the complex plane to compute the
|
||||
// number of iterations at. Note that the x values are
|
||||
// different across different program instances, since its
|
||||
// initializer incorporates the value of the programIndex
|
||||
// variable.
|
||||
float x = x0 + i * dx;
|
||||
float y = y0 + j * dy;
|
||||
|
||||
int index = j * width + i;
|
||||
output[index] = mandel(x, y, maxIterations);
|
||||
}
|
||||
}
|
||||
}
|
||||
175
examples_cuda/mandelbrot/mandelbrot.vcxproj
Normal file
175
examples_cuda/mandelbrot/mandelbrot.vcxproj
Normal file
@@ -0,0 +1,175 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Debug|Win32">
|
||||
<Configuration>Debug</Configuration>
|
||||
<Platform>Win32</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Debug|x64">
|
||||
<Configuration>Debug</Configuration>
|
||||
<Platform>x64</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Release|Win32">
|
||||
<Configuration>Release</Configuration>
|
||||
<Platform>Win32</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Release|x64">
|
||||
<Configuration>Release</Configuration>
|
||||
<Platform>x64</Platform>
|
||||
</ProjectConfiguration>
|
||||
</ItemGroup>
|
||||
<PropertyGroup Label="Globals">
|
||||
<ProjectGuid>{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C1}</ProjectGuid>
|
||||
<Keyword>Win32Proj</Keyword>
|
||||
<RootNamespace>mandelbrot</RootNamespace>
|
||||
</PropertyGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>true</UseDebugLibraries>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>true</UseDebugLibraries>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>false</UseDebugLibraries>
|
||||
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>false</UseDebugLibraries>
|
||||
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||
<ImportGroup Label="ExtensionSettings">
|
||||
</ImportGroup>
|
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<PropertyGroup Label="UserMacros" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
|
||||
</PropertyGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<ClCompile>
|
||||
<PrecompiledHeader>
|
||||
</PrecompiledHeader>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<ClCompile>
|
||||
<PrecompiledHeader>
|
||||
</PrecompiledHeader>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<ClCompile>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<PrecompiledHeader>
|
||||
</PrecompiledHeader>
|
||||
<Optimization>MaxSpeed</Optimization>
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding>
|
||||
<OptimizeReferences>true</OptimizeReferences>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<ClCompile>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<PrecompiledHeader>
|
||||
</PrecompiledHeader>
|
||||
<Optimization>MaxSpeed</Optimization>
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
|
||||
<FloatingPointModel>Fast</FloatingPointModel>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding>
|
||||
<OptimizeReferences>true</OptimizeReferences>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemGroup>
|
||||
<ClCompile Include="mandelbrot.cpp" />
|
||||
<ClCompile Include="mandelbrot_serial.cpp" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CustomBuild Include="mandelbrot.ispc">
|
||||
<FileType>Document</FileType>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
|
||||
</Command>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
|
||||
</CustomBuild>
|
||||
</ItemGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
<ImportGroup Label="ExtensionTargets">
|
||||
</ImportGroup>
|
||||
</Project>
|
||||
68
examples_cuda/mandelbrot/mandelbrot_serial.cpp
Normal file
68
examples_cuda/mandelbrot/mandelbrot_serial.cpp
Normal file
@@ -0,0 +1,68 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
|
||||
static int mandel(float c_re, float c_im, int count) {
|
||||
float z_re = c_re, z_im = c_im;
|
||||
int i;
|
||||
for (i = 0; i < count; ++i) {
|
||||
if (z_re * z_re + z_im * z_im > 4.f)
|
||||
break;
|
||||
|
||||
float new_re = z_re*z_re - z_im*z_im;
|
||||
float new_im = 2.f * z_re * z_im;
|
||||
z_re = c_re + new_re;
|
||||
z_im = c_im + new_im;
|
||||
}
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
void mandelbrot_serial(float x0, float y0, float x1, float y1,
|
||||
int width, int height, int maxIterations,
|
||||
int output[])
|
||||
{
|
||||
float dx = (x1 - x0) / width;
|
||||
float dy = (y1 - y0) / height;
|
||||
|
||||
for (int j = 0; j < height; j++) {
|
||||
for (int i = 0; i < width; ++i) {
|
||||
float x = x0 + i * dx;
|
||||
float y = y0 + j * dy;
|
||||
|
||||
int index = (j * width + i);
|
||||
output[index] = mandel(x, y, maxIterations);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
BIN
examples_cuda/mandelbrot/out.o
Normal file
BIN
examples_cuda/mandelbrot/out.o
Normal file
Binary file not shown.
843
examples_cuda/mandelbrot/out.ptx
Normal file
843
examples_cuda/mandelbrot/out.ptx
Normal file
@@ -0,0 +1,843 @@
|
||||
//
|
||||
// Generated by LLVM NVPTX Back-End
|
||||
//
|
||||
|
||||
.version 3.1
|
||||
.target sm_35, texmode_independent
|
||||
.address_size 64
|
||||
|
||||
// .globl __vselect_i8
|
||||
// @__vselect_i8
|
||||
.func (.param .align 1 .b8 func_retval0[1]) __vselect_i8(
|
||||
.param .align 1 .b8 __vselect_i8_param_0[1],
|
||||
.param .align 1 .b8 __vselect_i8_param_1[1],
|
||||
.param .align 4 .b8 __vselect_i8_param_2[4]
|
||||
)
|
||||
{
|
||||
.reg .pred %p<396>;
|
||||
.reg .s16 %rc<396>;
|
||||
.reg .s16 %rs<396>;
|
||||
.reg .s32 %r<396>;
|
||||
.reg .s64 %rl<396>;
|
||||
.reg .f32 %f<396>;
|
||||
.reg .f64 %fl<396>;
|
||||
|
||||
// BB#0:
|
||||
ld.param.u32 %r0, [__vselect_i8_param_2];
|
||||
setp.eq.s32 %p0, %r0, 0;
|
||||
ld.param.u8 %rc0, [__vselect_i8_param_0];
|
||||
ld.param.u8 %rc1, [__vselect_i8_param_1];
|
||||
selp.b16 %rc0, %rc0, %rc1, %p0;
|
||||
st.param.b8 [func_retval0+0], %rc0;
|
||||
ret;
|
||||
}
|
||||
|
||||
// .globl __vselect_i16
|
||||
.func (.param .align 2 .b8 func_retval0[2]) __vselect_i16(
|
||||
.param .align 2 .b8 __vselect_i16_param_0[2],
|
||||
.param .align 2 .b8 __vselect_i16_param_1[2],
|
||||
.param .align 4 .b8 __vselect_i16_param_2[4]
|
||||
) // @__vselect_i16
|
||||
{
|
||||
.reg .pred %p<396>;
|
||||
.reg .s16 %rc<396>;
|
||||
.reg .s16 %rs<396>;
|
||||
.reg .s32 %r<396>;
|
||||
.reg .s64 %rl<396>;
|
||||
.reg .f32 %f<396>;
|
||||
.reg .f64 %fl<396>;
|
||||
|
||||
// BB#0:
|
||||
ld.param.u32 %r0, [__vselect_i16_param_2];
|
||||
setp.eq.s32 %p0, %r0, 0;
|
||||
ld.param.u16 %rs0, [__vselect_i16_param_0];
|
||||
ld.param.u16 %rs1, [__vselect_i16_param_1];
|
||||
selp.b16 %rs0, %rs0, %rs1, %p0;
|
||||
st.param.b16 [func_retval0+0], %rs0;
|
||||
ret;
|
||||
}
|
||||
|
||||
// .globl __vselect_i64
|
||||
.func (.param .align 8 .b8 func_retval0[8]) __vselect_i64(
|
||||
.param .align 8 .b8 __vselect_i64_param_0[8],
|
||||
.param .align 8 .b8 __vselect_i64_param_1[8],
|
||||
.param .align 4 .b8 __vselect_i64_param_2[4]
|
||||
) // @__vselect_i64
|
||||
{
|
||||
.reg .pred %p<396>;
|
||||
.reg .s16 %rc<396>;
|
||||
.reg .s16 %rs<396>;
|
||||
.reg .s32 %r<396>;
|
||||
.reg .s64 %rl<396>;
|
||||
.reg .f32 %f<396>;
|
||||
.reg .f64 %fl<396>;
|
||||
|
||||
// BB#0:
|
||||
ld.param.u32 %r0, [__vselect_i64_param_2];
|
||||
setp.eq.s32 %p0, %r0, 0;
|
||||
ld.param.u64 %rl0, [__vselect_i64_param_0];
|
||||
ld.param.u64 %rl1, [__vselect_i64_param_1];
|
||||
selp.b64 %rl0, %rl0, %rl1, %p0;
|
||||
st.param.b64 [func_retval0+0], %rl0;
|
||||
ret;
|
||||
}
|
||||
|
||||
// .globl __aos_to_soa4_float1
|
||||
.func __aos_to_soa4_float1(
|
||||
.param .align 4 .b8 __aos_to_soa4_float1_param_0[4],
|
||||
.param .align 4 .b8 __aos_to_soa4_float1_param_1[4],
|
||||
.param .align 4 .b8 __aos_to_soa4_float1_param_2[4],
|
||||
.param .align 4 .b8 __aos_to_soa4_float1_param_3[4],
|
||||
.param .b64 __aos_to_soa4_float1_param_4,
|
||||
.param .b64 __aos_to_soa4_float1_param_5,
|
||||
.param .b64 __aos_to_soa4_float1_param_6,
|
||||
.param .b64 __aos_to_soa4_float1_param_7
|
||||
) // @__aos_to_soa4_float1
|
||||
{
|
||||
.reg .pred %p<396>;
|
||||
.reg .s16 %rc<396>;
|
||||
.reg .s16 %rs<396>;
|
||||
.reg .s32 %r<396>;
|
||||
.reg .s64 %rl<396>;
|
||||
.reg .f32 %f<396>;
|
||||
.reg .f64 %fl<396>;
|
||||
|
||||
// BB#0:
|
||||
ld.param.u64 %rl0, [__aos_to_soa4_float1_param_4];
|
||||
ld.param.u64 %rl1, [__aos_to_soa4_float1_param_5];
|
||||
ld.param.u64 %rl2, [__aos_to_soa4_float1_param_6];
|
||||
ld.param.u64 %rl3, [__aos_to_soa4_float1_param_7];
|
||||
ld.param.f32 %f0, [__aos_to_soa4_float1_param_0];
|
||||
ld.param.f32 %f1, [__aos_to_soa4_float1_param_1];
|
||||
ld.param.f32 %f2, [__aos_to_soa4_float1_param_2];
|
||||
ld.param.f32 %f3, [__aos_to_soa4_float1_param_3];
|
||||
st.f32 [%rl0], %f0;
|
||||
st.f32 [%rl1], %f1;
|
||||
st.f32 [%rl2], %f2;
|
||||
st.f32 [%rl3], %f3;
|
||||
ret;
|
||||
}
|
||||
|
||||
// .globl __soa_to_aos4_float1
|
||||
.func __soa_to_aos4_float1(
|
||||
.param .align 4 .b8 __soa_to_aos4_float1_param_0[4],
|
||||
.param .align 4 .b8 __soa_to_aos4_float1_param_1[4],
|
||||
.param .align 4 .b8 __soa_to_aos4_float1_param_2[4],
|
||||
.param .align 4 .b8 __soa_to_aos4_float1_param_3[4],
|
||||
.param .b64 __soa_to_aos4_float1_param_4,
|
||||
.param .b64 __soa_to_aos4_float1_param_5,
|
||||
.param .b64 __soa_to_aos4_float1_param_6,
|
||||
.param .b64 __soa_to_aos4_float1_param_7
|
||||
) // @__soa_to_aos4_float1
|
||||
{
|
||||
.reg .pred %p<396>;
|
||||
.reg .s16 %rc<396>;
|
||||
.reg .s16 %rs<396>;
|
||||
.reg .s32 %r<396>;
|
||||
.reg .s64 %rl<396>;
|
||||
.reg .f32 %f<396>;
|
||||
.reg .f64 %fl<396>;
|
||||
|
||||
// BB#0:
|
||||
ld.param.u64 %rl0, [__soa_to_aos4_float1_param_4];
|
||||
ld.param.u64 %rl1, [__soa_to_aos4_float1_param_5];
|
||||
ld.param.u64 %rl2, [__soa_to_aos4_float1_param_6];
|
||||
ld.param.u64 %rl3, [__soa_to_aos4_float1_param_7];
|
||||
ld.param.f32 %f0, [__soa_to_aos4_float1_param_0];
|
||||
ld.param.f32 %f1, [__soa_to_aos4_float1_param_1];
|
||||
ld.param.f32 %f2, [__soa_to_aos4_float1_param_2];
|
||||
ld.param.f32 %f3, [__soa_to_aos4_float1_param_3];
|
||||
st.f32 [%rl0], %f0;
|
||||
st.f32 [%rl1], %f1;
|
||||
st.f32 [%rl2], %f2;
|
||||
st.f32 [%rl3], %f3;
|
||||
ret;
|
||||
}
|
||||
|
||||
// .globl __aos_to_soa3_float1
|
||||
.func __aos_to_soa3_float1(
|
||||
.param .align 4 .b8 __aos_to_soa3_float1_param_0[4],
|
||||
.param .align 4 .b8 __aos_to_soa3_float1_param_1[4],
|
||||
.param .align 4 .b8 __aos_to_soa3_float1_param_2[4],
|
||||
.param .b64 __aos_to_soa3_float1_param_3,
|
||||
.param .b64 __aos_to_soa3_float1_param_4,
|
||||
.param .b64 __aos_to_soa3_float1_param_5
|
||||
) // @__aos_to_soa3_float1
|
||||
{
|
||||
.reg .pred %p<396>;
|
||||
.reg .s16 %rc<396>;
|
||||
.reg .s16 %rs<396>;
|
||||
.reg .s32 %r<396>;
|
||||
.reg .s64 %rl<396>;
|
||||
.reg .f32 %f<396>;
|
||||
.reg .f64 %fl<396>;
|
||||
|
||||
// BB#0:
|
||||
ld.param.u64 %rl0, [__aos_to_soa3_float1_param_3];
|
||||
ld.param.u64 %rl1, [__aos_to_soa3_float1_param_4];
|
||||
ld.param.u64 %rl2, [__aos_to_soa3_float1_param_5];
|
||||
ld.param.f32 %f0, [__aos_to_soa3_float1_param_0];
|
||||
ld.param.f32 %f1, [__aos_to_soa3_float1_param_1];
|
||||
ld.param.f32 %f2, [__aos_to_soa3_float1_param_2];
|
||||
st.f32 [%rl0], %f0;
|
||||
st.f32 [%rl1], %f1;
|
||||
st.f32 [%rl2], %f2;
|
||||
ret;
|
||||
}
|
||||
|
||||
// .globl __soa_to_aos3_float1
|
||||
.func __soa_to_aos3_float1(
|
||||
.param .align 4 .b8 __soa_to_aos3_float1_param_0[4],
|
||||
.param .align 4 .b8 __soa_to_aos3_float1_param_1[4],
|
||||
.param .align 4 .b8 __soa_to_aos3_float1_param_2[4],
|
||||
.param .b64 __soa_to_aos3_float1_param_3,
|
||||
.param .b64 __soa_to_aos3_float1_param_4,
|
||||
.param .b64 __soa_to_aos3_float1_param_5
|
||||
) // @__soa_to_aos3_float1
|
||||
{
|
||||
.reg .pred %p<396>;
|
||||
.reg .s16 %rc<396>;
|
||||
.reg .s16 %rs<396>;
|
||||
.reg .s32 %r<396>;
|
||||
.reg .s64 %rl<396>;
|
||||
.reg .f32 %f<396>;
|
||||
.reg .f64 %fl<396>;
|
||||
|
||||
// BB#0:
|
||||
ld.param.u64 %rl0, [__soa_to_aos3_float1_param_3];
|
||||
ld.param.u64 %rl1, [__soa_to_aos3_float1_param_4];
|
||||
ld.param.u64 %rl2, [__soa_to_aos3_float1_param_5];
|
||||
ld.param.f32 %f0, [__soa_to_aos3_float1_param_0];
|
||||
ld.param.f32 %f1, [__soa_to_aos3_float1_param_1];
|
||||
ld.param.f32 %f2, [__soa_to_aos3_float1_param_2];
|
||||
st.f32 [%rl0], %f0;
|
||||
st.f32 [%rl1], %f1;
|
||||
st.f32 [%rl2], %f2;
|
||||
ret;
|
||||
}
|
||||
|
||||
// .globl __rsqrt_varying_double
|
||||
.func (.param .align 8 .b8 func_retval0[8]) __rsqrt_varying_double(
|
||||
.param .align 8 .b8 __rsqrt_varying_double_param_0[8]
|
||||
) // @__rsqrt_varying_double
|
||||
{
|
||||
.reg .pred %p<396>;
|
||||
.reg .s16 %rc<396>;
|
||||
.reg .s16 %rs<396>;
|
||||
.reg .s32 %r<396>;
|
||||
.reg .s64 %rl<396>;
|
||||
.reg .f32 %f<396>;
|
||||
.reg .f64 %fl<396>;
|
||||
|
||||
// BB#0:
|
||||
ld.param.f64 %fl0, [__rsqrt_varying_double_param_0];
|
||||
rsqrt.approx.f64 %fl0, %fl0;
|
||||
st.param.f64 [func_retval0+0], %fl0;
|
||||
ret;
|
||||
}
|
||||
|
||||
// .globl mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_
|
||||
.func mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_(
|
||||
.param .b32 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_0,
|
||||
.param .b32 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_1,
|
||||
.param .b32 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_2,
|
||||
.param .b32 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_3,
|
||||
.param .b32 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_4,
|
||||
.param .b32 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_5,
|
||||
.param .b32 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_6,
|
||||
.param .b64 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_7,
|
||||
.param .align 4 .b8 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_8[4]
|
||||
) // @mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_
|
||||
{
|
||||
.reg .pred %p<396>;
|
||||
.reg .s16 %rc<396>;
|
||||
.reg .s16 %rs<396>;
|
||||
.reg .s32 %r<396>;
|
||||
.reg .s64 %rl<396>;
|
||||
.reg .f32 %f<396>;
|
||||
.reg .f64 %fl<396>;
|
||||
|
||||
// BB#0: // %allocas
|
||||
ld.param.f32 %f0, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_0];
|
||||
ld.param.f32 %f1, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_1];
|
||||
ld.param.f32 %f3, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_2];
|
||||
ld.param.f32 %f2, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_3];
|
||||
ld.param.u32 %r0, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_4];
|
||||
ld.param.u32 %r1, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_5];
|
||||
ld.param.u32 %r2, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_6];
|
||||
ld.param.u64 %rl0, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_7];
|
||||
ld.param.u32 %r3, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_8];
|
||||
setp.lt.s32 %p0, %r3, 0;
|
||||
sub.f32 %f3, %f3, %f0;
|
||||
cvt.rn.f32.s32 %f4, %r0;
|
||||
sub.f32 %f2, %f2, %f1;
|
||||
cvt.rn.f32.s32 %f5, %r1;
|
||||
div.rn.f32 %f2, %f2, %f5;
|
||||
div.rn.f32 %f3, %f3, %f4;
|
||||
@%p0 bra BB8_9;
|
||||
// BB#1: // %for_test110.preheader
|
||||
setp.lt.s32 %p0, %r1, 1;
|
||||
@%p0 bra BB8_45;
|
||||
// BB#2: // %outer_not_in_extras140.preheader.lr.ph
|
||||
setp.gt.s32 %p0, %r2, 0;
|
||||
mov.u32 %r3, 0;
|
||||
selp.b32 %r4, -1, 0, %p0;
|
||||
shl.b32 %r5, %r0, 2;
|
||||
mov.u32 %r6, %r3;
|
||||
BB8_3: // %outer_not_in_extras140.preheader
|
||||
// =>This Loop Header: Depth=1
|
||||
// Child Loop BB8_41 Depth 2
|
||||
// Child Loop BB8_43 Depth 2
|
||||
// Child Loop BB8_38 Depth 2
|
||||
// Child Loop BB8_33 Depth 3
|
||||
setp.lt.s32 %p0, %r0, 1;
|
||||
@%p0 bra BB8_4;
|
||||
// BB#31: // %foreach_full_body120.lr.ph
|
||||
// in Loop: Header=BB8_3 Depth=1
|
||||
setp.lt.s32 %p0, %r4, 0;
|
||||
mov.u32 %r7, %r0;
|
||||
mov.u32 %r8, %r3;
|
||||
@%p0 bra BB8_32;
|
||||
bra.uni BB8_43;
|
||||
BB8_32: // in Loop: Header=BB8_3 Depth=1
|
||||
mov.u64 %rl1, 0;
|
||||
cvt.rn.f32.s32 %f4, %r6;
|
||||
fma.rn.f32 %f4, %f2, %f4, %f1;
|
||||
mul.lo.s32 %r7, %r6, %r0;
|
||||
BB8_38: // %for_loop.i380.lr.ph.us
|
||||
// Parent Loop BB8_3 Depth=1
|
||||
// => This Loop Header: Depth=2
|
||||
// Child Loop BB8_33 Depth 3
|
||||
cvt.u32.u64 %r8, %rl1;
|
||||
cvt.rn.f32.s32 %f5, %r8;
|
||||
fma.rn.f32 %f5, %f3, %f5, %f0;
|
||||
mov.u32 %r10, 0;
|
||||
mov.u32 %r12, %r4;
|
||||
mov.u32 %r11, %r10;
|
||||
mov.u32 %r9, %r10;
|
||||
mov.f32 %f7, %f5;
|
||||
mov.f32 %f6, %f4;
|
||||
BB8_33: // %for_loop.i380.us
|
||||
// Parent Loop BB8_3 Depth=1
|
||||
// Parent Loop BB8_38 Depth=2
|
||||
// => This Inner Loop Header: Depth=3
|
||||
mul.f32 %f8, %f7, %f7;
|
||||
fma.rn.f32 %f9, %f6, %f6, %f8;
|
||||
setp.gtu.f32 %p0, %f9, 0f40800000;
|
||||
selp.b32 %r13, %r12, 0, %p0;
|
||||
or.b32 %r11, %r13, %r11;
|
||||
shr.u32 %r13, %r11, 31;
|
||||
shr.u32 %r14, %r12, 31;
|
||||
setp.eq.s32 %p0, %r13, %r14;
|
||||
@%p0 bra BB8_34;
|
||||
bra.uni BB8_35;
|
||||
BB8_34: // in Loop: Header=BB8_33 Depth=3
|
||||
mov.u32 %r12, %r10;
|
||||
bra.uni BB8_36;
|
||||
BB8_35: // %not_all_continued_or_breaked.i394.us
|
||||
// in Loop: Header=BB8_33 Depth=3
|
||||
mul.f32 %f9, %f6, %f6;
|
||||
not.b32 %r13, %r11;
|
||||
and.b32 %r12, %r12, %r13;
|
||||
sub.f32 %f8, %f8, %f9;
|
||||
add.f32 %f8, %f5, %f8;
|
||||
add.f32 %f7, %f7, %f7;
|
||||
fma.rn.f32 %f6, %f6, %f7, %f4;
|
||||
mov.f32 %f7, %f8;
|
||||
BB8_36: // %for_step.i363.us
|
||||
// in Loop: Header=BB8_33 Depth=3
|
||||
setp.ne.s32 %p0, %r12, 0;
|
||||
selp.u32 %r13, 1, 0, %p0;
|
||||
add.s32 %r9, %r9, %r13;
|
||||
setp.lt.s32 %p0, %r9, %r2;
|
||||
selp.b32 %r12, %r12, 0, %p0;
|
||||
setp.lt.s32 %p0, %r12, 0;
|
||||
@%p0 bra BB8_33;
|
||||
// BB#37: // %mandel___vyfvyfvyi.exit395.us
|
||||
// in Loop: Header=BB8_38 Depth=2
|
||||
add.s32 %r8, %r8, %r7;
|
||||
shl.b32 %r8, %r8, 2;
|
||||
cvt.s64.s32 %rl2, %r8;
|
||||
add.s64 %rl2, %rl2, %rl0;
|
||||
st.u32 [%rl2], %r9;
|
||||
add.s64 %rl1, %rl1, 1;
|
||||
cvt.u32.u64 %r8, %rl1;
|
||||
setp.eq.s32 %p0, %r8, %r0;
|
||||
@%p0 bra BB8_44;
|
||||
bra.uni BB8_38;
|
||||
BB8_43: // %mandel___vyfvyfvyi.exit395
|
||||
// Parent Loop BB8_3 Depth=1
|
||||
// => This Inner Loop Header: Depth=2
|
||||
cvt.s64.s32 %rl1, %r8;
|
||||
add.s64 %rl1, %rl1, %rl0;
|
||||
mov.u32 %r9, 0;
|
||||
st.u32 [%rl1], %r9;
|
||||
add.s32 %r8, %r8, 4;
|
||||
add.s32 %r7, %r7, -1;
|
||||
setp.eq.s32 %p0, %r7, 0;
|
||||
@%p0 bra BB8_44;
|
||||
bra.uni BB8_43;
|
||||
BB8_4: // %partial_inner_all_outer156
|
||||
// in Loop: Header=BB8_3 Depth=1
|
||||
@%p0 bra BB8_44;
|
||||
// BB#5: // %partial_inner_only197
|
||||
// in Loop: Header=BB8_3 Depth=1
|
||||
setp.gt.s32 %p0, %r0, 0;
|
||||
mov.u32 %r8, 0;
|
||||
fma.rn.f32 %f4, %f3, 0f00000000, %f0;
|
||||
cvt.rn.f32.s32 %f5, %r6;
|
||||
fma.rn.f32 %f5, %f2, %f5, %f1;
|
||||
selp.b32 %r7, %r4, 0, %p0;
|
||||
setp.lt.s32 %p1, %r7, 0;
|
||||
mov.u32 %r10, %r4;
|
||||
mov.u32 %r9, %r8;
|
||||
mov.u32 %r7, %r8;
|
||||
mov.f32 %f7, %f4;
|
||||
mov.f32 %f6, %f5;
|
||||
@%p1 bra BB8_41;
|
||||
bra.uni BB8_6;
|
||||
BB8_41: // %for_loop.i
|
||||
// Parent Loop BB8_3 Depth=1
|
||||
// => This Inner Loop Header: Depth=2
|
||||
selp.b32 %r11, %r10, 0, %p0;
|
||||
mul.f32 %f8, %f7, %f7;
|
||||
fma.rn.f32 %f9, %f6, %f6, %f8;
|
||||
setp.gtu.f32 %p1, %f9, 0f40800000;
|
||||
selp.b32 %r12, %r10, 0, %p1;
|
||||
or.b32 %r9, %r12, %r9;
|
||||
selp.b32 %r12, %r9, 0, %p0;
|
||||
shr.u32 %r12, %r12, 31;
|
||||
shr.u32 %r11, %r11, 31;
|
||||
setp.eq.s32 %p1, %r12, %r11;
|
||||
@%p1 bra BB8_42;
|
||||
bra.uni BB8_39;
|
||||
BB8_42: // in Loop: Header=BB8_41 Depth=2
|
||||
mov.u32 %r10, %r8;
|
||||
bra.uni BB8_40;
|
||||
BB8_39: // %not_all_continued_or_breaked.i
|
||||
// in Loop: Header=BB8_41 Depth=2
|
||||
mul.f32 %f9, %f6, %f6;
|
||||
not.b32 %r11, %r9;
|
||||
and.b32 %r10, %r10, %r11;
|
||||
sub.f32 %f8, %f8, %f9;
|
||||
add.f32 %f8, %f4, %f8;
|
||||
add.f32 %f7, %f7, %f7;
|
||||
fma.rn.f32 %f6, %f6, %f7, %f5;
|
||||
mov.f32 %f7, %f8;
|
||||
BB8_40: // %for_step.i
|
||||
// in Loop: Header=BB8_41 Depth=2
|
||||
setp.ne.s32 %p1, %r10, 0;
|
||||
selp.u32 %r11, 1, 0, %p1;
|
||||
add.s32 %r7, %r7, %r11;
|
||||
setp.lt.s32 %p1, %r7, %r2;
|
||||
selp.b32 %r10, %r10, 0, %p1;
|
||||
selp.b32 %r11, %r10, 0, %p0;
|
||||
setp.gt.s32 %p1, %r11, -1;
|
||||
@%p1 bra BB8_7;
|
||||
bra.uni BB8_41;
|
||||
BB8_6: // in Loop: Header=BB8_3 Depth=1
|
||||
mov.u32 %r7, %r8;
|
||||
BB8_7: // %mandel___vyfvyfvyi.exit
|
||||
// in Loop: Header=BB8_3 Depth=1
|
||||
setp.lt.s32 %p0, %r0, 1;
|
||||
@%p0 bra BB8_44;
|
||||
// BB#8: // %pl_dolane.i
|
||||
// in Loop: Header=BB8_3 Depth=1
|
||||
mul.lo.s32 %r8, %r6, %r0;
|
||||
shl.b32 %r8, %r8, 2;
|
||||
cvt.s64.s32 %rl1, %r8;
|
||||
add.s64 %rl1, %rl1, %rl0;
|
||||
st.u32 [%rl1], %r7;
|
||||
BB8_44: // %foreach_reset128
|
||||
// in Loop: Header=BB8_3 Depth=1
|
||||
add.s32 %r6, %r6, 1;
|
||||
add.s32 %r3, %r3, %r5;
|
||||
setp.eq.s32 %p0, %r6, %r1;
|
||||
@%p0 bra BB8_45;
|
||||
bra.uni BB8_3;
|
||||
BB8_9: // %for_test.preheader
|
||||
setp.lt.s32 %p0, %r1, 1;
|
||||
@%p0 bra BB8_45;
|
||||
// BB#10: // %outer_not_in_extras.preheader.lr.ph
|
||||
setp.gt.s32 %p0, %r2, 0;
|
||||
mov.u32 %r3, 0;
|
||||
selp.b32 %r4, -1, 0, %p0;
|
||||
shl.b32 %r5, %r0, 2;
|
||||
mov.u32 %r6, %r3;
|
||||
BB8_11: // %outer_not_in_extras.preheader
|
||||
// =>This Loop Header: Depth=1
|
||||
// Child Loop BB8_23 Depth 2
|
||||
// Child Loop BB8_20 Depth 2
|
||||
// Child Loop BB8_19 Depth 2
|
||||
// Child Loop BB8_14 Depth 3
|
||||
setp.lt.s32 %p0, %r0, 1;
|
||||
@%p0 bra BB8_28;
|
||||
// BB#12: // %foreach_full_body.lr.ph
|
||||
// in Loop: Header=BB8_11 Depth=1
|
||||
setp.lt.s32 %p0, %r4, 0;
|
||||
mov.u32 %r7, %r0;
|
||||
mov.u32 %r8, %r3;
|
||||
@%p0 bra BB8_13;
|
||||
bra.uni BB8_20;
|
||||
BB8_13: // in Loop: Header=BB8_11 Depth=1
|
||||
mov.u64 %rl1, 0;
|
||||
cvt.rn.f32.s32 %f4, %r6;
|
||||
fma.rn.f32 %f4, %f2, %f4, %f1;
|
||||
mul.lo.s32 %r7, %r6, %r0;
|
||||
BB8_19: // %for_loop.i281.lr.ph.us
|
||||
// Parent Loop BB8_11 Depth=1
|
||||
// => This Loop Header: Depth=2
|
||||
// Child Loop BB8_14 Depth 3
|
||||
cvt.u32.u64 %r8, %rl1;
|
||||
cvt.rn.f32.s32 %f5, %r8;
|
||||
fma.rn.f32 %f5, %f3, %f5, %f0;
|
||||
mov.u32 %r10, 0;
|
||||
mov.u32 %r12, %r4;
|
||||
mov.u32 %r11, %r10;
|
||||
mov.u32 %r9, %r10;
|
||||
mov.f32 %f7, %f5;
|
||||
mov.f32 %f6, %f4;
|
||||
BB8_14: // %for_loop.i281.us
|
||||
// Parent Loop BB8_11 Depth=1
|
||||
// Parent Loop BB8_19 Depth=2
|
||||
// => This Inner Loop Header: Depth=3
|
||||
mul.f32 %f8, %f7, %f7;
|
||||
fma.rn.f32 %f9, %f6, %f6, %f8;
|
||||
setp.gtu.f32 %p0, %f9, 0f40800000;
|
||||
selp.b32 %r13, %r12, 0, %p0;
|
||||
or.b32 %r11, %r13, %r11;
|
||||
shr.u32 %r13, %r11, 31;
|
||||
shr.u32 %r14, %r12, 31;
|
||||
setp.eq.s32 %p0, %r13, %r14;
|
||||
@%p0 bra BB8_15;
|
||||
bra.uni BB8_16;
|
||||
BB8_15: // in Loop: Header=BB8_14 Depth=3
|
||||
mov.u32 %r12, %r10;
|
||||
bra.uni BB8_17;
|
||||
BB8_16: // %not_all_continued_or_breaked.i295.us
|
||||
// in Loop: Header=BB8_14 Depth=3
|
||||
mul.f32 %f9, %f6, %f6;
|
||||
not.b32 %r13, %r11;
|
||||
and.b32 %r12, %r12, %r13;
|
||||
sub.f32 %f8, %f8, %f9;
|
||||
add.f32 %f8, %f5, %f8;
|
||||
add.f32 %f7, %f7, %f7;
|
||||
fma.rn.f32 %f6, %f6, %f7, %f4;
|
||||
mov.f32 %f7, %f8;
|
||||
BB8_17: // %for_step.i264.us
|
||||
// in Loop: Header=BB8_14 Depth=3
|
||||
setp.ne.s32 %p0, %r12, 0;
|
||||
selp.u32 %r13, 1, 0, %p0;
|
||||
add.s32 %r9, %r9, %r13;
|
||||
setp.lt.s32 %p0, %r9, %r2;
|
||||
selp.b32 %r12, %r12, 0, %p0;
|
||||
setp.lt.s32 %p0, %r12, 0;
|
||||
@%p0 bra BB8_14;
|
||||
// BB#18: // %mandel___vyfvyfvyi.exit296.us
|
||||
// in Loop: Header=BB8_19 Depth=2
|
||||
add.s32 %r8, %r8, %r7;
|
||||
shl.b32 %r8, %r8, 2;
|
||||
cvt.s64.s32 %rl2, %r8;
|
||||
add.s64 %rl2, %rl2, %rl0;
|
||||
st.u32 [%rl2], %r9;
|
||||
add.s64 %rl1, %rl1, 1;
|
||||
cvt.u32.u64 %r8, %rl1;
|
||||
setp.eq.s32 %p0, %r8, %r0;
|
||||
@%p0 bra BB8_27;
|
||||
bra.uni BB8_19;
|
||||
BB8_20: // %mandel___vyfvyfvyi.exit296
|
||||
// Parent Loop BB8_11 Depth=1
|
||||
// => This Inner Loop Header: Depth=2
|
||||
cvt.s64.s32 %rl1, %r8;
|
||||
add.s64 %rl1, %rl1, %rl0;
|
||||
mov.u32 %r9, 0;
|
||||
st.u32 [%rl1], %r9;
|
||||
add.s32 %r8, %r8, 4;
|
||||
add.s32 %r7, %r7, -1;
|
||||
setp.eq.s32 %p0, %r7, 0;
|
||||
@%p0 bra BB8_27;
|
||||
bra.uni BB8_20;
|
||||
BB8_28: // %partial_inner_all_outer
|
||||
// in Loop: Header=BB8_11 Depth=1
|
||||
@%p0 bra BB8_27;
|
||||
// BB#29: // %partial_inner_only
|
||||
// in Loop: Header=BB8_11 Depth=1
|
||||
setp.gt.s32 %p0, %r0, 0;
|
||||
mov.u32 %r8, 0;
|
||||
fma.rn.f32 %f4, %f3, 0f00000000, %f0;
|
||||
cvt.rn.f32.s32 %f5, %r6;
|
||||
fma.rn.f32 %f5, %f2, %f5, %f1;
|
||||
selp.b32 %r7, %r4, 0, %p0;
|
||||
setp.lt.s32 %p1, %r7, 0;
|
||||
mov.u32 %r10, %r4;
|
||||
mov.u32 %r9, %r8;
|
||||
mov.u32 %r7, %r8;
|
||||
mov.f32 %f7, %f4;
|
||||
mov.f32 %f6, %f5;
|
||||
@%p1 bra BB8_23;
|
||||
bra.uni BB8_30;
|
||||
BB8_23: // %for_loop.i332
|
||||
// Parent Loop BB8_11 Depth=1
|
||||
// => This Inner Loop Header: Depth=2
|
||||
selp.b32 %r11, %r10, 0, %p0;
|
||||
mul.f32 %f8, %f7, %f7;
|
||||
fma.rn.f32 %f9, %f6, %f6, %f8;
|
||||
setp.gtu.f32 %p1, %f9, 0f40800000;
|
||||
selp.b32 %r12, %r10, 0, %p1;
|
||||
or.b32 %r9, %r12, %r9;
|
||||
selp.b32 %r12, %r9, 0, %p0;
|
||||
shr.u32 %r12, %r12, 31;
|
||||
shr.u32 %r11, %r11, 31;
|
||||
setp.eq.s32 %p1, %r12, %r11;
|
||||
@%p1 bra BB8_24;
|
||||
bra.uni BB8_21;
|
||||
BB8_24: // in Loop: Header=BB8_23 Depth=2
|
||||
mov.u32 %r10, %r8;
|
||||
bra.uni BB8_22;
|
||||
BB8_21: // %not_all_continued_or_breaked.i346
|
||||
// in Loop: Header=BB8_23 Depth=2
|
||||
mul.f32 %f9, %f6, %f6;
|
||||
not.b32 %r11, %r9;
|
||||
and.b32 %r10, %r10, %r11;
|
||||
sub.f32 %f8, %f8, %f9;
|
||||
add.f32 %f8, %f4, %f8;
|
||||
add.f32 %f7, %f7, %f7;
|
||||
fma.rn.f32 %f6, %f6, %f7, %f5;
|
||||
mov.f32 %f7, %f8;
|
||||
BB8_22: // %for_step.i313
|
||||
// in Loop: Header=BB8_23 Depth=2
|
||||
setp.ne.s32 %p1, %r10, 0;
|
||||
selp.u32 %r11, 1, 0, %p1;
|
||||
add.s32 %r7, %r7, %r11;
|
||||
setp.lt.s32 %p1, %r7, %r2;
|
||||
selp.b32 %r10, %r10, 0, %p1;
|
||||
selp.b32 %r11, %r10, 0, %p0;
|
||||
setp.gt.s32 %p1, %r11, -1;
|
||||
@%p1 bra BB8_25;
|
||||
bra.uni BB8_23;
|
||||
BB8_30: // in Loop: Header=BB8_11 Depth=1
|
||||
mov.u32 %r7, %r8;
|
||||
BB8_25: // %mandel___vyfvyfvyi.exit347
|
||||
// in Loop: Header=BB8_11 Depth=1
|
||||
setp.lt.s32 %p0, %r0, 1;
|
||||
@%p0 bra BB8_27;
|
||||
// BB#26: // %pl_dolane.i452
|
||||
// in Loop: Header=BB8_11 Depth=1
|
||||
mul.lo.s32 %r8, %r6, %r0;
|
||||
shl.b32 %r8, %r8, 2;
|
||||
cvt.s64.s32 %rl1, %r8;
|
||||
add.s64 %rl1, %rl1, %rl0;
|
||||
st.u32 [%rl1], %r7;
|
||||
BB8_27: // %foreach_reset
|
||||
// in Loop: Header=BB8_11 Depth=1
|
||||
add.s32 %r6, %r6, 1;
|
||||
add.s32 %r3, %r3, %r5;
|
||||
setp.eq.s32 %p0, %r6, %r1;
|
||||
@%p0 bra BB8_45;
|
||||
bra.uni BB8_11;
|
||||
BB8_45: // %for_exit
|
||||
ret;
|
||||
}
|
||||
|
||||
// .globl mandelbrot_ispc
|
||||
.func mandelbrot_ispc(
|
||||
.param .b32 mandelbrot_ispc_param_0,
|
||||
.param .b32 mandelbrot_ispc_param_1,
|
||||
.param .b32 mandelbrot_ispc_param_2,
|
||||
.param .b32 mandelbrot_ispc_param_3,
|
||||
.param .b32 mandelbrot_ispc_param_4,
|
||||
.param .b32 mandelbrot_ispc_param_5,
|
||||
.param .b32 mandelbrot_ispc_param_6,
|
||||
.param .b64 mandelbrot_ispc_param_7
|
||||
) // @mandelbrot_ispc
|
||||
{
|
||||
.reg .pred %p<396>;
|
||||
.reg .s16 %rc<396>;
|
||||
.reg .s16 %rs<396>;
|
||||
.reg .s32 %r<396>;
|
||||
.reg .s64 %rl<396>;
|
||||
.reg .f32 %f<396>;
|
||||
.reg .f64 %fl<396>;
|
||||
|
||||
// BB#0: // %allocas
|
||||
ld.param.u32 %r0, [mandelbrot_ispc_param_5];
|
||||
setp.lt.s32 %p0, %r0, 1;
|
||||
@%p0 bra BB9_18;
|
||||
// BB#1: // %outer_not_in_extras.preheader.lr.ph
|
||||
ld.param.f32 %f0, [mandelbrot_ispc_param_0];
|
||||
ld.param.f32 %f1, [mandelbrot_ispc_param_1];
|
||||
ld.param.f32 %f3, [mandelbrot_ispc_param_2];
|
||||
ld.param.f32 %f2, [mandelbrot_ispc_param_3];
|
||||
ld.param.u32 %r1, [mandelbrot_ispc_param_4];
|
||||
ld.param.u32 %r2, [mandelbrot_ispc_param_6];
|
||||
ld.param.u64 %rl0, [mandelbrot_ispc_param_7];
|
||||
sub.f32 %f3, %f3, %f0;
|
||||
cvt.rn.f32.s32 %f4, %r1;
|
||||
sub.f32 %f2, %f2, %f1;
|
||||
cvt.rn.f32.s32 %f5, %r0;
|
||||
div.rn.f32 %f2, %f2, %f5;
|
||||
div.rn.f32 %f3, %f3, %f4;
|
||||
setp.gt.s32 %p0, %r2, 0;
|
||||
mov.u32 %r3, 0;
|
||||
selp.b32 %r4, -1, 0, %p0;
|
||||
BB9_2: // %outer_not_in_extras.preheader
|
||||
// =>This Loop Header: Depth=1
|
||||
// Child Loop BB9_13 Depth 2
|
||||
// Child Loop BB9_4 Depth 2
|
||||
// Child Loop BB9_9 Depth 3
|
||||
setp.lt.s32 %p0, %r1, 1;
|
||||
@%p0 bra BB9_19;
|
||||
// BB#3: // %foreach_full_body.lr.ph
|
||||
// in Loop: Header=BB9_2 Depth=1
|
||||
mov.u64 %rl1, 0;
|
||||
cvt.rn.f32.s32 %f4, %r3;
|
||||
fma.rn.f32 %f4, %f2, %f4, %f1;
|
||||
mul.lo.s32 %r5, %r3, %r1;
|
||||
BB9_4: // %foreach_full_body
|
||||
// Parent Loop BB9_2 Depth=1
|
||||
// => This Loop Header: Depth=2
|
||||
// Child Loop BB9_9 Depth 3
|
||||
setp.lt.s32 %p0, %r4, 0;
|
||||
cvt.u32.u64 %r6, %rl1;
|
||||
cvt.rn.f32.s32 %f5, %r6;
|
||||
fma.rn.f32 %f5, %f3, %f5, %f0;
|
||||
mov.u32 %r8, 0;
|
||||
mov.u32 %r10, %r4;
|
||||
mov.u32 %r9, %r8;
|
||||
mov.u32 %r7, %r8;
|
||||
mov.f32 %f7, %f5;
|
||||
mov.f32 %f6, %f4;
|
||||
@%p0 bra BB9_9;
|
||||
bra.uni BB9_5;
|
||||
BB9_9: // %for_loop.i281
|
||||
// Parent Loop BB9_2 Depth=1
|
||||
// Parent Loop BB9_4 Depth=2
|
||||
// => This Inner Loop Header: Depth=3
|
||||
mul.f32 %f8, %f7, %f7;
|
||||
fma.rn.f32 %f9, %f6, %f6, %f8;
|
||||
setp.gtu.f32 %p0, %f9, 0f40800000;
|
||||
selp.b32 %r11, %r10, 0, %p0;
|
||||
or.b32 %r9, %r11, %r9;
|
||||
shr.u32 %r11, %r9, 31;
|
||||
shr.u32 %r12, %r10, 31;
|
||||
setp.eq.s32 %p0, %r11, %r12;
|
||||
@%p0 bra BB9_10;
|
||||
bra.uni BB9_7;
|
||||
BB9_10: // in Loop: Header=BB9_9 Depth=3
|
||||
mov.u32 %r10, %r8;
|
||||
bra.uni BB9_8;
|
||||
BB9_7: // %not_all_continued_or_breaked.i295
|
||||
// in Loop: Header=BB9_9 Depth=3
|
||||
mul.f32 %f9, %f6, %f6;
|
||||
not.b32 %r11, %r9;
|
||||
and.b32 %r10, %r10, %r11;
|
||||
sub.f32 %f8, %f8, %f9;
|
||||
add.f32 %f8, %f5, %f8;
|
||||
add.f32 %f7, %f7, %f7;
|
||||
fma.rn.f32 %f6, %f6, %f7, %f4;
|
||||
mov.f32 %f7, %f8;
|
||||
BB9_8: // %for_step.i264
|
||||
// in Loop: Header=BB9_9 Depth=3
|
||||
setp.ne.s32 %p0, %r10, 0;
|
||||
selp.u32 %r11, 1, 0, %p0;
|
||||
add.s32 %r7, %r7, %r11;
|
||||
setp.lt.s32 %p0, %r7, %r2;
|
||||
selp.b32 %r10, %r10, 0, %p0;
|
||||
setp.gt.s32 %p0, %r10, -1;
|
||||
@%p0 bra BB9_6;
|
||||
bra.uni BB9_9;
|
||||
BB9_5: // in Loop: Header=BB9_4 Depth=2
|
||||
mov.u32 %r7, %r8;
|
||||
BB9_6: // %mandel___vyfvyfvyi.exit296
|
||||
// in Loop: Header=BB9_4 Depth=2
|
||||
add.s32 %r6, %r6, %r5;
|
||||
shl.b32 %r6, %r6, 2;
|
||||
cvt.s64.s32 %rl2, %r6;
|
||||
add.s64 %rl2, %rl2, %rl0;
|
||||
st.u32 [%rl2], %r7;
|
||||
add.s64 %rl1, %rl1, 1;
|
||||
cvt.u32.u64 %r6, %rl1;
|
||||
setp.eq.s32 %p0, %r6, %r1;
|
||||
@%p0 bra BB9_17;
|
||||
bra.uni BB9_4;
|
||||
BB9_19: // %partial_inner_all_outer
|
||||
// in Loop: Header=BB9_2 Depth=1
|
||||
@%p0 bra BB9_17;
|
||||
// BB#20: // %partial_inner_only
|
||||
// in Loop: Header=BB9_2 Depth=1
|
||||
setp.gt.s32 %p0, %r1, 0;
|
||||
mov.u32 %r6, 0;
|
||||
fma.rn.f32 %f4, %f3, 0f00000000, %f0;
|
||||
cvt.rn.f32.s32 %f5, %r3;
|
||||
fma.rn.f32 %f5, %f2, %f5, %f1;
|
||||
selp.b32 %r5, %r4, 0, %p0;
|
||||
setp.lt.s32 %p1, %r5, 0;
|
||||
mov.u32 %r8, %r4;
|
||||
mov.u32 %r7, %r6;
|
||||
mov.u32 %r5, %r6;
|
||||
mov.f32 %f7, %f4;
|
||||
mov.f32 %f6, %f5;
|
||||
@%p1 bra BB9_13;
|
||||
bra.uni BB9_21;
|
||||
BB9_13: // %for_loop.i332
|
||||
// Parent Loop BB9_2 Depth=1
|
||||
// => This Inner Loop Header: Depth=2
|
||||
selp.b32 %r9, %r8, 0, %p0;
|
||||
mul.f32 %f8, %f7, %f7;
|
||||
fma.rn.f32 %f9, %f6, %f6, %f8;
|
||||
setp.gtu.f32 %p1, %f9, 0f40800000;
|
||||
selp.b32 %r10, %r8, 0, %p1;
|
||||
or.b32 %r7, %r10, %r7;
|
||||
selp.b32 %r10, %r7, 0, %p0;
|
||||
shr.u32 %r10, %r10, 31;
|
||||
shr.u32 %r9, %r9, 31;
|
||||
setp.eq.s32 %p1, %r10, %r9;
|
||||
@%p1 bra BB9_14;
|
||||
bra.uni BB9_11;
|
||||
BB9_14: // in Loop: Header=BB9_13 Depth=2
|
||||
mov.u32 %r8, %r6;
|
||||
bra.uni BB9_12;
|
||||
BB9_11: // %not_all_continued_or_breaked.i346
|
||||
// in Loop: Header=BB9_13 Depth=2
|
||||
mul.f32 %f9, %f6, %f6;
|
||||
not.b32 %r9, %r7;
|
||||
and.b32 %r8, %r8, %r9;
|
||||
sub.f32 %f8, %f8, %f9;
|
||||
add.f32 %f8, %f4, %f8;
|
||||
add.f32 %f7, %f7, %f7;
|
||||
fma.rn.f32 %f6, %f6, %f7, %f5;
|
||||
mov.f32 %f7, %f8;
|
||||
BB9_12: // %for_step.i313
|
||||
// in Loop: Header=BB9_13 Depth=2
|
||||
setp.ne.s32 %p1, %r8, 0;
|
||||
selp.u32 %r9, 1, 0, %p1;
|
||||
add.s32 %r5, %r5, %r9;
|
||||
setp.lt.s32 %p1, %r5, %r2;
|
||||
selp.b32 %r8, %r8, 0, %p1;
|
||||
selp.b32 %r9, %r8, 0, %p0;
|
||||
setp.gt.s32 %p1, %r9, -1;
|
||||
@%p1 bra BB9_15;
|
||||
bra.uni BB9_13;
|
||||
BB9_21: // in Loop: Header=BB9_2 Depth=1
|
||||
mov.u32 %r5, %r6;
|
||||
BB9_15: // %mandel___vyfvyfvyi.exit347
|
||||
// in Loop: Header=BB9_2 Depth=1
|
||||
setp.lt.s32 %p0, %r1, 1;
|
||||
@%p0 bra BB9_17;
|
||||
// BB#16: // %pl_dolane.i
|
||||
// in Loop: Header=BB9_2 Depth=1
|
||||
mul.lo.s32 %r6, %r3, %r1;
|
||||
shl.b32 %r6, %r6, 2;
|
||||
cvt.s64.s32 %rl1, %r6;
|
||||
add.s64 %rl1, %rl1, %rl0;
|
||||
st.u32 [%rl1], %r5;
|
||||
BB9_17: // %foreach_reset
|
||||
// in Loop: Header=BB9_2 Depth=1
|
||||
add.s32 %r3, %r3, 1;
|
||||
setp.eq.s32 %p0, %r3, %r0;
|
||||
@%p0 bra BB9_18;
|
||||
bra.uni BB9_2;
|
||||
BB9_18: // %for_exit
|
||||
ret;
|
||||
}
|
||||
|
||||
BIN
examples_cuda/mandelbrot/out.s
Normal file
BIN
examples_cuda/mandelbrot/out.s
Normal file
Binary file not shown.
BIN
examples_cuda/mandelbrot/out1.o
Normal file
BIN
examples_cuda/mandelbrot/out1.o
Normal file
Binary file not shown.
Reference in New Issue
Block a user