added cuda examples

This commit is contained in:
Evghenii
2013-11-04 11:44:49 +01:00
parent cb6614da42
commit cb7cbec0d5
226 changed files with 284385 additions and 0 deletions

3
examples_cuda/mandelbrot/.gitignore vendored Normal file
View File

@@ -0,0 +1,3 @@
mandelbrot
*.ppm
objs

View File

@@ -0,0 +1,8 @@
EXAMPLE=mandelbrot
CPP_SRC=mandelbrot.cpp mandelbrot_serial.cpp
ISPC_SRC=mandelbrot.ispc
ISPC_IA_TARGETS=sse2,sse4-x2,avx-x2
ISPC_ARM_TARGETS=neon
include ../common.mk

BIN
examples_cuda/mandelbrot/avx.out Executable file

Binary file not shown.

BIN
examples_cuda/mandelbrot/avx1.out Executable file

Binary file not shown.

View File

@@ -0,0 +1,118 @@
/*
Copyright (c) 2010-2011, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef _MSC_VER
#define _CRT_SECURE_NO_WARNINGS
#define NOMINMAX
#pragma warning (disable: 4244)
#pragma warning (disable: 4305)
#endif
#include <stdio.h>
#include <algorithm>
#include "../timing.h"
#include "mandelbrot_ispc.h"
using namespace ispc;
extern void mandelbrot_serial(float x0, float y0, float x1, float y1,
int width, int height, int maxIterations,
int output[]);
/* Write a PPM image file with the image of the Mandelbrot set */
static void
writePPM(int *buf, int width, int height, const char *fn) {
FILE *fp = fopen(fn, "wb");
fprintf(fp, "P6\n");
fprintf(fp, "%d %d\n", width, height);
fprintf(fp, "255\n");
for (int i = 0; i < width*height; ++i) {
// Map the iteration count to colors by just alternating between
// two greys.
char c = (buf[i] & 0x1) ? 240 : 20;
for (int j = 0; j < 3; ++j)
fputc(c, fp);
}
fclose(fp);
printf("Wrote image file %s\n", fn);
}
int main() {
unsigned int width = 768;
unsigned int height = 512;
float x0 = -2;
float x1 = 1;
float y0 = -1;
float y1 = 1;
int maxIterations = 256;
int *buf = new int[width*height];
//
// Compute the image using the ispc implementation; report the minimum
// time of three runs.
//
double minISPC = 1e30;
for (int i = 0; i < 3; ++i) {
reset_and_start_timer();
mandelbrot_ispc(x0, y0, x1, y1, width, height, maxIterations, buf);
double dt = get_elapsed_mcycles();
minISPC = std::min(minISPC, dt);
}
printf("[mandelbrot ispc]:\t\t[%.3f] million cycles\n", minISPC);
writePPM(buf, width, height, "mandelbrot-ispc.ppm");
// Clear out the buffer
for (unsigned int i = 0; i < width * height; ++i)
buf[i] = 0;
//
// And run the serial implementation 3 times, again reporting the
// minimum time.
//
double minSerial = 1e30;
for (int i = 0; i < 3; ++i) {
reset_and_start_timer();
mandelbrot_serial(x0, y0, x1, y1, width, height, maxIterations, buf);
double dt = get_elapsed_mcycles();
minSerial = std::min(minSerial, dt);
}
printf("[mandelbrot serial]:\t\t[%.3f] million cycles\n", minSerial);
writePPM(buf, width, height, "mandelbrot-serial.ppm");
printf("\t\t\t\t(%.2fx speedup from ISPC)\n", minSerial/minISPC);
return 0;
}

View File

@@ -0,0 +1,78 @@
/*
Copyright (c) 2010-2012, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
static inline int mandel(float c_re, float c_im, int count) {
float z_re = c_re, z_im = c_im;
int i;
for (i = 0; i < count; ++i) {
if (z_re * z_re + z_im * z_im > 4.)
break;
float new_re = z_re*z_re - z_im*z_im;
float new_im = 2.f * z_re * z_im;
unmasked {
z_re = c_re + new_re;
z_im = c_im + new_im;
}
}
return i;
}
export void mandelbrot_ispc(uniform float x0, uniform float y0,
uniform float x1, uniform float y1,
uniform int width, uniform int height,
uniform int maxIterations,
uniform int output[])
{
float dx = (x1 - x0) / width;
float dy = (y1 - y0) / height;
for (uniform int j = 0; j < height; j++) {
// Note that we'll be doing programCount computations in parallel,
// so increment i by that much. This assumes that width evenly
// divides programCount.
foreach (i = 0 ... width) {
// Figure out the position on the complex plane to compute the
// number of iterations at. Note that the x values are
// different across different program instances, since its
// initializer incorporates the value of the programIndex
// variable.
float x = x0 + i * dx;
float y = y0 + j * dy;
int index = j * width + i;
output[index] = mandel(x, y, maxIterations);
}
}
}

View File

@@ -0,0 +1,175 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|Win32">
<Configuration>Debug</Configuration>
<Platform>Win32</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|Win32">
<Configuration>Release</Configuration>
<Platform>Win32</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C1}</ProjectGuid>
<Keyword>Win32Proj</Keyword>
<RootNamespace>mandelbrot</RootNamespace>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
</ImportGroup>
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
<LinkIncremental>true</LinkIncremental>
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<LinkIncremental>true</LinkIncremental>
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
<LinkIncremental>false</LinkIncremental>
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<LinkIncremental>false</LinkIncremental>
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
</PropertyGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
<ClCompile>
<PrecompiledHeader>
</PrecompiledHeader>
<WarningLevel>Level3</WarningLevel>
<Optimization>Disabled</Optimization>
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
<IntrinsicFunctions>true</IntrinsicFunctions>
<FloatingPointModel>Fast</FloatingPointModel>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<ClCompile>
<PrecompiledHeader>
</PrecompiledHeader>
<WarningLevel>Level3</WarningLevel>
<Optimization>Disabled</Optimization>
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
<IntrinsicFunctions>true</IntrinsicFunctions>
<FloatingPointModel>Fast</FloatingPointModel>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PrecompiledHeader>
</PrecompiledHeader>
<Optimization>MaxSpeed</Optimization>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
<FloatingPointModel>Fast</FloatingPointModel>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PrecompiledHeader>
</PrecompiledHeader>
<Optimization>MaxSpeed</Optimization>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>$(TargetDir)</AdditionalIncludeDirectories>
<FloatingPointModel>Fast</FloatingPointModel>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
</Link>
</ItemDefinitionGroup>
<ItemGroup>
<ClCompile Include="mandelbrot.cpp" />
<ClCompile Include="mandelbrot_serial.cpp" />
</ItemGroup>
<ItemGroup>
<CustomBuild Include="mandelbrot.ispc">
<FileType>Document</FileType>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
</Command>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
</Command>
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2
</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
</CustomBuild>
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
</ImportGroup>
</Project>

View File

@@ -0,0 +1,68 @@
/*
Copyright (c) 2010-2011, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
static int mandel(float c_re, float c_im, int count) {
float z_re = c_re, z_im = c_im;
int i;
for (i = 0; i < count; ++i) {
if (z_re * z_re + z_im * z_im > 4.f)
break;
float new_re = z_re*z_re - z_im*z_im;
float new_im = 2.f * z_re * z_im;
z_re = c_re + new_re;
z_im = c_im + new_im;
}
return i;
}
void mandelbrot_serial(float x0, float y0, float x1, float y1,
int width, int height, int maxIterations,
int output[])
{
float dx = (x1 - x0) / width;
float dy = (y1 - y0) / height;
for (int j = 0; j < height; j++) {
for (int i = 0; i < width; ++i) {
float x = x0 + i * dx;
float y = y0 + j * dy;
int index = (j * width + i);
output[index] = mandel(x, y, maxIterations);
}
}
}

Binary file not shown.

View File

@@ -0,0 +1,843 @@
//
// Generated by LLVM NVPTX Back-End
//
.version 3.1
.target sm_35, texmode_independent
.address_size 64
// .globl __vselect_i8
// @__vselect_i8
.func (.param .align 1 .b8 func_retval0[1]) __vselect_i8(
.param .align 1 .b8 __vselect_i8_param_0[1],
.param .align 1 .b8 __vselect_i8_param_1[1],
.param .align 4 .b8 __vselect_i8_param_2[4]
)
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u32 %r0, [__vselect_i8_param_2];
setp.eq.s32 %p0, %r0, 0;
ld.param.u8 %rc0, [__vselect_i8_param_0];
ld.param.u8 %rc1, [__vselect_i8_param_1];
selp.b16 %rc0, %rc0, %rc1, %p0;
st.param.b8 [func_retval0+0], %rc0;
ret;
}
// .globl __vselect_i16
.func (.param .align 2 .b8 func_retval0[2]) __vselect_i16(
.param .align 2 .b8 __vselect_i16_param_0[2],
.param .align 2 .b8 __vselect_i16_param_1[2],
.param .align 4 .b8 __vselect_i16_param_2[4]
) // @__vselect_i16
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u32 %r0, [__vselect_i16_param_2];
setp.eq.s32 %p0, %r0, 0;
ld.param.u16 %rs0, [__vselect_i16_param_0];
ld.param.u16 %rs1, [__vselect_i16_param_1];
selp.b16 %rs0, %rs0, %rs1, %p0;
st.param.b16 [func_retval0+0], %rs0;
ret;
}
// .globl __vselect_i64
.func (.param .align 8 .b8 func_retval0[8]) __vselect_i64(
.param .align 8 .b8 __vselect_i64_param_0[8],
.param .align 8 .b8 __vselect_i64_param_1[8],
.param .align 4 .b8 __vselect_i64_param_2[4]
) // @__vselect_i64
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u32 %r0, [__vselect_i64_param_2];
setp.eq.s32 %p0, %r0, 0;
ld.param.u64 %rl0, [__vselect_i64_param_0];
ld.param.u64 %rl1, [__vselect_i64_param_1];
selp.b64 %rl0, %rl0, %rl1, %p0;
st.param.b64 [func_retval0+0], %rl0;
ret;
}
// .globl __aos_to_soa4_float1
.func __aos_to_soa4_float1(
.param .align 4 .b8 __aos_to_soa4_float1_param_0[4],
.param .align 4 .b8 __aos_to_soa4_float1_param_1[4],
.param .align 4 .b8 __aos_to_soa4_float1_param_2[4],
.param .align 4 .b8 __aos_to_soa4_float1_param_3[4],
.param .b64 __aos_to_soa4_float1_param_4,
.param .b64 __aos_to_soa4_float1_param_5,
.param .b64 __aos_to_soa4_float1_param_6,
.param .b64 __aos_to_soa4_float1_param_7
) // @__aos_to_soa4_float1
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u64 %rl0, [__aos_to_soa4_float1_param_4];
ld.param.u64 %rl1, [__aos_to_soa4_float1_param_5];
ld.param.u64 %rl2, [__aos_to_soa4_float1_param_6];
ld.param.u64 %rl3, [__aos_to_soa4_float1_param_7];
ld.param.f32 %f0, [__aos_to_soa4_float1_param_0];
ld.param.f32 %f1, [__aos_to_soa4_float1_param_1];
ld.param.f32 %f2, [__aos_to_soa4_float1_param_2];
ld.param.f32 %f3, [__aos_to_soa4_float1_param_3];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
st.f32 [%rl3], %f3;
ret;
}
// .globl __soa_to_aos4_float1
.func __soa_to_aos4_float1(
.param .align 4 .b8 __soa_to_aos4_float1_param_0[4],
.param .align 4 .b8 __soa_to_aos4_float1_param_1[4],
.param .align 4 .b8 __soa_to_aos4_float1_param_2[4],
.param .align 4 .b8 __soa_to_aos4_float1_param_3[4],
.param .b64 __soa_to_aos4_float1_param_4,
.param .b64 __soa_to_aos4_float1_param_5,
.param .b64 __soa_to_aos4_float1_param_6,
.param .b64 __soa_to_aos4_float1_param_7
) // @__soa_to_aos4_float1
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u64 %rl0, [__soa_to_aos4_float1_param_4];
ld.param.u64 %rl1, [__soa_to_aos4_float1_param_5];
ld.param.u64 %rl2, [__soa_to_aos4_float1_param_6];
ld.param.u64 %rl3, [__soa_to_aos4_float1_param_7];
ld.param.f32 %f0, [__soa_to_aos4_float1_param_0];
ld.param.f32 %f1, [__soa_to_aos4_float1_param_1];
ld.param.f32 %f2, [__soa_to_aos4_float1_param_2];
ld.param.f32 %f3, [__soa_to_aos4_float1_param_3];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
st.f32 [%rl3], %f3;
ret;
}
// .globl __aos_to_soa3_float1
.func __aos_to_soa3_float1(
.param .align 4 .b8 __aos_to_soa3_float1_param_0[4],
.param .align 4 .b8 __aos_to_soa3_float1_param_1[4],
.param .align 4 .b8 __aos_to_soa3_float1_param_2[4],
.param .b64 __aos_to_soa3_float1_param_3,
.param .b64 __aos_to_soa3_float1_param_4,
.param .b64 __aos_to_soa3_float1_param_5
) // @__aos_to_soa3_float1
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u64 %rl0, [__aos_to_soa3_float1_param_3];
ld.param.u64 %rl1, [__aos_to_soa3_float1_param_4];
ld.param.u64 %rl2, [__aos_to_soa3_float1_param_5];
ld.param.f32 %f0, [__aos_to_soa3_float1_param_0];
ld.param.f32 %f1, [__aos_to_soa3_float1_param_1];
ld.param.f32 %f2, [__aos_to_soa3_float1_param_2];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
ret;
}
// .globl __soa_to_aos3_float1
.func __soa_to_aos3_float1(
.param .align 4 .b8 __soa_to_aos3_float1_param_0[4],
.param .align 4 .b8 __soa_to_aos3_float1_param_1[4],
.param .align 4 .b8 __soa_to_aos3_float1_param_2[4],
.param .b64 __soa_to_aos3_float1_param_3,
.param .b64 __soa_to_aos3_float1_param_4,
.param .b64 __soa_to_aos3_float1_param_5
) // @__soa_to_aos3_float1
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.u64 %rl0, [__soa_to_aos3_float1_param_3];
ld.param.u64 %rl1, [__soa_to_aos3_float1_param_4];
ld.param.u64 %rl2, [__soa_to_aos3_float1_param_5];
ld.param.f32 %f0, [__soa_to_aos3_float1_param_0];
ld.param.f32 %f1, [__soa_to_aos3_float1_param_1];
ld.param.f32 %f2, [__soa_to_aos3_float1_param_2];
st.f32 [%rl0], %f0;
st.f32 [%rl1], %f1;
st.f32 [%rl2], %f2;
ret;
}
// .globl __rsqrt_varying_double
.func (.param .align 8 .b8 func_retval0[8]) __rsqrt_varying_double(
.param .align 8 .b8 __rsqrt_varying_double_param_0[8]
) // @__rsqrt_varying_double
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0:
ld.param.f64 %fl0, [__rsqrt_varying_double_param_0];
rsqrt.approx.f64 %fl0, %fl0;
st.param.f64 [func_retval0+0], %fl0;
ret;
}
// .globl mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_
.func mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_(
.param .b32 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_0,
.param .b32 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_1,
.param .b32 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_2,
.param .b32 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_3,
.param .b32 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_4,
.param .b32 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_5,
.param .b32 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_6,
.param .b64 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_7,
.param .align 4 .b8 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_8[4]
) // @mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0: // %allocas
ld.param.f32 %f0, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_0];
ld.param.f32 %f1, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_1];
ld.param.f32 %f3, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_2];
ld.param.f32 %f2, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_3];
ld.param.u32 %r0, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_4];
ld.param.u32 %r1, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_5];
ld.param.u32 %r2, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_6];
ld.param.u64 %rl0, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_7];
ld.param.u32 %r3, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_8];
setp.lt.s32 %p0, %r3, 0;
sub.f32 %f3, %f3, %f0;
cvt.rn.f32.s32 %f4, %r0;
sub.f32 %f2, %f2, %f1;
cvt.rn.f32.s32 %f5, %r1;
div.rn.f32 %f2, %f2, %f5;
div.rn.f32 %f3, %f3, %f4;
@%p0 bra BB8_9;
// BB#1: // %for_test110.preheader
setp.lt.s32 %p0, %r1, 1;
@%p0 bra BB8_45;
// BB#2: // %outer_not_in_extras140.preheader.lr.ph
setp.gt.s32 %p0, %r2, 0;
mov.u32 %r3, 0;
selp.b32 %r4, -1, 0, %p0;
shl.b32 %r5, %r0, 2;
mov.u32 %r6, %r3;
BB8_3: // %outer_not_in_extras140.preheader
// =>This Loop Header: Depth=1
// Child Loop BB8_41 Depth 2
// Child Loop BB8_43 Depth 2
// Child Loop BB8_38 Depth 2
// Child Loop BB8_33 Depth 3
setp.lt.s32 %p0, %r0, 1;
@%p0 bra BB8_4;
// BB#31: // %foreach_full_body120.lr.ph
// in Loop: Header=BB8_3 Depth=1
setp.lt.s32 %p0, %r4, 0;
mov.u32 %r7, %r0;
mov.u32 %r8, %r3;
@%p0 bra BB8_32;
bra.uni BB8_43;
BB8_32: // in Loop: Header=BB8_3 Depth=1
mov.u64 %rl1, 0;
cvt.rn.f32.s32 %f4, %r6;
fma.rn.f32 %f4, %f2, %f4, %f1;
mul.lo.s32 %r7, %r6, %r0;
BB8_38: // %for_loop.i380.lr.ph.us
// Parent Loop BB8_3 Depth=1
// => This Loop Header: Depth=2
// Child Loop BB8_33 Depth 3
cvt.u32.u64 %r8, %rl1;
cvt.rn.f32.s32 %f5, %r8;
fma.rn.f32 %f5, %f3, %f5, %f0;
mov.u32 %r10, 0;
mov.u32 %r12, %r4;
mov.u32 %r11, %r10;
mov.u32 %r9, %r10;
mov.f32 %f7, %f5;
mov.f32 %f6, %f4;
BB8_33: // %for_loop.i380.us
// Parent Loop BB8_3 Depth=1
// Parent Loop BB8_38 Depth=2
// => This Inner Loop Header: Depth=3
mul.f32 %f8, %f7, %f7;
fma.rn.f32 %f9, %f6, %f6, %f8;
setp.gtu.f32 %p0, %f9, 0f40800000;
selp.b32 %r13, %r12, 0, %p0;
or.b32 %r11, %r13, %r11;
shr.u32 %r13, %r11, 31;
shr.u32 %r14, %r12, 31;
setp.eq.s32 %p0, %r13, %r14;
@%p0 bra BB8_34;
bra.uni BB8_35;
BB8_34: // in Loop: Header=BB8_33 Depth=3
mov.u32 %r12, %r10;
bra.uni BB8_36;
BB8_35: // %not_all_continued_or_breaked.i394.us
// in Loop: Header=BB8_33 Depth=3
mul.f32 %f9, %f6, %f6;
not.b32 %r13, %r11;
and.b32 %r12, %r12, %r13;
sub.f32 %f8, %f8, %f9;
add.f32 %f8, %f5, %f8;
add.f32 %f7, %f7, %f7;
fma.rn.f32 %f6, %f6, %f7, %f4;
mov.f32 %f7, %f8;
BB8_36: // %for_step.i363.us
// in Loop: Header=BB8_33 Depth=3
setp.ne.s32 %p0, %r12, 0;
selp.u32 %r13, 1, 0, %p0;
add.s32 %r9, %r9, %r13;
setp.lt.s32 %p0, %r9, %r2;
selp.b32 %r12, %r12, 0, %p0;
setp.lt.s32 %p0, %r12, 0;
@%p0 bra BB8_33;
// BB#37: // %mandel___vyfvyfvyi.exit395.us
// in Loop: Header=BB8_38 Depth=2
add.s32 %r8, %r8, %r7;
shl.b32 %r8, %r8, 2;
cvt.s64.s32 %rl2, %r8;
add.s64 %rl2, %rl2, %rl0;
st.u32 [%rl2], %r9;
add.s64 %rl1, %rl1, 1;
cvt.u32.u64 %r8, %rl1;
setp.eq.s32 %p0, %r8, %r0;
@%p0 bra BB8_44;
bra.uni BB8_38;
BB8_43: // %mandel___vyfvyfvyi.exit395
// Parent Loop BB8_3 Depth=1
// => This Inner Loop Header: Depth=2
cvt.s64.s32 %rl1, %r8;
add.s64 %rl1, %rl1, %rl0;
mov.u32 %r9, 0;
st.u32 [%rl1], %r9;
add.s32 %r8, %r8, 4;
add.s32 %r7, %r7, -1;
setp.eq.s32 %p0, %r7, 0;
@%p0 bra BB8_44;
bra.uni BB8_43;
BB8_4: // %partial_inner_all_outer156
// in Loop: Header=BB8_3 Depth=1
@%p0 bra BB8_44;
// BB#5: // %partial_inner_only197
// in Loop: Header=BB8_3 Depth=1
setp.gt.s32 %p0, %r0, 0;
mov.u32 %r8, 0;
fma.rn.f32 %f4, %f3, 0f00000000, %f0;
cvt.rn.f32.s32 %f5, %r6;
fma.rn.f32 %f5, %f2, %f5, %f1;
selp.b32 %r7, %r4, 0, %p0;
setp.lt.s32 %p1, %r7, 0;
mov.u32 %r10, %r4;
mov.u32 %r9, %r8;
mov.u32 %r7, %r8;
mov.f32 %f7, %f4;
mov.f32 %f6, %f5;
@%p1 bra BB8_41;
bra.uni BB8_6;
BB8_41: // %for_loop.i
// Parent Loop BB8_3 Depth=1
// => This Inner Loop Header: Depth=2
selp.b32 %r11, %r10, 0, %p0;
mul.f32 %f8, %f7, %f7;
fma.rn.f32 %f9, %f6, %f6, %f8;
setp.gtu.f32 %p1, %f9, 0f40800000;
selp.b32 %r12, %r10, 0, %p1;
or.b32 %r9, %r12, %r9;
selp.b32 %r12, %r9, 0, %p0;
shr.u32 %r12, %r12, 31;
shr.u32 %r11, %r11, 31;
setp.eq.s32 %p1, %r12, %r11;
@%p1 bra BB8_42;
bra.uni BB8_39;
BB8_42: // in Loop: Header=BB8_41 Depth=2
mov.u32 %r10, %r8;
bra.uni BB8_40;
BB8_39: // %not_all_continued_or_breaked.i
// in Loop: Header=BB8_41 Depth=2
mul.f32 %f9, %f6, %f6;
not.b32 %r11, %r9;
and.b32 %r10, %r10, %r11;
sub.f32 %f8, %f8, %f9;
add.f32 %f8, %f4, %f8;
add.f32 %f7, %f7, %f7;
fma.rn.f32 %f6, %f6, %f7, %f5;
mov.f32 %f7, %f8;
BB8_40: // %for_step.i
// in Loop: Header=BB8_41 Depth=2
setp.ne.s32 %p1, %r10, 0;
selp.u32 %r11, 1, 0, %p1;
add.s32 %r7, %r7, %r11;
setp.lt.s32 %p1, %r7, %r2;
selp.b32 %r10, %r10, 0, %p1;
selp.b32 %r11, %r10, 0, %p0;
setp.gt.s32 %p1, %r11, -1;
@%p1 bra BB8_7;
bra.uni BB8_41;
BB8_6: // in Loop: Header=BB8_3 Depth=1
mov.u32 %r7, %r8;
BB8_7: // %mandel___vyfvyfvyi.exit
// in Loop: Header=BB8_3 Depth=1
setp.lt.s32 %p0, %r0, 1;
@%p0 bra BB8_44;
// BB#8: // %pl_dolane.i
// in Loop: Header=BB8_3 Depth=1
mul.lo.s32 %r8, %r6, %r0;
shl.b32 %r8, %r8, 2;
cvt.s64.s32 %rl1, %r8;
add.s64 %rl1, %rl1, %rl0;
st.u32 [%rl1], %r7;
BB8_44: // %foreach_reset128
// in Loop: Header=BB8_3 Depth=1
add.s32 %r6, %r6, 1;
add.s32 %r3, %r3, %r5;
setp.eq.s32 %p0, %r6, %r1;
@%p0 bra BB8_45;
bra.uni BB8_3;
BB8_9: // %for_test.preheader
setp.lt.s32 %p0, %r1, 1;
@%p0 bra BB8_45;
// BB#10: // %outer_not_in_extras.preheader.lr.ph
setp.gt.s32 %p0, %r2, 0;
mov.u32 %r3, 0;
selp.b32 %r4, -1, 0, %p0;
shl.b32 %r5, %r0, 2;
mov.u32 %r6, %r3;
BB8_11: // %outer_not_in_extras.preheader
// =>This Loop Header: Depth=1
// Child Loop BB8_23 Depth 2
// Child Loop BB8_20 Depth 2
// Child Loop BB8_19 Depth 2
// Child Loop BB8_14 Depth 3
setp.lt.s32 %p0, %r0, 1;
@%p0 bra BB8_28;
// BB#12: // %foreach_full_body.lr.ph
// in Loop: Header=BB8_11 Depth=1
setp.lt.s32 %p0, %r4, 0;
mov.u32 %r7, %r0;
mov.u32 %r8, %r3;
@%p0 bra BB8_13;
bra.uni BB8_20;
BB8_13: // in Loop: Header=BB8_11 Depth=1
mov.u64 %rl1, 0;
cvt.rn.f32.s32 %f4, %r6;
fma.rn.f32 %f4, %f2, %f4, %f1;
mul.lo.s32 %r7, %r6, %r0;
BB8_19: // %for_loop.i281.lr.ph.us
// Parent Loop BB8_11 Depth=1
// => This Loop Header: Depth=2
// Child Loop BB8_14 Depth 3
cvt.u32.u64 %r8, %rl1;
cvt.rn.f32.s32 %f5, %r8;
fma.rn.f32 %f5, %f3, %f5, %f0;
mov.u32 %r10, 0;
mov.u32 %r12, %r4;
mov.u32 %r11, %r10;
mov.u32 %r9, %r10;
mov.f32 %f7, %f5;
mov.f32 %f6, %f4;
BB8_14: // %for_loop.i281.us
// Parent Loop BB8_11 Depth=1
// Parent Loop BB8_19 Depth=2
// => This Inner Loop Header: Depth=3
mul.f32 %f8, %f7, %f7;
fma.rn.f32 %f9, %f6, %f6, %f8;
setp.gtu.f32 %p0, %f9, 0f40800000;
selp.b32 %r13, %r12, 0, %p0;
or.b32 %r11, %r13, %r11;
shr.u32 %r13, %r11, 31;
shr.u32 %r14, %r12, 31;
setp.eq.s32 %p0, %r13, %r14;
@%p0 bra BB8_15;
bra.uni BB8_16;
BB8_15: // in Loop: Header=BB8_14 Depth=3
mov.u32 %r12, %r10;
bra.uni BB8_17;
BB8_16: // %not_all_continued_or_breaked.i295.us
// in Loop: Header=BB8_14 Depth=3
mul.f32 %f9, %f6, %f6;
not.b32 %r13, %r11;
and.b32 %r12, %r12, %r13;
sub.f32 %f8, %f8, %f9;
add.f32 %f8, %f5, %f8;
add.f32 %f7, %f7, %f7;
fma.rn.f32 %f6, %f6, %f7, %f4;
mov.f32 %f7, %f8;
BB8_17: // %for_step.i264.us
// in Loop: Header=BB8_14 Depth=3
setp.ne.s32 %p0, %r12, 0;
selp.u32 %r13, 1, 0, %p0;
add.s32 %r9, %r9, %r13;
setp.lt.s32 %p0, %r9, %r2;
selp.b32 %r12, %r12, 0, %p0;
setp.lt.s32 %p0, %r12, 0;
@%p0 bra BB8_14;
// BB#18: // %mandel___vyfvyfvyi.exit296.us
// in Loop: Header=BB8_19 Depth=2
add.s32 %r8, %r8, %r7;
shl.b32 %r8, %r8, 2;
cvt.s64.s32 %rl2, %r8;
add.s64 %rl2, %rl2, %rl0;
st.u32 [%rl2], %r9;
add.s64 %rl1, %rl1, 1;
cvt.u32.u64 %r8, %rl1;
setp.eq.s32 %p0, %r8, %r0;
@%p0 bra BB8_27;
bra.uni BB8_19;
BB8_20: // %mandel___vyfvyfvyi.exit296
// Parent Loop BB8_11 Depth=1
// => This Inner Loop Header: Depth=2
cvt.s64.s32 %rl1, %r8;
add.s64 %rl1, %rl1, %rl0;
mov.u32 %r9, 0;
st.u32 [%rl1], %r9;
add.s32 %r8, %r8, 4;
add.s32 %r7, %r7, -1;
setp.eq.s32 %p0, %r7, 0;
@%p0 bra BB8_27;
bra.uni BB8_20;
BB8_28: // %partial_inner_all_outer
// in Loop: Header=BB8_11 Depth=1
@%p0 bra BB8_27;
// BB#29: // %partial_inner_only
// in Loop: Header=BB8_11 Depth=1
setp.gt.s32 %p0, %r0, 0;
mov.u32 %r8, 0;
fma.rn.f32 %f4, %f3, 0f00000000, %f0;
cvt.rn.f32.s32 %f5, %r6;
fma.rn.f32 %f5, %f2, %f5, %f1;
selp.b32 %r7, %r4, 0, %p0;
setp.lt.s32 %p1, %r7, 0;
mov.u32 %r10, %r4;
mov.u32 %r9, %r8;
mov.u32 %r7, %r8;
mov.f32 %f7, %f4;
mov.f32 %f6, %f5;
@%p1 bra BB8_23;
bra.uni BB8_30;
BB8_23: // %for_loop.i332
// Parent Loop BB8_11 Depth=1
// => This Inner Loop Header: Depth=2
selp.b32 %r11, %r10, 0, %p0;
mul.f32 %f8, %f7, %f7;
fma.rn.f32 %f9, %f6, %f6, %f8;
setp.gtu.f32 %p1, %f9, 0f40800000;
selp.b32 %r12, %r10, 0, %p1;
or.b32 %r9, %r12, %r9;
selp.b32 %r12, %r9, 0, %p0;
shr.u32 %r12, %r12, 31;
shr.u32 %r11, %r11, 31;
setp.eq.s32 %p1, %r12, %r11;
@%p1 bra BB8_24;
bra.uni BB8_21;
BB8_24: // in Loop: Header=BB8_23 Depth=2
mov.u32 %r10, %r8;
bra.uni BB8_22;
BB8_21: // %not_all_continued_or_breaked.i346
// in Loop: Header=BB8_23 Depth=2
mul.f32 %f9, %f6, %f6;
not.b32 %r11, %r9;
and.b32 %r10, %r10, %r11;
sub.f32 %f8, %f8, %f9;
add.f32 %f8, %f4, %f8;
add.f32 %f7, %f7, %f7;
fma.rn.f32 %f6, %f6, %f7, %f5;
mov.f32 %f7, %f8;
BB8_22: // %for_step.i313
// in Loop: Header=BB8_23 Depth=2
setp.ne.s32 %p1, %r10, 0;
selp.u32 %r11, 1, 0, %p1;
add.s32 %r7, %r7, %r11;
setp.lt.s32 %p1, %r7, %r2;
selp.b32 %r10, %r10, 0, %p1;
selp.b32 %r11, %r10, 0, %p0;
setp.gt.s32 %p1, %r11, -1;
@%p1 bra BB8_25;
bra.uni BB8_23;
BB8_30: // in Loop: Header=BB8_11 Depth=1
mov.u32 %r7, %r8;
BB8_25: // %mandel___vyfvyfvyi.exit347
// in Loop: Header=BB8_11 Depth=1
setp.lt.s32 %p0, %r0, 1;
@%p0 bra BB8_27;
// BB#26: // %pl_dolane.i452
// in Loop: Header=BB8_11 Depth=1
mul.lo.s32 %r8, %r6, %r0;
shl.b32 %r8, %r8, 2;
cvt.s64.s32 %rl1, %r8;
add.s64 %rl1, %rl1, %rl0;
st.u32 [%rl1], %r7;
BB8_27: // %foreach_reset
// in Loop: Header=BB8_11 Depth=1
add.s32 %r6, %r6, 1;
add.s32 %r3, %r3, %r5;
setp.eq.s32 %p0, %r6, %r1;
@%p0 bra BB8_45;
bra.uni BB8_11;
BB8_45: // %for_exit
ret;
}
// .globl mandelbrot_ispc
.func mandelbrot_ispc(
.param .b32 mandelbrot_ispc_param_0,
.param .b32 mandelbrot_ispc_param_1,
.param .b32 mandelbrot_ispc_param_2,
.param .b32 mandelbrot_ispc_param_3,
.param .b32 mandelbrot_ispc_param_4,
.param .b32 mandelbrot_ispc_param_5,
.param .b32 mandelbrot_ispc_param_6,
.param .b64 mandelbrot_ispc_param_7
) // @mandelbrot_ispc
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0: // %allocas
ld.param.u32 %r0, [mandelbrot_ispc_param_5];
setp.lt.s32 %p0, %r0, 1;
@%p0 bra BB9_18;
// BB#1: // %outer_not_in_extras.preheader.lr.ph
ld.param.f32 %f0, [mandelbrot_ispc_param_0];
ld.param.f32 %f1, [mandelbrot_ispc_param_1];
ld.param.f32 %f3, [mandelbrot_ispc_param_2];
ld.param.f32 %f2, [mandelbrot_ispc_param_3];
ld.param.u32 %r1, [mandelbrot_ispc_param_4];
ld.param.u32 %r2, [mandelbrot_ispc_param_6];
ld.param.u64 %rl0, [mandelbrot_ispc_param_7];
sub.f32 %f3, %f3, %f0;
cvt.rn.f32.s32 %f4, %r1;
sub.f32 %f2, %f2, %f1;
cvt.rn.f32.s32 %f5, %r0;
div.rn.f32 %f2, %f2, %f5;
div.rn.f32 %f3, %f3, %f4;
setp.gt.s32 %p0, %r2, 0;
mov.u32 %r3, 0;
selp.b32 %r4, -1, 0, %p0;
BB9_2: // %outer_not_in_extras.preheader
// =>This Loop Header: Depth=1
// Child Loop BB9_13 Depth 2
// Child Loop BB9_4 Depth 2
// Child Loop BB9_9 Depth 3
setp.lt.s32 %p0, %r1, 1;
@%p0 bra BB9_19;
// BB#3: // %foreach_full_body.lr.ph
// in Loop: Header=BB9_2 Depth=1
mov.u64 %rl1, 0;
cvt.rn.f32.s32 %f4, %r3;
fma.rn.f32 %f4, %f2, %f4, %f1;
mul.lo.s32 %r5, %r3, %r1;
BB9_4: // %foreach_full_body
// Parent Loop BB9_2 Depth=1
// => This Loop Header: Depth=2
// Child Loop BB9_9 Depth 3
setp.lt.s32 %p0, %r4, 0;
cvt.u32.u64 %r6, %rl1;
cvt.rn.f32.s32 %f5, %r6;
fma.rn.f32 %f5, %f3, %f5, %f0;
mov.u32 %r8, 0;
mov.u32 %r10, %r4;
mov.u32 %r9, %r8;
mov.u32 %r7, %r8;
mov.f32 %f7, %f5;
mov.f32 %f6, %f4;
@%p0 bra BB9_9;
bra.uni BB9_5;
BB9_9: // %for_loop.i281
// Parent Loop BB9_2 Depth=1
// Parent Loop BB9_4 Depth=2
// => This Inner Loop Header: Depth=3
mul.f32 %f8, %f7, %f7;
fma.rn.f32 %f9, %f6, %f6, %f8;
setp.gtu.f32 %p0, %f9, 0f40800000;
selp.b32 %r11, %r10, 0, %p0;
or.b32 %r9, %r11, %r9;
shr.u32 %r11, %r9, 31;
shr.u32 %r12, %r10, 31;
setp.eq.s32 %p0, %r11, %r12;
@%p0 bra BB9_10;
bra.uni BB9_7;
BB9_10: // in Loop: Header=BB9_9 Depth=3
mov.u32 %r10, %r8;
bra.uni BB9_8;
BB9_7: // %not_all_continued_or_breaked.i295
// in Loop: Header=BB9_9 Depth=3
mul.f32 %f9, %f6, %f6;
not.b32 %r11, %r9;
and.b32 %r10, %r10, %r11;
sub.f32 %f8, %f8, %f9;
add.f32 %f8, %f5, %f8;
add.f32 %f7, %f7, %f7;
fma.rn.f32 %f6, %f6, %f7, %f4;
mov.f32 %f7, %f8;
BB9_8: // %for_step.i264
// in Loop: Header=BB9_9 Depth=3
setp.ne.s32 %p0, %r10, 0;
selp.u32 %r11, 1, 0, %p0;
add.s32 %r7, %r7, %r11;
setp.lt.s32 %p0, %r7, %r2;
selp.b32 %r10, %r10, 0, %p0;
setp.gt.s32 %p0, %r10, -1;
@%p0 bra BB9_6;
bra.uni BB9_9;
BB9_5: // in Loop: Header=BB9_4 Depth=2
mov.u32 %r7, %r8;
BB9_6: // %mandel___vyfvyfvyi.exit296
// in Loop: Header=BB9_4 Depth=2
add.s32 %r6, %r6, %r5;
shl.b32 %r6, %r6, 2;
cvt.s64.s32 %rl2, %r6;
add.s64 %rl2, %rl2, %rl0;
st.u32 [%rl2], %r7;
add.s64 %rl1, %rl1, 1;
cvt.u32.u64 %r6, %rl1;
setp.eq.s32 %p0, %r6, %r1;
@%p0 bra BB9_17;
bra.uni BB9_4;
BB9_19: // %partial_inner_all_outer
// in Loop: Header=BB9_2 Depth=1
@%p0 bra BB9_17;
// BB#20: // %partial_inner_only
// in Loop: Header=BB9_2 Depth=1
setp.gt.s32 %p0, %r1, 0;
mov.u32 %r6, 0;
fma.rn.f32 %f4, %f3, 0f00000000, %f0;
cvt.rn.f32.s32 %f5, %r3;
fma.rn.f32 %f5, %f2, %f5, %f1;
selp.b32 %r5, %r4, 0, %p0;
setp.lt.s32 %p1, %r5, 0;
mov.u32 %r8, %r4;
mov.u32 %r7, %r6;
mov.u32 %r5, %r6;
mov.f32 %f7, %f4;
mov.f32 %f6, %f5;
@%p1 bra BB9_13;
bra.uni BB9_21;
BB9_13: // %for_loop.i332
// Parent Loop BB9_2 Depth=1
// => This Inner Loop Header: Depth=2
selp.b32 %r9, %r8, 0, %p0;
mul.f32 %f8, %f7, %f7;
fma.rn.f32 %f9, %f6, %f6, %f8;
setp.gtu.f32 %p1, %f9, 0f40800000;
selp.b32 %r10, %r8, 0, %p1;
or.b32 %r7, %r10, %r7;
selp.b32 %r10, %r7, 0, %p0;
shr.u32 %r10, %r10, 31;
shr.u32 %r9, %r9, 31;
setp.eq.s32 %p1, %r10, %r9;
@%p1 bra BB9_14;
bra.uni BB9_11;
BB9_14: // in Loop: Header=BB9_13 Depth=2
mov.u32 %r8, %r6;
bra.uni BB9_12;
BB9_11: // %not_all_continued_or_breaked.i346
// in Loop: Header=BB9_13 Depth=2
mul.f32 %f9, %f6, %f6;
not.b32 %r9, %r7;
and.b32 %r8, %r8, %r9;
sub.f32 %f8, %f8, %f9;
add.f32 %f8, %f4, %f8;
add.f32 %f7, %f7, %f7;
fma.rn.f32 %f6, %f6, %f7, %f5;
mov.f32 %f7, %f8;
BB9_12: // %for_step.i313
// in Loop: Header=BB9_13 Depth=2
setp.ne.s32 %p1, %r8, 0;
selp.u32 %r9, 1, 0, %p1;
add.s32 %r5, %r5, %r9;
setp.lt.s32 %p1, %r5, %r2;
selp.b32 %r8, %r8, 0, %p1;
selp.b32 %r9, %r8, 0, %p0;
setp.gt.s32 %p1, %r9, -1;
@%p1 bra BB9_15;
bra.uni BB9_13;
BB9_21: // in Loop: Header=BB9_2 Depth=1
mov.u32 %r5, %r6;
BB9_15: // %mandel___vyfvyfvyi.exit347
// in Loop: Header=BB9_2 Depth=1
setp.lt.s32 %p0, %r1, 1;
@%p0 bra BB9_17;
// BB#16: // %pl_dolane.i
// in Loop: Header=BB9_2 Depth=1
mul.lo.s32 %r6, %r3, %r1;
shl.b32 %r6, %r6, 2;
cvt.s64.s32 %rl1, %r6;
add.s64 %rl1, %rl1, %rl0;
st.u32 [%rl1], %r5;
BB9_17: // %foreach_reset
// in Loop: Header=BB9_2 Depth=1
add.s32 %r3, %r3, 1;
setp.eq.s32 %p0, %r3, %r0;
@%p0 bra BB9_18;
bra.uni BB9_2;
BB9_18: // %for_exit
ret;
}

Binary file not shown.

Binary file not shown.