diff --git a/builtins/dispatch.ll b/builtins/dispatch.ll index 799fb3eb..d4cd4e1d 100644 --- a/builtins/dispatch.ll +++ b/builtins/dispatch.ll @@ -74,20 +74,54 @@ ;; return (rEAX & 6) == 6; ;; } ;; +;; static int __os_has_avx512_support() { +;; // Check if the OS saves the XMM, YMM and ZMM registers, i.e. it supports AVX2 and AVX512. +;; // See section 2.1 of software.intel.com/sites/default/files/managed/0d/53/319433-022.pdf +;; // Check xgetbv; this uses a .byte sequence instead of the instruction +;; // directly because older assemblers do not include support for xgetbv and +;; // there is no easy way to conditionally compile based on the assembler used. +;; int rEAX, rEDX; +;; __asm__ __volatile__ (".byte 0x0f, 0x01, 0xd0" : "=a" (rEAX), "=d" (rEDX) : "c" (0)); +;; return (rEAX & 0xE6) == 0xE6; +;; } +;; ;; int32_t __get_system_isa() { ;; int info[4]; ;; __cpuid(info, 1); ;; +;; // Call cpuid with eax=7, ecx=0 +;; int info2[4]; +;; __cpuid_count(info2, 7, 0); +;; ;; // NOTE: the values returned below must be the same as the ;; // corresponding enumerant values in Target::ISA. +;; if ((info2[1] & (1 << 5)) != 0 && // AVX2 +;; (info2[1] & (1 << 16)) != 0 && // AVX512 F +;; __os_has_avx512_support()) { +;; // We need to verify that AVX2 is also available, +;; // as well as AVX512, because our targets are supposed +;; // to use both. +;; +;; if ((info2[1] & (1 << 17)) != 0 && // AVX512 DQ +;; (info2[1] & (1 << 28)) != 0 && // AVX512 CDI +;; (info2[1] & (1 << 30)) != 0 && // AVX512 BW +;; (info2[1] & (1 << 31)) != 0) { // AVX512 VL +;; return 6; // SKX +;; } +;; else if ((info2[1] & (1 << 26)) != 0 && // AVX512 PF +;; (info2[1] & (1 << 27)) != 0 && // AVX512 ER +;; (info2[1] & (1 << 28)) != 0) { // AVX512 CDI +;; return 5; // KNL +;; } +;; // If it's unknown AVX512 target, fall through and use AVX2 +;; // or whatever is available in the machine. +;; } +;; ;; if ((info[2] & (1 << 28)) != 0 && ;; __os_has_avx_support()) { ;; if ((info[2] & (1 << 29)) != 0 && // F16C ;; (info[2] & (1 << 30)) != 0) { // RDRAND ;; // So far, so good. AVX2? -;; // Call cpuid with eax=7, ecx=0 -;; int info2[4]; -;; __cpuid_count(info2, 7, 0); ;; if ((info2[1] & (1 << 5)) != 0) ;; return 4; ;; else @@ -104,6 +138,7 @@ ;; abort(); ;; } + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; LLVM has different IR for different versions since 3.7 @@ -122,46 +157,68 @@ entry: %0 = tail call { i32, i32, i32, i32 } asm sideeffect "cpuid", "={ax},={bx},={cx},={dx},0,~{dirflag},~{fpsr},~{flags}"(i32 1) nounwind %asmresult5.i = extractvalue { i32, i32, i32, i32 } %0, 2 %asmresult6.i = extractvalue { i32, i32, i32, i32 } %0, 3 - %and = and i32 %asmresult5.i, 268435456 - %cmp = icmp eq i32 %and, 0 - br i1 %cmp, label %if.else14, label %land.lhs.true + %1 = tail call { i32, i32, i32, i32 } asm sideeffect "xchg$(l$)\09$(%$)ebx, $1\0A\09cpuid\0A\09xchg$(l$)\09$(%$)ebx, $1\0A\09", "={ax},=r,={cx},={dx},0,2,~{dirflag},~{fpsr},~{flags}"(i32 7, i32 0) nounwind + %asmresult4.i78 = extractvalue { i32, i32, i32, i32 } %1, 1 + %2 = and i32 %asmresult4.i78, 65568 + %3 = icmp eq i32 %2, 65568 + br i1 %3, label %land.lhs.true5, label %if.end35 -land.lhs.true: ; preds = %entry - %1 = tail call { i32, i32 } asm sideeffect ".byte 0x0f, 0x01, 0xd0", "={ax},={dx},{cx},~{dirflag},~{fpsr},~{flags}"(i32 0) nounwind - %asmresult.i25 = extractvalue { i32, i32 } %1, 0 - %and.i = and i32 %asmresult.i25, 6 - %cmp.i = icmp eq i32 %and.i, 6 - br i1 %cmp.i, label %if.then, label %if.else14 +land.lhs.true5: ; preds = %entry + %4 = tail call { i32, i32 } asm sideeffect ".byte 0x0f, 0x01, 0xd0", "={ax},={dx},{cx},~{dirflag},~{fpsr},~{flags}"(i32 0) nounwind + %asmresult.i81 = extractvalue { i32, i32 } %4, 0 + %and.i = and i32 %asmresult.i81, 230 + %cmp.i = icmp eq i32 %and.i, 230 + br i1 %cmp.i, label %if.then, label %if.end35 -if.then: ; preds = %land.lhs.true - %2 = and i32 %asmresult5.i, 1610612736 - %3 = icmp eq i32 %2, 1610612736 - br i1 %3, label %if.then8, label %return +if.then: ; preds = %land.lhs.true5 + %5 = and i32 %asmresult4.i78, -805175296 + %6 = icmp eq i32 %5, -805175296 + br i1 %6, label %return, label %if.else -if.then8: ; preds = %if.then - %4 = tail call { i32, i32, i32, i32 } asm sideeffect "xchg$(l$)\09$(%$)ebx, $1\0A\09cpuid\0A\09xchg$(l$)\09$(%$)ebx, $1\0A\09", "={ax},=r,={cx},={dx},0,2,~{dirflag},~{fpsr},~{flags}"(i32 7, i32 0) nounwind - %asmresult4.i30 = extractvalue { i32, i32, i32, i32 } %4, 1 - %and11 = lshr i32 %asmresult4.i30, 5 - %5 = and i32 %and11, 1 - %6 = add i32 %5, 3 +if.else: ; preds = %if.then + %7 = and i32 %asmresult4.i78, 469762048 + %8 = icmp eq i32 %7, 469762048 + br i1 %8, label %return, label %if.end35 + +if.end35: ; preds = %if.else, %land.lhs.true5, %entry + %and37 = and i32 %asmresult5.i, 268435456 + %cmp38 = icmp eq i32 %and37, 0 + br i1 %cmp38, label %if.else57, label %land.lhs.true39 + +land.lhs.true39: ; preds = %if.end35 + %9 = tail call { i32, i32 } asm sideeffect ".byte 0x0f, 0x01, 0xd0", "={ax},={dx},{cx},~{dirflag},~{fpsr},~{flags}"(i32 0) nounwind + %asmresult.i82 = extractvalue { i32, i32 } %9, 0 + %and.i83 = and i32 %asmresult.i82, 6 + %cmp.i84 = icmp eq i32 %and.i83, 6 + br i1 %cmp.i84, label %if.then42, label %if.else57 + +if.then42: ; preds = %land.lhs.true39 + %10 = and i32 %asmresult5.i, 1610612736 + %11 = icmp eq i32 %10, 1610612736 + br i1 %11, label %if.then50, label %return + +if.then50: ; preds = %if.then42 + %and = lshr i32 %asmresult4.i78, 5 + %12 = and i32 %and, 1 + %13 = add nuw nsw i32 %12, 3 br label %return -if.else14: ; preds = %land.lhs.true, %entry - %and16 = and i32 %asmresult5.i, 524288 - %cmp17 = icmp eq i32 %and16, 0 - br i1 %cmp17, label %if.else19, label %return +if.else57: ; preds = %land.lhs.true39, %if.end35 + %and59 = and i32 %asmresult5.i, 524288 + %cmp60 = icmp eq i32 %and59, 0 + br i1 %cmp60, label %if.else62, label %return -if.else19: ; preds = %if.else14 - %and21 = and i32 %asmresult6.i, 67108864 - %cmp22 = icmp eq i32 %and21, 0 - br i1 %cmp22, label %if.else24, label %return +if.else62: ; preds = %if.else57 + %and64 = and i32 %asmresult6.i, 67108864 + %cmp65 = icmp eq i32 %and64, 0 + br i1 %cmp65, label %if.else67, label %return -if.else24: ; preds = %if.else19 - tail call void @abort() noreturn nounwind +if.else67: ; preds = %if.else62 + tail call void @abort() #3 unreachable -return: ; preds = %if.else19, %if.else14, %if.then8, %if.then - %retval.0 = phi i32 [ %6, %if.then8 ], [ 2, %if.then ], [ 1, %if.else14 ], [ 0, %if.else19 ] +return: ; preds = %if.else62, %if.else57, %if.then50, %if.then42, %if.else, %if.then + %retval.0 = phi i32 [ 6, %if.then ], [ 5, %if.else ], [ %13, %if.then50 ], [ 2, %if.then42 ], [ 1, %if.else57 ], [ 0, %if.else62 ] ret i32 %retval.0 } diff --git a/check_isa.cpp b/check_isa.cpp index a4d10606..991f467b 100644 --- a/check_isa.cpp +++ b/check_isa.cpp @@ -78,6 +78,22 @@ static bool __os_has_avx_support() { return (rEAX & 6) == 6; #endif // !defined(ISPC_IS_WINDOWS) } + +static bool __os_has_avx512_support() { +#if defined(ISPC_IS_WINDOWS) + // Check if the OS saves the XMM, YMM and ZMM registers, i.e. it supports AVX2 and AVX512. + // See section 2.1 of software.intel.com/sites/default/files/managed/0d/53/319433-022.pdf + unsigned long long xcrFeatureMask = _xgetbv(_XCR_XFEATURE_ENABLED_MASK); + return (xcrFeatureMask & 0xE6) == 0xE6; +#else // !defined(ISPC_IS_WINDOWS) + // Check xgetbv; this uses a .byte sequence instead of the instruction + // directly because older assemblers do not include support for xgetbv and + // there is no easy way to conditionally compile based on the assembler used. + int rEAX, rEDX; + __asm__ __volatile__ (".byte 0x0f, 0x01, 0xd0" : "=a" (rEAX), "=d" (rEDX) : "c" (0)); + return (rEAX & 0xE6) == 0xE6; +#endif // !defined(ISPC_IS_WINDOWS) +} #endif // !__arm__ @@ -89,6 +105,32 @@ lGetSystemISA() { int info[4]; __cpuid(info, 1); + int info2[4]; + // Call cpuid with eax=7, ecx=0 + __cpuidex(info2, 7, 0); + + if ((info2[1] & (1 << 5)) != 0 && // AVX2 + (info2[1] & (1 << 16)) != 0 && // AVX512 F + __os_has_avx512_support()) { + // We need to verify that AVX2 is also available, + // as well as AVX512, because our targets are supposed + // to use both. + + if ((info2[1] & (1 << 17)) != 0 && // AVX512 DQ + (info2[1] & (1 << 28)) != 0 && // AVX512 CDI + (info2[1] & (1 << 30)) != 0 && // AVX512 BW + (info2[1] & (1 << 31)) != 0) { // AVX512 VL + return "SKX"; + } + else if ((info2[1] & (1 << 26)) != 0 && // AVX512 PF + (info2[1] & (1 << 27)) != 0 && // AVX512 ER + (info2[1] & (1 << 28)) != 0) { // AVX512 CDI + return "KNL"; + } + // If it's unknown AVX512 target, fall through and use AVX2 + // or whatever is available in the machine. + } + if ((info[2] & (1 << 28)) != 0 && __os_has_avx_support()) { // AVX // AVX1 for sure.... @@ -96,9 +138,6 @@ lGetSystemISA() { if ((info[2] & (1 << 29)) != 0 && // F16C (info[2] & (1 << 30)) != 0) { // RDRAND // So far, so good. AVX2? - // Call cpuid with eax=7, ecx=0 - int info2[4]; - __cpuidex(info2, 7, 0); if ((info2[1] & (1 << 5)) != 0) { return "AVX2 (codename Haswell)"; } diff --git a/ispc.cpp b/ispc.cpp index ae9816ef..f5a5ec59 100644 --- a/ispc.cpp +++ b/ispc.cpp @@ -123,6 +123,22 @@ static bool __os_has_avx_support() { return (rEAX & 6) == 6; #endif // !defined(ISPC_IS_WINDOWS) } + +static bool __os_has_avx512_support() { +#if defined(ISPC_IS_WINDOWS) + // Check if the OS saves the XMM, YMM and ZMM registers, i.e. it supports AVX2 and AVX512. + // See section 2.1 of software.intel.com/sites/default/files/managed/0d/53/319433-022.pdf + unsigned long long xcrFeatureMask = _xgetbv(_XCR_XFEATURE_ENABLED_MASK); + return (xcrFeatureMask & 0xE6) == 0xE6; +#else // !defined(ISPC_IS_WINDOWS) + // Check xgetbv; this uses a .byte sequence instead of the instruction + // directly because older assemblers do not include support for xgetbv and + // there is no easy way to conditionally compile based on the assembler used. + int rEAX, rEDX; + __asm__ __volatile__ (".byte 0x0f, 0x01, 0xd0" : "=a" (rEAX), "=d" (rEDX) : "c" (0)); + return (rEAX & 0xE6) == 0xE6; +#endif // !defined(ISPC_IS_WINDOWS) +} #endif // !__arm__ static const char * @@ -133,6 +149,32 @@ lGetSystemISA() { int info[4]; __cpuid(info, 1); + int info2[4]; + // Call cpuid with eax=7, ecx=0 + __cpuidex(info2, 7, 0); + + if ((info2[1] & (1 << 5)) != 0 && // AVX2 + (info2[1] & (1 << 16)) != 0 && // AVX512 F + __os_has_avx512_support()) { + // We need to verify that AVX2 is also available, + // as well as AVX512, because our targets are supposed + // to use both. + + if ((info2[1] & (1 << 17)) != 0 && // AVX512 DQ + (info2[1] & (1 << 28)) != 0 && // AVX512 CDI + (info2[1] & (1 << 30)) != 0 && // AVX512 BW + (info2[1] & (1 << 31)) != 0) { // AVX512 VL + return "skx"; + } + else if ((info2[1] & (1 << 26)) != 0 && // AVX512 PF + (info2[1] & (1 << 27)) != 0 && // AVX512 ER + (info2[1] & (1 << 28)) != 0) { // AVX512 CDI + return "knl"; + } + // If it's unknown AVX512 target, fall through and use AVX2 + // or whatever is available in the machine. + } + if ((info[2] & (1 << 28)) != 0 && __os_has_avx_support()) { // AVX // AVX1 for sure.... @@ -140,9 +182,6 @@ lGetSystemISA() { if ((info[2] & (1 << 29)) != 0 && // F16C (info[2] & (1 << 30)) != 0) { // RDRAND // So far, so good. AVX2? - // Call cpuid with eax=7, ecx=0 - int info2[4]; - __cpuidex(info2, 7, 0); if ((info2[1] & (1 << 5)) != 0) return "avx2-i32x8"; else @@ -764,7 +803,11 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : CPUfromISA = CPU_IvyBridge; } else if (!strcasecmp(isa, "avx2") || - !strcasecmp(isa, "avx2-i32x8")) { + !strcasecmp(isa, "avx2-i32x8") || + // TODO: enable knl and skx support + // They are downconverted to avx2 for code generation. + !strcasecmp(isa, "skx") || + !strcasecmp(isa, "knl")) { this->m_isa = Target::AVX2; this->m_nativeVectorWidth = 8; this->m_nativeVectorAlignment = 32; @@ -1091,6 +1134,10 @@ Target::ISAToString(ISA isa) { return "avx11"; case Target::AVX2: return "avx2"; + case Target::KNL: + return "knl"; + case Target::SKX: + return "skx"; case Target::GENERIC: return "generic"; #ifdef ISPC_NVPTX_ENABLED @@ -1133,6 +1180,12 @@ Target::ISAToTargetString(ISA isa) { return "avx1.1-i32x8"; case Target::AVX2: return "avx2-i32x8"; + // TODO: enable knl and skx support. + // They are downconverted to avx2 for code generation. + case Target::KNL: + return "avx2"; + case Target::SKX: + return "avx2"; case Target::GENERIC: return "generic-4"; #ifdef ISPC_NVPTX_ENABLED diff --git a/ispc.h b/ispc.h index cec56056..5af265fe 100644 --- a/ispc.h +++ b/ispc.h @@ -181,7 +181,9 @@ public: AVX = 2, AVX11 = 3, AVX2 = 4, - GENERIC = 5, + KNL = 5, + SKX = 6, + GENERIC = 7, #ifdef ISPC_NVPTX_ENABLED NVPTX, #endif