瀏覽代碼

iAdd clang include

Gogs 4 年之前
父節點
當前提交
196b5b52c3
共有 100 個文件被更改,包括 165155 次插入0 次删除
  1. 126 0
      demo/include/__clang_cuda_builtin_vars.h
  2. 472 0
      demo/include/__clang_cuda_cmath.h
  3. 203 0
      demo/include/__clang_cuda_complex_builtins.h
  4. 489 0
      demo/include/__clang_cuda_intrinsics.h
  5. 286 0
      demo/include/__clang_cuda_math_forward_declares.h
  6. 381 0
      demo/include/__clang_cuda_runtime_wrapper.h
  7. 43 0
      demo/include/__stddef_max_align_t.h
  8. 151 0
      demo/include/__wmmintrin_aes.h
  9. 57 0
      demo/include/__wmmintrin_pclmul.h
  10. 86 0
      demo/include/adxintrin.h
  11. 16736 0
      demo/include/altivec.h
  12. 193 0
      demo/include/ammintrin.h
  13. 49 0
      demo/include/arm64intr.h
  14. 626 0
      demo/include/arm_acle.h
  15. 72599 0
      demo/include/arm_neon.h
  16. 45 0
      demo/include/armintr.h
  17. 1308 0
      demo/include/avx2intrin.h
  18. 97 0
      demo/include/avx512bitalgintrin.h
  19. 2140 0
      demo/include/avx512bwintrin.h
  20. 145 0
      demo/include/avx512cdintrin.h
  21. 1331 0
      demo/include/avx512dqintrin.h
  22. 285 0
      demo/include/avx512erintrin.h
  23. 10233 0
      demo/include/avx512fintrin.h
  24. 92 0
      demo/include/avx512ifmaintrin.h
  25. 149 0
      demo/include/avx512ifmavlintrin.h
  26. 111 0
      demo/include/avx512pfintrin.h
  27. 391 0
      demo/include/avx512vbmi2intrin.h
  28. 137 0
      demo/include/avx512vbmiintrin.h
  29. 247 0
      demo/include/avx512vbmivlintrin.h
  30. 157 0
      demo/include/avx512vlbitalgintrin.h
  31. 2781 0
      demo/include/avx512vlbwintrin.h
  32. 263 0
      demo/include/avx512vlcdintrin.h
  33. 1198 0
      demo/include/avx512vldqintrin.h
  34. 8547 0
      demo/include/avx512vlintrin.h
  35. 748 0
      demo/include/avx512vlvbmi2intrin.h
  36. 254 0
      demo/include/avx512vlvnniintrin.h
  37. 146 0
      demo/include/avx512vnniintrin.h
  38. 70 0
      demo/include/avx512vpopcntdqintrin.h
  39. 99 0
      demo/include/avx512vpopcntdqvlintrin.h
  40. 5162 0
      demo/include/avxintrin.h
  41. 95 0
      demo/include/bmi2intrin.h
  42. 382 0
      demo/include/bmiintrin.h
  43. 93 0
      demo/include/cetintrin.h
  44. 41 0
      demo/include/clflushoptintrin.h
  45. 52 0
      demo/include/clwbintrin.h
  46. 50 0
      demo/include/clzerointrin.h
  47. 302 0
      demo/include/cpuid.h
  48. 96 0
      demo/include/cuda_wrappers/algorithm
  49. 82 0
      demo/include/cuda_wrappers/complex
  50. 96 0
      demo/include/cuda_wrappers/new
  51. 4951 0
      demo/include/emmintrin.h
  52. 124 0
      demo/include/f16cintrin.h
  53. 160 0
      demo/include/float.h
  54. 230 0
      demo/include/fma4intrin.h
  55. 228 0
      demo/include/fmaintrin.h
  56. 105 0
      demo/include/fxsrintrin.h
  57. 202 0
      demo/include/gfniintrin.h
  58. 226 0
      demo/include/htmintrin.h
  59. 359 0
      demo/include/htmxlintrin.h
  60. 73 0
      demo/include/ia32intrin.h
  61. 374 0
      demo/include/immintrin.h
  62. 969 0
      demo/include/intrin.h
  63. 106 0
      demo/include/inttypes.h
  64. 43 0
      demo/include/iso646.h
  65. 118 0
      demo/include/limits.h
  66. 150 0
      demo/include/lwpintrin.h
  67. 118 0
      demo/include/lzcntintrin.h
  68. 171 0
      demo/include/mm3dnow.h
  69. 75 0
      demo/include/mm_malloc.h
  70. 1570 0
      demo/include/mmintrin.h
  71. 167 0
      demo/include/module.modulemap
  72. 583 0
      demo/include/msa.h
  73. 47 0
      demo/include/mwaitxintrin.h
  74. 30 0
      demo/include/nmmintrin.h
  75. 16391 0
      demo/include/opencl-c.h
  76. 48 0
      demo/include/pkuintrin.h
  77. 304 0
      demo/include/pmmintrin.h
  78. 98 0
      demo/include/popcntintrin.h
  79. 71 0
      demo/include/prfchwintrin.h
  80. 56 0
      demo/include/rdseedintrin.h
  81. 59 0
      demo/include/rtmintrin.h
  82. 39 0
      demo/include/s390intrin.h
  83. 90 0
      demo/include/sanitizer/allocator_interface.h
  84. 155 0
      demo/include/sanitizer/asan_interface.h
  85. 198 0
      demo/include/sanitizer/common_interface_defs.h
  86. 36 0
      demo/include/sanitizer/coverage_interface.h
  87. 116 0
      demo/include/sanitizer/dfsan_interface.h
  88. 50 0
      demo/include/sanitizer/esan_interface.h
  89. 33 0
      demo/include/sanitizer/hwasan_interface.h
  90. 3083 0
      demo/include/sanitizer/linux_syscall_hooks.h
  91. 90 0
      demo/include/sanitizer/lsan_interface.h
  92. 111 0
      demo/include/sanitizer/msan_interface.h
  93. 34 0
      demo/include/sanitizer/scudo_interface.h
  94. 144 0
      demo/include/sanitizer/tsan_interface.h
  95. 222 0
      demo/include/sanitizer/tsan_interface_atomic.h
  96. 75 0
      demo/include/shaintrin.h
  97. 2465 0
      demo/include/smmintrin.h
  98. 35 0
      demo/include/stdalign.h
  99. 51 0
      demo/include/stdarg.h
  100. 0 0
      demo/include/stdatomic.h

+ 126 - 0
demo/include/__clang_cuda_builtin_vars.h

@@ -0,0 +1,126 @@
+/*===---- cuda_builtin_vars.h - CUDA built-in variables ---------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CUDA_BUILTIN_VARS_H
+#define __CUDA_BUILTIN_VARS_H
+
+// Forward declares from vector_types.h.
+struct uint3;
+struct dim3;
+
+// The file implements built-in CUDA variables using __declspec(property).
+// https://msdn.microsoft.com/en-us/library/yhfk0thd.aspx
+// All read accesses of built-in variable fields get converted into calls to a
+// getter function which in turn calls the appropriate builtin to fetch the
+// value.
+//
+// Example:
+//    int x = threadIdx.x;
+// IR output:
+//  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
+// PTX output:
+//  mov.u32     %r2, %tid.x;
+
+#define __CUDA_DEVICE_BUILTIN(FIELD, INTRINSIC)                                \
+  __declspec(property(get = __fetch_builtin_##FIELD)) unsigned int FIELD;      \
+  static inline __attribute__((always_inline))                                 \
+      __attribute__((device)) unsigned int __fetch_builtin_##FIELD(void) {     \
+    return INTRINSIC;                                                          \
+  }
+
+#if __cplusplus >= 201103L
+#define __DELETE =delete
+#else
+#define __DELETE
+#endif
+
+// Make sure nobody can create instances of the special varible types.  nvcc
+// also disallows taking address of special variables, so we disable address-of
+// operator as well.
+#define __CUDA_DISALLOW_BUILTINVAR_ACCESS(TypeName)                            \
+  __attribute__((device)) TypeName() __DELETE;                                 \
+  __attribute__((device)) TypeName(const TypeName &) __DELETE;                 \
+  __attribute__((device)) void operator=(const TypeName &) const __DELETE;     \
+  __attribute__((device)) TypeName *operator&() const __DELETE
+
+struct __cuda_builtin_threadIdx_t {
+  __CUDA_DEVICE_BUILTIN(x,__nvvm_read_ptx_sreg_tid_x());
+  __CUDA_DEVICE_BUILTIN(y,__nvvm_read_ptx_sreg_tid_y());
+  __CUDA_DEVICE_BUILTIN(z,__nvvm_read_ptx_sreg_tid_z());
+  // threadIdx should be convertible to uint3 (in fact in nvcc, it *is* a
+  // uint3).  This function is defined after we pull in vector_types.h.
+  __attribute__((device)) operator uint3() const;
+private:
+  __CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_threadIdx_t);
+};
+
+struct __cuda_builtin_blockIdx_t {
+  __CUDA_DEVICE_BUILTIN(x,__nvvm_read_ptx_sreg_ctaid_x());
+  __CUDA_DEVICE_BUILTIN(y,__nvvm_read_ptx_sreg_ctaid_y());
+  __CUDA_DEVICE_BUILTIN(z,__nvvm_read_ptx_sreg_ctaid_z());
+  // blockIdx should be convertible to uint3 (in fact in nvcc, it *is* a
+  // uint3).  This function is defined after we pull in vector_types.h.
+  __attribute__((device)) operator uint3() const;
+private:
+  __CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_blockIdx_t);
+};
+
+struct __cuda_builtin_blockDim_t {
+  __CUDA_DEVICE_BUILTIN(x,__nvvm_read_ptx_sreg_ntid_x());
+  __CUDA_DEVICE_BUILTIN(y,__nvvm_read_ptx_sreg_ntid_y());
+  __CUDA_DEVICE_BUILTIN(z,__nvvm_read_ptx_sreg_ntid_z());
+  // blockDim should be convertible to dim3 (in fact in nvcc, it *is* a
+  // dim3).  This function is defined after we pull in vector_types.h.
+  __attribute__((device)) operator dim3() const;
+private:
+  __CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_blockDim_t);
+};
+
+struct __cuda_builtin_gridDim_t {
+  __CUDA_DEVICE_BUILTIN(x,__nvvm_read_ptx_sreg_nctaid_x());
+  __CUDA_DEVICE_BUILTIN(y,__nvvm_read_ptx_sreg_nctaid_y());
+  __CUDA_DEVICE_BUILTIN(z,__nvvm_read_ptx_sreg_nctaid_z());
+  // gridDim should be convertible to dim3 (in fact in nvcc, it *is* a
+  // dim3).  This function is defined after we pull in vector_types.h.
+  __attribute__((device)) operator dim3() const;
+private:
+  __CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_gridDim_t);
+};
+
+#define __CUDA_BUILTIN_VAR                                                     \
+  extern const __attribute__((device)) __attribute__((weak))
+__CUDA_BUILTIN_VAR __cuda_builtin_threadIdx_t threadIdx;
+__CUDA_BUILTIN_VAR __cuda_builtin_blockIdx_t blockIdx;
+__CUDA_BUILTIN_VAR __cuda_builtin_blockDim_t blockDim;
+__CUDA_BUILTIN_VAR __cuda_builtin_gridDim_t gridDim;
+
+// warpSize should translate to read of %WARP_SZ but there's currently no
+// builtin to do so. According to PTX v4.2 docs 'to date, all target
+// architectures have a WARP_SZ value of 32'.
+__attribute__((device)) const int warpSize = 32;
+
+#undef __CUDA_DEVICE_BUILTIN
+#undef __CUDA_BUILTIN_VAR
+#undef __CUDA_DISALLOW_BUILTINVAR_ACCESS
+
+#endif /* __CUDA_BUILTIN_VARS_H */

+ 472 - 0
demo/include/__clang_cuda_cmath.h

@@ -0,0 +1,472 @@
+/*===---- __clang_cuda_cmath.h - Device-side CUDA cmath support ------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __CLANG_CUDA_CMATH_H__
+#define __CLANG_CUDA_CMATH_H__
+#ifndef __CUDA__
+#error "This file is for CUDA compilation only."
+#endif
+
+#include <limits>
+
+// CUDA lets us use various std math functions on the device side.  This file
+// works in concert with __clang_cuda_math_forward_declares.h to make this work.
+//
+// Specifically, the forward-declares header declares __device__ overloads for
+// these functions in the global namespace, then pulls them into namespace std
+// with 'using' statements.  Then this file implements those functions, after
+// their implementations have been pulled in.
+//
+// It's important that we declare the functions in the global namespace and pull
+// them into namespace std with using statements, as opposed to simply declaring
+// these functions in namespace std, because our device functions need to
+// overload the standard library functions, which may be declared in the global
+// namespace or in std, depending on the degree of conformance of the stdlib
+// implementation.  Declaring in the global namespace and pulling into namespace
+// std covers all of the known knowns.
+
+#define __DEVICE__ static __device__ __inline__ __attribute__((always_inline))
+
+__DEVICE__ long long abs(long long __n) { return ::llabs(__n); }
+__DEVICE__ long abs(long __n) { return ::labs(__n); }
+__DEVICE__ float abs(float __x) { return ::fabsf(__x); }
+__DEVICE__ double abs(double __x) { return ::fabs(__x); }
+__DEVICE__ float acos(float __x) { return ::acosf(__x); }
+__DEVICE__ float asin(float __x) { return ::asinf(__x); }
+__DEVICE__ float atan(float __x) { return ::atanf(__x); }
+__DEVICE__ float atan2(float __x, float __y) { return ::atan2f(__x, __y); }
+__DEVICE__ float ceil(float __x) { return ::ceilf(__x); }
+__DEVICE__ float cos(float __x) { return ::cosf(__x); }
+__DEVICE__ float cosh(float __x) { return ::coshf(__x); }
+__DEVICE__ float exp(float __x) { return ::expf(__x); }
+__DEVICE__ float fabs(float __x) { return ::fabsf(__x); }
+__DEVICE__ float floor(float __x) { return ::floorf(__x); }
+__DEVICE__ float fmod(float __x, float __y) { return ::fmodf(__x, __y); }
+__DEVICE__ int fpclassify(float __x) {
+  return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL,
+                              FP_ZERO, __x);
+}
+__DEVICE__ int fpclassify(double __x) {
+  return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL,
+                              FP_ZERO, __x);
+}
+__DEVICE__ float frexp(float __arg, int *__exp) {
+  return ::frexpf(__arg, __exp);
+}
+
+// For inscrutable reasons, the CUDA headers define these functions for us on
+// Windows.
+#ifndef _MSC_VER
+__DEVICE__ bool isinf(float __x) { return ::__isinff(__x); }
+__DEVICE__ bool isinf(double __x) { return ::__isinf(__x); }
+__DEVICE__ bool isfinite(float __x) { return ::__finitef(__x); }
+// For inscrutable reasons, __finite(), the double-precision version of
+// __finitef, does not exist when compiling for MacOS.  __isfinited is available
+// everywhere and is just as good.
+__DEVICE__ bool isfinite(double __x) { return ::__isfinited(__x); }
+__DEVICE__ bool isnan(float __x) { return ::__isnanf(__x); }
+__DEVICE__ bool isnan(double __x) { return ::__isnan(__x); }
+#endif
+
+__DEVICE__ bool isgreater(float __x, float __y) {
+  return __builtin_isgreater(__x, __y);
+}
+__DEVICE__ bool isgreater(double __x, double __y) {
+  return __builtin_isgreater(__x, __y);
+}
+__DEVICE__ bool isgreaterequal(float __x, float __y) {
+  return __builtin_isgreaterequal(__x, __y);
+}
+__DEVICE__ bool isgreaterequal(double __x, double __y) {
+  return __builtin_isgreaterequal(__x, __y);
+}
+__DEVICE__ bool isless(float __x, float __y) {
+  return __builtin_isless(__x, __y);
+}
+__DEVICE__ bool isless(double __x, double __y) {
+  return __builtin_isless(__x, __y);
+}
+__DEVICE__ bool islessequal(float __x, float __y) {
+  return __builtin_islessequal(__x, __y);
+}
+__DEVICE__ bool islessequal(double __x, double __y) {
+  return __builtin_islessequal(__x, __y);
+}
+__DEVICE__ bool islessgreater(float __x, float __y) {
+  return __builtin_islessgreater(__x, __y);
+}
+__DEVICE__ bool islessgreater(double __x, double __y) {
+  return __builtin_islessgreater(__x, __y);
+}
+__DEVICE__ bool isnormal(float __x) { return __builtin_isnormal(__x); }
+__DEVICE__ bool isnormal(double __x) { return __builtin_isnormal(__x); }
+__DEVICE__ bool isunordered(float __x, float __y) {
+  return __builtin_isunordered(__x, __y);
+}
+__DEVICE__ bool isunordered(double __x, double __y) {
+  return __builtin_isunordered(__x, __y);
+}
+__DEVICE__ float ldexp(float __arg, int __exp) {
+  return ::ldexpf(__arg, __exp);
+}
+__DEVICE__ float log(float __x) { return ::logf(__x); }
+__DEVICE__ float log10(float __x) { return ::log10f(__x); }
+__DEVICE__ float modf(float __x, float *__iptr) { return ::modff(__x, __iptr); }
+__DEVICE__ float pow(float __base, float __exp) {
+  return ::powf(__base, __exp);
+}
+__DEVICE__ float pow(float __base, int __iexp) {
+  return ::powif(__base, __iexp);
+}
+__DEVICE__ double pow(double __base, int __iexp) {
+  return ::powi(__base, __iexp);
+}
+__DEVICE__ bool signbit(float __x) { return ::__signbitf(__x); }
+__DEVICE__ bool signbit(double __x) { return ::__signbitd(__x); }
+__DEVICE__ float sin(float __x) { return ::sinf(__x); }
+__DEVICE__ float sinh(float __x) { return ::sinhf(__x); }
+__DEVICE__ float sqrt(float __x) { return ::sqrtf(__x); }
+__DEVICE__ float tan(float __x) { return ::tanf(__x); }
+__DEVICE__ float tanh(float __x) { return ::tanhf(__x); }
+
+// Notably missing above is nexttoward.  We omit it because
+// libdevice doesn't provide an implementation, and we don't want to be in the
+// business of implementing tricky libm functions in this header.
+
+// Now we've defined everything we promised we'd define in
+// __clang_cuda_math_forward_declares.h.  We need to do two additional things to
+// fix up our math functions.
+//
+// 1) Define __device__ overloads for e.g. sin(int).  The CUDA headers define
+//    only sin(float) and sin(double), which means that e.g. sin(0) is
+//    ambiguous.
+//
+// 2) Pull the __device__ overloads of "foobarf" math functions into namespace
+//    std.  These are defined in the CUDA headers in the global namespace,
+//    independent of everything else we've done here.
+
+// We can't use std::enable_if, because we want to be pre-C++11 compatible.  But
+// we go ahead and unconditionally define functions that are only available when
+// compiling for C++11 to match the behavior of the CUDA headers.
+template<bool __B, class __T = void>
+struct __clang_cuda_enable_if {};
+
+template <class __T> struct __clang_cuda_enable_if<true, __T> {
+  typedef __T type;
+};
+
+// Defines an overload of __fn that accepts one integral argument, calls
+// __fn((double)x), and returns __retty.
+#define __CUDA_CLANG_FN_INTEGER_OVERLOAD_1(__retty, __fn)                      \
+  template <typename __T>                                                      \
+  __DEVICE__                                                                   \
+      typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,    \
+                                      __retty>::type                           \
+      __fn(__T __x) {                                                          \
+    return ::__fn((double)__x);                                                \
+  }
+
+// Defines an overload of __fn that accepts one two arithmetic arguments, calls
+// __fn((double)x, (double)y), and returns a double.
+//
+// Note this is different from OVERLOAD_1, which generates an overload that
+// accepts only *integral* arguments.
+#define __CUDA_CLANG_FN_INTEGER_OVERLOAD_2(__retty, __fn)                      \
+  template <typename __T1, typename __T2>                                      \
+  __DEVICE__ typename __clang_cuda_enable_if<                                  \
+      std::numeric_limits<__T1>::is_specialized &&                             \
+          std::numeric_limits<__T2>::is_specialized,                           \
+      __retty>::type                                                           \
+  __fn(__T1 __x, __T2 __y) {                                                   \
+    return __fn((double)__x, (double)__y);                                     \
+  }
+
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, acos)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, acosh)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, asin)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, asinh)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, atan)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, atan2);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, atanh)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, cbrt)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, ceil)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, copysign);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, cos)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, cosh)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, erf)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, erfc)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, exp)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, exp2)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, expm1)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, fabs)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, fdim);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, floor)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, fmax);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, fmin);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, fmod);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(int, fpclassify)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, hypot);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(int, ilogb)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, isfinite)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, isgreater);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, isgreaterequal);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, isinf);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, isless);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, islessequal);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, islessgreater);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, isnan);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, isnormal)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, isunordered);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, lgamma)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, log)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, log10)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, log1p)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, log2)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, logb)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(long long, llrint)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(long long, llround)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(long, lrint)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(long, lround)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, nearbyint);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, nextafter);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, pow);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, remainder);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, rint);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, round);
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, signbit)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, sin)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, sinh)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, sqrt)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, tan)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, tanh)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, tgamma)
+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, trunc);
+
+#undef __CUDA_CLANG_FN_INTEGER_OVERLOAD_1
+#undef __CUDA_CLANG_FN_INTEGER_OVERLOAD_2
+
+// Overloads for functions that don't match the patterns expected by
+// __CUDA_CLANG_FN_INTEGER_OVERLOAD_{1,2}.
+template <typename __T1, typename __T2, typename __T3>
+__DEVICE__ typename __clang_cuda_enable_if<
+    std::numeric_limits<__T1>::is_specialized &&
+        std::numeric_limits<__T2>::is_specialized &&
+        std::numeric_limits<__T3>::is_specialized,
+    double>::type
+fma(__T1 __x, __T2 __y, __T3 __z) {
+  return std::fma((double)__x, (double)__y, (double)__z);
+}
+
+template <typename __T>
+__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
+                                           double>::type
+frexp(__T __x, int *__exp) {
+  return std::frexp((double)__x, __exp);
+}
+
+template <typename __T>
+__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
+                                           double>::type
+ldexp(__T __x, int __exp) {
+  return std::ldexp((double)__x, __exp);
+}
+
+template <typename __T1, typename __T2>
+__DEVICE__ typename __clang_cuda_enable_if<
+    std::numeric_limits<__T1>::is_specialized &&
+        std::numeric_limits<__T2>::is_specialized,
+    double>::type
+remquo(__T1 __x, __T2 __y, int *__quo) {
+  return std::remquo((double)__x, (double)__y, __quo);
+}
+
+template <typename __T>
+__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
+                                           double>::type
+scalbln(__T __x, long __exp) {
+  return std::scalbln((double)__x, __exp);
+}
+
+template <typename __T>
+__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
+                                           double>::type
+scalbn(__T __x, int __exp) {
+  return std::scalbn((double)__x, __exp);
+}
+
+// We need to define these overloads in exactly the namespace our standard
+// library uses (including the right inline namespace), otherwise they won't be
+// picked up by other functions in the standard library (e.g. functions in
+// <complex>).  Thus the ugliness below.
+#ifdef _LIBCPP_BEGIN_NAMESPACE_STD
+_LIBCPP_BEGIN_NAMESPACE_STD
+#else
+namespace std {
+#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
+_GLIBCXX_BEGIN_NAMESPACE_VERSION
+#endif
+#endif
+
+// Pull the new overloads we defined above into namespace std.
+using ::acos;
+using ::acosh;
+using ::asin;
+using ::asinh;
+using ::atan;
+using ::atan2;
+using ::atanh;
+using ::cbrt;
+using ::ceil;
+using ::copysign;
+using ::cos;
+using ::cosh;
+using ::erf;
+using ::erfc;
+using ::exp;
+using ::exp2;
+using ::expm1;
+using ::fabs;
+using ::fdim;
+using ::floor;
+using ::fma;
+using ::fmax;
+using ::fmin;
+using ::fmod;
+using ::fpclassify;
+using ::frexp;
+using ::hypot;
+using ::ilogb;
+using ::isfinite;
+using ::isgreater;
+using ::isgreaterequal;
+using ::isless;
+using ::islessequal;
+using ::islessgreater;
+using ::isnormal;
+using ::isunordered;
+using ::ldexp;
+using ::lgamma;
+using ::llrint;
+using ::llround;
+using ::log;
+using ::log10;
+using ::log1p;
+using ::log2;
+using ::logb;
+using ::lrint;
+using ::lround;
+using ::nearbyint;
+using ::nextafter;
+using ::pow;
+using ::remainder;
+using ::remquo;
+using ::rint;
+using ::round;
+using ::scalbln;
+using ::scalbn;
+using ::signbit;
+using ::sin;
+using ::sinh;
+using ::sqrt;
+using ::tan;
+using ::tanh;
+using ::tgamma;
+using ::trunc;
+
+// Well this is fun: We need to pull these symbols in for libc++, but we can't
+// pull them in with libstdc++, because its ::isinf and ::isnan are different
+// than its std::isinf and std::isnan.
+#ifndef __GLIBCXX__
+using ::isinf;
+using ::isnan;
+#endif
+
+// Finally, pull the "foobarf" functions that CUDA defines in its headers into
+// namespace std.
+using ::acosf;
+using ::acoshf;
+using ::asinf;
+using ::asinhf;
+using ::atan2f;
+using ::atanf;
+using ::atanhf;
+using ::cbrtf;
+using ::ceilf;
+using ::copysignf;
+using ::cosf;
+using ::coshf;
+using ::erfcf;
+using ::erff;
+using ::exp2f;
+using ::expf;
+using ::expm1f;
+using ::fabsf;
+using ::fdimf;
+using ::floorf;
+using ::fmaf;
+using ::fmaxf;
+using ::fminf;
+using ::fmodf;
+using ::frexpf;
+using ::hypotf;
+using ::ilogbf;
+using ::ldexpf;
+using ::lgammaf;
+using ::llrintf;
+using ::llroundf;
+using ::log10f;
+using ::log1pf;
+using ::log2f;
+using ::logbf;
+using ::logf;
+using ::lrintf;
+using ::lroundf;
+using ::modff;
+using ::nearbyintf;
+using ::nextafterf;
+using ::powf;
+using ::remainderf;
+using ::remquof;
+using ::rintf;
+using ::roundf;
+using ::scalblnf;
+using ::scalbnf;
+using ::sinf;
+using ::sinhf;
+using ::sqrtf;
+using ::tanf;
+using ::tanhf;
+using ::tgammaf;
+using ::truncf;
+
+#ifdef _LIBCPP_END_NAMESPACE_STD
+_LIBCPP_END_NAMESPACE_STD
+#else
+#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
+_GLIBCXX_END_NAMESPACE_VERSION
+#endif
+} // namespace std
+#endif
+
+#undef __DEVICE__
+
+#endif

+ 203 - 0
demo/include/__clang_cuda_complex_builtins.h

@@ -0,0 +1,203 @@
+/*===-- __clang_cuda_complex_builtins - CUDA impls of runtime complex fns ---===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_CUDA_COMPLEX_BUILTINS
+#define __CLANG_CUDA_COMPLEX_BUILTINS
+
+// This header defines __muldc3, __mulsc3, __divdc3, and __divsc3.  These are
+// libgcc functions that clang assumes are available when compiling c99 complex
+// operations.  (These implementations come from libc++, and have been modified
+// to work with CUDA.)
+
+extern "C" inline __device__ double _Complex __muldc3(double __a, double __b,
+                                                      double __c, double __d) {
+  double __ac = __a * __c;
+  double __bd = __b * __d;
+  double __ad = __a * __d;
+  double __bc = __b * __c;
+  double _Complex z;
+  __real__(z) = __ac - __bd;
+  __imag__(z) = __ad + __bc;
+  if (std::isnan(__real__(z)) && std::isnan(__imag__(z))) {
+    int __recalc = 0;
+    if (std::isinf(__a) || std::isinf(__b)) {
+      __a = std::copysign(std::isinf(__a) ? 1 : 0, __a);
+      __b = std::copysign(std::isinf(__b) ? 1 : 0, __b);
+      if (std::isnan(__c))
+        __c = std::copysign(0, __c);
+      if (std::isnan(__d))
+        __d = std::copysign(0, __d);
+      __recalc = 1;
+    }
+    if (std::isinf(__c) || std::isinf(__d)) {
+      __c = std::copysign(std::isinf(__c) ? 1 : 0, __c);
+      __d = std::copysign(std::isinf(__d) ? 1 : 0, __d);
+      if (std::isnan(__a))
+        __a = std::copysign(0, __a);
+      if (std::isnan(__b))
+        __b = std::copysign(0, __b);
+      __recalc = 1;
+    }
+    if (!__recalc && (std::isinf(__ac) || std::isinf(__bd) ||
+                      std::isinf(__ad) || std::isinf(__bc))) {
+      if (std::isnan(__a))
+        __a = std::copysign(0, __a);
+      if (std::isnan(__b))
+        __b = std::copysign(0, __b);
+      if (std::isnan(__c))
+        __c = std::copysign(0, __c);
+      if (std::isnan(__d))
+        __d = std::copysign(0, __d);
+      __recalc = 1;
+    }
+    if (__recalc) {
+      // Can't use std::numeric_limits<double>::infinity() -- that doesn't have
+      // a device overload (and isn't constexpr before C++11, naturally).
+      __real__(z) = __builtin_huge_valf() * (__a * __c - __b * __d);
+      __imag__(z) = __builtin_huge_valf() * (__a * __d + __b * __c);
+    }
+  }
+  return z;
+}
+
+extern "C" inline __device__ float _Complex __mulsc3(float __a, float __b,
+                                                     float __c, float __d) {
+  float __ac = __a * __c;
+  float __bd = __b * __d;
+  float __ad = __a * __d;
+  float __bc = __b * __c;
+  float _Complex z;
+  __real__(z) = __ac - __bd;
+  __imag__(z) = __ad + __bc;
+  if (std::isnan(__real__(z)) && std::isnan(__imag__(z))) {
+    int __recalc = 0;
+    if (std::isinf(__a) || std::isinf(__b)) {
+      __a = std::copysign(std::isinf(__a) ? 1 : 0, __a);
+      __b = std::copysign(std::isinf(__b) ? 1 : 0, __b);
+      if (std::isnan(__c))
+        __c = std::copysign(0, __c);
+      if (std::isnan(__d))
+        __d = std::copysign(0, __d);
+      __recalc = 1;
+    }
+    if (std::isinf(__c) || std::isinf(__d)) {
+      __c = std::copysign(std::isinf(__c) ? 1 : 0, __c);
+      __d = std::copysign(std::isinf(__d) ? 1 : 0, __d);
+      if (std::isnan(__a))
+        __a = std::copysign(0, __a);
+      if (std::isnan(__b))
+        __b = std::copysign(0, __b);
+      __recalc = 1;
+    }
+    if (!__recalc && (std::isinf(__ac) || std::isinf(__bd) ||
+                      std::isinf(__ad) || std::isinf(__bc))) {
+      if (std::isnan(__a))
+        __a = std::copysign(0, __a);
+      if (std::isnan(__b))
+        __b = std::copysign(0, __b);
+      if (std::isnan(__c))
+        __c = std::copysign(0, __c);
+      if (std::isnan(__d))
+        __d = std::copysign(0, __d);
+      __recalc = 1;
+    }
+    if (__recalc) {
+      __real__(z) = __builtin_huge_valf() * (__a * __c - __b * __d);
+      __imag__(z) = __builtin_huge_valf() * (__a * __d + __b * __c);
+    }
+  }
+  return z;
+}
+
+extern "C" inline __device__ double _Complex __divdc3(double __a, double __b,
+                                                      double __c, double __d) {
+  int __ilogbw = 0;
+  // Can't use std::max, because that's defined in <algorithm>, and we don't
+  // want to pull that in for every compile.  The CUDA headers define
+  // ::max(float, float) and ::max(double, double), which is sufficient for us.
+  double __logbw = std::logb(max(std::abs(__c), std::abs(__d)));
+  if (std::isfinite(__logbw)) {
+    __ilogbw = (int)__logbw;
+    __c = std::scalbn(__c, -__ilogbw);
+    __d = std::scalbn(__d, -__ilogbw);
+  }
+  double __denom = __c * __c + __d * __d;
+  double _Complex z;
+  __real__(z) = std::scalbn((__a * __c + __b * __d) / __denom, -__ilogbw);
+  __imag__(z) = std::scalbn((__b * __c - __a * __d) / __denom, -__ilogbw);
+  if (std::isnan(__real__(z)) && std::isnan(__imag__(z))) {
+    if ((__denom == 0.0) && (!std::isnan(__a) || !std::isnan(__b))) {
+      __real__(z) = std::copysign(__builtin_huge_valf(), __c) * __a;
+      __imag__(z) = std::copysign(__builtin_huge_valf(), __c) * __b;
+    } else if ((std::isinf(__a) || std::isinf(__b)) && std::isfinite(__c) &&
+               std::isfinite(__d)) {
+      __a = std::copysign(std::isinf(__a) ? 1.0 : 0.0, __a);
+      __b = std::copysign(std::isinf(__b) ? 1.0 : 0.0, __b);
+      __real__(z) = __builtin_huge_valf() * (__a * __c + __b * __d);
+      __imag__(z) = __builtin_huge_valf() * (__b * __c - __a * __d);
+    } else if (std::isinf(__logbw) && __logbw > 0.0 && std::isfinite(__a) &&
+               std::isfinite(__b)) {
+      __c = std::copysign(std::isinf(__c) ? 1.0 : 0.0, __c);
+      __d = std::copysign(std::isinf(__d) ? 1.0 : 0.0, __d);
+      __real__(z) = 0.0 * (__a * __c + __b * __d);
+      __imag__(z) = 0.0 * (__b * __c - __a * __d);
+    }
+  }
+  return z;
+}
+
+extern "C" inline __device__ float _Complex __divsc3(float __a, float __b,
+                                                     float __c, float __d) {
+  int __ilogbw = 0;
+  float __logbw = std::logb(max(std::abs(__c), std::abs(__d)));
+  if (std::isfinite(__logbw)) {
+    __ilogbw = (int)__logbw;
+    __c = std::scalbn(__c, -__ilogbw);
+    __d = std::scalbn(__d, -__ilogbw);
+  }
+  float __denom = __c * __c + __d * __d;
+  float _Complex z;
+  __real__(z) = std::scalbn((__a * __c + __b * __d) / __denom, -__ilogbw);
+  __imag__(z) = std::scalbn((__b * __c - __a * __d) / __denom, -__ilogbw);
+  if (std::isnan(__real__(z)) && std::isnan(__imag__(z))) {
+    if ((__denom == 0) && (!std::isnan(__a) || !std::isnan(__b))) {
+      __real__(z) = std::copysign(__builtin_huge_valf(), __c) * __a;
+      __imag__(z) = std::copysign(__builtin_huge_valf(), __c) * __b;
+    } else if ((std::isinf(__a) || std::isinf(__b)) && std::isfinite(__c) &&
+               std::isfinite(__d)) {
+      __a = std::copysign(std::isinf(__a) ? 1 : 0, __a);
+      __b = std::copysign(std::isinf(__b) ? 1 : 0, __b);
+      __real__(z) = __builtin_huge_valf() * (__a * __c + __b * __d);
+      __imag__(z) = __builtin_huge_valf() * (__b * __c - __a * __d);
+    } else if (std::isinf(__logbw) && __logbw > 0 && std::isfinite(__a) &&
+               std::isfinite(__b)) {
+      __c = std::copysign(std::isinf(__c) ? 1 : 0, __c);
+      __d = std::copysign(std::isinf(__d) ? 1 : 0, __d);
+      __real__(z) = 0 * (__a * __c + __b * __d);
+      __imag__(z) = 0 * (__b * __c - __a * __d);
+    }
+  }
+  return z;
+}
+
+#endif // __CLANG_CUDA_COMPLEX_BUILTINS

+ 489 - 0
demo/include/__clang_cuda_intrinsics.h

@@ -0,0 +1,489 @@
+/*===--- __clang_cuda_intrinsics.h - Device-side CUDA intrinsic wrappers ---===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __CLANG_CUDA_INTRINSICS_H__
+#define __CLANG_CUDA_INTRINSICS_H__
+#ifndef __CUDA__
+#error "This file is for CUDA compilation only."
+#endif
+
+// sm_30 intrinsics: __shfl_{up,down,xor}.
+
+#define __SM_30_INTRINSICS_H__
+#define __SM_30_INTRINSICS_HPP__
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
+
+#pragma push_macro("__MAKE_SHUFFLES")
+#define __MAKE_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, __Mask,    \
+                        __Type)                                                \
+  inline __device__ int __FnName(int __val, __Type __offset,                   \
+                                 int __width = warpSize) {                     \
+    return __IntIntrinsic(__val, __offset,                                     \
+                          ((warpSize - __width) << 8) | (__Mask));             \
+  }                                                                            \
+  inline __device__ float __FnName(float __val, __Type __offset,               \
+                                   int __width = warpSize) {                   \
+    return __FloatIntrinsic(__val, __offset,                                   \
+                            ((warpSize - __width) << 8) | (__Mask));           \
+  }                                                                            \
+  inline __device__ unsigned int __FnName(unsigned int __val, __Type __offset, \
+                                          int __width = warpSize) {            \
+    return static_cast<unsigned int>(                                          \
+        ::__FnName(static_cast<int>(__val), __offset, __width));               \
+  }                                                                            \
+  inline __device__ long long __FnName(long long __val, __Type __offset,       \
+                                       int __width = warpSize) {               \
+    struct __Bits {                                                            \
+      int __a, __b;                                                            \
+    };                                                                         \
+    _Static_assert(sizeof(__val) == sizeof(__Bits));                           \
+    _Static_assert(sizeof(__Bits) == 2 * sizeof(int));                         \
+    __Bits __tmp;                                                              \
+    memcpy(&__val, &__tmp, sizeof(__val));                                     \
+    __tmp.__a = ::__FnName(__tmp.__a, __offset, __width);                      \
+    __tmp.__b = ::__FnName(__tmp.__b, __offset, __width);                      \
+    long long __ret;                                                           \
+    memcpy(&__ret, &__tmp, sizeof(__tmp));                                     \
+    return __ret;                                                              \
+  }                                                                            \
+  inline __device__ long __FnName(long __val, __Type __offset,                 \
+                                  int __width = warpSize) {                    \
+    _Static_assert(sizeof(long) == sizeof(long long) ||                        \
+                   sizeof(long) == sizeof(int));                               \
+    if (sizeof(long) == sizeof(long long)) {                                   \
+      return static_cast<long>(                                                \
+          ::__FnName(static_cast<long long>(__val), __offset, __width));       \
+    } else if (sizeof(long) == sizeof(int)) {                                  \
+      return static_cast<long>(                                                \
+          ::__FnName(static_cast<int>(__val), __offset, __width));             \
+    }                                                                          \
+  }                                                                            \
+  inline __device__ unsigned long __FnName(                                    \
+      unsigned long __val, __Type __offset, int __width = warpSize) {          \
+    return static_cast<unsigned long>(                                         \
+        ::__FnName(static_cast<long>(__val), __offset, __width));              \
+  }                                                                            \
+  inline __device__ unsigned long long __FnName(                               \
+      unsigned long long __val, __Type __offset, int __width = warpSize) {     \
+    return static_cast<unsigned long long>(::__FnName(                         \
+        static_cast<unsigned long long>(__val), __offset, __width));           \
+  }                                                                            \
+  inline __device__ double __FnName(double __val, __Type __offset,             \
+                                    int __width = warpSize) {                  \
+    long long __tmp;                                                           \
+    _Static_assert(sizeof(__tmp) == sizeof(__val));                            \
+    memcpy(&__tmp, &__val, sizeof(__val));                                     \
+    __tmp = ::__FnName(__tmp, __offset, __width);                              \
+    double __ret;                                                              \
+    memcpy(&__ret, &__tmp, sizeof(__ret));                                     \
+    return __ret;                                                              \
+  }
+
+__MAKE_SHUFFLES(__shfl, __nvvm_shfl_idx_i32, __nvvm_shfl_idx_f32, 0x1f, int);
+// We use 0 rather than 31 as our mask, because shfl.up applies to lanes >=
+// maxLane.
+__MAKE_SHUFFLES(__shfl_up, __nvvm_shfl_up_i32, __nvvm_shfl_up_f32, 0,
+                unsigned int);
+__MAKE_SHUFFLES(__shfl_down, __nvvm_shfl_down_i32, __nvvm_shfl_down_f32, 0x1f,
+                unsigned int);
+__MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f,
+                int);
+#pragma pop_macro("__MAKE_SHUFFLES")
+
+#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
+
+#if CUDA_VERSION >= 9000
+#if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300)
+// __shfl_sync_* variants available in CUDA-9
+#pragma push_macro("__MAKE_SYNC_SHUFFLES")
+#define __MAKE_SYNC_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic,       \
+                             __Mask, __Type)                                   \
+  inline __device__ int __FnName(unsigned int __mask, int __val,               \
+                                 __Type __offset, int __width = warpSize) {    \
+    return __IntIntrinsic(__mask, __val, __offset,                             \
+                          ((warpSize - __width) << 8) | (__Mask));             \
+  }                                                                            \
+  inline __device__ float __FnName(unsigned int __mask, float __val,           \
+                                   __Type __offset, int __width = warpSize) {  \
+    return __FloatIntrinsic(__mask, __val, __offset,                           \
+                            ((warpSize - __width) << 8) | (__Mask));           \
+  }                                                                            \
+  inline __device__ unsigned int __FnName(unsigned int __mask,                 \
+                                          unsigned int __val, __Type __offset, \
+                                          int __width = warpSize) {            \
+    return static_cast<unsigned int>(                                          \
+        ::__FnName(__mask, static_cast<int>(__val), __offset, __width));       \
+  }                                                                            \
+  inline __device__ long long __FnName(unsigned int __mask, long long __val,   \
+                                       __Type __offset,                        \
+                                       int __width = warpSize) {               \
+    struct __Bits {                                                            \
+      int __a, __b;                                                            \
+    };                                                                         \
+    _Static_assert(sizeof(__val) == sizeof(__Bits));                           \
+    _Static_assert(sizeof(__Bits) == 2 * sizeof(int));                         \
+    __Bits __tmp;                                                              \
+    memcpy(&__val, &__tmp, sizeof(__val));                                     \
+    __tmp.__a = ::__FnName(__mask, __tmp.__a, __offset, __width);              \
+    __tmp.__b = ::__FnName(__mask, __tmp.__b, __offset, __width);              \
+    long long __ret;                                                           \
+    memcpy(&__ret, &__tmp, sizeof(__tmp));                                     \
+    return __ret;                                                              \
+  }                                                                            \
+  inline __device__ unsigned long long __FnName(                               \
+      unsigned int __mask, unsigned long long __val, __Type __offset,          \
+      int __width = warpSize) {                                                \
+    return static_cast<unsigned long long>(::__FnName(                         \
+        __mask, static_cast<unsigned long long>(__val), __offset, __width));   \
+  }                                                                            \
+  inline __device__ long __FnName(unsigned int __mask, long __val,             \
+                                  __Type __offset, int __width = warpSize) {   \
+    _Static_assert(sizeof(long) == sizeof(long long) ||                        \
+                   sizeof(long) == sizeof(int));                               \
+    if (sizeof(long) == sizeof(long long)) {                                   \
+      return static_cast<long>(::__FnName(                                     \
+          __mask, static_cast<long long>(__val), __offset, __width));          \
+    } else if (sizeof(long) == sizeof(int)) {                                  \
+      return static_cast<long>(                                                \
+          ::__FnName(__mask, static_cast<int>(__val), __offset, __width));     \
+    }                                                                          \
+  }                                                                            \
+  inline __device__ unsigned long __FnName(                                    \
+      unsigned int __mask, unsigned long __val, __Type __offset,               \
+      int __width = warpSize) {                                                \
+    return static_cast<unsigned long>(                                         \
+        ::__FnName(__mask, static_cast<long>(__val), __offset, __width));      \
+  }                                                                            \
+  inline __device__ double __FnName(unsigned int __mask, double __val,         \
+                                    __Type __offset, int __width = warpSize) { \
+    long long __tmp;                                                           \
+    _Static_assert(sizeof(__tmp) == sizeof(__val));                            \
+    memcpy(&__tmp, &__val, sizeof(__val));                                     \
+    __tmp = ::__FnName(__mask, __tmp, __offset, __width);                      \
+    double __ret;                                                              \
+    memcpy(&__ret, &__tmp, sizeof(__ret));                                     \
+    return __ret;                                                              \
+  }
+__MAKE_SYNC_SHUFFLES(__shfl_sync, __nvvm_shfl_sync_idx_i32,
+                     __nvvm_shfl_sync_idx_f32, 0x1f, int);
+// We use 0 rather than 31 as our mask, because shfl.up applies to lanes >=
+// maxLane.
+__MAKE_SYNC_SHUFFLES(__shfl_up_sync, __nvvm_shfl_sync_up_i32,
+                     __nvvm_shfl_sync_up_f32, 0, unsigned int);
+__MAKE_SYNC_SHUFFLES(__shfl_down_sync, __nvvm_shfl_sync_down_i32,
+                     __nvvm_shfl_sync_down_f32, 0x1f, unsigned int);
+__MAKE_SYNC_SHUFFLES(__shfl_xor_sync, __nvvm_shfl_sync_bfly_i32,
+                     __nvvm_shfl_sync_bfly_f32, 0x1f, int);
+#pragma pop_macro("__MAKE_SYNC_SHUFFLES")
+
+inline __device__ void __syncwarp(unsigned int mask = 0xffffffff) {
+  return __nvvm_bar_warp_sync(mask);
+}
+
+inline __device__ void __barrier_sync(unsigned int id) {
+  __nvvm_barrier_sync(id);
+}
+
+inline __device__ void __barrier_sync_count(unsigned int id,
+                                            unsigned int count) {
+  __nvvm_barrier_sync_cnt(id, count);
+}
+
+inline __device__ int __all_sync(unsigned int mask, int pred) {
+  return __nvvm_vote_all_sync(mask, pred);
+}
+
+inline __device__ int __any_sync(unsigned int mask, int pred) {
+  return __nvvm_vote_any_sync(mask, pred);
+}
+
+inline __device__ int __uni_sync(unsigned int mask, int pred) {
+  return __nvvm_vote_uni_sync(mask, pred);
+}
+
+inline __device__ unsigned int __ballot_sync(unsigned int mask, int pred) {
+  return __nvvm_vote_ballot_sync(mask, pred);
+}
+
+inline __device__ unsigned int __activemask() { return __nvvm_vote_ballot(1); }
+
+inline __device__ unsigned int __fns(unsigned mask, unsigned base, int offset) {
+  return __nvvm_fns(mask, base, offset);
+}
+
+#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
+
+// Define __match* builtins CUDA-9 headers expect to see.
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+inline __device__ unsigned int __match32_any_sync(unsigned int mask,
+                                                  unsigned int value) {
+  return __nvvm_match_any_sync_i32(mask, value);
+}
+
+inline __device__ unsigned long long
+__match64_any_sync(unsigned int mask, unsigned long long value) {
+  return __nvvm_match_any_sync_i64(mask, value);
+}
+
+inline __device__ unsigned int
+__match32_all_sync(unsigned int mask, unsigned int value, int *pred) {
+  return __nvvm_match_all_sync_i32p(mask, value, pred);
+}
+
+inline __device__ unsigned long long
+__match64_all_sync(unsigned int mask, unsigned long long value, int *pred) {
+  return __nvvm_match_all_sync_i64p(mask, value, pred);
+}
+#include "crt/sm_70_rt.hpp"
+
+#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+#endif // __CUDA_VERSION >= 9000
+
+// sm_32 intrinsics: __ldg and __funnelshift_{l,lc,r,rc}.
+
+// Prevent the vanilla sm_32 intrinsics header from being included.
+#define __SM_32_INTRINSICS_H__
+#define __SM_32_INTRINSICS_HPP__
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
+
+inline __device__ char __ldg(const char *ptr) { return __nvvm_ldg_c(ptr); }
+inline __device__ short __ldg(const short *ptr) { return __nvvm_ldg_s(ptr); }
+inline __device__ int __ldg(const int *ptr) { return __nvvm_ldg_i(ptr); }
+inline __device__ long __ldg(const long *ptr) { return __nvvm_ldg_l(ptr); }
+inline __device__ long long __ldg(const long long *ptr) {
+  return __nvvm_ldg_ll(ptr);
+}
+inline __device__ unsigned char __ldg(const unsigned char *ptr) {
+  return __nvvm_ldg_uc(ptr);
+}
+inline __device__ unsigned short __ldg(const unsigned short *ptr) {
+  return __nvvm_ldg_us(ptr);
+}
+inline __device__ unsigned int __ldg(const unsigned int *ptr) {
+  return __nvvm_ldg_ui(ptr);
+}
+inline __device__ unsigned long __ldg(const unsigned long *ptr) {
+  return __nvvm_ldg_ul(ptr);
+}
+inline __device__ unsigned long long __ldg(const unsigned long long *ptr) {
+  return __nvvm_ldg_ull(ptr);
+}
+inline __device__ float __ldg(const float *ptr) { return __nvvm_ldg_f(ptr); }
+inline __device__ double __ldg(const double *ptr) { return __nvvm_ldg_d(ptr); }
+
+inline __device__ char2 __ldg(const char2 *ptr) {
+  typedef char c2 __attribute__((ext_vector_type(2)));
+  // We can assume that ptr is aligned at least to char2's alignment, but the
+  // load will assume that ptr is aligned to char2's alignment.  This is only
+  // safe if alignof(c2) <= alignof(char2).
+  c2 rv = __nvvm_ldg_c2(reinterpret_cast<const c2 *>(ptr));
+  char2 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  return ret;
+}
+inline __device__ char4 __ldg(const char4 *ptr) {
+  typedef char c4 __attribute__((ext_vector_type(4)));
+  c4 rv = __nvvm_ldg_c4(reinterpret_cast<const c4 *>(ptr));
+  char4 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  ret.z = rv[2];
+  ret.w = rv[3];
+  return ret;
+}
+inline __device__ short2 __ldg(const short2 *ptr) {
+  typedef short s2 __attribute__((ext_vector_type(2)));
+  s2 rv = __nvvm_ldg_s2(reinterpret_cast<const s2 *>(ptr));
+  short2 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  return ret;
+}
+inline __device__ short4 __ldg(const short4 *ptr) {
+  typedef short s4 __attribute__((ext_vector_type(4)));
+  s4 rv = __nvvm_ldg_s4(reinterpret_cast<const s4 *>(ptr));
+  short4 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  ret.z = rv[2];
+  ret.w = rv[3];
+  return ret;
+}
+inline __device__ int2 __ldg(const int2 *ptr) {
+  typedef int i2 __attribute__((ext_vector_type(2)));
+  i2 rv = __nvvm_ldg_i2(reinterpret_cast<const i2 *>(ptr));
+  int2 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  return ret;
+}
+inline __device__ int4 __ldg(const int4 *ptr) {
+  typedef int i4 __attribute__((ext_vector_type(4)));
+  i4 rv = __nvvm_ldg_i4(reinterpret_cast<const i4 *>(ptr));
+  int4 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  ret.z = rv[2];
+  ret.w = rv[3];
+  return ret;
+}
+inline __device__ longlong2 __ldg(const longlong2 *ptr) {
+  typedef long long ll2 __attribute__((ext_vector_type(2)));
+  ll2 rv = __nvvm_ldg_ll2(reinterpret_cast<const ll2 *>(ptr));
+  longlong2 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  return ret;
+}
+
+inline __device__ uchar2 __ldg(const uchar2 *ptr) {
+  typedef unsigned char uc2 __attribute__((ext_vector_type(2)));
+  uc2 rv = __nvvm_ldg_uc2(reinterpret_cast<const uc2 *>(ptr));
+  uchar2 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  return ret;
+}
+inline __device__ uchar4 __ldg(const uchar4 *ptr) {
+  typedef unsigned char uc4 __attribute__((ext_vector_type(4)));
+  uc4 rv = __nvvm_ldg_uc4(reinterpret_cast<const uc4 *>(ptr));
+  uchar4 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  ret.z = rv[2];
+  ret.w = rv[3];
+  return ret;
+}
+inline __device__ ushort2 __ldg(const ushort2 *ptr) {
+  typedef unsigned short us2 __attribute__((ext_vector_type(2)));
+  us2 rv = __nvvm_ldg_us2(reinterpret_cast<const us2 *>(ptr));
+  ushort2 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  return ret;
+}
+inline __device__ ushort4 __ldg(const ushort4 *ptr) {
+  typedef unsigned short us4 __attribute__((ext_vector_type(4)));
+  us4 rv = __nvvm_ldg_us4(reinterpret_cast<const us4 *>(ptr));
+  ushort4 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  ret.z = rv[2];
+  ret.w = rv[3];
+  return ret;
+}
+inline __device__ uint2 __ldg(const uint2 *ptr) {
+  typedef unsigned int ui2 __attribute__((ext_vector_type(2)));
+  ui2 rv = __nvvm_ldg_ui2(reinterpret_cast<const ui2 *>(ptr));
+  uint2 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  return ret;
+}
+inline __device__ uint4 __ldg(const uint4 *ptr) {
+  typedef unsigned int ui4 __attribute__((ext_vector_type(4)));
+  ui4 rv = __nvvm_ldg_ui4(reinterpret_cast<const ui4 *>(ptr));
+  uint4 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  ret.z = rv[2];
+  ret.w = rv[3];
+  return ret;
+}
+inline __device__ ulonglong2 __ldg(const ulonglong2 *ptr) {
+  typedef unsigned long long ull2 __attribute__((ext_vector_type(2)));
+  ull2 rv = __nvvm_ldg_ull2(reinterpret_cast<const ull2 *>(ptr));
+  ulonglong2 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  return ret;
+}
+
+inline __device__ float2 __ldg(const float2 *ptr) {
+  typedef float f2 __attribute__((ext_vector_type(2)));
+  f2 rv = __nvvm_ldg_f2(reinterpret_cast<const f2 *>(ptr));
+  float2 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  return ret;
+}
+inline __device__ float4 __ldg(const float4 *ptr) {
+  typedef float f4 __attribute__((ext_vector_type(4)));
+  f4 rv = __nvvm_ldg_f4(reinterpret_cast<const f4 *>(ptr));
+  float4 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  ret.z = rv[2];
+  ret.w = rv[3];
+  return ret;
+}
+inline __device__ double2 __ldg(const double2 *ptr) {
+  typedef double d2 __attribute__((ext_vector_type(2)));
+  d2 rv = __nvvm_ldg_d2(reinterpret_cast<const d2 *>(ptr));
+  double2 ret;
+  ret.x = rv[0];
+  ret.y = rv[1];
+  return ret;
+}
+
+// TODO: Implement these as intrinsics, so the backend can work its magic on
+// these.  Alternatively, we could implement these as plain C and try to get
+// llvm to recognize the relevant patterns.
+inline __device__ unsigned __funnelshift_l(unsigned low32, unsigned high32,
+                                           unsigned shiftWidth) {
+  unsigned result;
+  asm("shf.l.wrap.b32 %0, %1, %2, %3;"
+      : "=r"(result)
+      : "r"(low32), "r"(high32), "r"(shiftWidth));
+  return result;
+}
+inline __device__ unsigned __funnelshift_lc(unsigned low32, unsigned high32,
+                                            unsigned shiftWidth) {
+  unsigned result;
+  asm("shf.l.clamp.b32 %0, %1, %2, %3;"
+      : "=r"(result)
+      : "r"(low32), "r"(high32), "r"(shiftWidth));
+  return result;
+}
+inline __device__ unsigned __funnelshift_r(unsigned low32, unsigned high32,
+                                           unsigned shiftWidth) {
+  unsigned result;
+  asm("shf.r.wrap.b32 %0, %1, %2, %3;"
+      : "=r"(result)
+      : "r"(low32), "r"(high32), "r"(shiftWidth));
+  return result;
+}
+inline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32,
+                                            unsigned shiftWidth) {
+  unsigned ret;
+  asm("shf.r.clamp.b32 %0, %1, %2, %3;"
+      : "=r"(ret)
+      : "r"(low32), "r"(high32), "r"(shiftWidth));
+  return ret;
+}
+
+#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
+
+#endif // defined(__CLANG_CUDA_INTRINSICS_H__)

+ 286 - 0
demo/include/__clang_cuda_math_forward_declares.h

@@ -0,0 +1,286 @@
+/*===- __clang_math_forward_declares.h - Prototypes of __device__ math fns --===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __CLANG__CUDA_MATH_FORWARD_DECLARES_H__
+#define __CLANG__CUDA_MATH_FORWARD_DECLARES_H__
+#ifndef __CUDA__
+#error "This file is for CUDA compilation only."
+#endif
+
+// This file forward-declares of some math functions we (or the CUDA headers)
+// will define later.  We need to do this, and do it before cmath is included,
+// because the standard library may have constexpr math functions.  In the
+// absence of a prior __device__ decl, those constexpr functions may become
+// implicitly host+device.  host+device functions can't be overloaded, so that
+// would preclude the use of our own __device__ overloads for these functions.
+
+#pragma push_macro("__DEVICE__")
+#define __DEVICE__                                                             \
+  static __inline__ __attribute__((always_inline)) __attribute__((device))
+
+__DEVICE__ double abs(double);
+__DEVICE__ float abs(float);
+__DEVICE__ int abs(int);
+__DEVICE__ long abs(long);
+__DEVICE__ long long abs(long long);
+__DEVICE__ double acos(double);
+__DEVICE__ float acos(float);
+__DEVICE__ double acosh(double);
+__DEVICE__ float acosh(float);
+__DEVICE__ double asin(double);
+__DEVICE__ float asin(float);
+__DEVICE__ double asinh(double);
+__DEVICE__ float asinh(float);
+__DEVICE__ double atan2(double, double);
+__DEVICE__ float atan2(float, float);
+__DEVICE__ double atan(double);
+__DEVICE__ float atan(float);
+__DEVICE__ double atanh(double);
+__DEVICE__ float atanh(float);
+__DEVICE__ double cbrt(double);
+__DEVICE__ float cbrt(float);
+__DEVICE__ double ceil(double);
+__DEVICE__ float ceil(float);
+__DEVICE__ double copysign(double, double);
+__DEVICE__ float copysign(float, float);
+__DEVICE__ double cos(double);
+__DEVICE__ float cos(float);
+__DEVICE__ double cosh(double);
+__DEVICE__ float cosh(float);
+__DEVICE__ double erfc(double);
+__DEVICE__ float erfc(float);
+__DEVICE__ double erf(double);
+__DEVICE__ float erf(float);
+__DEVICE__ double exp2(double);
+__DEVICE__ float exp2(float);
+__DEVICE__ double exp(double);
+__DEVICE__ float exp(float);
+__DEVICE__ double expm1(double);
+__DEVICE__ float expm1(float);
+__DEVICE__ double fabs(double);
+__DEVICE__ float fabs(float);
+__DEVICE__ double fdim(double, double);
+__DEVICE__ float fdim(float, float);
+__DEVICE__ double floor(double);
+__DEVICE__ float floor(float);
+__DEVICE__ double fma(double, double, double);
+__DEVICE__ float fma(float, float, float);
+__DEVICE__ double fmax(double, double);
+__DEVICE__ float fmax(float, float);
+__DEVICE__ double fmin(double, double);
+__DEVICE__ float fmin(float, float);
+__DEVICE__ double fmod(double, double);
+__DEVICE__ float fmod(float, float);
+__DEVICE__ int fpclassify(double);
+__DEVICE__ int fpclassify(float);
+__DEVICE__ double frexp(double, int *);
+__DEVICE__ float frexp(float, int *);
+__DEVICE__ double hypot(double, double);
+__DEVICE__ float hypot(float, float);
+__DEVICE__ int ilogb(double);
+__DEVICE__ int ilogb(float);
+__DEVICE__ bool isfinite(double);
+__DEVICE__ bool isfinite(float);
+__DEVICE__ bool isgreater(double, double);
+__DEVICE__ bool isgreaterequal(double, double);
+__DEVICE__ bool isgreaterequal(float, float);
+__DEVICE__ bool isgreater(float, float);
+__DEVICE__ bool isinf(double);
+__DEVICE__ bool isinf(float);
+__DEVICE__ bool isless(double, double);
+__DEVICE__ bool islessequal(double, double);
+__DEVICE__ bool islessequal(float, float);
+__DEVICE__ bool isless(float, float);
+__DEVICE__ bool islessgreater(double, double);
+__DEVICE__ bool islessgreater(float, float);
+__DEVICE__ bool isnan(double);
+__DEVICE__ bool isnan(float);
+__DEVICE__ bool isnormal(double);
+__DEVICE__ bool isnormal(float);
+__DEVICE__ bool isunordered(double, double);
+__DEVICE__ bool isunordered(float, float);
+__DEVICE__ long labs(long);
+__DEVICE__ double ldexp(double, int);
+__DEVICE__ float ldexp(float, int);
+__DEVICE__ double lgamma(double);
+__DEVICE__ float lgamma(float);
+__DEVICE__ long long llabs(long long);
+__DEVICE__ long long llrint(double);
+__DEVICE__ long long llrint(float);
+__DEVICE__ double log10(double);
+__DEVICE__ float log10(float);
+__DEVICE__ double log1p(double);
+__DEVICE__ float log1p(float);
+__DEVICE__ double log2(double);
+__DEVICE__ float log2(float);
+__DEVICE__ double logb(double);
+__DEVICE__ float logb(float);
+__DEVICE__ double log(double);
+__DEVICE__ float log(float);
+__DEVICE__ long lrint(double);
+__DEVICE__ long lrint(float);
+__DEVICE__ long lround(double);
+__DEVICE__ long lround(float);
+__DEVICE__ long long llround(float); // No llround(double).
+__DEVICE__ double modf(double, double *);
+__DEVICE__ float modf(float, float *);
+__DEVICE__ double nan(const char *);
+__DEVICE__ float nanf(const char *);
+__DEVICE__ double nearbyint(double);
+__DEVICE__ float nearbyint(float);
+__DEVICE__ double nextafter(double, double);
+__DEVICE__ float nextafter(float, float);
+__DEVICE__ double pow(double, double);
+__DEVICE__ double pow(double, int);
+__DEVICE__ float pow(float, float);
+__DEVICE__ float pow(float, int);
+__DEVICE__ double remainder(double, double);
+__DEVICE__ float remainder(float, float);
+__DEVICE__ double remquo(double, double, int *);
+__DEVICE__ float remquo(float, float, int *);
+__DEVICE__ double rint(double);
+__DEVICE__ float rint(float);
+__DEVICE__ double round(double);
+__DEVICE__ float round(float);
+__DEVICE__ double scalbln(double, long);
+__DEVICE__ float scalbln(float, long);
+__DEVICE__ double scalbn(double, int);
+__DEVICE__ float scalbn(float, int);
+__DEVICE__ bool signbit(double);
+__DEVICE__ bool signbit(float);
+__DEVICE__ double sin(double);
+__DEVICE__ float sin(float);
+__DEVICE__ double sinh(double);
+__DEVICE__ float sinh(float);
+__DEVICE__ double sqrt(double);
+__DEVICE__ float sqrt(float);
+__DEVICE__ double tan(double);
+__DEVICE__ float tan(float);
+__DEVICE__ double tanh(double);
+__DEVICE__ float tanh(float);
+__DEVICE__ double tgamma(double);
+__DEVICE__ float tgamma(float);
+__DEVICE__ double trunc(double);
+__DEVICE__ float trunc(float);
+
+// Notably missing above is nexttoward, which we don't define on
+// the device side because libdevice doesn't give us an implementation, and we
+// don't want to be in the business of writing one ourselves.
+
+// We need to define these overloads in exactly the namespace our standard
+// library uses (including the right inline namespace), otherwise they won't be
+// picked up by other functions in the standard library (e.g. functions in
+// <complex>).  Thus the ugliness below.
+#ifdef _LIBCPP_BEGIN_NAMESPACE_STD
+_LIBCPP_BEGIN_NAMESPACE_STD
+#else
+namespace std {
+#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
+_GLIBCXX_BEGIN_NAMESPACE_VERSION
+#endif
+#endif
+
+using ::abs;
+using ::acos;
+using ::acosh;
+using ::asin;
+using ::asinh;
+using ::atan;
+using ::atan2;
+using ::atanh;
+using ::cbrt;
+using ::ceil;
+using ::copysign;
+using ::cos;
+using ::cosh;
+using ::erf;
+using ::erfc;
+using ::exp;
+using ::exp2;
+using ::expm1;
+using ::fabs;
+using ::fdim;
+using ::floor;
+using ::fma;
+using ::fmax;
+using ::fmin;
+using ::fmod;
+using ::fpclassify;
+using ::frexp;
+using ::hypot;
+using ::ilogb;
+using ::isfinite;
+using ::isgreater;
+using ::isgreaterequal;
+using ::isinf;
+using ::isless;
+using ::islessequal;
+using ::islessgreater;
+using ::isnan;
+using ::isnormal;
+using ::isunordered;
+using ::labs;
+using ::ldexp;
+using ::lgamma;
+using ::llabs;
+using ::llrint;
+using ::log;
+using ::log10;
+using ::log1p;
+using ::log2;
+using ::logb;
+using ::lrint;
+using ::lround;
+using ::llround;
+using ::modf;
+using ::nan;
+using ::nanf;
+using ::nearbyint;
+using ::nextafter;
+using ::pow;
+using ::remainder;
+using ::remquo;
+using ::rint;
+using ::round;
+using ::scalbln;
+using ::scalbn;
+using ::signbit;
+using ::sin;
+using ::sinh;
+using ::sqrt;
+using ::tan;
+using ::tanh;
+using ::tgamma;
+using ::trunc;
+
+#ifdef _LIBCPP_END_NAMESPACE_STD
+_LIBCPP_END_NAMESPACE_STD
+#else
+#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
+_GLIBCXX_END_NAMESPACE_VERSION
+#endif
+} // namespace std
+#endif
+
+#pragma pop_macro("__DEVICE__")
+
+#endif

+ 381 - 0
demo/include/__clang_cuda_runtime_wrapper.h

@@ -0,0 +1,381 @@
+/*===---- __clang_cuda_runtime_wrapper.h - CUDA runtime support -------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/*
+ * WARNING: This header is intended to be directly -include'd by
+ * the compiler and is not supposed to be included by users.
+ *
+ * CUDA headers are implemented in a way that currently makes it
+ * impossible for user code to #include directly when compiling with
+ * Clang. They present different view of CUDA-supplied functions
+ * depending on where in NVCC's compilation pipeline the headers are
+ * included. Neither of these modes provides function definitions with
+ * correct attributes, so we use preprocessor to force the headers
+ * into a form that Clang can use.
+ *
+ * Similarly to NVCC which -include's cuda_runtime.h, Clang -include's
+ * this file during every CUDA compilation.
+ */
+
+#ifndef __CLANG_CUDA_RUNTIME_WRAPPER_H__
+#define __CLANG_CUDA_RUNTIME_WRAPPER_H__
+
+#if defined(__CUDA__) && defined(__clang__)
+
+// Include some forward declares that must come before cmath.
+#include <__clang_cuda_math_forward_declares.h>
+
+// Include some standard headers to avoid CUDA headers including them
+// while some required macros (like __THROW) are in a weird state.
+#include <cmath>
+#include <cstdlib>
+#include <stdlib.h>
+
+// Preserve common macros that will be changed below by us or by CUDA
+// headers.
+#pragma push_macro("__THROW")
+#pragma push_macro("__CUDA_ARCH__")
+
+// WARNING: Preprocessor hacks below are based on specific details of
+// CUDA-7.x headers and are not expected to work with any other
+// version of CUDA headers.
+#include "cuda.h"
+#if !defined(CUDA_VERSION)
+#error "cuda.h did not define CUDA_VERSION"
+#elif CUDA_VERSION < 7000 || CUDA_VERSION > 9000
+#error "Unsupported CUDA version!"
+#endif
+
+// Make largest subset of device functions available during host
+// compilation -- SM_35 for the time being.
+#ifndef __CUDA_ARCH__
+#define __CUDA_ARCH__ 350
+#endif
+
+#include "__clang_cuda_builtin_vars.h"
+
+// No need for device_launch_parameters.h as __clang_cuda_builtin_vars.h above
+// has taken care of builtin variables declared in the file.
+#define __DEVICE_LAUNCH_PARAMETERS_H__
+
+// {math,device}_functions.h only have declarations of the
+// functions. We don't need them as we're going to pull in their
+// definitions from .hpp files.
+#define __DEVICE_FUNCTIONS_H__
+#define __MATH_FUNCTIONS_H__
+#define __COMMON_FUNCTIONS_H__
+
+#undef __CUDACC__
+#if CUDA_VERSION < 9000
+#define __CUDABE__
+#else
+#define __CUDA_LIBDEVICE__
+#endif
+// Disables definitions of device-side runtime support stubs in
+// cuda_device_runtime_api.h
+#include "driver_types.h"
+#include "host_config.h"
+#include "host_defines.h"
+
+#undef __CUDABE__
+#undef __CUDA_LIBDEVICE__
+#define __CUDACC__
+#include "cuda_runtime.h"
+
+#undef __CUDACC__
+#define __CUDABE__
+
+// CUDA headers use __nvvm_memcpy and __nvvm_memset which Clang does
+// not have at the moment. Emulate them with a builtin memcpy/memset.
+#define __nvvm_memcpy(s, d, n, a) __builtin_memcpy(s, d, n)
+#define __nvvm_memset(d, c, n, a) __builtin_memset(d, c, n)
+
+#if CUDA_VERSION < 9000
+#include "crt/device_runtime.h"
+#endif
+#include "crt/host_runtime.h"
+// device_runtime.h defines __cxa_* macros that will conflict with
+// cxxabi.h.
+// FIXME: redefine these as __device__ functions.
+#undef __cxa_vec_ctor
+#undef __cxa_vec_cctor
+#undef __cxa_vec_dtor
+#undef __cxa_vec_new
+#undef __cxa_vec_new2
+#undef __cxa_vec_new3
+#undef __cxa_vec_delete2
+#undef __cxa_vec_delete
+#undef __cxa_vec_delete3
+#undef __cxa_pure_virtual
+
+// math_functions.hpp expects this host function be defined on MacOS, but it
+// ends up not being there because of the games we play here.  Just define it
+// ourselves; it's simple enough.
+#ifdef __APPLE__
+inline __host__ double __signbitd(double x) {
+  return std::signbit(x);
+}
+#endif
+
+// We need decls for functions in CUDA's libdevice with __device__
+// attribute only. Alas they come either as __host__ __device__ or
+// with no attributes at all. To work around that, define __CUDA_RTC__
+// which produces HD variant and undef __host__ which gives us desided
+// decls with __device__ attribute.
+#pragma push_macro("__host__")
+#define __host__
+#define __CUDACC_RTC__
+#include "device_functions_decls.h"
+#undef __CUDACC_RTC__
+
+// Temporarily poison __host__ macro to ensure it's not used by any of
+// the headers we're about to include.
+#define __host__ UNEXPECTED_HOST_ATTRIBUTE
+
+// CUDA 8.0.41 relies on __USE_FAST_MATH__ and __CUDA_PREC_DIV's values.
+// Previous versions used to check whether they are defined or not.
+// CU_DEVICE_INVALID macro is only defined in 8.0.41, so we use it
+// here to detect the switch.
+
+#if defined(CU_DEVICE_INVALID)
+#if !defined(__USE_FAST_MATH__)
+#define __USE_FAST_MATH__ 0
+#endif
+
+#if !defined(__CUDA_PREC_DIV)
+#define __CUDA_PREC_DIV 0
+#endif
+#endif
+
+// device_functions.hpp and math_functions*.hpp use 'static
+// __forceinline__' (with no __device__) for definitions of device
+// functions. Temporarily redefine __forceinline__ to include
+// __device__.
+#pragma push_macro("__forceinline__")
+#define __forceinline__ __device__ __inline__ __attribute__((always_inline))
+
+#pragma push_macro("__float2half_rn")
+#if CUDA_VERSION >= 9000
+// CUDA-9 has conflicting prototypes for __float2half_rn(float f) in
+// cuda_fp16.h[pp] and device_functions.hpp. We need to get the one in
+// device_functions.hpp out of the way.
+#define __float2half_rn  __float2half_rn_disabled
+#endif
+
+#include "device_functions.hpp"
+#pragma pop_macro("__float2half_rn")
+
+
+// math_function.hpp uses the __USE_FAST_MATH__ macro to determine whether we
+// get the slow-but-accurate or fast-but-inaccurate versions of functions like
+// sin and exp.  This is controlled in clang by -fcuda-approx-transcendentals.
+//
+// device_functions.hpp uses __USE_FAST_MATH__ for a different purpose (fast vs.
+// slow divides), so we need to scope our define carefully here.
+#pragma push_macro("__USE_FAST_MATH__")
+#if defined(__CLANG_CUDA_APPROX_TRANSCENDENTALS__)
+#define __USE_FAST_MATH__ 1
+#endif
+#include "math_functions.hpp"
+#pragma pop_macro("__USE_FAST_MATH__")
+
+#include "math_functions_dbl_ptx3.hpp"
+#pragma pop_macro("__forceinline__")
+
+// Pull in host-only functions that are only available when neither
+// __CUDACC__ nor __CUDABE__ are defined.
+#undef __MATH_FUNCTIONS_HPP__
+#undef __CUDABE__
+#include "math_functions.hpp"
+// Alas, additional overloads for these functions are hard to get to.
+// Considering that we only need these overloads for a few functions,
+// we can provide them here.
+static inline float rsqrt(float __a) { return rsqrtf(__a); }
+static inline float rcbrt(float __a) { return rcbrtf(__a); }
+static inline float sinpi(float __a) { return sinpif(__a); }
+static inline float cospi(float __a) { return cospif(__a); }
+static inline void sincospi(float __a, float *__b, float *__c) {
+  return sincospif(__a, __b, __c);
+}
+static inline float erfcinv(float __a) { return erfcinvf(__a); }
+static inline float normcdfinv(float __a) { return normcdfinvf(__a); }
+static inline float normcdf(float __a) { return normcdff(__a); }
+static inline float erfcx(float __a) { return erfcxf(__a); }
+
+// For some reason single-argument variant is not always declared by
+// CUDA headers. Alas, device_functions.hpp included below needs it.
+static inline __device__ void __brkpt(int __c) { __brkpt(); }
+
+// Now include *.hpp with definitions of various GPU functions.  Alas,
+// a lot of thins get declared/defined with __host__ attribute which
+// we don't want and we have to define it out. We also have to include
+// {device,math}_functions.hpp again in order to extract the other
+// branch of #if/else inside.
+
+#define __host__
+#undef __CUDABE__
+#define __CUDACC__
+#undef __DEVICE_FUNCTIONS_HPP__
+#include "device_atomic_functions.hpp"
+#include "device_functions.hpp"
+#include "sm_20_atomic_functions.hpp"
+#include "sm_20_intrinsics.hpp"
+#include "sm_32_atomic_functions.hpp"
+
+// Don't include sm_30_intrinsics.h and sm_32_intrinsics.h.  These define the
+// __shfl and __ldg intrinsics using inline (volatile) asm, but we want to
+// define them using builtins so that the optimizer can reason about and across
+// these instructions.  In particular, using intrinsics for ldg gets us the
+// [addr+imm] addressing mode, which, although it doesn't actually exist in the
+// hardware, seems to generate faster machine code because ptxas can more easily
+// reason about our code.
+
+#if CUDA_VERSION >= 8000
+#include "sm_60_atomic_functions.hpp"
+#include "sm_61_intrinsics.hpp"
+#endif
+
+#undef __MATH_FUNCTIONS_HPP__
+
+// math_functions.hpp defines ::signbit as a __host__ __device__ function.  This
+// conflicts with libstdc++'s constexpr ::signbit, so we have to rename
+// math_function.hpp's ::signbit.  It's guarded by #undef signbit, but that's
+// conditional on __GNUC__.  :)
+#pragma push_macro("signbit")
+#pragma push_macro("__GNUC__")
+#undef __GNUC__
+#define signbit __ignored_cuda_signbit
+
+// CUDA-9 omits device-side definitions of some math functions if it sees
+// include guard from math.h wrapper from libstdc++. We have to undo the header
+// guard temporarily to get the definitions we need.
+#pragma push_macro("_GLIBCXX_MATH_H")
+#pragma push_macro("_LIBCPP_VERSION")
+#if CUDA_VERSION >= 9000
+#undef _GLIBCXX_MATH_H
+// We also need to undo another guard that checks for libc++ 3.8+
+#ifdef _LIBCPP_VERSION
+#define _LIBCPP_VERSION 3700
+#endif
+#endif
+
+#include "math_functions.hpp"
+#pragma pop_macro("_GLIBCXX_MATH_H")
+#pragma pop_macro("_LIBCPP_VERSION")
+#pragma pop_macro("__GNUC__")
+#pragma pop_macro("signbit")
+
+#pragma pop_macro("__host__")
+
+#include "texture_indirect_functions.h"
+
+// Restore state of __CUDA_ARCH__ and __THROW we had on entry.
+#pragma pop_macro("__CUDA_ARCH__")
+#pragma pop_macro("__THROW")
+
+// Set up compiler macros expected to be seen during compilation.
+#undef __CUDABE__
+#define __CUDACC__
+
+extern "C" {
+// Device-side CUDA system calls.
+// http://docs.nvidia.com/cuda/ptx-writers-guide-to-interoperability/index.html#system-calls
+// We need these declarations and wrappers for device-side
+// malloc/free/printf calls to work without relying on
+// -fcuda-disable-target-call-checks option.
+__device__ int vprintf(const char *, const char *);
+__device__ void free(void *) __attribute((nothrow));
+__device__ void *malloc(size_t) __attribute((nothrow)) __attribute__((malloc));
+__device__ void __assertfail(const char *__message, const char *__file,
+                             unsigned __line, const char *__function,
+                             size_t __charSize) __attribute__((noreturn));
+
+// In order for standard assert() macro on linux to work we need to
+// provide device-side __assert_fail()
+__device__ static inline void __assert_fail(const char *__message,
+                                            const char *__file, unsigned __line,
+                                            const char *__function) {
+  __assertfail(__message, __file, __line, __function, sizeof(char));
+}
+
+// Clang will convert printf into vprintf, but we still need
+// device-side declaration for it.
+__device__ int printf(const char *, ...);
+} // extern "C"
+
+// We also need device-side std::malloc and std::free.
+namespace std {
+__device__ static inline void free(void *__ptr) { ::free(__ptr); }
+__device__ static inline void *malloc(size_t __size) {
+  return ::malloc(__size);
+}
+} // namespace std
+
+// Out-of-line implementations from __clang_cuda_builtin_vars.h.  These need to
+// come after we've pulled in the definition of uint3 and dim3.
+
+__device__ inline __cuda_builtin_threadIdx_t::operator uint3() const {
+  uint3 ret;
+  ret.x = x;
+  ret.y = y;
+  ret.z = z;
+  return ret;
+}
+
+__device__ inline __cuda_builtin_blockIdx_t::operator uint3() const {
+  uint3 ret;
+  ret.x = x;
+  ret.y = y;
+  ret.z = z;
+  return ret;
+}
+
+__device__ inline __cuda_builtin_blockDim_t::operator dim3() const {
+  return dim3(x, y, z);
+}
+
+__device__ inline __cuda_builtin_gridDim_t::operator dim3() const {
+  return dim3(x, y, z);
+}
+
+#include <__clang_cuda_cmath.h>
+#include <__clang_cuda_intrinsics.h>
+#include <__clang_cuda_complex_builtins.h>
+
+// curand_mtgp32_kernel helpfully redeclares blockDim and threadIdx in host
+// mode, giving them their "proper" types of dim3 and uint3.  This is
+// incompatible with the types we give in __clang_cuda_builtin_vars.h.  As as
+// hack, force-include the header (nvcc doesn't include it by default) but
+// redefine dim3 and uint3 to our builtin types.  (Thankfully dim3 and uint3 are
+// only used here for the redeclarations of blockDim and threadIdx.)
+#pragma push_macro("dim3")
+#pragma push_macro("uint3")
+#define dim3 __cuda_builtin_blockDim_t
+#define uint3 __cuda_builtin_threadIdx_t
+#include "curand_mtgp32_kernel.h"
+#pragma pop_macro("dim3")
+#pragma pop_macro("uint3")
+#pragma pop_macro("__USE_FAST_MATH__")
+
+#endif // __CUDA__
+#endif // __CLANG_CUDA_RUNTIME_WRAPPER_H__

+ 43 - 0
demo/include/__stddef_max_align_t.h

@@ -0,0 +1,43 @@
+/*===---- __stddef_max_align_t.h - Definition of max_align_t for modules ---===
+ *
+ * Copyright (c) 2014 Chandler Carruth
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_MAX_ALIGN_T_DEFINED
+#define __CLANG_MAX_ALIGN_T_DEFINED
+
+#if defined(_MSC_VER)
+typedef double max_align_t;
+#elif defined(__APPLE__)
+typedef long double max_align_t;
+#else
+// Define 'max_align_t' to match the GCC definition.
+typedef struct {
+  long long __clang_max_align_nonce1
+      __attribute__((__aligned__(__alignof__(long long))));
+  long double __clang_max_align_nonce2
+      __attribute__((__aligned__(__alignof__(long double))));
+} max_align_t;
+#endif
+
+#endif

+ 151 - 0
demo/include/__wmmintrin_aes.h

@@ -0,0 +1,151 @@
+/*===---- __wmmintrin_aes.h - AES intrinsics -------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef _WMMINTRIN_AES_H
+#define _WMMINTRIN_AES_H
+
+#include <emmintrin.h>
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("aes")))
+
+/// \brief Performs a single round of AES encryption using the Equivalent
+///    Inverse Cipher, transforming the state value from the first source
+///    operand using a 128-bit round key value contained in the second source
+///    operand, and writes the result to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VAESENC </c> instruction.
+///
+/// \param __V
+///    A 128-bit integer vector containing the state value.
+/// \param __R
+///    A 128-bit integer vector containing the round key value.
+/// \returns A 128-bit integer vector containing the encrypted value.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_aesenc_si128(__m128i __V, __m128i __R)
+{
+  return (__m128i)__builtin_ia32_aesenc128((__v2di)__V, (__v2di)__R);
+}
+
+/// \brief Performs the final round of AES encryption using the Equivalent
+///    Inverse Cipher, transforming the state value from the first source
+///    operand using a 128-bit round key value contained in the second source
+///    operand, and writes the result to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VAESENCLAST </c> instruction.
+///
+/// \param __V
+///    A 128-bit integer vector containing the state value.
+/// \param __R
+///    A 128-bit integer vector containing the round key value.
+/// \returns A 128-bit integer vector containing the encrypted value.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_aesenclast_si128(__m128i __V, __m128i __R)
+{
+  return (__m128i)__builtin_ia32_aesenclast128((__v2di)__V, (__v2di)__R);
+}
+
+/// \brief Performs a single round of AES decryption using the Equivalent
+///    Inverse Cipher, transforming the state value from the first source
+///    operand using a 128-bit round key value contained in the second source
+///    operand, and writes the result to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VAESDEC </c> instruction.
+///
+/// \param __V
+///    A 128-bit integer vector containing the state value.
+/// \param __R
+///    A 128-bit integer vector containing the round key value.
+/// \returns A 128-bit integer vector containing the decrypted value.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_aesdec_si128(__m128i __V, __m128i __R)
+{
+  return (__m128i)__builtin_ia32_aesdec128((__v2di)__V, (__v2di)__R);
+}
+
+/// \brief Performs the final round of AES decryption using the Equivalent
+///    Inverse Cipher, transforming the state value from the first source
+///    operand using a 128-bit round key value contained in the second source
+///    operand, and writes the result to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VAESDECLAST </c> instruction.
+///
+/// \param __V
+///    A 128-bit integer vector containing the state value.
+/// \param __R
+///    A 128-bit integer vector containing the round key value.
+/// \returns A 128-bit integer vector containing the decrypted value.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_aesdeclast_si128(__m128i __V, __m128i __R)
+{
+  return (__m128i)__builtin_ia32_aesdeclast128((__v2di)__V, (__v2di)__R);
+}
+
+/// \brief Applies the AES InvMixColumns() transformation to an expanded key
+///    contained in the source operand, and writes the result to the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VAESIMC </c> instruction.
+///
+/// \param __V
+///    A 128-bit integer vector containing the expanded key.
+/// \returns A 128-bit integer vector containing the transformed value.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_aesimc_si128(__m128i __V)
+{
+  return (__m128i)__builtin_ia32_aesimc128((__v2di)__V);
+}
+
+/// \brief Generates a round key for AES encyption, operating on 128-bit data
+///    specified in the first source operand and using an 8-bit round constant
+///    specified by the second source operand, and writes the result to the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_aeskeygenassist_si128(__m128i C, const int R);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> AESKEYGENASSIST </c> instruction.
+///
+/// \param C
+///    A 128-bit integer vector that is used to generate the AES encryption key.
+/// \param R
+///    An 8-bit round constant used to generate the AES encryption key.
+/// \returns A 128-bit round key for AES encryption.
+#define _mm_aeskeygenassist_si128(C, R) \
+  (__m128i)__builtin_ia32_aeskeygenassist128((__v2di)(__m128i)(C), (int)(R))
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif  /* _WMMINTRIN_AES_H */

+ 57 - 0
demo/include/__wmmintrin_pclmul.h

@@ -0,0 +1,57 @@
+/*===---- __wmmintrin_pclmul.h - PCMUL intrinsics ---------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef _WMMINTRIN_PCLMUL_H
+#define _WMMINTRIN_PCLMUL_H
+
+/// \brief Multiplies two 64-bit integer values, which are selected from source
+///    operands using the immediate-value operand. The multiplication is a
+///    carry-less multiplication, and the 128-bit integer product is stored in
+///    the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_clmulepi64_si128(__m128i __X, __m128i __Y, const int __I);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPCLMULQDQ </c> instruction.
+///
+/// \param __X
+///    A 128-bit vector of [2 x i64] containing one of the source operands.
+/// \param __Y
+///    A 128-bit vector of [2 x i64] containing one of the source operands.
+/// \param __I
+///    An immediate value specifying which 64-bit values to select from the
+///    operands. Bit 0 is used to select a value from operand \a __X, and bit
+///    4 is used to select a value from operand \a __Y: \n
+///    Bit[0]=0 indicates that bits[63:0] of operand \a __X are used. \n
+///    Bit[0]=1 indicates that bits[127:64] of operand \a __X are used. \n
+///    Bit[4]=0 indicates that bits[63:0] of operand \a __Y are used. \n
+///    Bit[4]=1 indicates that bits[127:64] of operand \a __Y are used.
+/// \returns The 128-bit integer vector containing the result of the carry-less
+///    multiplication of the selected 64-bit values.
+#define _mm_clmulepi64_si128(__X, __Y, __I) \
+  ((__m128i)__builtin_ia32_pclmulqdq128((__v2di)(__m128i)(__X), \
+                                        (__v2di)(__m128i)(__Y), (char)(__I)))
+
+#endif /* _WMMINTRIN_PCLMUL_H */

+ 86 - 0
demo/include/adxintrin.h

@@ -0,0 +1,86 @@
+/*===---- adxintrin.h - ADX intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <adxintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __ADXINTRIN_H
+#define __ADXINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
+
+/* Intrinsics that are available only if __ADX__ defined */
+static __inline unsigned char __attribute__((__always_inline__, __nodebug__, __target__("adx")))
+_addcarryx_u32(unsigned char __cf, unsigned int __x, unsigned int __y,
+               unsigned int *__p)
+{
+  return __builtin_ia32_addcarryx_u32(__cf, __x, __y, __p);
+}
+
+#ifdef __x86_64__
+static __inline unsigned char __attribute__((__always_inline__, __nodebug__, __target__("adx")))
+_addcarryx_u64(unsigned char __cf, unsigned long long __x,
+               unsigned long long __y, unsigned long long  *__p)
+{
+  return __builtin_ia32_addcarryx_u64(__cf, __x, __y, __p);
+}
+#endif
+
+/* Intrinsics that are also available if __ADX__ undefined */
+static __inline unsigned char __DEFAULT_FN_ATTRS
+_addcarry_u32(unsigned char __cf, unsigned int __x, unsigned int __y,
+              unsigned int *__p)
+{
+  return __builtin_ia32_addcarry_u32(__cf, __x, __y, __p);
+}
+
+#ifdef __x86_64__
+static __inline unsigned char __DEFAULT_FN_ATTRS
+_addcarry_u64(unsigned char __cf, unsigned long long __x,
+              unsigned long long __y, unsigned long long  *__p)
+{
+  return __builtin_ia32_addcarry_u64(__cf, __x, __y, __p);
+}
+#endif
+
+static __inline unsigned char __DEFAULT_FN_ATTRS
+_subborrow_u32(unsigned char __cf, unsigned int __x, unsigned int __y,
+              unsigned int *__p)
+{
+  return __builtin_ia32_subborrow_u32(__cf, __x, __y, __p);
+}
+
+#ifdef __x86_64__
+static __inline unsigned char __DEFAULT_FN_ATTRS
+_subborrow_u64(unsigned char __cf, unsigned long long __x,
+               unsigned long long __y, unsigned long long  *__p)
+{
+  return __builtin_ia32_subborrow_u64(__cf, __x, __y, __p);
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __ADXINTRIN_H */

文件差異過大導致無法顯示
+ 16736 - 0
demo/include/altivec.h


+ 193 - 0
demo/include/ammintrin.h

@@ -0,0 +1,193 @@
+/*===---- ammintrin.h - SSE4a intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __AMMINTRIN_H
+#define __AMMINTRIN_H
+
+#include <pmmintrin.h>
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4a")))
+
+/// \brief Extracts the specified bits from the lower 64 bits of the 128-bit
+///    integer vector operand at the index \a idx and of the length \a len.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_extracti_si64(__m128i x, const int len, const int idx);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> EXTRQ </c> instruction.
+///
+/// \param x
+///    The value from which bits are extracted.
+/// \param len
+///    Bits [5:0] specify the length; the other bits are ignored. If bits [5:0]
+///    are zero, the length is interpreted as 64.
+/// \param idx
+///    Bits [5:0] specify the index of the least significant bit; the other
+///    bits are ignored. If the sum of the index and length is greater than 64,
+///    the result is undefined. If the length and index are both zero, bits
+///    [63:0] of parameter \a x are extracted. If the length is zero but the
+///    index is non-zero, the result is undefined.
+/// \returns A 128-bit integer vector whose lower 64 bits contain the bits
+///    extracted from the source operand.
+#define _mm_extracti_si64(x, len, idx) \
+  ((__m128i)__builtin_ia32_extrqi((__v2di)(__m128i)(x), \
+                                  (char)(len), (char)(idx)))
+
+/// \brief Extracts the specified bits from the lower 64 bits of the 128-bit
+///    integer vector operand at the index and of the length specified by
+///    \a __y.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> EXTRQ </c> instruction.
+///
+/// \param __x
+///    The value from which bits are extracted.
+/// \param __y
+///    Specifies the index of the least significant bit at [13:8] and the
+///    length at [5:0]; all other bits are ignored. If bits [5:0] are zero, the
+///    length is interpreted as 64. If the sum of the index and length is
+///    greater than 64, the result is undefined. If the length and index are
+///    both zero, bits [63:0] of parameter \a __x are extracted. If the length
+///    is zero but the index is non-zero, the result is undefined.
+/// \returns A 128-bit vector whose lower 64 bits contain the bits extracted
+///    from the source operand.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_extract_si64(__m128i __x, __m128i __y)
+{
+  return (__m128i)__builtin_ia32_extrq((__v2di)__x, (__v16qi)__y);
+}
+
+/// \brief Inserts bits of a specified length from the source integer vector
+///    \a y into the lower 64 bits of the destination integer vector \a x at
+///    the index \a idx and of the length \a len.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_inserti_si64(__m128i x, __m128i y, const int len,
+/// const int idx);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> INSERTQ </c> instruction.
+///
+/// \param x
+///    The destination operand where bits will be inserted. The inserted bits
+///    are defined by the length \a len and by the index \a idx specifying the
+///    least significant bit.
+/// \param y
+///    The source operand containing the bits to be extracted. The extracted
+///    bits are the least significant bits of operand \a y of length \a len.
+/// \param len
+///    Bits [5:0] specify the length; the other bits are ignored. If bits [5:0]
+///    are zero, the length is interpreted as 64.
+/// \param idx
+///    Bits [5:0] specify the index of the least significant bit; the other
+///    bits are ignored. If the sum of the index and length is greater than 64,
+///    the result is undefined. If the length and index are both zero, bits
+///    [63:0] of parameter \a y are inserted into parameter \a x. If the length
+///    is zero but the index is non-zero, the result is undefined.
+/// \returns A 128-bit integer vector containing the original lower 64-bits of
+///    destination operand \a x with the specified bitfields replaced by the
+///    lower bits of source operand \a y. The upper 64 bits of the return value
+///    are undefined.
+#define _mm_inserti_si64(x, y, len, idx) \
+  ((__m128i)__builtin_ia32_insertqi((__v2di)(__m128i)(x), \
+                                    (__v2di)(__m128i)(y), \
+                                    (char)(len), (char)(idx)))
+
+/// \brief Inserts bits of a specified length from the source integer vector
+///    \a __y into the lower 64 bits of the destination integer vector \a __x
+///    at the index and of the length specified by \a __y.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> INSERTQ </c> instruction.
+///
+/// \param __x
+///    The destination operand where bits will be inserted. The inserted bits
+///    are defined by the length and by the index of the least significant bit
+///    specified by operand \a __y.
+/// \param __y
+///    The source operand containing the bits to be extracted. The extracted
+///    bits are the least significant bits of operand \a __y with length
+///    specified by bits [69:64]. These are inserted into the destination at the
+///    index specified by bits [77:72]; all other bits are ignored. If bits
+///    [69:64] are zero, the length is interpreted as 64. If the sum of the
+///    index and length is greater than 64, the result is undefined. If the
+///    length and index are both zero, bits [63:0] of parameter \a __y are
+///    inserted into parameter \a __x. If the length is zero but the index is
+///    non-zero, the result is undefined.
+/// \returns A 128-bit integer vector containing the original lower 64-bits of
+///    destination operand \a __x with the specified bitfields replaced by the
+///    lower bits of source operand \a __y. The upper 64 bits of the return
+///    value are undefined.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_insert_si64(__m128i __x, __m128i __y)
+{
+  return (__m128i)__builtin_ia32_insertq((__v2di)__x, (__v2di)__y);
+}
+
+/// \brief Stores a 64-bit double-precision value in a 64-bit memory location.
+///    To minimize caching, the data is flagged as non-temporal (unlikely to be
+///    used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> MOVNTSD </c> instruction.
+///
+/// \param __p
+///    The 64-bit memory location used to store the register value.
+/// \param __a
+///    The 64-bit double-precision floating-point register value to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_stream_sd(double *__p, __m128d __a)
+{
+  __builtin_ia32_movntsd(__p, (__v2df)__a);
+}
+
+/// \brief Stores a 32-bit single-precision floating-point value in a 32-bit
+///    memory location. To minimize caching, the data is flagged as
+///    non-temporal (unlikely to be used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> MOVNTSS </c> instruction.
+///
+/// \param __p
+///    The 32-bit memory location used to store the register value.
+/// \param __a
+///    The 32-bit single-precision floating-point register value to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_stream_ss(float *__p, __m128 __a)
+{
+  __builtin_ia32_movntss(__p, (__v4sf)__a);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __AMMINTRIN_H */

+ 49 - 0
demo/include/arm64intr.h

@@ -0,0 +1,49 @@
+/*===---- arm64intr.h - ARM64 Windows intrinsics -------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/* Only include this if we're compiling for the windows platform. */
+#ifndef _MSC_VER
+#include_next <arm64intr.h>
+#else
+
+#ifndef __ARM64INTR_H
+#define __ARM64INTR_H
+
+typedef enum
+{
+  _ARM64_BARRIER_SY    = 0xF,
+  _ARM64_BARRIER_ST    = 0xE,
+  _ARM64_BARRIER_LD    = 0xD,
+  _ARM64_BARRIER_ISH   = 0xB,
+  _ARM64_BARRIER_ISHST = 0xA,
+  _ARM64_BARRIER_ISHLD = 0x9,
+  _ARM64_BARRIER_NSH   = 0x7,
+  _ARM64_BARRIER_NSHST = 0x6,
+  _ARM64_BARRIER_NSHLD = 0x5,
+  _ARM64_BARRIER_OSH   = 0x3,
+  _ARM64_BARRIER_OSHST = 0x2,
+  _ARM64_BARRIER_OSHLD = 0x1
+} _ARM64INTR_BARRIER_TYPE;
+
+#endif /* __ARM64INTR_H */
+#endif /* _MSC_VER */

+ 626 - 0
demo/include/arm_acle.h

@@ -0,0 +1,626 @@
+/*===---- arm_acle.h - ARM Non-Neon intrinsics -----------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __ARM_ACLE_H
+#define __ARM_ACLE_H
+
+#ifndef __ARM_ACLE
+#error "ACLE intrinsics support not enabled."
+#endif
+
+#include <stdint.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/* 8 SYNCHRONIZATION, BARRIER AND HINT INTRINSICS */
+/* 8.3 Memory barriers */
+#if !defined(_MSC_VER)
+#define __dmb(i) __builtin_arm_dmb(i)
+#define __dsb(i) __builtin_arm_dsb(i)
+#define __isb(i) __builtin_arm_isb(i)
+#endif
+
+/* 8.4 Hints */
+
+#if !defined(_MSC_VER)
+static __inline__ void __attribute__((__always_inline__, __nodebug__)) __wfi(void) {
+  __builtin_arm_wfi();
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__)) __wfe(void) {
+  __builtin_arm_wfe();
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__)) __sev(void) {
+  __builtin_arm_sev();
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__)) __sevl(void) {
+  __builtin_arm_sevl();
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__)) __yield(void) {
+  __builtin_arm_yield();
+}
+#endif
+
+#if __ARM_32BIT_STATE
+#define __dbg(t) __builtin_arm_dbg(t)
+#endif
+
+/* 8.5 Swap */
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__swp(uint32_t __x, volatile uint32_t *__p) {
+  uint32_t v;
+  do
+    v = __builtin_arm_ldrex(__p);
+  while (__builtin_arm_strex(__x, __p));
+  return v;
+}
+
+/* 8.6 Memory prefetch intrinsics */
+/* 8.6.1 Data prefetch */
+#define __pld(addr) __pldx(0, 0, 0, addr)
+
+#if __ARM_32BIT_STATE
+#define __pldx(access_kind, cache_level, retention_policy, addr) \
+  __builtin_arm_prefetch(addr, access_kind, 1)
+#else
+#define __pldx(access_kind, cache_level, retention_policy, addr) \
+  __builtin_arm_prefetch(addr, access_kind, cache_level, retention_policy, 1)
+#endif
+
+/* 8.6.2 Instruction prefetch */
+#define __pli(addr) __plix(0, 0, addr)
+
+#if __ARM_32BIT_STATE
+#define __plix(cache_level, retention_policy, addr) \
+  __builtin_arm_prefetch(addr, 0, 0)
+#else
+#define __plix(cache_level, retention_policy, addr) \
+  __builtin_arm_prefetch(addr, 0, cache_level, retention_policy, 0)
+#endif
+
+/* 8.7 NOP */
+static __inline__ void __attribute__((__always_inline__, __nodebug__)) __nop(void) {
+  __builtin_arm_nop();
+}
+
+/* 9 DATA-PROCESSING INTRINSICS */
+/* 9.2 Miscellaneous data-processing intrinsics */
+/* ROR */
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__ror(uint32_t __x, uint32_t __y) {
+  __y %= 32;
+  if (__y == 0)
+    return __x;
+  return (__x >> __y) | (__x << (32 - __y));
+}
+
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+__rorll(uint64_t __x, uint32_t __y) {
+  __y %= 64;
+  if (__y == 0)
+    return __x;
+  return (__x >> __y) | (__x << (64 - __y));
+}
+
+static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
+__rorl(unsigned long __x, uint32_t __y) {
+#if __SIZEOF_LONG__ == 4
+  return __ror(__x, __y);
+#else
+  return __rorll(__x, __y);
+#endif
+}
+
+
+/* CLZ */
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__clz(uint32_t __t) {
+  return __builtin_clz(__t);
+}
+
+static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
+__clzl(unsigned long __t) {
+  return __builtin_clzl(__t);
+}
+
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+__clzll(uint64_t __t) {
+  return __builtin_clzll(__t);
+}
+
+/* REV */
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__rev(uint32_t __t) {
+  return __builtin_bswap32(__t);
+}
+
+static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
+__revl(unsigned long __t) {
+#if __SIZEOF_LONG__ == 4
+  return __builtin_bswap32(__t);
+#else
+  return __builtin_bswap64(__t);
+#endif
+}
+
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+__revll(uint64_t __t) {
+  return __builtin_bswap64(__t);
+}
+
+/* REV16 */
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__rev16(uint32_t __t) {
+  return __ror(__rev(__t), 16);
+}
+
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+__rev16ll(uint64_t __t) {
+  return (((uint64_t)__rev16(__t >> 32)) << 32) | __rev16(__t);
+}
+
+static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
+__rev16l(unsigned long __t) {
+#if __SIZEOF_LONG__ == 4
+    return __rev16(__t);
+#else
+    return __rev16ll(__t);
+#endif
+}
+
+/* REVSH */
+static __inline__ int16_t __attribute__((__always_inline__, __nodebug__))
+__revsh(int16_t __t) {
+  return __builtin_bswap16(__t);
+}
+
+/* RBIT */
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__rbit(uint32_t __t) {
+  return __builtin_arm_rbit(__t);
+}
+
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+__rbitll(uint64_t __t) {
+#if __ARM_32BIT_STATE
+  return (((uint64_t)__builtin_arm_rbit(__t)) << 32) |
+         __builtin_arm_rbit(__t >> 32);
+#else
+  return __builtin_arm_rbit64(__t);
+#endif
+}
+
+static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
+__rbitl(unsigned long __t) {
+#if __SIZEOF_LONG__ == 4
+  return __rbit(__t);
+#else
+  return __rbitll(__t);
+#endif
+}
+
+/*
+ * 9.3 16-bit multiplications
+ */
+#if __ARM_FEATURE_DSP
+static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
+__smulbb(int32_t __a, int32_t __b) {
+  return __builtin_arm_smulbb(__a, __b);
+}
+static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
+__smulbt(int32_t __a, int32_t __b) {
+  return __builtin_arm_smulbt(__a, __b);
+}
+static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
+__smultb(int32_t __a, int32_t __b) {
+  return __builtin_arm_smultb(__a, __b);
+}
+static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
+__smultt(int32_t __a, int32_t __b) {
+  return __builtin_arm_smultt(__a, __b);
+}
+static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
+__smulwb(int32_t __a, int32_t __b) {
+  return __builtin_arm_smulwb(__a, __b);
+}
+static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
+__smulwt(int32_t __a, int32_t __b) {
+  return __builtin_arm_smulwt(__a, __b);
+}
+#endif
+
+/*
+ * 9.4 Saturating intrinsics
+ *
+ * FIXME: Change guard to their corrosponding __ARM_FEATURE flag when Q flag
+ * intrinsics are implemented and the flag is enabled.
+ */
+/* 9.4.1 Width-specified saturation intrinsics */
+#if __ARM_FEATURE_SAT
+#define __ssat(x, y) __builtin_arm_ssat(x, y)
+#define __usat(x, y) __builtin_arm_usat(x, y)
+#endif
+
+/* 9.4.2 Saturating addition and subtraction intrinsics */
+#if __ARM_FEATURE_DSP
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__qadd(int32_t __t, int32_t __v) {
+  return __builtin_arm_qadd(__t, __v);
+}
+
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__qsub(int32_t __t, int32_t __v) {
+  return __builtin_arm_qsub(__t, __v);
+}
+
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__qdbl(int32_t __t) {
+  return __builtin_arm_qadd(__t, __t);
+}
+#endif
+
+/* 9.4.3 Accumultating multiplications */
+#if __ARM_FEATURE_DSP
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__smlabb(int32_t __a, int32_t __b, int32_t __c) {
+  return __builtin_arm_smlabb(__a, __b, __c);
+}
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__smlabt(int32_t __a, int32_t __b, int32_t __c) {
+  return __builtin_arm_smlabt(__a, __b, __c);
+}
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__smlatb(int32_t __a, int32_t __b, int32_t __c) {
+  return __builtin_arm_smlatb(__a, __b, __c);
+}
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__smlatt(int32_t __a, int32_t __b, int32_t __c) {
+  return __builtin_arm_smlatt(__a, __b, __c);
+}
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__smlawb(int32_t __a, int32_t __b, int32_t __c) {
+  return __builtin_arm_smlawb(__a, __b, __c);
+}
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__smlawt(int32_t __a, int32_t __b, int32_t __c) {
+  return __builtin_arm_smlawt(__a, __b, __c);
+}
+#endif
+
+
+/* 9.5.4 Parallel 16-bit saturation */
+#if __ARM_FEATURE_SIMD32
+#define __ssat16(x, y) __builtin_arm_ssat16(x, y)
+#define __usat16(x, y) __builtin_arm_usat16(x, y)
+#endif
+
+/* 9.5.5 Packing and unpacking */
+#if __ARM_FEATURE_SIMD32
+typedef int32_t int8x4_t;
+typedef int32_t int16x2_t;
+typedef uint32_t uint8x4_t;
+typedef uint32_t uint16x2_t;
+
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__sxtab16(int16x2_t __a, int8x4_t __b) {
+  return __builtin_arm_sxtab16(__a, __b);
+}
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__sxtb16(int8x4_t __a) {
+  return __builtin_arm_sxtb16(__a);
+}
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__uxtab16(int16x2_t __a, int8x4_t __b) {
+  return __builtin_arm_uxtab16(__a, __b);
+}
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__uxtb16(int8x4_t __a) {
+  return __builtin_arm_uxtb16(__a);
+}
+#endif
+
+/* 9.5.6 Parallel selection */
+#if __ARM_FEATURE_SIMD32
+static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
+__sel(uint8x4_t __a, uint8x4_t __b) {
+  return __builtin_arm_sel(__a, __b);
+}
+#endif
+
+/* 9.5.7 Parallel 8-bit addition and subtraction */
+#if __ARM_FEATURE_SIMD32
+static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
+__qadd8(int8x4_t __a, int8x4_t __b) {
+  return __builtin_arm_qadd8(__a, __b);
+}
+static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
+__qsub8(int8x4_t __a, int8x4_t __b) {
+  return __builtin_arm_qsub8(__a, __b);
+}
+static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
+__sadd8(int8x4_t __a, int8x4_t __b) {
+  return __builtin_arm_sadd8(__a, __b);
+}
+static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
+__shadd8(int8x4_t __a, int8x4_t __b) {
+  return __builtin_arm_shadd8(__a, __b);
+}
+static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
+__shsub8(int8x4_t __a, int8x4_t __b) {
+  return __builtin_arm_shsub8(__a, __b);
+}
+static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
+__ssub8(int8x4_t __a, int8x4_t __b) {
+  return __builtin_arm_ssub8(__a, __b);
+}
+static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
+__uadd8(uint8x4_t __a, uint8x4_t __b) {
+  return __builtin_arm_uadd8(__a, __b);
+}
+static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
+__uhadd8(uint8x4_t __a, uint8x4_t __b) {
+  return __builtin_arm_uhadd8(__a, __b);
+}
+static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
+__uhsub8(uint8x4_t __a, uint8x4_t __b) {
+  return __builtin_arm_uhsub8(__a, __b);
+}
+static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
+__uqadd8(uint8x4_t __a, uint8x4_t __b) {
+  return __builtin_arm_uqadd8(__a, __b);
+}
+static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
+__uqsub8(uint8x4_t __a, uint8x4_t __b) {
+  return __builtin_arm_uqsub8(__a, __b);
+}
+static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
+__usub8(uint8x4_t __a, uint8x4_t __b) {
+  return __builtin_arm_usub8(__a, __b);
+}
+#endif
+
+/* 9.5.8 Sum of 8-bit absolute differences */
+#if __ARM_FEATURE_SIMD32
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__usad8(uint8x4_t __a, uint8x4_t __b) {
+  return __builtin_arm_usad8(__a, __b);
+}
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__usada8(uint8x4_t __a, uint8x4_t __b, uint32_t __c) {
+  return __builtin_arm_usada8(__a, __b, __c);
+}
+#endif
+
+/* 9.5.9 Parallel 16-bit addition and subtraction */
+#if __ARM_FEATURE_SIMD32
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__qadd16(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_qadd16(__a, __b);
+}
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__qasx(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_qasx(__a, __b);
+}
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__qsax(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_qsax(__a, __b);
+}
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__qsub16(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_qsub16(__a, __b);
+}
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__sadd16(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_sadd16(__a, __b);
+}
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__sasx(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_sasx(__a, __b);
+}
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__shadd16(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_shadd16(__a, __b);
+}
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__shasx(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_shasx(__a, __b);
+}
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__shsax(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_shsax(__a, __b);
+}
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__shsub16(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_shsub16(__a, __b);
+}
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__ssax(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_ssax(__a, __b);
+}
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__ssub16(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_ssub16(__a, __b);
+}
+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
+__uadd16(uint16x2_t __a, uint16x2_t __b) {
+  return __builtin_arm_uadd16(__a, __b);
+}
+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
+__uasx(uint16x2_t __a, uint16x2_t __b) {
+  return __builtin_arm_uasx(__a, __b);
+}
+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
+__uhadd16(uint16x2_t __a, uint16x2_t __b) {
+  return __builtin_arm_uhadd16(__a, __b);
+}
+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
+__uhasx(uint16x2_t __a, uint16x2_t __b) {
+  return __builtin_arm_uhasx(__a, __b);
+}
+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
+__uhsax(uint16x2_t __a, uint16x2_t __b) {
+  return __builtin_arm_uhsax(__a, __b);
+}
+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
+__uhsub16(uint16x2_t __a, uint16x2_t __b) {
+  return __builtin_arm_uhsub16(__a, __b);
+}
+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
+__uqadd16(uint16x2_t __a, uint16x2_t __b) {
+  return __builtin_arm_uqadd16(__a, __b);
+}
+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
+__uqasx(uint16x2_t __a, uint16x2_t __b) {
+  return __builtin_arm_uqasx(__a, __b);
+}
+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
+__uqsax(uint16x2_t __a, uint16x2_t __b) {
+  return __builtin_arm_uqsax(__a, __b);
+}
+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
+__uqsub16(uint16x2_t __a, uint16x2_t __b) {
+  return __builtin_arm_uqsub16(__a, __b);
+}
+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
+__usax(uint16x2_t __a, uint16x2_t __b) {
+  return __builtin_arm_usax(__a, __b);
+}
+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
+__usub16(uint16x2_t __a, uint16x2_t __b) {
+  return __builtin_arm_usub16(__a, __b);
+}
+#endif
+
+/* 9.5.10 Parallel 16-bit multiplications */
+#if __ARM_FEATURE_SIMD32
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__smlad(int16x2_t __a, int16x2_t __b, int32_t __c) {
+  return __builtin_arm_smlad(__a, __b, __c);
+}
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__smladx(int16x2_t __a, int16x2_t __b, int32_t __c) {
+  return __builtin_arm_smladx(__a, __b, __c);
+}
+static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
+__smlald(int16x2_t __a, int16x2_t __b, int64_t __c) {
+  return __builtin_arm_smlald(__a, __b, __c);
+}
+static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
+__smlaldx(int16x2_t __a, int16x2_t __b, int64_t __c) {
+  return __builtin_arm_smlaldx(__a, __b, __c);
+}
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__smlsd(int16x2_t __a, int16x2_t __b, int32_t __c) {
+  return __builtin_arm_smlsd(__a, __b, __c);
+}
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__smlsdx(int16x2_t __a, int16x2_t __b, int32_t __c) {
+  return __builtin_arm_smlsdx(__a, __b, __c);
+}
+static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
+__smlsld(int16x2_t __a, int16x2_t __b, int64_t __c) {
+  return __builtin_arm_smlsld(__a, __b, __c);
+}
+static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
+__smlsldx(int16x2_t __a, int16x2_t __b, int64_t __c) {
+  return __builtin_arm_smlsldx(__a, __b, __c);
+}
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__smuad(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_smuad(__a, __b);
+}
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__smuadx(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_smuadx(__a, __b);
+}
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__smusd(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_smusd(__a, __b);
+}
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__smusdx(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_smusdx(__a, __b);
+}
+#endif
+
+/* 9.7 CRC32 intrinsics */
+#if __ARM_FEATURE_CRC32
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__crc32b(uint32_t __a, uint8_t __b) {
+  return __builtin_arm_crc32b(__a, __b);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__crc32h(uint32_t __a, uint16_t __b) {
+  return __builtin_arm_crc32h(__a, __b);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__crc32w(uint32_t __a, uint32_t __b) {
+  return __builtin_arm_crc32w(__a, __b);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__crc32d(uint32_t __a, uint64_t __b) {
+  return __builtin_arm_crc32d(__a, __b);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__crc32cb(uint32_t __a, uint8_t __b) {
+  return __builtin_arm_crc32cb(__a, __b);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__crc32ch(uint32_t __a, uint16_t __b) {
+  return __builtin_arm_crc32ch(__a, __b);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__crc32cw(uint32_t __a, uint32_t __b) {
+  return __builtin_arm_crc32cw(__a, __b);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__crc32cd(uint32_t __a, uint64_t __b) {
+  return __builtin_arm_crc32cd(__a, __b);
+}
+#endif
+
+/* 10.1 Special register intrinsics */
+#define __arm_rsr(sysreg) __builtin_arm_rsr(sysreg)
+#define __arm_rsr64(sysreg) __builtin_arm_rsr64(sysreg)
+#define __arm_rsrp(sysreg) __builtin_arm_rsrp(sysreg)
+#define __arm_wsr(sysreg, v) __builtin_arm_wsr(sysreg, v)
+#define __arm_wsr64(sysreg, v) __builtin_arm_wsr64(sysreg, v)
+#define __arm_wsrp(sysreg, v) __builtin_arm_wsrp(sysreg, v)
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* __ARM_ACLE_H */

文件差異過大導致無法顯示
+ 72599 - 0
demo/include/arm_neon.h


+ 45 - 0
demo/include/armintr.h

@@ -0,0 +1,45 @@
+/*===---- armintr.h - ARM Windows intrinsics -------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/* Only include this if we're compiling for the windows platform. */
+#ifndef _MSC_VER
+#include_next <armintr.h>
+#else
+
+#ifndef __ARMINTR_H
+#define __ARMINTR_H
+
+typedef enum
+{
+  _ARM_BARRIER_SY    = 0xF,
+  _ARM_BARRIER_ST    = 0xE,
+  _ARM_BARRIER_ISH   = 0xB,
+  _ARM_BARRIER_ISHST = 0xA,
+  _ARM_BARRIER_NSH   = 0x7,
+  _ARM_BARRIER_NSHST = 0x6,
+  _ARM_BARRIER_OSH   = 0x3,
+  _ARM_BARRIER_OSHST = 0x2
+} _ARMINTR_BARRIER_TYPE;
+
+#endif /* __ARMINTR_H */
+#endif /* _MSC_VER */

文件差異過大導致無法顯示
+ 1308 - 0
demo/include/avx2intrin.h


+ 97 - 0
demo/include/avx512bitalgintrin.h

@@ -0,0 +1,97 @@
+/*===------------- avx512bitalgintrin.h - BITALG intrinsics ------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512bitalgintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512BITALGINTRIN_H
+#define __AVX512BITALGINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512bitalg")))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_popcnt_epi16(__m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpopcntw_512((__v32hi) __A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_popcnt_epi16(__m512i __A, __mmask32 __U, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_selectw_512((__mmask32) __U,
+              (__v32hi) _mm512_popcnt_epi16(__B),
+              (__v32hi) __A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_popcnt_epi16(__mmask32 __U, __m512i __B)
+{
+  return _mm512_mask_popcnt_epi16((__m512i) _mm512_setzero_hi(),
+              __U,
+              __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_popcnt_epi8(__m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpopcntb_512((__v64qi) __A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_popcnt_epi8(__m512i __A, __mmask64 __U, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_selectb_512((__mmask64) __U,
+              (__v64qi) _mm512_popcnt_epi8(__B),
+              (__v64qi) __A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_popcnt_epi8(__mmask64 __U, __m512i __B)
+{
+  return _mm512_mask_popcnt_epi8((__m512i) _mm512_setzero_qi(),
+              __U,
+              __B);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_bitshuffle_epi64_mask(__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask64) __builtin_ia32_vpshufbitqmb512_mask((__v64qi) __A,
+              (__v64qi) __B,
+              __U);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_bitshuffle_epi64_mask(__m512i __A, __m512i __B)
+{
+  return _mm512_mask_bitshuffle_epi64_mask((__mmask64) -1,
+              __A,
+              __B);
+}
+
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif

文件差異過大導致無法顯示
+ 2140 - 0
demo/include/avx512bwintrin.h


+ 145 - 0
demo/include/avx512cdintrin.h

@@ -0,0 +1,145 @@
+/*===------------- avx512cdintrin.h - AVX512CD intrinsics ------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512cdintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512CDINTRIN_H
+#define __AVX512CDINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512cd")))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_conflict_epi64 (__m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A,
+                 (__v8di) _mm512_setzero_si512 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_conflict_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A,
+               (__v8di) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_conflict_epi64 (__mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A,
+                 (__v8di) _mm512_setzero_si512 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_conflict_epi32 (__m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A,
+                 (__v16si) _mm512_setzero_si512 (),
+                 (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_conflict_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A,
+               (__v16si) __W,
+               (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_conflict_epi32 (__mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A,
+                 (__v16si) _mm512_setzero_si512 (),
+                 (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_lzcnt_epi32 (__m512i __A)
+{
+  return (__m512i) __builtin_ia32_vplzcntd_512_mask ((__v16si) __A,
+             (__v16si) _mm512_setzero_si512 (),
+             (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_lzcnt_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vplzcntd_512_mask ((__v16si) __A,
+                 (__v16si) __W,
+                 (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_lzcnt_epi32 (__mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vplzcntd_512_mask ((__v16si) __A,
+             (__v16si) _mm512_setzero_si512 (),
+             (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_lzcnt_epi64 (__m512i __A)
+{
+  return (__m512i) __builtin_ia32_vplzcntq_512_mask ((__v8di) __A,
+             (__v8di) _mm512_setzero_si512 (),
+             (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_lzcnt_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vplzcntq_512_mask ((__v8di) __A,
+                 (__v8di) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_lzcnt_epi64 (__mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vplzcntq_512_mask ((__v8di) __A,
+             (__v8di) _mm512_setzero_si512 (),
+             (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_broadcastmb_epi64 (__mmask8 __A)
+{
+  return (__m512i) _mm512_set1_epi64((long long) __A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_broadcastmw_epi32 (__mmask16 __A)
+{
+  return (__m512i) _mm512_set1_epi32((int) __A);
+
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif

文件差異過大導致無法顯示
+ 1331 - 0
demo/include/avx512dqintrin.h


+ 285 - 0
demo/include/avx512erintrin.h

@@ -0,0 +1,285 @@
+/*===---- avx512erintrin.h - AVX512ER intrinsics ---------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512erintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512ERINTRIN_H
+#define __AVX512ERINTRIN_H
+
+// exp2a23
+#define _mm512_exp2a23_round_pd(A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
+                                      (__v8df)_mm512_setzero_pd(), \
+                                      (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_exp2a23_round_pd(S, M, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
+                                      (__v8df)(__m512d)(S), (__mmask8)(M), \
+                                      (int)(R)); })
+
+#define _mm512_maskz_exp2a23_round_pd(M, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
+                                      (__v8df)_mm512_setzero_pd(), \
+                                      (__mmask8)(M), (int)(R)); })
+
+#define _mm512_exp2a23_pd(A) \
+  _mm512_exp2a23_round_pd((A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_exp2a23_pd(S, M, A) \
+  _mm512_mask_exp2a23_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_exp2a23_pd(M, A) \
+  _mm512_maskz_exp2a23_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_exp2a23_round_ps(A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
+                                     (__v16sf)_mm512_setzero_ps(), \
+                                     (__mmask16)-1, (int)(R)); })
+
+#define _mm512_mask_exp2a23_round_ps(S, M, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
+                                     (__v16sf)(__m512)(S), (__mmask16)(M), \
+                                     (int)(R)); })
+
+#define _mm512_maskz_exp2a23_round_ps(M, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
+                                     (__v16sf)_mm512_setzero_ps(), \
+                                     (__mmask16)(M), (int)(R)); })
+
+#define _mm512_exp2a23_ps(A) \
+  _mm512_exp2a23_round_ps((A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_exp2a23_ps(S, M, A) \
+  _mm512_mask_exp2a23_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_exp2a23_ps(M, A) \
+  _mm512_maskz_exp2a23_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
+
+// rsqrt28
+#define _mm512_rsqrt28_round_pd(A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
+                                         (__v8df)_mm512_setzero_pd(), \
+                                         (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_rsqrt28_round_pd(S, M, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
+                                         (__v8df)(__m512d)(S), (__mmask8)(M), \
+                                         (int)(R)); })
+
+#define _mm512_maskz_rsqrt28_round_pd(M, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
+                                         (__v8df)_mm512_setzero_pd(), \
+                                         (__mmask8)(M), (int)(R)); })
+
+#define _mm512_rsqrt28_pd(A) \
+  _mm512_rsqrt28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_rsqrt28_pd(S, M, A) \
+  _mm512_mask_rsqrt28_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_rsqrt28_pd(M, A) \
+  _mm512_maskz_rsqrt28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_rsqrt28_round_ps(A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
+                                        (__v16sf)_mm512_setzero_ps(), \
+                                        (__mmask16)-1, (int)(R)); })
+
+#define _mm512_mask_rsqrt28_round_ps(S, M, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
+                                        (__v16sf)(__m512)(S), (__mmask16)(M), \
+                                        (int)(R)); })
+
+#define _mm512_maskz_rsqrt28_round_ps(M, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
+                                        (__v16sf)_mm512_setzero_ps(), \
+                                        (__mmask16)(M), (int)(R)); })
+
+#define _mm512_rsqrt28_ps(A) \
+  _mm512_rsqrt28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_rsqrt28_ps(S, M, A) \
+  _mm512_mask_rsqrt28_round_ps((S), (M), A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_rsqrt28_ps(M, A) \
+  _mm512_maskz_rsqrt28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_rsqrt28_round_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
+                                              (__v4sf)(__m128)(B), \
+                                              (__v4sf)_mm_setzero_ps(), \
+                                              (__mmask8)-1, (int)(R)); })
+
+#define _mm_mask_rsqrt28_round_ss(S, M, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
+                                              (__v4sf)(__m128)(B), \
+                                              (__v4sf)(__m128)(S), \
+                                              (__mmask8)(M), (int)(R)); })
+
+#define _mm_maskz_rsqrt28_round_ss(M, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
+                                              (__v4sf)(__m128)(B), \
+                                              (__v4sf)_mm_setzero_ps(), \
+                                              (__mmask8)(M), (int)(R)); })
+
+#define _mm_rsqrt28_ss(A, B) \
+  _mm_rsqrt28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_mask_rsqrt28_ss(S, M, A, B) \
+  _mm_mask_rsqrt28_round_ss((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_rsqrt28_ss(M, A, B) \
+  _mm_maskz_rsqrt28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_rsqrt28_round_sd(A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
+                                               (__v2df)(__m128d)(B), \
+                                               (__v2df)_mm_setzero_pd(), \
+                                               (__mmask8)-1, (int)(R)); })
+
+#define _mm_mask_rsqrt28_round_sd(S, M, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
+                                               (__v2df)(__m128d)(B), \
+                                               (__v2df)(__m128d)(S), \
+                                               (__mmask8)(M), (int)(R)); })
+
+#define _mm_maskz_rsqrt28_round_sd(M, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
+                                               (__v2df)(__m128d)(B), \
+                                               (__v2df)_mm_setzero_pd(), \
+                                               (__mmask8)(M), (int)(R)); })
+
+#define _mm_rsqrt28_sd(A, B) \
+  _mm_rsqrt28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_mask_rsqrt28_sd(S, M, A, B) \
+  _mm_mask_rsqrt28_round_sd((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_rsqrt28_sd(M, A, B) \
+  _mm_maskz_rsqrt28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+// rcp28
+#define _mm512_rcp28_round_pd(A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
+                                       (__v8df)_mm512_setzero_pd(), \
+                                       (__mmask8)-1, (int)(R)); })
+
+#define _mm512_mask_rcp28_round_pd(S, M, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
+                                       (__v8df)(__m512d)(S), (__mmask8)(M), \
+                                       (int)(R)); })
+
+#define _mm512_maskz_rcp28_round_pd(M, A, R) __extension__ ({ \
+  (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
+                                       (__v8df)_mm512_setzero_pd(), \
+                                       (__mmask8)(M), (int)(R)); })
+
+#define _mm512_rcp28_pd(A) \
+  _mm512_rcp28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_rcp28_pd(S, M, A) \
+  _mm512_mask_rcp28_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_rcp28_pd(M, A) \
+  _mm512_maskz_rcp28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_rcp28_round_ps(A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
+                                      (__v16sf)_mm512_setzero_ps(), \
+                                      (__mmask16)-1, (int)(R)); })
+
+#define _mm512_mask_rcp28_round_ps(S, M, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
+                                      (__v16sf)(__m512)(S), (__mmask16)(M), \
+                                      (int)(R)); })
+
+#define _mm512_maskz_rcp28_round_ps(M, A, R) __extension__ ({ \
+  (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
+                                      (__v16sf)_mm512_setzero_ps(), \
+                                      (__mmask16)(M), (int)(R)); })
+
+#define _mm512_rcp28_ps(A) \
+  _mm512_rcp28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_rcp28_ps(S, M, A) \
+  _mm512_mask_rcp28_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_rcp28_ps(M, A) \
+  _mm512_maskz_rcp28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_rcp28_round_ss(A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
+                                            (__v4sf)(__m128)(B), \
+                                            (__v4sf)_mm_setzero_ps(), \
+                                            (__mmask8)-1, (int)(R)); })
+
+#define _mm_mask_rcp28_round_ss(S, M, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
+                                            (__v4sf)(__m128)(B), \
+                                            (__v4sf)(__m128)(S), \
+                                            (__mmask8)(M), (int)(R)); })
+
+#define _mm_maskz_rcp28_round_ss(M, A, B, R) __extension__ ({ \
+  (__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
+                                            (__v4sf)(__m128)(B), \
+                                            (__v4sf)_mm_setzero_ps(), \
+                                            (__mmask8)(M), (int)(R)); })
+
+#define _mm_rcp28_ss(A, B) \
+  _mm_rcp28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_mask_rcp28_ss(S, M, A, B) \
+  _mm_mask_rcp28_round_ss((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_rcp28_ss(M, A, B) \
+  _mm_maskz_rcp28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_rcp28_round_sd(A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
+                                             (__v2df)(__m128d)(B), \
+                                             (__v2df)_mm_setzero_pd(), \
+                                             (__mmask8)-1, (int)(R)); })
+
+#define _mm_mask_rcp28_round_sd(S, M, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
+                                             (__v2df)(__m128d)(B), \
+                                             (__v2df)(__m128d)(S), \
+                                             (__mmask8)(M), (int)(R)); })
+
+#define _mm_maskz_rcp28_round_sd(M, A, B, R) __extension__ ({ \
+  (__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
+                                             (__v2df)(__m128d)(B), \
+                                             (__v2df)_mm_setzero_pd(), \
+                                             (__mmask8)(M), (int)(R)); })
+
+#define _mm_rcp28_sd(A, B) \
+  _mm_rcp28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_mask_rcp28_sd(S, M, A, B) \
+  _mm_mask_rcp28_round_sd((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_rcp28_sd(M, A, B) \
+  _mm_maskz_rcp28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#endif // __AVX512ERINTRIN_H

文件差異過大導致無法顯示
+ 10233 - 0
demo/include/avx512fintrin.h


+ 92 - 0
demo/include/avx512ifmaintrin.h

@@ -0,0 +1,92 @@
+/*===------------- avx512ifmaintrin.h - IFMA intrinsics ------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512ifmaintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __IFMAINTRIN_H
+#define __IFMAINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma")))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_madd52hi_epu64 (__m512i __X, __m512i __Y, __m512i __Z)
+{
+  return (__m512i) __builtin_ia32_vpmadd52huq512_mask ((__v8di) __X,
+                   (__v8di) __Y,
+                   (__v8di) __Z,
+                   (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_madd52hi_epu64 (__m512i __W, __mmask8 __M, __m512i __X,
+          __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_vpmadd52huq512_mask ((__v8di) __W,
+                   (__v8di) __X,
+                   (__v8di) __Y,
+                   (__mmask8) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_madd52hi_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z)
+{
+  return (__m512i) __builtin_ia32_vpmadd52huq512_maskz ((__v8di) __X,
+              (__v8di) __Y,
+              (__v8di) __Z,
+              (__mmask8) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_madd52lo_epu64 (__m512i __X, __m512i __Y, __m512i __Z)
+{
+  return (__m512i) __builtin_ia32_vpmadd52luq512_mask ((__v8di) __X,
+                   (__v8di) __Y,
+                   (__v8di) __Z,
+                   (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_madd52lo_epu64 (__m512i __W, __mmask8 __M, __m512i __X,
+          __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_vpmadd52luq512_mask ((__v8di) __W,
+                   (__v8di) __X,
+                   (__v8di) __Y,
+                   (__mmask8) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_madd52lo_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z)
+{
+  return (__m512i) __builtin_ia32_vpmadd52luq512_maskz ((__v8di) __X,
+              (__v8di) __Y,
+              (__v8di) __Z,
+              (__mmask8) __M);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif

+ 149 - 0
demo/include/avx512ifmavlintrin.h

@@ -0,0 +1,149 @@
+/*===------------- avx512ifmavlintrin.h - IFMA intrinsics ------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512ifmavlintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __IFMAVLINTRIN_H
+#define __IFMAVLINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma,avx512vl")))
+
+
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_madd52hi_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
+{
+  return (__m128i) __builtin_ia32_vpmadd52huq128_mask ((__v2di) __X,
+                   (__v2di) __Y,
+                   (__v2di) __Z,
+                   (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_madd52hi_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_vpmadd52huq128_mask ((__v2di) __W,
+                   (__v2di) __X,
+                   (__v2di) __Y,
+                   (__mmask8) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_madd52hi_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z)
+{
+  return (__m128i) __builtin_ia32_vpmadd52huq128_maskz ((__v2di) __X,
+              (__v2di) __Y,
+              (__v2di) __Z,
+              (__mmask8) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_madd52hi_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
+{
+  return (__m256i) __builtin_ia32_vpmadd52huq256_mask ((__v4di) __X,
+                   (__v4di) __Y,
+                   (__v4di) __Z,
+                   (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_madd52hi_epu64 (__m256i __W, __mmask8 __M, __m256i __X,
+          __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_vpmadd52huq256_mask ((__v4di) __W,
+                   (__v4di) __X,
+                   (__v4di) __Y,
+                   (__mmask8) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_madd52hi_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z)
+{
+  return (__m256i) __builtin_ia32_vpmadd52huq256_maskz ((__v4di) __X,
+              (__v4di) __Y,
+              (__v4di) __Z,
+              (__mmask8) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_madd52lo_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
+{
+  return (__m128i) __builtin_ia32_vpmadd52luq128_mask ((__v2di) __X,
+                   (__v2di) __Y,
+                   (__v2di) __Z,
+                   (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_madd52lo_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_vpmadd52luq128_mask ((__v2di) __W,
+                   (__v2di) __X,
+                   (__v2di) __Y,
+                   (__mmask8) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_madd52lo_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z)
+{
+  return (__m128i) __builtin_ia32_vpmadd52luq128_maskz ((__v2di) __X,
+              (__v2di) __Y,
+              (__v2di) __Z,
+              (__mmask8) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_madd52lo_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
+{
+  return (__m256i) __builtin_ia32_vpmadd52luq256_mask ((__v4di) __X,
+                   (__v4di) __Y,
+                   (__v4di) __Z,
+                   (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_madd52lo_epu64 (__m256i __W, __mmask8 __M, __m256i __X,
+          __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_vpmadd52luq256_mask ((__v4di) __W,
+                   (__v4di) __X,
+                   (__v4di) __Y,
+                   (__mmask8) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_madd52lo_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z)
+{
+  return (__m256i) __builtin_ia32_vpmadd52luq256_maskz ((__v4di) __X,
+              (__v4di) __Y,
+              (__v4di) __Z,
+              (__mmask8) __M);
+}
+
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif

+ 111 - 0
demo/include/avx512pfintrin.h

@@ -0,0 +1,111 @@
+/*===------------- avx512pfintrin.h - PF intrinsics ------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512pfintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512PFINTRIN_H
+#define __AVX512PFINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512pf")))
+
+#define _mm512_mask_prefetch_i32gather_pd(index, mask, addr, scale, hint) __extension__ ({\
+  __builtin_ia32_gatherpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \
+                             (long long const *)(addr), (int)(scale), \
+                             (int)(hint)); })
+              
+#define _mm512_prefetch_i32gather_pd(index, addr, scale, hint) __extension__ ({\
+  __builtin_ia32_gatherpfdpd((__mmask8) -1, (__v8si)(__m256i)(index), \
+                             (long long const *)(addr), (int)(scale), \
+                             (int)(hint)); })
+
+#define _mm512_mask_prefetch_i32gather_ps(index, mask, addr, scale, hint) ({\
+  __builtin_ia32_gatherpfdps((__mmask16)(mask), \
+                             (__v16si)(__m512i)(index), (int const *)(addr), \
+                             (int)(scale), (int)(hint)); })
+
+#define _mm512_prefetch_i32gather_ps(index, addr, scale, hint) ({\
+  __builtin_ia32_gatherpfdps((__mmask16) -1, \
+                             (__v16si)(__m512i)(index), (int const *)(addr), \
+                             (int)(scale), (int)(hint)); })
+
+#define _mm512_mask_prefetch_i64gather_pd(index, mask, addr, scale, hint) __extension__ ({\
+  __builtin_ia32_gatherpfqpd((__mmask8)(mask), (__v8di)(__m512i)(index), \
+                             (long long const *)(addr), (int)(scale), \
+                             (int)(hint)); })
+
+#define _mm512_prefetch_i64gather_pd(index, addr, scale, hint) __extension__ ({\
+  __builtin_ia32_gatherpfqpd((__mmask8) -1, (__v8di)(__m512i)(index), \
+                             (long long const *)(addr), (int)(scale), \
+                             (int)(hint)); })
+              
+#define _mm512_mask_prefetch_i64gather_ps(index, mask, addr, scale, hint) ({\
+  __builtin_ia32_gatherpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \
+                             (int const *)(addr), (int)(scale), (int)(hint)); })
+
+#define _mm512_prefetch_i64gather_ps(index, addr, scale, hint) ({\
+  __builtin_ia32_gatherpfqps((__mmask8) -1, (__v8di)(__m512i)(index), \
+                             (int const *)(addr), (int)(scale), (int)(hint)); })
+
+#define _mm512_prefetch_i32scatter_pd(addr, index, scale, hint) __extension__ ({\
+  __builtin_ia32_scatterpfdpd((__mmask8)-1, (__v8si)(__m256i)(index), \
+                              (long long *)(addr), (int)(scale), \
+                              (int)(hint)); })
+
+#define _mm512_mask_prefetch_i32scatter_pd(addr, mask, index, scale, hint) __extension__ ({\
+  __builtin_ia32_scatterpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \
+                              (long long *)(addr), (int)(scale), \
+                              (int)(hint)); })
+
+#define _mm512_prefetch_i32scatter_ps(addr, index, scale, hint) __extension__ ({\
+  __builtin_ia32_scatterpfdps((__mmask16)-1, (__v16si)(__m512i)(index), \
+                              (int *)(addr), (int)(scale), (int)(hint)); })
+
+#define _mm512_mask_prefetch_i32scatter_ps(addr, mask, index, scale, hint) __extension__ ({\
+  __builtin_ia32_scatterpfdps((__mmask16)(mask), \
+                              (__v16si)(__m512i)(index), (int *)(addr), \
+                              (int)(scale), (int)(hint)); })
+
+#define _mm512_prefetch_i64scatter_pd(addr, index, scale, hint) __extension__ ({\
+  __builtin_ia32_scatterpfqpd((__mmask8)-1, (__v8di)(__m512i)(index), \
+                              (long long *)(addr), (int)(scale), \
+                              (int)(hint)); })
+
+#define _mm512_mask_prefetch_i64scatter_pd(addr, mask, index, scale, hint) __extension__ ({\
+  __builtin_ia32_scatterpfqpd((__mmask8)(mask), (__v8di)(__m512i)(index), \
+                              (long long *)(addr), (int)(scale), \
+                              (int)(hint)); })
+
+#define _mm512_prefetch_i64scatter_ps(addr, index, scale, hint) __extension__ ({\
+  __builtin_ia32_scatterpfqps((__mmask8)-1, (__v8di)(__m512i)(index), \
+                              (int *)(addr), (int)(scale), (int)(hint)); })
+
+#define _mm512_mask_prefetch_i64scatter_ps(addr, mask, index, scale, hint) __extension__ ({\
+  __builtin_ia32_scatterpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \
+                              (int *)(addr), (int)(scale), (int)(hint)); })
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif

+ 391 - 0
demo/include/avx512vbmi2intrin.h

@@ -0,0 +1,391 @@
+/*===------------- avx512vbmi2intrin.h - VBMI2 intrinsics ------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vbmi2intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VBMI2INTRIN_H
+#define __AVX512VBMI2INTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi2")))
+
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_compress_epi16(__m512i __S, __mmask32 __U, __m512i __D)
+{
+  return (__m512i) __builtin_ia32_compresshi512_mask ((__v32hi) __D,
+              (__v32hi) __S,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_compress_epi16(__mmask32 __U, __m512i __D)
+{
+  return (__m512i) __builtin_ia32_compresshi512_mask ((__v32hi) __D,
+              (__v32hi) _mm512_setzero_hi(),
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_compress_epi8(__m512i __S, __mmask64 __U, __m512i __D)
+{
+  return (__m512i) __builtin_ia32_compressqi512_mask ((__v64qi) __D,
+              (__v64qi) __S,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_compress_epi8(__mmask64 __U, __m512i __D)
+{
+  return (__m512i) __builtin_ia32_compressqi512_mask ((__v64qi) __D,
+              (__v64qi) _mm512_setzero_qi(),
+              __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_compressstoreu_epi16(void *__P, __mmask32 __U, __m512i __D)
+{
+  __builtin_ia32_compressstorehi512_mask ((__v32hi *) __P, (__v32hi) __D,
+              __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_compressstoreu_epi8(void *__P, __mmask64 __U, __m512i __D)
+{
+  __builtin_ia32_compressstoreqi512_mask ((__v64qi *) __P, (__v64qi) __D,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_expand_epi16(__m512i __S, __mmask32 __U, __m512i __D)
+{
+  return (__m512i) __builtin_ia32_expandhi512_mask ((__v32hi) __D,
+              (__v32hi) __S,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_expand_epi16(__mmask32 __U, __m512i __D)
+{
+  return (__m512i) __builtin_ia32_expandhi512_mask ((__v32hi) __D,
+              (__v32hi) _mm512_setzero_hi(),
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_expand_epi8(__m512i __S, __mmask64 __U, __m512i __D)
+{
+  return (__m512i) __builtin_ia32_expandqi512_mask ((__v64qi) __D,
+              (__v64qi) __S,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_expand_epi8(__mmask64 __U, __m512i __D)
+{
+  return (__m512i) __builtin_ia32_expandqi512_mask ((__v64qi) __D,
+              (__v64qi) _mm512_setzero_qi(),
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_expandloadu_epi16(__m512i __S, __mmask32 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_expandloadhi512_mask ((const __v32hi *)__P,
+              (__v32hi) __S,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_expandloadu_epi16(__mmask32 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_expandloadhi512_mask ((const __v32hi *)__P,
+              (__v32hi) _mm512_setzero_hi(),
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_expandloadu_epi8(__m512i __S, __mmask64 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_expandloadqi512_mask ((const __v64qi *)__P,
+              (__v64qi) __S,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_expandloadu_epi8(__mmask64 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_expandloadqi512_mask ((const __v64qi *)__P,
+              (__v64qi) _mm512_setzero_qi(),
+              __U);
+}
+
+#define _mm512_mask_shldi_epi64(S, U, A, B, I) __extension__ ({ \
+  (__m512i)__builtin_ia32_vpshldq512_mask((__v8di)(A), \
+                                          (__v8di)(B), \
+                                          (int)(I), \
+                                          (__v8di)(S), \
+                                          (__mmask8)(U)); })
+
+#define _mm512_maskz_shldi_epi64(U, A, B, I) \
+  _mm512_mask_shldi_epi64(_mm512_setzero_hi(), (U), (A), (B), (I))
+
+#define _mm512_shldi_epi64(A, B, I) \
+  _mm512_mask_shldi_epi64(_mm512_undefined(), (__mmask8)(-1), (A), (B), (I))
+
+#define _mm512_mask_shldi_epi32(S, U, A, B, I) __extension__ ({ \
+  (__m512i)__builtin_ia32_vpshldd512_mask((__v16si)(A), \
+                                          (__v16si)(B), \
+                                          (int)(I), \
+                                          (__v16si)(S), \
+                                          (__mmask16)(U)); })
+
+#define _mm512_maskz_shldi_epi32(U, A, B, I) \
+  _mm512_mask_shldi_epi32(_mm512_setzero_hi(), (U), (A), (B), (I))
+
+#define _mm512_shldi_epi32(A, B, I) \
+  _mm512_mask_shldi_epi32(_mm512_undefined(), (__mmask16)(-1), (A), (B), (I))
+
+#define _mm512_mask_shldi_epi16(S, U, A, B, I) __extension__ ({ \
+  (__m512i)__builtin_ia32_vpshldw512_mask((__v32hi)(A), \
+                                          (__v32hi)(B), \
+                                          (int)(I), \
+                                          (__v32hi)(S), \
+                                          (__mmask32)(U)); })
+
+#define _mm512_maskz_shldi_epi16(U, A, B, I) \
+  _mm512_mask_shldi_epi16(_mm512_setzero_hi(), (U), (A), (B), (I))
+
+#define _mm512_shldi_epi16(A, B, I) \
+  _mm512_mask_shldi_epi16(_mm512_undefined(), (__mmask32)(-1), (A), (B), (I))
+
+#define _mm512_mask_shrdi_epi64(S, U, A, B, I) __extension__ ({ \
+  (__m512i)__builtin_ia32_vpshrdq512_mask((__v8di)(A), \
+                                          (__v8di)(B), \
+                                          (int)(I), \
+                                          (__v8di)(S), \
+                                          (__mmask8)(U)); })
+
+#define _mm512_maskz_shrdi_epi64(U, A, B, I) \
+  _mm512_mask_shrdi_epi64(_mm512_setzero_hi(), (U), (A), (B), (I))
+
+#define _mm512_shrdi_epi64(A, B, I) \
+  _mm512_mask_shrdi_epi64(_mm512_undefined(), (__mmask8)(-1), (A), (B), (I))
+
+#define _mm512_mask_shrdi_epi32(S, U, A, B, I) __extension__ ({ \
+  (__m512i)__builtin_ia32_vpshrdd512_mask((__v16si)(A), \
+                                          (__v16si)(B), \
+                                          (int)(I), \
+                                          (__v16si)(S), \
+                                          (__mmask16)(U)); })
+
+#define _mm512_maskz_shrdi_epi32(U, A, B, I) \
+  _mm512_mask_shrdi_epi32(_mm512_setzero_hi(), (U), (A), (B), (I))
+
+#define _mm512_shrdi_epi32(A, B, I) \
+  _mm512_mask_shrdi_epi32(_mm512_undefined(), (__mmask16)(-1), (A), (B), (I))
+
+#define _mm512_mask_shrdi_epi16(S, U, A, B, I) __extension__ ({ \
+  (__m512i)__builtin_ia32_vpshrdw512_mask((__v32hi)(A), \
+                                          (__v32hi)(B), \
+                                          (int)(I), \
+                                          (__v32hi)(S), \
+                                          (__mmask32)(U)); })
+
+#define _mm512_maskz_shrdi_epi16(U, A, B, I) \
+  _mm512_mask_shrdi_epi16(_mm512_setzero_hi(), (U), (A), (B), (I))
+
+#define _mm512_shrdi_epi16(A, B, I) \
+  _mm512_mask_shrdi_epi16(_mm512_undefined(), (__mmask32)(-1), (A), (B), (I))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_shldv_epi64(__m512i __S, __mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpshldvq512_mask ((__v8di) __S,
+              (__v8di) __A,
+              (__v8di) __B,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_shldv_epi64(__mmask8 __U, __m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpshldvq512_maskz ((__v8di) __S,
+              (__v8di) __A,
+              (__v8di) __B,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_shldv_epi64(__m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpshldvq512_mask ((__v8di) __S,
+              (__v8di) __A,
+              (__v8di) __B,
+              (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_shldv_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpshldvd512_mask ((__v16si) __S,
+              (__v16si) __A,
+              (__v16si) __B,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_shldv_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpshldvd512_maskz ((__v16si) __S,
+              (__v16si) __A,
+              (__v16si) __B,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_shldv_epi32(__m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpshldvd512_mask ((__v16si) __S,
+              (__v16si) __A,
+              (__v16si) __B,
+              (__mmask16) -1);
+}
+
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_shldv_epi16(__m512i __S, __mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpshldvw512_mask ((__v32hi) __S,
+              (__v32hi) __A,
+              (__v32hi) __B,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_shldv_epi16(__mmask32 __U, __m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpshldvw512_maskz ((__v32hi) __S,
+              (__v32hi) __A,
+              (__v32hi) __B,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_shldv_epi16(__m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpshldvw512_mask ((__v32hi) __S,
+              (__v32hi) __A,
+              (__v32hi) __B,
+              (__mmask32) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_shrdv_epi64(__m512i __S, __mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpshrdvq512_mask ((__v8di) __S,
+              (__v8di) __A,
+              (__v8di) __B,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_shrdv_epi64(__mmask8 __U, __m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpshrdvq512_maskz ((__v8di) __S,
+              (__v8di) __A,
+              (__v8di) __B,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_shrdv_epi64(__m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpshrdvq512_mask ((__v8di) __S,
+              (__v8di) __A,
+              (__v8di) __B,
+              (__mmask8) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_shrdv_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpshrdvd512_mask ((__v16si) __S,
+              (__v16si) __A,
+              (__v16si) __B,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_shrdv_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpshrdvd512_maskz ((__v16si) __S,
+              (__v16si) __A,
+              (__v16si) __B,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_shrdv_epi32(__m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpshrdvd512_mask ((__v16si) __S,
+              (__v16si) __A,
+              (__v16si) __B,
+              (__mmask16) -1);
+}
+
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_shrdv_epi16(__m512i __S, __mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpshrdvw512_mask ((__v32hi) __S,
+              (__v32hi) __A,
+              (__v32hi) __B,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_shrdv_epi16(__mmask32 __U, __m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpshrdvw512_maskz ((__v32hi) __S,
+              (__v32hi) __A,
+              (__v32hi) __B,
+              __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_shrdv_epi16(__m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpshrdvw512_mask ((__v32hi) __S,
+              (__v32hi) __A,
+              (__v32hi) __B,
+              (__mmask32) -1);
+}
+
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
+

+ 137 - 0
demo/include/avx512vbmiintrin.h

@@ -0,0 +1,137 @@
+/*===------------- avx512vbmiintrin.h - VBMI intrinsics ------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vbmiintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __VBMIINTRIN_H
+#define __VBMIINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi")))
+
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask2_permutex2var_epi8 (__m512i __A, __m512i __I,
+         __mmask64 __U, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermi2varqi512_mask ((__v64qi) __A,
+              (__v64qi) __I
+              /* idx */ ,
+              (__v64qi) __B,
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_permutex2var_epi8 (__m512i __A, __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2varqi512_mask ((__v64qi) __I
+              /* idx */ ,
+              (__v64qi) __A,
+              (__v64qi) __B,
+              (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_permutex2var_epi8 (__m512i __A, __mmask64 __U,
+        __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2varqi512_mask ((__v64qi) __I
+              /* idx */ ,
+              (__v64qi) __A,
+              (__v64qi) __B,
+              (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_permutex2var_epi8 (__mmask64 __U, __m512i __A,
+         __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2varqi512_maskz ((__v64qi) __I
+               /* idx */ ,
+               (__v64qi) __A,
+               (__v64qi) __B,
+               (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_permutexvar_epi8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_permvarqi512_mask ((__v64qi) __B,
+                 (__v64qi) __A,
+                 (__v64qi) _mm512_undefined_epi32 (),
+                 (__mmask64) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_permutexvar_epi8 (__mmask64 __M, __m512i __A,
+        __m512i __B)
+{
+  return (__m512i) __builtin_ia32_permvarqi512_mask ((__v64qi) __B,
+                 (__v64qi) __A,
+                 (__v64qi) _mm512_setzero_si512(),
+                 (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_permutexvar_epi8 (__m512i __W, __mmask64 __M, __m512i __A,
+             __m512i __B)
+{
+  return (__m512i) __builtin_ia32_permvarqi512_mask ((__v64qi) __B,
+                 (__v64qi) __A,
+                 (__v64qi) __W,
+                 (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_multishift_epi64_epi8 (__m512i __W, __mmask64 __M, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_vpmultishiftqb512_mask ((__v64qi) __X,
+                (__v64qi) __Y,
+                (__v64qi) __W,
+                (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_multishift_epi64_epi8 (__mmask64 __M, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_vpmultishiftqb512_mask ((__v64qi) __X,
+                (__v64qi) __Y,
+                (__v64qi) _mm512_setzero_si512 (),
+                (__mmask64) __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_multishift_epi64_epi8 (__m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_vpmultishiftqb512_mask ((__v64qi) __X,
+                (__v64qi) __Y,
+                (__v64qi) _mm512_undefined_epi32 (),
+                (__mmask64) -1);
+}
+
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif

+ 247 - 0
demo/include/avx512vbmivlintrin.h

@@ -0,0 +1,247 @@
+/*===------------- avx512vbmivlintrin.h - VBMI intrinsics ------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vbmivlintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __VBMIVLINTRIN_H
+#define __VBMIVLINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi,avx512vl")))
+
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask2_permutex2var_epi8 (__m128i __A, __m128i __I, __mmask16 __U,
+            __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpermi2varqi128_mask ((__v16qi) __A,
+              (__v16qi) __I
+              /* idx */ ,
+              (__v16qi) __B,
+              (__mmask16)
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask2_permutex2var_epi8 (__m256i __A, __m256i __I,
+         __mmask32 __U, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpermi2varqi256_mask ((__v32qi) __A,
+              (__v32qi) __I
+              /* idx */ ,
+              (__v32qi) __B,
+              (__mmask32)
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_permutex2var_epi8 (__m128i __A, __m128i __I, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpermt2varqi128_mask ((__v16qi) __I
+              /* idx */ ,
+              (__v16qi) __A,
+              (__v16qi) __B,
+              (__mmask16) -
+              1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_permutex2var_epi8 (__m128i __A, __mmask16 __U, __m128i __I,
+           __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpermt2varqi128_mask ((__v16qi) __I
+              /* idx */ ,
+              (__v16qi) __A,
+              (__v16qi) __B,
+              (__mmask16)
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_permutex2var_epi8 (__mmask16 __U, __m128i __A, __m128i __I,
+            __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpermt2varqi128_maskz ((__v16qi) __I
+               /* idx */ ,
+               (__v16qi) __A,
+               (__v16qi) __B,
+               (__mmask16)
+               __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_permutex2var_epi8 (__m256i __A, __m256i __I, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpermt2varqi256_mask ((__v32qi) __I
+              /* idx */ ,
+              (__v32qi) __A,
+              (__v32qi) __B,
+              (__mmask32) -
+              1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_permutex2var_epi8 (__m256i __A, __mmask32 __U,
+        __m256i __I, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpermt2varqi256_mask ((__v32qi) __I
+              /* idx */ ,
+              (__v32qi) __A,
+              (__v32qi) __B,
+              (__mmask32)
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_permutex2var_epi8 (__mmask32 __U, __m256i __A,
+         __m256i __I, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpermt2varqi256_maskz ((__v32qi) __I
+               /* idx */ ,
+               (__v32qi) __A,
+               (__v32qi) __B,
+               (__mmask32)
+               __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_permutexvar_epi8 (__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_permvarqi128_mask ((__v16qi) __B,
+                 (__v16qi) __A,
+                 (__v16qi) _mm_undefined_si128 (),
+                 (__mmask16) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_permutexvar_epi8 (__mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_permvarqi128_mask ((__v16qi) __B,
+                 (__v16qi) __A,
+                 (__v16qi) _mm_setzero_si128 (),
+                 (__mmask16) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_permutexvar_epi8 (__m128i __W, __mmask16 __M, __m128i __A,
+          __m128i __B)
+{
+  return (__m128i) __builtin_ia32_permvarqi128_mask ((__v16qi) __B,
+                 (__v16qi) __A,
+                 (__v16qi) __W,
+                 (__mmask16) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_permutexvar_epi8 (__m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_permvarqi256_mask ((__v32qi) __B,
+                 (__v32qi) __A,
+                 (__v32qi) _mm256_undefined_si256 (),
+                 (__mmask32) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_permutexvar_epi8 (__mmask32 __M, __m256i __A,
+        __m256i __B)
+{
+  return (__m256i) __builtin_ia32_permvarqi256_mask ((__v32qi) __B,
+                 (__v32qi) __A,
+                 (__v32qi) _mm256_setzero_si256 (),
+                 (__mmask32) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_permutexvar_epi8 (__m256i __W, __mmask32 __M, __m256i __A,
+             __m256i __B)
+{
+  return (__m256i) __builtin_ia32_permvarqi256_mask ((__v32qi) __B,
+                 (__v32qi) __A,
+                 (__v32qi) __W,
+                 (__mmask32) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_multishift_epi64_epi8 (__m128i __W, __mmask16 __M, __m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_vpmultishiftqb128_mask ((__v16qi) __X,
+                (__v16qi) __Y,
+                (__v16qi) __W,
+                (__mmask16) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_multishift_epi64_epi8 (__mmask16 __M, __m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_vpmultishiftqb128_mask ((__v16qi) __X,
+                (__v16qi) __Y,
+                (__v16qi)
+                _mm_setzero_si128 (),
+                (__mmask16) __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_multishift_epi64_epi8 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_vpmultishiftqb128_mask ((__v16qi) __X,
+                (__v16qi) __Y,
+                (__v16qi)
+                _mm_undefined_si128 (),
+                (__mmask16) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_multishift_epi64_epi8 (__m256i __W, __mmask32 __M, __m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_vpmultishiftqb256_mask ((__v32qi) __X,
+                (__v32qi) __Y,
+                (__v32qi) __W,
+                (__mmask32) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_multishift_epi64_epi8 (__mmask32 __M, __m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_vpmultishiftqb256_mask ((__v32qi) __X,
+                (__v32qi) __Y,
+                (__v32qi)
+                _mm256_setzero_si256 (),
+                (__mmask32) __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_multishift_epi64_epi8 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_vpmultishiftqb256_mask ((__v32qi) __X,
+                (__v32qi) __Y,
+                (__v32qi)
+                _mm256_undefined_si256 (),
+                (__mmask32) -1);
+}
+
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif

+ 157 - 0
demo/include/avx512vlbitalgintrin.h

@@ -0,0 +1,157 @@
+/*===------------- avx512vlbitalgintrin.h - BITALG intrinsics ------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vlbitalgintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VLBITALGINTRIN_H
+#define __AVX512VLBITALGINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512bitalg")))
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_popcnt_epi16(__m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpopcntw_256((__v16hi) __A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_popcnt_epi16(__m256i __A, __mmask16 __U, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_selectw_256((__mmask16) __U,
+              (__v16hi) _mm256_popcnt_epi16(__B),
+              (__v16hi) __A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_popcnt_epi16(__mmask16 __U, __m256i __B)
+{
+  return _mm256_mask_popcnt_epi16((__m256i) _mm256_setzero_si256(),
+              __U,
+              __B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_popcnt_epi16(__m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpopcntw_128((__v8hi) __A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_mask_popcnt_epi16(__m128i __A, __mmask8 __U, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_selectw_128((__mmask8) __U,
+              (__v8hi) _mm128_popcnt_epi16(__B),
+              (__v8hi) __A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_maskz_popcnt_epi16(__mmask8 __U, __m128i __B)
+{
+  return _mm128_mask_popcnt_epi16((__m128i) _mm_setzero_si128(),
+              __U,
+              __B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_popcnt_epi8(__m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpopcntb_256((__v32qi) __A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_popcnt_epi8(__m256i __A, __mmask32 __U, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_selectb_256((__mmask32) __U,
+              (__v32qi) _mm256_popcnt_epi8(__B),
+              (__v32qi) __A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_popcnt_epi8(__mmask32 __U, __m256i __B)
+{
+  return _mm256_mask_popcnt_epi8((__m256i) _mm256_setzero_si256(),
+              __U,
+              __B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_popcnt_epi8(__m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpopcntb_128((__v16qi) __A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_mask_popcnt_epi8(__m128i __A, __mmask16 __U, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_selectb_128((__mmask16) __U,
+              (__v16qi) _mm128_popcnt_epi8(__B),
+              (__v16qi) __A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_maskz_popcnt_epi8(__mmask16 __U, __m128i __B)
+{
+  return _mm128_mask_popcnt_epi8((__m128i) _mm_setzero_si128(),
+              __U,
+              __B);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_mask_bitshuffle_epi32_mask(__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask32) __builtin_ia32_vpshufbitqmb256_mask((__v32qi) __A,
+              (__v32qi) __B,
+              __U);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm256_bitshuffle_epi32_mask(__m256i __A, __m256i __B)
+{
+  return _mm256_mask_bitshuffle_epi32_mask((__mmask32) -1,
+              __A,
+              __B);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm128_mask_bitshuffle_epi16_mask(__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask16) __builtin_ia32_vpshufbitqmb128_mask((__v16qi) __A,
+              (__v16qi) __B,
+              __U);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm128_bitshuffle_epi16_mask(__m128i __A, __m128i __B)
+{
+  return _mm128_mask_bitshuffle_epi16_mask((__mmask16) -1,
+              __A,
+              __B);
+}
+
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif

文件差異過大導致無法顯示
+ 2781 - 0
demo/include/avx512vlbwintrin.h


+ 263 - 0
demo/include/avx512vlcdintrin.h

@@ -0,0 +1,263 @@
+/*===---- avx512vlcdintrin.h - AVX512VL and AVX512CD intrinsics ---------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vlcdintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VLCDINTRIN_H
+#define __AVX512VLCDINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512cd")))
+
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_broadcastmb_epi64 (__mmask8 __A)
+{ 
+  return (__m128i) _mm_set1_epi64x((long long) __A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_broadcastmb_epi64 (__mmask8 __A)
+{
+  return (__m256i) _mm256_set1_epi64x((long long)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_broadcastmw_epi32 (__mmask16 __A)
+{
+  return (__m128i) _mm_set1_epi32((int)__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_broadcastmw_epi32 (__mmask16 __A)
+{
+  return (__m256i) _mm256_set1_epi32((int)__A);
+}
+
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_conflict_epi64 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
+               (__v2di) _mm_undefined_si128 (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_conflict_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
+               (__v2di) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_conflict_epi64 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
+               (__v2di)
+               _mm_setzero_di (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_conflict_epi64 (__m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
+               (__v4di)  _mm256_undefined_si256 (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_conflict_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
+               (__v4di) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_conflict_epi64 (__mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
+               (__v4di) _mm256_setzero_si256 (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_conflict_epi32 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
+               (__v4si) _mm_undefined_si128 (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_conflict_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
+               (__v4si) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_conflict_epi32 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
+               (__v4si) _mm_setzero_si128 (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_conflict_epi32 (__m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
+               (__v8si) _mm256_undefined_si256 (),
+               (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_conflict_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
+               (__v8si) __W,
+               (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_conflict_epi32 (__mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
+               (__v8si)
+               _mm256_setzero_si256 (),
+               (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_lzcnt_epi32 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_lzcnt_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A,
+                 (__v4si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_lzcnt_epi32 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A,
+                 (__v4si)
+                 _mm_setzero_si128 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_lzcnt_epi32 (__m256i __A)
+{
+  return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A,
+                 (__v8si)
+                 _mm256_setzero_si256 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_lzcnt_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A,
+                 (__v8si) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_lzcnt_epi32 (__mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A,
+                 (__v8si)
+                 _mm256_setzero_si256 (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_lzcnt_epi64 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A,
+                 (__v2di)
+                 _mm_setzero_di (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_lzcnt_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A,
+                 (__v2di) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_lzcnt_epi64 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A,
+                 (__v2di)
+                 _mm_setzero_di (),
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_lzcnt_epi64 (__m256i __A)
+{
+  return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A,
+                 (__v4di)
+                 _mm256_setzero_si256 (),
+                 (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_lzcnt_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A,
+                 (__v4di) __W,
+                 (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_lzcnt_epi64 (__mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A,
+                 (__v4di)
+                 _mm256_setzero_si256 (),
+                 (__mmask8) __U);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __AVX512VLCDINTRIN_H */

文件差異過大導致無法顯示
+ 1198 - 0
demo/include/avx512vldqintrin.h


文件差異過大導致無法顯示
+ 8547 - 0
demo/include/avx512vlintrin.h


+ 748 - 0
demo/include/avx512vlvbmi2intrin.h

@@ -0,0 +1,748 @@
+/*===------------- avx512vlvbmi2intrin.h - VBMI2 intrinsics -----------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vlvbmi2intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VLVBMI2INTRIN_H
+#define __AVX512VLVBMI2INTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vbmi2")))
+
+static  __inline __m128i __DEFAULT_FN_ATTRS
+_mm128_setzero_hi(void) {
+  return (__m128i)(__v8hi){ 0, 0, 0, 0, 0, 0, 0, 0 };
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_mask_compress_epi16(__m128i __S, __mmask8 __U, __m128i __D)
+{
+  return (__m128i) __builtin_ia32_compresshi128_mask ((__v8hi) __D,
+              (__v8hi) __S,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_maskz_compress_epi16(__mmask8 __U, __m128i __D)
+{
+  return (__m128i) __builtin_ia32_compresshi128_mask ((__v8hi) __D,
+              (__v8hi) _mm128_setzero_hi(),
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_mask_compress_epi8(__m128i __S, __mmask16 __U, __m128i __D)
+{
+  return (__m128i) __builtin_ia32_compressqi128_mask ((__v16qi) __D,
+              (__v16qi) __S,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_maskz_compress_epi8(__mmask16 __U, __m128i __D)
+{
+  return (__m128i) __builtin_ia32_compressqi128_mask ((__v16qi) __D,
+              (__v16qi) _mm128_setzero_hi(),
+              __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm128_mask_compressstoreu_epi16(void *__P, __mmask8 __U, __m128i __D)
+{
+  __builtin_ia32_compressstorehi128_mask ((__v8hi *) __P, (__v8hi) __D,
+              __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm128_mask_compressstoreu_epi8(void *__P, __mmask16 __U, __m128i __D)
+{
+  __builtin_ia32_compressstoreqi128_mask ((__v16qi *) __P, (__v16qi) __D,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_mask_expand_epi16(__m128i __S, __mmask8 __U, __m128i __D)
+{
+  return (__m128i) __builtin_ia32_expandhi128_mask ((__v8hi) __D,
+              (__v8hi) __S,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_maskz_expand_epi16(__mmask8 __U, __m128i __D)
+{
+  return (__m128i) __builtin_ia32_expandhi128_mask ((__v8hi) __D,
+              (__v8hi) _mm128_setzero_hi(),
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_mask_expand_epi8(__m128i __S, __mmask16 __U, __m128i __D)
+{
+  return (__m128i) __builtin_ia32_expandqi128_mask ((__v16qi) __D,
+              (__v16qi) __S,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_maskz_expand_epi8(__mmask16 __U, __m128i __D)
+{
+  return (__m128i) __builtin_ia32_expandqi128_mask ((__v16qi) __D,
+              (__v16qi) _mm128_setzero_hi(),
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_mask_expandloadu_epi16(__m128i __S, __mmask8 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_expandloadhi128_mask ((const __v8hi *)__P,
+              (__v8hi) __S,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_maskz_expandloadu_epi16(__mmask8 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_expandloadhi128_mask ((const __v8hi *)__P,
+              (__v8hi) _mm128_setzero_hi(),
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_mask_expandloadu_epi8(__m128i __S, __mmask16 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_expandloadqi128_mask ((const __v16qi *)__P,
+              (__v16qi) __S,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_maskz_expandloadu_epi8(__mmask16 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_expandloadqi128_mask ((const __v16qi *)__P,
+              (__v16qi) _mm128_setzero_hi(),
+              __U);
+}
+
+static  __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_setzero_hi(void) {
+  return (__m256i)(__v16hi){ 0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0 };
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_compress_epi16(__m256i __S, __mmask16 __U, __m256i __D)
+{
+  return (__m256i) __builtin_ia32_compresshi256_mask ((__v16hi) __D,
+              (__v16hi) __S,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_compress_epi16(__mmask16 __U, __m256i __D)
+{
+  return (__m256i) __builtin_ia32_compresshi256_mask ((__v16hi) __D,
+              (__v16hi) _mm256_setzero_hi(),
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_compress_epi8(__m256i __S, __mmask32 __U, __m256i __D)
+{
+  return (__m256i) __builtin_ia32_compressqi256_mask ((__v32qi) __D,
+              (__v32qi) __S,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_compress_epi8(__mmask32 __U, __m256i __D)
+{
+  return (__m256i) __builtin_ia32_compressqi256_mask ((__v32qi) __D,
+              (__v32qi) _mm256_setzero_hi(),
+              __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_compressstoreu_epi16(void *__P, __mmask16 __U, __m256i __D)
+{
+  __builtin_ia32_compressstorehi256_mask ((__v16hi *) __P, (__v16hi) __D,
+              __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm256_mask_compressstoreu_epi8(void *__P, __mmask32 __U, __m256i __D)
+{
+  __builtin_ia32_compressstoreqi256_mask ((__v32qi *) __P, (__v32qi) __D,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_expand_epi16(__m256i __S, __mmask16 __U, __m256i __D)
+{
+  return (__m256i) __builtin_ia32_expandhi256_mask ((__v16hi) __D,
+              (__v16hi) __S,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_expand_epi16(__mmask16 __U, __m256i __D)
+{
+  return (__m256i) __builtin_ia32_expandhi256_mask ((__v16hi) __D,
+              (__v16hi) _mm256_setzero_hi(),
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_expand_epi8(__m256i __S, __mmask32 __U, __m256i __D)
+{
+  return (__m256i) __builtin_ia32_expandqi256_mask ((__v32qi) __D,
+              (__v32qi) __S,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_expand_epi8(__mmask32 __U, __m256i __D)
+{
+  return (__m256i) __builtin_ia32_expandqi256_mask ((__v32qi) __D,
+              (__v32qi) _mm256_setzero_hi(),
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_expandloadu_epi16(__m256i __S, __mmask16 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_expandloadhi256_mask ((const __v16hi *)__P,
+              (__v16hi) __S,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_expandloadu_epi16(__mmask16 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_expandloadhi256_mask ((const __v16hi *)__P,
+              (__v16hi) _mm256_setzero_hi(),
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_expandloadu_epi8(__m256i __S, __mmask32 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_expandloadqi256_mask ((const __v32qi *)__P,
+              (__v32qi) __S,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_expandloadu_epi8(__mmask32 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_expandloadqi256_mask ((const __v32qi *)__P,
+              (__v32qi) _mm256_setzero_hi(),
+              __U);
+}
+
+#define _mm256_mask_shldi_epi64(S, U, A, B, I) __extension__ ({ \
+  (__m256i)__builtin_ia32_vpshldq256_mask((__v4di)(A), \
+                                          (__v4di)(B), \
+                                          (int)(I), \
+                                          (__v4di)(S), \
+                                          (__mmask8)(U)); })
+
+#define _mm256_maskz_shldi_epi64(U, A, B, I) \
+  _mm256_mask_shldi_epi64(_mm256_setzero_hi(), (U), (A), (B), (I))
+
+#define _mm256_shldi_epi64(A, B, I) \
+  _mm256_mask_shldi_epi64(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I))
+
+#define _mm128_mask_shldi_epi64(S, U, A, B, I) __extension__ ({ \
+  (__m128i)__builtin_ia32_vpshldq128_mask((__v2di)(A), \
+                                          (__v2di)(B), \
+                                          (int)(I), \
+                                          (__v2di)(S), \
+                                          (__mmask8)(U)); })
+
+#define _mm128_maskz_shldi_epi64(U, A, B, I) \
+  _mm128_mask_shldi_epi64(_mm128_setzero_hi(), (U), (A), (B), (I))
+
+#define _mm128_shldi_epi64(A, B, I) \
+  _mm128_mask_shldi_epi64(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I))
+
+#define _mm256_mask_shldi_epi32(S, U, A, B, I) __extension__ ({ \
+  (__m256i)__builtin_ia32_vpshldd256_mask((__v8si)(A), \
+                                          (__v8si)(B), \
+                                          (int)(I), \
+                                          (__v8si)(S), \
+                                          (__mmask8)(U)); })
+
+#define _mm256_maskz_shldi_epi32(U, A, B, I) \
+  _mm256_mask_shldi_epi32(_mm256_setzero_hi(), (U), (A), (B), (I))
+
+#define _mm256_shldi_epi32(A, B, I) \
+  _mm256_mask_shldi_epi32(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I))
+
+#define _mm128_mask_shldi_epi32(S, U, A, B, I) __extension__ ({ \
+  (__m128i)__builtin_ia32_vpshldd128_mask((__v4si)(A), \
+                                          (__v4si)(B), \
+                                          (int)(I), \
+                                          (__v4si)(S), \
+                                          (__mmask8)(U)); })
+
+#define _mm128_maskz_shldi_epi32(U, A, B, I) \
+  _mm128_mask_shldi_epi32(_mm128_setzero_hi(), (U), (A), (B), (I))
+
+#define _mm128_shldi_epi32(A, B, I) \
+  _mm128_mask_shldi_epi32(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I))
+
+#define _mm256_mask_shldi_epi16(S, U, A, B, I) __extension__ ({ \
+  (__m256i)__builtin_ia32_vpshldw256_mask((__v16hi)(A), \
+                                          (__v16hi)(B), \
+                                          (int)(I), \
+                                          (__v16hi)(S), \
+                                          (__mmask16)(U)); })
+
+#define _mm256_maskz_shldi_epi16(U, A, B, I) \
+  _mm256_mask_shldi_epi16(_mm256_setzero_hi(), (U), (A), (B), (I))
+
+#define _mm256_shldi_epi16(A, B, I) \
+  _mm256_mask_shldi_epi16(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I))
+
+#define _mm128_mask_shldi_epi16(S, U, A, B, I) __extension__ ({ \
+  (__m128i)__builtin_ia32_vpshldw128_mask((__v8hi)(A), \
+                                          (__v8hi)(B), \
+                                          (int)(I), \
+                                          (__v8hi)(S), \
+                                          (__mmask8)(U)); })
+
+#define _mm128_maskz_shldi_epi16(U, A, B, I) \
+  _mm128_mask_shldi_epi16(_mm128_setzero_hi(), (U), (A), (B), (I))
+
+#define _mm128_shldi_epi16(A, B, I) \
+  _mm128_mask_shldi_epi16(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I))
+
+#define _mm256_mask_shrdi_epi64(S, U, A, B, I) __extension__ ({ \
+  (__m256i)__builtin_ia32_vpshrdq256_mask((__v4di)(A), \
+                                          (__v4di)(B), \
+                                          (int)(I), \
+                                          (__v4di)(S), \
+                                          (__mmask8)(U)); })
+
+#define _mm256_maskz_shrdi_epi64(U, A, B, I) \
+  _mm256_mask_shrdi_epi64(_mm256_setzero_hi(), (U), (A), (B), (I))
+
+#define _mm256_shrdi_epi64(A, B, I) \
+  _mm256_mask_shrdi_epi64(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I))
+
+#define _mm128_mask_shrdi_epi64(S, U, A, B, I) __extension__ ({ \
+  (__m128i)__builtin_ia32_vpshrdq128_mask((__v2di)(A), \
+                                          (__v2di)(B), \
+                                          (int)(I), \
+                                          (__v2di)(S), \
+                                          (__mmask8)(U)); })
+
+#define _mm128_maskz_shrdi_epi64(U, A, B, I) \
+  _mm128_mask_shrdi_epi64(_mm128_setzero_hi(), (U), (A), (B), (I))
+
+#define _mm128_shrdi_epi64(A, B, I) \
+  _mm128_mask_shrdi_epi64(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I))
+
+#define _mm256_mask_shrdi_epi32(S, U, A, B, I) __extension__ ({ \
+  (__m256i)__builtin_ia32_vpshrdd256_mask((__v8si)(A), \
+                                          (__v8si)(B), \
+                                          (int)(I), \
+                                          (__v8si)(S), \
+                                          (__mmask8)(U)); })
+
+#define _mm256_maskz_shrdi_epi32(U, A, B, I) \
+  _mm256_mask_shrdi_epi32(_mm256_setzero_hi(), (U), (A), (B), (I))
+
+#define _mm256_shrdi_epi32(A, B, I) \
+  _mm256_mask_shrdi_epi32(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I))
+
+#define _mm128_mask_shrdi_epi32(S, U, A, B, I) __extension__ ({ \
+  (__m128i)__builtin_ia32_vpshrdd128_mask((__v4si)(A), \
+                                          (__v4si)(B), \
+                                          (int)(I), \
+                                          (__v4si)(S), \
+                                          (__mmask8)(U)); })
+
+#define _mm128_maskz_shrdi_epi32(U, A, B, I) \
+  _mm128_mask_shrdi_epi32(_mm128_setzero_hi(), (U), (A), (B), (I))
+
+#define _mm128_shrdi_epi32(A, B, I) \
+  _mm128_mask_shrdi_epi32(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I))
+
+#define _mm256_mask_shrdi_epi16(S, U, A, B, I) __extension__ ({ \
+  (__m256i)__builtin_ia32_vpshrdw256_mask((__v16hi)(A), \
+                                          (__v16hi)(B), \
+                                          (int)(I), \
+                                          (__v16hi)(S), \
+                                          (__mmask16)(U)); })
+
+#define _mm256_maskz_shrdi_epi16(U, A, B, I) \
+  _mm256_mask_shrdi_epi16(_mm256_setzero_hi(), (U), (A), (B), (I))
+
+#define _mm256_shrdi_epi16(A, B, I) \
+  _mm256_mask_shrdi_epi16(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I))
+
+#define _mm128_mask_shrdi_epi16(S, U, A, B, I) __extension__ ({ \
+  (__m128i)__builtin_ia32_vpshrdw128_mask((__v8hi)(A), \
+                                          (__v8hi)(B), \
+                                          (int)(I), \
+                                          (__v8hi)(S), \
+                                          (__mmask8)(U)); })
+
+#define _mm128_maskz_shrdi_epi16(U, A, B, I) \
+  _mm128_mask_shrdi_epi16(_mm128_setzero_hi(), (U), (A), (B), (I))
+
+#define _mm128_shrdi_epi16(A, B, I) \
+  _mm128_mask_shrdi_epi16(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I))
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_shldv_epi64(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpshldvq256_mask ((__v4di) __S,
+              (__v4di) __A,
+              (__v4di) __B,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_shldv_epi64(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpshldvq256_maskz ((__v4di) __S,
+              (__v4di) __A,
+              (__v4di) __B,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_shldv_epi64(__m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpshldvq256_mask ((__v4di) __S,
+              (__v4di) __A,
+              (__v4di) __B,
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_mask_shldv_epi64(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpshldvq128_mask ((__v2di) __S,
+              (__v2di) __A,
+              (__v2di) __B,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_maskz_shldv_epi64(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpshldvq128_maskz ((__v2di) __S,
+              (__v2di) __A,
+              (__v2di) __B,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_shldv_epi64(__m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpshldvq128_mask ((__v2di) __S,
+              (__v2di) __A,
+              (__v2di) __B,
+              (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_shldv_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpshldvd256_mask ((__v8si) __S,
+              (__v8si) __A,
+              (__v8si) __B,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_shldv_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpshldvd256_maskz ((__v8si) __S,
+              (__v8si) __A,
+              (__v8si) __B,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_shldv_epi32(__m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpshldvd256_mask ((__v8si) __S,
+              (__v8si) __A,
+              (__v8si) __B,
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_mask_shldv_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpshldvd128_mask ((__v4si) __S,
+              (__v4si) __A,
+              (__v4si) __B,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_maskz_shldv_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpshldvd128_maskz ((__v4si) __S,
+              (__v4si) __A,
+              (__v4si) __B,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_shldv_epi32(__m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpshldvd128_mask ((__v4si) __S,
+              (__v4si) __A,
+              (__v4si) __B,
+              (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_shldv_epi16(__m256i __S, __mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpshldvw256_mask ((__v16hi) __S,
+              (__v16hi) __A,
+              (__v16hi) __B,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_shldv_epi16(__mmask16 __U, __m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpshldvw256_maskz ((__v16hi) __S,
+              (__v16hi) __A,
+              (__v16hi) __B,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_shldv_epi16(__m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpshldvw256_mask ((__v16hi) __S,
+              (__v16hi) __A,
+              (__v16hi) __B,
+              (__mmask16) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_mask_shldv_epi16(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpshldvw128_mask ((__v8hi) __S,
+              (__v8hi) __A,
+              (__v8hi) __B,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_maskz_shldv_epi16(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpshldvw128_maskz ((__v8hi) __S,
+              (__v8hi) __A,
+              (__v8hi) __B,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_shldv_epi16(__m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpshldvw128_mask ((__v8hi) __S,
+              (__v8hi) __A,
+              (__v8hi) __B,
+              (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_shrdv_epi64(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpshrdvq256_mask ((__v4di) __S,
+              (__v4di) __A,
+              (__v4di) __B,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_shrdv_epi64(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpshrdvq256_maskz ((__v4di) __S,
+              (__v4di) __A,
+              (__v4di) __B,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_shrdv_epi64(__m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpshrdvq256_mask ((__v4di) __S,
+              (__v4di) __A,
+              (__v4di) __B,
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_mask_shrdv_epi64(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpshrdvq128_mask ((__v2di) __S,
+              (__v2di) __A,
+              (__v2di) __B,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_maskz_shrdv_epi64(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpshrdvq128_maskz ((__v2di) __S,
+              (__v2di) __A,
+              (__v2di) __B,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_shrdv_epi64(__m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpshrdvq128_mask ((__v2di) __S,
+              (__v2di) __A,
+              (__v2di) __B,
+              (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_shrdv_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpshrdvd256_mask ((__v8si) __S,
+              (__v8si) __A,
+              (__v8si) __B,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_shrdv_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpshrdvd256_maskz ((__v8si) __S,
+              (__v8si) __A,
+              (__v8si) __B,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_shrdv_epi32(__m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpshrdvd256_mask ((__v8si) __S,
+              (__v8si) __A,
+              (__v8si) __B,
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_mask_shrdv_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpshrdvd128_mask ((__v4si) __S,
+              (__v4si) __A,
+              (__v4si) __B,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_maskz_shrdv_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpshrdvd128_maskz ((__v4si) __S,
+              (__v4si) __A,
+              (__v4si) __B,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_shrdv_epi32(__m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpshrdvd128_mask ((__v4si) __S,
+              (__v4si) __A,
+              (__v4si) __B,
+              (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_shrdv_epi16(__m256i __S, __mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpshrdvw256_mask ((__v16hi) __S,
+              (__v16hi) __A,
+              (__v16hi) __B,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_shrdv_epi16(__mmask16 __U, __m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpshrdvw256_maskz ((__v16hi) __S,
+              (__v16hi) __A,
+              (__v16hi) __B,
+              __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_shrdv_epi16(__m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpshrdvw256_mask ((__v16hi) __S,
+              (__v16hi) __A,
+              (__v16hi) __B,
+              (__mmask16) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_mask_shrdv_epi16(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpshrdvw128_mask ((__v8hi) __S,
+              (__v8hi) __A,
+              (__v8hi) __B,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_maskz_shrdv_epi16(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpshrdvw128_maskz ((__v8hi) __S,
+              (__v8hi) __A,
+              (__v8hi) __B,
+              __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_shrdv_epi16(__m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpshrdvw128_mask ((__v8hi) __S,
+              (__v8hi) __A,
+              (__v8hi) __B,
+              (__mmask8) -1);
+}
+
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif

+ 254 - 0
demo/include/avx512vlvnniintrin.h

@@ -0,0 +1,254 @@
+/*===------------- avx512vlvnniintrin.h - VNNI intrinsics ------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vlvnniintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VLVNNIINTRIN_H
+#define __AVX512VLVNNIINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni")))
+
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpdpbusd256_mask ((__v8si) __S,
+              (__v8si) __A,
+              (__v8si) __B,
+              (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpdpbusd256_maskz ((__v8si) __S,
+              (__v8si) __A,
+              (__v8si) __B,
+              (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpdpbusd256_mask ((__v8si) __S,
+              (__v8si) __A,
+              (__v8si) __B,
+              (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpdpbusds256_mask ((__v8si) __S,
+              (__v8si) __A,
+              (__v8si) __B,
+              (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpdpbusds256_maskz ((__v8si) __S,
+              (__v8si) __A,
+              (__v8si) __B,
+              (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpdpbusds256_mask ((__v8si) __S,
+              (__v8si) __A,
+              (__v8si) __B,
+              (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpdpwssd256_mask ((__v8si) __S,
+              (__v8si) __A,
+              (__v8si) __B,
+              (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpdpwssd256_maskz ((__v8si) __S,
+              (__v8si) __A,
+              (__v8si) __B,
+              (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpdpwssd256_mask ((__v8si) __S,
+              (__v8si) __A,
+              (__v8si) __B,
+              (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpdpwssds256_mask ((__v8si) __S,
+              (__v8si) __A,
+              (__v8si) __B,
+              (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpdpwssds256_maskz ((__v8si) __S,
+              (__v8si) __A,
+              (__v8si) __B,
+              (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpdpwssds256_mask ((__v8si) __S,
+              (__v8si) __A,
+              (__v8si) __B,
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpdpbusd128_mask ((__v4si) __S,
+              (__v4si) __A,
+              (__v4si) __B,
+              (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpdpbusd128_maskz ((__v4si) __S,
+              (__v4si) __A,
+              (__v4si) __B,
+              (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpdpbusd128_mask ((__v4si) __S,
+              (__v4si) __A,
+              (__v4si) __B,
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpdpbusds128_mask ((__v4si) __S,
+              (__v4si) __A,
+              (__v4si) __B,
+              (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpdpbusds128_maskz ((__v4si) __S,
+              (__v4si) __A,
+              (__v4si) __B,
+              (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpdpbusds128_mask ((__v4si) __S,
+              (__v4si) __A,
+              (__v4si) __B,
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpdpwssd128_mask ((__v4si) __S,
+              (__v4si) __A,
+              (__v4si) __B,
+              (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpdpwssd128_maskz ((__v4si) __S,
+              (__v4si) __A,
+              (__v4si) __B,
+              (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpdpwssd128_mask ((__v4si) __S,
+              (__v4si) __A,
+              (__v4si) __B,
+              (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpdpwssds128_mask ((__v4si) __S,
+              (__v4si) __A,
+              (__v4si) __B,
+              (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_maskz_dpwssds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpdpwssds128_maskz ((__v4si) __S,
+              (__v4si) __A,
+              (__v4si) __B,
+              (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm128_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpdpwssds128_mask ((__v4si) __S,
+              (__v4si) __A,
+              (__v4si) __B,
+              (__mmask8) -1);
+}
+
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif

+ 146 - 0
demo/include/avx512vnniintrin.h

@@ -0,0 +1,146 @@
+/*===------------- avx512vnniintrin.h - VNNI intrinsics ------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vnniintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VNNIINTRIN_H
+#define __AVX512VNNIINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vnni")))
+
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_dpbusd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpdpbusd512_mask ((__v16si) __S,
+              (__v16si) __A,
+              (__v16si) __B,
+              (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_dpbusd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpdpbusd512_maskz ((__v16si) __S,
+              (__v16si) __A,
+              (__v16si) __B,
+              (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_dpbusd_epi32(__m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpdpbusd512_mask ((__v16si) __S,
+              (__v16si) __A,
+              (__v16si) __B,
+              (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_dpbusds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpdpbusds512_mask ((__v16si) __S,
+              (__v16si) __A,
+              (__v16si) __B,
+              (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_dpbusds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpdpbusds512_maskz ((__v16si) __S,
+              (__v16si) __A,
+              (__v16si) __B,
+              (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_dpbusds_epi32(__m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpdpbusds512_mask ((__v16si) __S,
+              (__v16si) __A,
+              (__v16si) __B,
+              (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_dpwssd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpdpwssd512_mask ((__v16si) __S,
+              (__v16si) __A,
+              (__v16si) __B,
+              (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_dpwssd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpdpwssd512_maskz ((__v16si) __S,
+              (__v16si) __A,
+              (__v16si) __B,
+              (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_dpwssd_epi32(__m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpdpwssd512_mask ((__v16si) __S,
+              (__v16si) __A,
+              (__v16si) __B,
+              (__mmask16) -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_dpwssds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpdpwssds512_mask ((__v16si) __S,
+              (__v16si) __A,
+              (__v16si) __B,
+              (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_dpwssds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpdpwssds512_maskz ((__v16si) __S,
+              (__v16si) __A,
+              (__v16si) __B,
+              (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_dpwssds_epi32(__m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpdpwssds512_mask ((__v16si) __S,
+              (__v16si) __A,
+              (__v16si) __B,
+              (__mmask16) -1);
+}
+
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif

+ 70 - 0
demo/include/avx512vpopcntdqintrin.h

@@ -0,0 +1,70 @@
+/*===------------- avx512vpopcntdqintrin.h - AVX512VPOPCNTDQ intrinsics
+ *------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error                                                                         \
+    "Never use <avx512vpopcntdqintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VPOPCNTDQINTRIN_H
+#define __AVX512VPOPCNTDQINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("avx512vpopcntd"   \
+                                                            "q")))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_popcnt_epi64(__m512i __A) {
+  return (__m512i)__builtin_ia32_vpopcntq_512((__v8di)__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_popcnt_epi64(__m512i __W, __mmask8 __U, __m512i __A) {
+  return (__m512i)__builtin_ia32_selectq_512(
+      (__mmask8)__U, (__v8di)_mm512_popcnt_epi64(__A), (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_popcnt_epi64(__mmask8 __U, __m512i __A) {
+  return _mm512_mask_popcnt_epi64((__m512i)_mm512_setzero_si512(), __U, __A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_popcnt_epi32(__m512i __A) {
+  return (__m512i)__builtin_ia32_vpopcntd_512((__v16si)__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_popcnt_epi32(__m512i __W, __mmask16 __U, __m512i __A) {
+  return (__m512i)__builtin_ia32_selectd_512(
+      (__mmask16)__U, (__v16si)_mm512_popcnt_epi32(__A), (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_popcnt_epi32(__mmask16 __U, __m512i __A) {
+  return _mm512_mask_popcnt_epi32((__m512i)_mm512_setzero_si512(), __U, __A);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif

+ 99 - 0
demo/include/avx512vpopcntdqvlintrin.h

@@ -0,0 +1,99 @@
+/*===------------- avx512vpopcntdqintrin.h - AVX512VPOPCNTDQ intrinsics
+ *------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error                                                                         \
+    "Never use <avx512vpopcntdqvlintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VPOPCNTDQVLINTRIN_H
+#define __AVX512VPOPCNTDQVLINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("avx512vpopcntdq,avx512vl")))
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_popcnt_epi64(__m128i __A) {
+  return (__m128i)__builtin_ia32_vpopcntq_128((__v2di)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_popcnt_epi64(__m128i __W, __mmask8 __U, __m128i __A) {
+  return (__m128i)__builtin_ia32_selectq_128(
+      (__mmask8)__U, (__v2di)_mm_popcnt_epi64(__A), (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_popcnt_epi64(__mmask8 __U, __m128i __A) {
+  return _mm_mask_popcnt_epi64((__m128i)_mm_setzero_si128(), __U, __A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_popcnt_epi32(__m128i __A) {
+  return (__m128i)__builtin_ia32_vpopcntd_128((__v4si)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_popcnt_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
+  return (__m128i)__builtin_ia32_selectd_128(
+      (__mmask8)__U, (__v4si)_mm_popcnt_epi32(__A), (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_popcnt_epi32(__mmask8 __U, __m128i __A) {
+  return _mm_mask_popcnt_epi32((__m128i)_mm_setzero_si128(), __U, __A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_popcnt_epi64(__m256i __A) {
+  return (__m256i)__builtin_ia32_vpopcntq_256((__v4di)__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_popcnt_epi64(__m256i __W, __mmask8 __U, __m256i __A) {
+  return (__m256i)__builtin_ia32_selectq_256(
+      (__mmask8)__U, (__v4di)_mm256_popcnt_epi64(__A), (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_popcnt_epi64(__mmask8 __U, __m256i __A) {
+  return _mm256_mask_popcnt_epi64((__m256i)_mm256_setzero_si256(), __U, __A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_popcnt_epi32(__m256i __A) {
+  return (__m256i)__builtin_ia32_vpopcntd_256((__v8si)__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_popcnt_epi32(__m256i __W, __mmask8 __U, __m256i __A) {
+  return (__m256i)__builtin_ia32_selectd_256(
+      (__mmask8)__U, (__v8si)_mm256_popcnt_epi32(__A), (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_popcnt_epi32(__mmask8 __U, __m256i __A) {
+  return _mm256_mask_popcnt_epi32((__m256i)_mm256_setzero_si256(), __U, __A);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif

文件差異過大導致無法顯示
+ 5162 - 0
demo/include/avxintrin.h


+ 95 - 0
demo/include/bmi2intrin.h

@@ -0,0 +1,95 @@
+/*===---- bmi2intrin.h - BMI2 intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <bmi2intrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __BMI2INTRIN_H
+#define __BMI2INTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("bmi2")))
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_bzhi_u32(unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_bzhi_si(__X, __Y);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_pdep_u32(unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_pdep_si(__X, __Y);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_pext_u32(unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_pext_si(__X, __Y);
+}
+
+#ifdef  __x86_64__
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_bzhi_u64(unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_bzhi_di(__X, __Y);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_pdep_u64(unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_pdep_di(__X, __Y);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_pext_u64(unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_pext_di(__X, __Y);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_mulx_u64 (unsigned long long __X, unsigned long long __Y,
+	   unsigned long long *__P)
+{
+  unsigned __int128 __res = (unsigned __int128) __X * __Y;
+  *__P = (unsigned long long) (__res >> 64);
+  return (unsigned long long) __res;
+}
+
+#else /* !__x86_64__ */
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mulx_u32 (unsigned int __X, unsigned int __Y, unsigned int *__P)
+{
+  unsigned long long __res = (unsigned long long) __X * __Y;
+  *__P = (unsigned int) (__res >> 32);
+  return (unsigned int) __res;
+}
+
+#endif /* !__x86_64__  */
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __BMI2INTRIN_H */

+ 382 - 0
demo/include/bmiintrin.h

@@ -0,0 +1,382 @@
+/*===---- bmiintrin.h - BMI intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <bmiintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __BMIINTRIN_H
+#define __BMIINTRIN_H
+
+#define _tzcnt_u16(a)     (__tzcnt_u16((a)))
+
+#define _andn_u32(a, b)   (__andn_u32((a), (b)))
+
+/* _bextr_u32 != __bextr_u32 */
+#define _blsi_u32(a)      (__blsi_u32((a)))
+
+#define _blsmsk_u32(a)    (__blsmsk_u32((a)))
+
+#define _blsr_u32(a)      (__blsr_u32((a)))
+
+#define _tzcnt_u32(a)     (__tzcnt_u32((a)))
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("bmi")))
+
+/* Allow using the tzcnt intrinsics even for non-BMI targets. Since the TZCNT
+   instruction behaves as BSF on non-BMI targets, there is code that expects
+   to use it as a potentially faster version of BSF. */
+#define __RELAXED_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
+
+/// \brief Counts the number of trailing zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
+///
+/// \param __X
+///    An unsigned 16-bit integer whose trailing zeros are to be counted.
+/// \returns An unsigned 16-bit integer containing the number of trailing zero
+///    bits in the operand.
+static __inline__ unsigned short __RELAXED_FN_ATTRS
+__tzcnt_u16(unsigned short __X)
+{
+  return __X ? __builtin_ctzs(__X) : 16;
+}
+
+/// \brief Performs a bitwise AND of the second operand with the one's
+///    complement of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> ANDN </c> instruction.
+///
+/// \param __X
+///    An unsigned integer containing one of the operands.
+/// \param __Y
+///    An unsigned integer containing one of the operands.
+/// \returns An unsigned integer containing the bitwise AND of the second
+///    operand with the one's complement of the first operand.
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__andn_u32(unsigned int __X, unsigned int __Y)
+{
+  return ~__X & __Y;
+}
+
+/* AMD-specified, double-leading-underscore version of BEXTR */
+/// \brief Extracts the specified bits from the first operand and returns them
+///    in the least significant bits of the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
+///
+/// \param __X
+///    An unsigned integer whose bits are to be extracted.
+/// \param __Y
+///    An unsigned integer used to specify which bits are extracted. Bits [7:0]
+///    specify the index of the least significant bit. Bits [15:8] specify the
+///    number of bits to be extracted.
+/// \returns An unsigned integer whose least significant bits contain the
+///    extracted bits.
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__bextr_u32(unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_bextr_u32(__X, __Y);
+}
+
+/* Intel-specified, single-leading-underscore version of BEXTR */
+/// \brief Extracts the specified bits from the first operand and returns them
+///    in the least significant bits of the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
+///
+/// \param __X
+///    An unsigned integer whose bits are to be extracted.
+/// \param __Y
+///    An unsigned integer used to specify the index of the least significant
+///    bit for the bits to be extracted. Bits [7:0] specify the index.
+/// \param __Z
+///    An unsigned integer used to specify the number of bits to be extracted.
+///    Bits [7:0] specify the number of bits.
+/// \returns An unsigned integer whose least significant bits contain the
+///    extracted bits.
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_bextr_u32(unsigned int __X, unsigned int __Y, unsigned int __Z)
+{
+  return __builtin_ia32_bextr_u32 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
+}
+
+/// \brief Clears all bits in the source except for the least significant bit
+///    containing a value of 1 and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> BLSI </c> instruction.
+///
+/// \param __X
+///    An unsigned integer whose bits are to be cleared.
+/// \returns An unsigned integer containing the result of clearing the bits from
+///    the source operand.
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__blsi_u32(unsigned int __X)
+{
+  return __X & -__X;
+}
+
+/// \brief Creates a mask whose bits are set to 1, using bit 0 up to and
+///    including the least significant bit that is set to 1 in the source
+///    operand and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> BLSMSK </c> instruction.
+///
+/// \param __X
+///    An unsigned integer used to create the mask.
+/// \returns An unsigned integer containing the newly created mask.
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__blsmsk_u32(unsigned int __X)
+{
+  return __X ^ (__X - 1);
+}
+
+/// \brief Clears the least significant bit that is set to 1 in the source
+///    operand and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> BLSR </c> instruction.
+///
+/// \param __X
+///    An unsigned integer containing the operand to be cleared.
+/// \returns An unsigned integer containing the result of clearing the source
+///    operand.
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__blsr_u32(unsigned int __X)
+{
+  return __X & (__X - 1);
+}
+
+/// \brief Counts the number of trailing zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
+///
+/// \param __X
+///    An unsigned 32-bit integer whose trailing zeros are to be counted.
+/// \returns An unsigned 32-bit integer containing the number of trailing zero
+///    bits in the operand.
+static __inline__ unsigned int __RELAXED_FN_ATTRS
+__tzcnt_u32(unsigned int __X)
+{
+  return __X ? __builtin_ctz(__X) : 32;
+}
+
+/// \brief Counts the number of trailing zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
+///
+/// \param __X
+///    An unsigned 32-bit integer whose trailing zeros are to be counted.
+/// \returns An 32-bit integer containing the number of trailing zero bits in
+///    the operand.
+static __inline__ int __RELAXED_FN_ATTRS
+_mm_tzcnt_32(unsigned int __X)
+{
+  return __X ? __builtin_ctz(__X) : 32;
+}
+
+#ifdef __x86_64__
+
+#define _andn_u64(a, b)   (__andn_u64((a), (b)))
+
+/* _bextr_u64 != __bextr_u64 */
+#define _blsi_u64(a)      (__blsi_u64((a)))
+
+#define _blsmsk_u64(a)    (__blsmsk_u64((a)))
+
+#define _blsr_u64(a)      (__blsr_u64((a)))
+
+#define _tzcnt_u64(a)     (__tzcnt_u64((a)))
+
+/// \brief Performs a bitwise AND of the second operand with the one's
+///    complement of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> ANDN </c> instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer containing one of the operands.
+/// \param __Y
+///    An unsigned 64-bit integer containing one of the operands.
+/// \returns An unsigned 64-bit integer containing the bitwise AND of the second
+///    operand with the one's complement of the first operand.
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__andn_u64 (unsigned long long __X, unsigned long long __Y)
+{
+  return ~__X & __Y;
+}
+
+/* AMD-specified, double-leading-underscore version of BEXTR */
+/// \brief Extracts the specified bits from the first operand and returns them
+///    in the least significant bits of the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer whose bits are to be extracted.
+/// \param __Y
+///    An unsigned 64-bit integer used to specify which bits are extracted. Bits
+///    [7:0] specify the index of the least significant bit. Bits [15:8] specify
+///    the number of bits to be extracted.
+/// \returns An unsigned 64-bit integer whose least significant bits contain the
+///    extracted bits.
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__bextr_u64(unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_bextr_u64(__X, __Y);
+}
+
+/* Intel-specified, single-leading-underscore version of BEXTR */
+/// \brief Extracts the specified bits from the first operand and returns them
+///     in the least significant bits of the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer whose bits are to be extracted.
+/// \param __Y
+///    An unsigned integer used to specify the index of the least significant
+///    bit for the bits to be extracted. Bits [7:0] specify the index.
+/// \param __Z
+///    An unsigned integer used to specify the number of bits to be extracted.
+///    Bits [7:0] specify the number of bits.
+/// \returns An unsigned 64-bit integer whose least significant bits contain the
+///    extracted bits.
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_bextr_u64(unsigned long long __X, unsigned int __Y, unsigned int __Z)
+{
+  return __builtin_ia32_bextr_u64 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
+}
+
+/// \brief Clears all bits in the source except for the least significant bit
+///    containing a value of 1 and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> BLSI </c> instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer whose bits are to be cleared.
+/// \returns An unsigned 64-bit integer containing the result of clearing the
+///    bits from the source operand.
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__blsi_u64(unsigned long long __X)
+{
+  return __X & -__X;
+}
+
+/// \brief Creates a mask whose bits are set to 1, using bit 0 up to and
+///    including the least significant bit that is set to 1 in the source
+///    operand and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> BLSMSK </c> instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer used to create the mask.
+/// \returns An unsigned 64-bit integer containing the newly created mask.
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__blsmsk_u64(unsigned long long __X)
+{
+  return __X ^ (__X - 1);
+}
+
+/// \brief Clears the least significant bit that is set to 1 in the source
+///    operand and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> BLSR </c> instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer containing the operand to be cleared.
+/// \returns An unsigned 64-bit integer containing the result of clearing the
+///    source operand.
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__blsr_u64(unsigned long long __X)
+{
+  return __X & (__X - 1);
+}
+
+/// \brief Counts the number of trailing zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer whose trailing zeros are to be counted.
+/// \returns An unsigned 64-bit integer containing the number of trailing zero
+///    bits in the operand.
+static __inline__ unsigned long long __RELAXED_FN_ATTRS
+__tzcnt_u64(unsigned long long __X)
+{
+  return __X ? __builtin_ctzll(__X) : 64;
+}
+
+/// \brief Counts the number of trailing zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer whose trailing zeros are to be counted.
+/// \returns An 64-bit integer containing the number of trailing zero bits in
+///    the operand.
+static __inline__ long long __RELAXED_FN_ATTRS
+_mm_tzcnt_64(unsigned long long __X)
+{
+  return __X ? __builtin_ctzll(__X) : 64;
+}
+
+#endif /* __x86_64__ */
+
+#undef __DEFAULT_FN_ATTRS
+#undef __RELAXED_FN_ATTRS
+
+#endif /* __BMIINTRIN_H */

+ 93 - 0
demo/include/cetintrin.h

@@ -0,0 +1,93 @@
+/*===---- cetintrin.h - CET intrinsic ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <cetintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __CETINTRIN_H
+#define __CETINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("shstk")))
+
+static __inline__ void __DEFAULT_FN_ATTRS _incsspd(int __a) {
+  __builtin_ia32_incsspd(__a);
+}
+
+#ifdef __x86_64__
+static __inline__ void __DEFAULT_FN_ATTRS _incsspq(unsigned long long __a) {
+  __builtin_ia32_incsspq(__a);
+}
+#endif /* __x86_64__ */
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS _rdsspd(unsigned int __a) {
+  return __builtin_ia32_rdsspd(__a);
+}
+
+#ifdef __x86_64__
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS _rdsspq(unsigned long long __a) {
+  return __builtin_ia32_rdsspq(__a);
+}
+#endif /* __x86_64__ */
+
+static __inline__ void __DEFAULT_FN_ATTRS _saveprevssp() {
+  __builtin_ia32_saveprevssp();
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS _rstorssp(void * __p) {
+  __builtin_ia32_rstorssp(__p);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS _wrssd(unsigned int __a, void * __p) {
+  __builtin_ia32_wrssd(__a, __p);
+}
+
+#ifdef __x86_64__
+static __inline__ void __DEFAULT_FN_ATTRS _wrssq(unsigned long long __a, void * __p) {
+  __builtin_ia32_wrssq(__a, __p);
+}
+#endif /* __x86_64__ */
+
+static __inline__ void __DEFAULT_FN_ATTRS _wrussd(unsigned int __a, void * __p) {
+  __builtin_ia32_wrussd(__a, __p);
+}
+
+#ifdef __x86_64__
+static __inline__ void __DEFAULT_FN_ATTRS _wrussq(unsigned long long __a, void * __p) {
+  __builtin_ia32_wrussq(__a, __p);
+}
+#endif /* __x86_64__ */
+
+static __inline__ void __DEFAULT_FN_ATTRS _setssbsy() {
+  __builtin_ia32_setssbsy();
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS _clrssbsy(void * __p) {
+  __builtin_ia32_clrssbsy(__p);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __CETINTRIN_H */

+ 41 - 0
demo/include/clflushoptintrin.h

@@ -0,0 +1,41 @@
+/*===---- clflushoptintrin.h - CLFLUSHOPT intrinsic ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <clflushoptintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __CLFLUSHOPTINTRIN_H
+#define __CLFLUSHOPTINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("clflushopt")))
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_clflushopt(void const * __m) {
+  __builtin_ia32_clflushopt(__m);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif

+ 52 - 0
demo/include/clwbintrin.h

@@ -0,0 +1,52 @@
+/*===---- clwbintrin.h - CLWB intrinsic ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <clwbintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __CLWBINTRIN_H
+#define __CLWBINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("clwb")))
+
+/// \brief Writes back to memory the cache line (if modified) that contains the
+/// linear address specified in \a __p from any level of the cache hierarchy in
+/// the cache coherence domain
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> CLWB </c> instruction.
+///
+/// \param __p
+///    A pointer to the memory location used to identify the cache line to be
+///    written back.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_clwb(void const *__p) {
+  __builtin_ia32_clwb(__p);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif

+ 50 - 0
demo/include/clzerointrin.h

@@ -0,0 +1,50 @@
+/*===----------------------- clzerointrin.h - CLZERO ----------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __X86INTRIN_H
+#error "Never use <clzerointrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef _CLZEROINTRIN_H
+#define _CLZEROINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS \
+  __attribute__((__always_inline__, __nodebug__,  __target__("clzero")))
+
+/// \brief Loads the cache line address and zero's out the cacheline
+///
+/// \headerfile <clzerointrin.h>
+///
+/// This intrinsic corresponds to the <c> CLZERO </c> instruction.
+///
+/// \param __line
+///    A pointer to a cacheline which needs to be zeroed out.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_clzero (void * __line)
+{
+  __builtin_ia32_clzero ((void *)__line);
+}
+
+#undef __DEFAULT_FN_ATTRS 
+
+#endif /* _CLZEROINTRIN_H */

+ 302 - 0
demo/include/cpuid.h

@@ -0,0 +1,302 @@
+/*===---- cpuid.h - X86 cpu model detection --------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !(__x86_64__ || __i386__)
+#error this header is for x86 only
+#endif
+
+/* Responses identification request with %eax 0 */
+/* AMD:     "AuthenticAMD" */
+#define signature_AMD_ebx 0x68747541
+#define signature_AMD_edx 0x69746e65
+#define signature_AMD_ecx 0x444d4163
+/* CENTAUR: "CentaurHauls" */
+#define signature_CENTAUR_ebx 0x746e6543
+#define signature_CENTAUR_edx 0x48727561
+#define signature_CENTAUR_ecx 0x736c7561
+/* CYRIX:   "CyrixInstead" */
+#define signature_CYRIX_ebx 0x69727943
+#define signature_CYRIX_edx 0x736e4978
+#define signature_CYRIX_ecx 0x64616574
+/* INTEL:   "GenuineIntel" */
+#define signature_INTEL_ebx 0x756e6547
+#define signature_INTEL_edx 0x49656e69
+#define signature_INTEL_ecx 0x6c65746e
+/* TM1:     "TransmetaCPU" */
+#define signature_TM1_ebx 0x6e617254
+#define signature_TM1_edx 0x74656d73
+#define signature_TM1_ecx 0x55504361
+/* TM2:     "GenuineTMx86" */
+#define signature_TM2_ebx 0x756e6547
+#define signature_TM2_edx 0x54656e69
+#define signature_TM2_ecx 0x3638784d
+/* NSC:     "Geode by NSC" */
+#define signature_NSC_ebx 0x646f6547
+#define signature_NSC_edx 0x43534e20
+#define signature_NSC_ecx 0x79622065
+/* NEXGEN:  "NexGenDriven" */
+#define signature_NEXGEN_ebx 0x4778654e
+#define signature_NEXGEN_edx 0x72446e65
+#define signature_NEXGEN_ecx 0x6e657669
+/* RISE:    "RiseRiseRise" */
+#define signature_RISE_ebx 0x65736952
+#define signature_RISE_edx 0x65736952
+#define signature_RISE_ecx 0x65736952
+/* SIS:     "SiS SiS SiS " */
+#define signature_SIS_ebx 0x20536953
+#define signature_SIS_edx 0x20536953
+#define signature_SIS_ecx 0x20536953
+/* UMC:     "UMC UMC UMC " */
+#define signature_UMC_ebx 0x20434d55
+#define signature_UMC_edx 0x20434d55
+#define signature_UMC_ecx 0x20434d55
+/* VIA:     "VIA VIA VIA " */
+#define signature_VIA_ebx 0x20414956
+#define signature_VIA_edx 0x20414956
+#define signature_VIA_ecx 0x20414956
+/* VORTEX:  "Vortex86 SoC" */
+#define signature_VORTEX_ebx 0x74726f56
+#define signature_VORTEX_edx 0x36387865
+#define signature_VORTEX_ecx 0x436f5320
+
+/* Features in %ecx for leaf 1 */
+#define bit_SSE3        0x00000001
+#define bit_PCLMULQDQ   0x00000002
+#define bit_PCLMUL      bit_PCLMULQDQ   /* for gcc compat */
+#define bit_DTES64      0x00000004
+#define bit_MONITOR     0x00000008
+#define bit_DSCPL       0x00000010
+#define bit_VMX         0x00000020
+#define bit_SMX         0x00000040
+#define bit_EIST        0x00000080
+#define bit_TM2         0x00000100
+#define bit_SSSE3       0x00000200
+#define bit_CNXTID      0x00000400
+#define bit_FMA         0x00001000
+#define bit_CMPXCHG16B  0x00002000
+#define bit_xTPR        0x00004000
+#define bit_PDCM        0x00008000
+#define bit_PCID        0x00020000
+#define bit_DCA         0x00040000
+#define bit_SSE41       0x00080000
+#define bit_SSE4_1      bit_SSE41       /* for gcc compat */
+#define bit_SSE42       0x00100000
+#define bit_SSE4_2      bit_SSE42       /* for gcc compat */
+#define bit_x2APIC      0x00200000
+#define bit_MOVBE       0x00400000
+#define bit_POPCNT      0x00800000
+#define bit_TSCDeadline 0x01000000
+#define bit_AESNI       0x02000000
+#define bit_AES         bit_AESNI       /* for gcc compat */
+#define bit_XSAVE       0x04000000
+#define bit_OSXSAVE     0x08000000
+#define bit_AVX         0x10000000
+#define bit_F16C        0x20000000
+#define bit_RDRND       0x40000000
+
+/* Features in %edx for leaf 1 */
+#define bit_FPU         0x00000001
+#define bit_VME         0x00000002
+#define bit_DE          0x00000004
+#define bit_PSE         0x00000008
+#define bit_TSC         0x00000010
+#define bit_MSR         0x00000020
+#define bit_PAE         0x00000040
+#define bit_MCE         0x00000080
+#define bit_CX8         0x00000100
+#define bit_CMPXCHG8B   bit_CX8         /* for gcc compat */
+#define bit_APIC        0x00000200
+#define bit_SEP         0x00000800
+#define bit_MTRR        0x00001000
+#define bit_PGE         0x00002000
+#define bit_MCA         0x00004000
+#define bit_CMOV        0x00008000
+#define bit_PAT         0x00010000
+#define bit_PSE36       0x00020000
+#define bit_PSN         0x00040000
+#define bit_CLFSH       0x00080000
+#define bit_DS          0x00200000
+#define bit_ACPI        0x00400000
+#define bit_MMX         0x00800000
+#define bit_FXSR        0x01000000
+#define bit_FXSAVE      bit_FXSR        /* for gcc compat */
+#define bit_SSE         0x02000000
+#define bit_SSE2        0x04000000
+#define bit_SS          0x08000000
+#define bit_HTT         0x10000000
+#define bit_TM          0x20000000
+#define bit_PBE         0x80000000
+
+/* Features in %ebx for leaf 7 sub-leaf 0 */
+#define bit_FSGSBASE    0x00000001
+#define bit_SGX         0x00000004
+#define bit_BMI         0x00000008
+#define bit_HLE         0x00000010
+#define bit_AVX2        0x00000020
+#define bit_SMEP        0x00000080
+#define bit_BMI2        0x00000100
+#define bit_ENH_MOVSB   0x00000200
+#define bit_RTM         0x00000800
+#define bit_MPX         0x00004000
+#define bit_AVX512F     0x00010000
+#define bit_AVX512DQ    0x00020000
+#define bit_RDSEED      0x00040000
+#define bit_ADX         0x00080000
+#define bit_AVX512IFMA  0x00200000
+#define bit_CLFLUSHOPT  0x00800000
+#define bit_CLWB        0x01000000
+#define bit_AVX512PF    0x04000000
+#define bit_AVX51SER    0x08000000
+#define bit_AVX512CD    0x10000000
+#define bit_SHA         0x20000000
+#define bit_AVX512BW    0x40000000
+#define bit_AVX512VL    0x80000000
+
+/* Features in %ecx for leaf 7 sub-leaf 0 */
+#define bit_PREFTCHWT1       0x00000001
+#define bit_AVX512VBMI       0x00000002
+#define bit_PKU              0x00000004
+#define bit_OSPKE            0x00000010
+#define bit_AVX512VBMI2      0x00000040
+#define bit_SHSTK            0x00000080
+#define bit_GFNI             0x00000100
+#define bit_VAES             0x00000200
+#define bit_VPCLMULQDQ       0x00000400
+#define bit_AVX512VNNI       0x00000800
+#define bit_AVX512BITALG     0x00001000
+#define bit_AVX512VPOPCNTDQ  0x00004000
+#define bit_RDPID            0x00400000
+
+/* Features in %edx for leaf 7 sub-leaf 0 */
+#define bit_AVX5124VNNIW  0x00000004
+#define bit_AVX5124FMAPS  0x00000008
+#define bit_IBT           0x00100000
+
+/* Features in %eax for leaf 13 sub-leaf 1 */
+#define bit_XSAVEOPT    0x00000001
+#define bit_XSAVEC      0x00000002
+#define bit_XSAVES      0x00000008
+
+/* Features in %ecx for leaf 0x80000001 */
+#define bit_LAHF_LM     0x00000001
+#define bit_ABM         0x00000020
+#define bit_LZCNT       bit_ABM        /* for gcc compat */
+#define bit_SSE4a       0x00000040
+#define bit_PRFCHW      0x00000100
+#define bit_XOP         0x00000800
+#define bit_LWP         0x00008000
+#define bit_FMA4        0x00010000
+#define bit_TBM         0x00200000
+#define bit_MWAITX      0x20000000
+
+/* Features in %edx for leaf 0x80000001 */
+#define bit_MMXEXT      0x00400000
+#define bit_LM          0x20000000
+#define bit_3DNOWP      0x40000000
+#define bit_3DNOW       0x80000000
+
+/* Features in %ebx for leaf 0x80000001 */
+#define bit_CLZERO      0x00000001
+
+
+#if __i386__
+#define __cpuid(__leaf, __eax, __ebx, __ecx, __edx) \
+    __asm("cpuid" : "=a"(__eax), "=b" (__ebx), "=c"(__ecx), "=d"(__edx) \
+                  : "0"(__leaf))
+
+#define __cpuid_count(__leaf, __count, __eax, __ebx, __ecx, __edx) \
+    __asm("cpuid" : "=a"(__eax), "=b" (__ebx), "=c"(__ecx), "=d"(__edx) \
+                  : "0"(__leaf), "2"(__count))
+#else
+/* x86-64 uses %rbx as the base register, so preserve it. */
+#define __cpuid(__leaf, __eax, __ebx, __ecx, __edx) \
+    __asm("  xchgq  %%rbx,%q1\n" \
+          "  cpuid\n" \
+          "  xchgq  %%rbx,%q1" \
+        : "=a"(__eax), "=r" (__ebx), "=c"(__ecx), "=d"(__edx) \
+        : "0"(__leaf))
+
+#define __cpuid_count(__leaf, __count, __eax, __ebx, __ecx, __edx) \
+    __asm("  xchgq  %%rbx,%q1\n" \
+          "  cpuid\n" \
+          "  xchgq  %%rbx,%q1" \
+        : "=a"(__eax), "=r" (__ebx), "=c"(__ecx), "=d"(__edx) \
+        : "0"(__leaf), "2"(__count))
+#endif
+
+static __inline int __get_cpuid_max (unsigned int __leaf, unsigned int *__sig)
+{
+    unsigned int __eax, __ebx, __ecx, __edx;
+#if __i386__
+    int __cpuid_supported;
+
+    __asm("  pushfl\n"
+          "  popl   %%eax\n"
+          "  movl   %%eax,%%ecx\n"
+          "  xorl   $0x00200000,%%eax\n"
+          "  pushl  %%eax\n"
+          "  popfl\n"
+          "  pushfl\n"
+          "  popl   %%eax\n"
+          "  movl   $0,%0\n"
+          "  cmpl   %%eax,%%ecx\n"
+          "  je     1f\n"
+          "  movl   $1,%0\n"
+          "1:"
+        : "=r" (__cpuid_supported) : : "eax", "ecx");
+    if (!__cpuid_supported)
+        return 0;
+#endif
+
+    __cpuid(__leaf, __eax, __ebx, __ecx, __edx);
+    if (__sig)
+        *__sig = __ebx;
+    return __eax;
+}
+
+static __inline int __get_cpuid (unsigned int __leaf, unsigned int *__eax,
+                                 unsigned int *__ebx, unsigned int *__ecx,
+                                 unsigned int *__edx)
+{
+    unsigned int __max_leaf = __get_cpuid_max(__leaf & 0x80000000, 0);
+
+    if (__max_leaf == 0 || __max_leaf < __leaf)
+        return 0;
+
+    __cpuid(__leaf, *__eax, *__ebx, *__ecx, *__edx);
+    return 1;
+}
+
+static __inline int __get_cpuid_count (unsigned int __leaf,
+                                       unsigned int __subleaf,
+                                       unsigned int *__eax, unsigned int *__ebx,
+                                       unsigned int *__ecx, unsigned int *__edx)
+{
+    unsigned int __max_leaf = __get_cpuid_max(__leaf & 0x80000000, 0);
+
+    if (__max_leaf == 0 || __max_leaf < __leaf)
+        return 0;
+
+    __cpuid_count(__leaf, __subleaf, *__eax, *__ebx, *__ecx, *__edx);
+    return 1;
+}

+ 96 - 0
demo/include/cuda_wrappers/algorithm

@@ -0,0 +1,96 @@
+/*===---- complex - CUDA wrapper for <algorithm> ----------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_CUDA_WRAPPERS_ALGORITHM
+#define __CLANG_CUDA_WRAPPERS_ALGORITHM
+
+// This header defines __device__ overloads of std::min/max, but only if we're
+// <= C++11.  In C++14, these functions are constexpr, and so are implicitly
+// __host__ __device__.
+//
+// We don't support the initializer_list overloads because
+// initializer_list::begin() and end() are not __host__ __device__ functions.
+//
+// When compiling in C++14 mode, we could force std::min/max to have different
+// implementations for host and device, by declaring the device overloads
+// before the constexpr overloads appear.  We choose not to do this because
+
+//  a) why write our own implementation when we can use one from the standard
+//     library? and
+//  b) libstdc++ is evil and declares min/max inside a header that is included
+//     *before* we include <algorithm>.  So we'd have to unconditionally
+//     declare our __device__ overloads of min/max, but that would pollute
+//     things for people who choose not to include <algorithm>.
+
+#include_next <algorithm>
+
+#if __cplusplus <= 201103L
+
+// We need to define these overloads in exactly the namespace our standard
+// library uses (including the right inline namespace), otherwise they won't be
+// picked up by other functions in the standard library (e.g. functions in
+// <complex>).  Thus the ugliness below.
+#ifdef _LIBCPP_BEGIN_NAMESPACE_STD
+_LIBCPP_BEGIN_NAMESPACE_STD
+#else
+namespace std {
+#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
+_GLIBCXX_BEGIN_NAMESPACE_VERSION
+#endif
+#endif
+
+template <class __T, class __Cmp>
+inline __device__ const __T &
+max(const __T &__a, const __T &__b, __Cmp __cmp) {
+  return __cmp(__a, __b) ? __b : __a;
+}
+
+template <class __T>
+inline __device__ const __T &
+max(const __T &__a, const __T &__b) {
+  return __a < __b ? __b : __a;
+}
+
+template <class __T, class __Cmp>
+inline __device__ const __T &
+min(const __T &__a, const __T &__b, __Cmp __cmp) {
+  return __cmp(__b, __a) ? __b : __a;
+}
+
+template <class __T>
+inline __device__ const __T &
+min(const __T &__a, const __T &__b) {
+  return __a < __b ? __a : __b;
+}
+
+#ifdef _LIBCPP_END_NAMESPACE_STD
+_LIBCPP_END_NAMESPACE_STD
+#else
+#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
+_GLIBCXX_END_NAMESPACE_VERSION
+#endif
+} // namespace std
+#endif
+
+#endif // __cplusplus <= 201103L
+#endif // __CLANG_CUDA_WRAPPERS_ALGORITHM

+ 82 - 0
demo/include/cuda_wrappers/complex

@@ -0,0 +1,82 @@
+/*===---- complex - CUDA wrapper for <complex> ------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_CUDA_WRAPPERS_COMPLEX
+#define __CLANG_CUDA_WRAPPERS_COMPLEX
+
+// Wrapper around <complex> that forces its functions to be __host__
+// __device__.
+
+// First, include host-only headers we think are likely to be included by
+// <complex>, so that the pragma below only applies to <complex> itself.
+#if __cplusplus >= 201103L
+#include <type_traits>
+#endif
+#include <stdexcept>
+#include <cmath>
+#include <sstream>
+
+// Next, include our <algorithm> wrapper, to ensure that device overloads of
+// std::min/max are available.
+#include <algorithm>
+
+#pragma clang force_cuda_host_device begin
+
+// When compiling for device, ask libstdc++ to use its own implements of
+// complex functions, rather than calling builtins (which resolve to library
+// functions that don't exist when compiling CUDA device code).
+//
+// This is a little dicey, because it causes libstdc++ to define a different
+// set of overloads on host and device.
+//
+//   // Present only when compiling for host.
+//   __host__ __device__ void complex<float> sin(const complex<float>& x) {
+//     return __builtin_csinf(x);
+//   }
+//
+//   // Present when compiling for host and for device.
+//   template <typename T>
+//   void __host__ __device__ complex<T> sin(const complex<T>& x) {
+//     return complex<T>(sin(x.real()) * cosh(x.imag()),
+//                       cos(x.real()), sinh(x.imag()));
+//   }
+//
+// This is safe because when compiling for device, all function calls in
+// __host__ code to sin() will still resolve to *something*, even if they don't
+// resolve to the same function as they resolve to when compiling for host.  We
+// don't care that they don't resolve to the right function because we won't
+// codegen this host code when compiling for device.
+
+#pragma push_macro("_GLIBCXX_USE_C99_COMPLEX")
+#pragma push_macro("_GLIBCXX_USE_C99_COMPLEX_TR1")
+#define _GLIBCXX_USE_C99_COMPLEX 0
+#define _GLIBCXX_USE_C99_COMPLEX_TR1 0
+
+#include_next <complex>
+
+#pragma pop_macro("_GLIBCXX_USE_C99_COMPLEX_TR1")
+#pragma pop_macro("_GLIBCXX_USE_C99_COMPLEX")
+
+#pragma clang force_cuda_host_device end
+
+#endif // include guard

+ 96 - 0
demo/include/cuda_wrappers/new

@@ -0,0 +1,96 @@
+/*===---- complex - CUDA wrapper for <new> ------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_CUDA_WRAPPERS_NEW
+#define __CLANG_CUDA_WRAPPERS_NEW
+
+#include_next <new>
+
+#pragma push_macro("CUDA_NOEXCEPT")
+#if __cplusplus >= 201103L
+#define CUDA_NOEXCEPT noexcept
+#else
+#define CUDA_NOEXCEPT
+#endif
+
+// Device overrides for non-placement new and delete.
+__device__ inline void *operator new(__SIZE_TYPE__ size) {
+  if (size == 0) {
+    size = 1;
+  }
+  return ::malloc(size);
+}
+__device__ inline void *operator new(__SIZE_TYPE__ size,
+                                     const std::nothrow_t &) CUDA_NOEXCEPT {
+  return ::operator new(size);
+}
+
+__device__ inline void *operator new[](__SIZE_TYPE__ size) {
+  return ::operator new(size);
+}
+__device__ inline void *operator new[](__SIZE_TYPE__ size,
+                                       const std::nothrow_t &) {
+  return ::operator new(size);
+}
+
+__device__ inline void operator delete(void* ptr) CUDA_NOEXCEPT {
+  if (ptr) {
+    ::free(ptr);
+  }
+}
+__device__ inline void operator delete(void *ptr,
+                                       const std::nothrow_t &) CUDA_NOEXCEPT {
+  ::operator delete(ptr);
+}
+
+__device__ inline void operator delete[](void* ptr) CUDA_NOEXCEPT {
+  ::operator delete(ptr);
+}
+__device__ inline void operator delete[](void *ptr,
+                                         const std::nothrow_t &) CUDA_NOEXCEPT {
+  ::operator delete(ptr);
+}
+
+// Sized delete, C++14 only.
+#if __cplusplus >= 201402L
+__device__ void operator delete(void *ptr, __SIZE_TYPE__ size) CUDA_NOEXCEPT {
+  ::operator delete(ptr);
+}
+__device__ void operator delete[](void *ptr, __SIZE_TYPE__ size) CUDA_NOEXCEPT {
+  ::operator delete(ptr);
+}
+#endif
+
+// Device overrides for placement new and delete.
+__device__ inline void *operator new(__SIZE_TYPE__, void *__ptr) CUDA_NOEXCEPT {
+  return __ptr;
+}
+__device__ inline void *operator new[](__SIZE_TYPE__, void *__ptr) CUDA_NOEXCEPT {
+  return __ptr;
+}
+__device__ inline void operator delete(void *, void *) CUDA_NOEXCEPT {}
+__device__ inline void operator delete[](void *, void *) CUDA_NOEXCEPT {}
+
+#pragma pop_macro("CUDA_NOEXCEPT")
+
+#endif // include guard

文件差異過大導致無法顯示
+ 4951 - 0
demo/include/emmintrin.h


+ 124 - 0
demo/include/f16cintrin.h

@@ -0,0 +1,124 @@
+/*===---- f16cintrin.h - F16C intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __X86INTRIN_H && !defined __EMMINTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <f16cintrin.h> directly; include <emmintrin.h> instead."
+#endif
+
+#ifndef __F16CINTRIN_H
+#define __F16CINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS \
+  __attribute__((__always_inline__, __nodebug__, __target__("f16c")))
+
+/// \brief Converts a 16-bit half-precision float value into a 32-bit float
+///    value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction.
+///
+/// \param __a
+///    A 16-bit half-precision float value.
+/// \returns The converted 32-bit float value.
+static __inline float __DEFAULT_FN_ATTRS
+_cvtsh_ss(unsigned short __a)
+{
+  __v8hi v = {(short)__a, 0, 0, 0, 0, 0, 0, 0};
+  __v4sf r = __builtin_ia32_vcvtph2ps(v);
+  return r[0];
+}
+
+/// \brief Converts a 32-bit single-precision float value to a 16-bit
+///    half-precision float value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned short _cvtss_sh(float a, const int imm);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction.
+///
+/// \param a
+///    A 32-bit single-precision float value to be converted to a 16-bit
+///    half-precision float value.
+/// \param imm
+///    An immediate value controlling rounding using bits [2:0]: \n
+///    000: Nearest \n
+///    001: Down \n
+///    010: Up \n
+///    011: Truncate \n
+///    1XX: Use MXCSR.RC for rounding
+/// \returns The converted 16-bit half-precision float value.
+#define _cvtss_sh(a, imm) __extension__ ({ \
+  (unsigned short)(((__v8hi)__builtin_ia32_vcvtps2ph((__v4sf){a, 0, 0, 0}, \
+                                                     (imm)))[0]); })
+
+/// \brief Converts a 128-bit vector containing 32-bit float values into a
+///    128-bit vector containing 16-bit half-precision float values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_cvtps_ph(__m128 a, const int imm);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction.
+///
+/// \param a
+///    A 128-bit vector containing 32-bit float values.
+/// \param imm
+///    An immediate value controlling rounding using bits [2:0]: \n
+///    000: Nearest \n
+///    001: Down \n
+///    010: Up \n
+///    011: Truncate \n
+///    1XX: Use MXCSR.RC for rounding
+/// \returns A 128-bit vector containing converted 16-bit half-precision float
+///    values. The lower 64 bits are used to store the converted 16-bit
+///    half-precision floating-point values.
+#define _mm_cvtps_ph(a, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_vcvtps2ph((__v4sf)(__m128)(a), (imm)); })
+
+/// \brief Converts a 128-bit vector containing 16-bit half-precision float
+///    values into a 128-bit vector containing 32-bit float values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector containing 16-bit half-precision float values. The lower
+///    64 bits are used in the conversion.
+/// \returns A 128-bit vector of [4 x float] containing converted float values.
+static __inline __m128 __DEFAULT_FN_ATTRS
+_mm_cvtph_ps(__m128i __a)
+{
+  return (__m128)__builtin_ia32_vcvtph2ps((__v8hi)__a);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __F16CINTRIN_H */

+ 160 - 0
demo/include/float.h

@@ -0,0 +1,160 @@
+/*===---- float.h - Characteristics of floating point types ----------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __FLOAT_H
+#define __FLOAT_H
+
+/* If we're on MinGW, fall back to the system's float.h, which might have
+ * additional definitions provided for Windows.
+ * For more details see http://msdn.microsoft.com/en-us/library/y0ybw9fy.aspx
+ *
+ * Also fall back on Darwin to allow additional definitions and
+ * implementation-defined values.
+ */
+#if (defined(__APPLE__) || (defined(__MINGW32__) || defined(_MSC_VER))) && \
+    __STDC_HOSTED__ && __has_include_next(<float.h>)
+
+/* Prior to Apple's 10.7 SDK, float.h SDK header used to apply an extra level
+ * of #include_next<float.h> to keep Metrowerks compilers happy. Avoid this
+ * extra indirection.
+ */
+#ifdef __APPLE__
+#define _FLOAT_H_
+#endif
+
+#  include_next <float.h>
+
+/* Undefine anything that we'll be redefining below. */
+#  undef FLT_EVAL_METHOD
+#  undef FLT_ROUNDS
+#  undef FLT_RADIX
+#  undef FLT_MANT_DIG
+#  undef DBL_MANT_DIG
+#  undef LDBL_MANT_DIG
+#  if __STDC_VERSION__ >= 199901L || !defined(__STRICT_ANSI__)
+#    undef DECIMAL_DIG
+#  endif
+#  undef FLT_DIG
+#  undef DBL_DIG
+#  undef LDBL_DIG
+#  undef FLT_MIN_EXP
+#  undef DBL_MIN_EXP
+#  undef LDBL_MIN_EXP
+#  undef FLT_MIN_10_EXP
+#  undef DBL_MIN_10_EXP
+#  undef LDBL_MIN_10_EXP
+#  undef FLT_MAX_EXP
+#  undef DBL_MAX_EXP
+#  undef LDBL_MAX_EXP
+#  undef FLT_MAX_10_EXP
+#  undef DBL_MAX_10_EXP
+#  undef LDBL_MAX_10_EXP
+#  undef FLT_MAX
+#  undef DBL_MAX
+#  undef LDBL_MAX
+#  undef FLT_EPSILON
+#  undef DBL_EPSILON
+#  undef LDBL_EPSILON
+#  undef FLT_MIN
+#  undef DBL_MIN
+#  undef LDBL_MIN
+#  if __STDC_VERSION__ >= 201112L || !defined(__STRICT_ANSI__)
+#    undef FLT_TRUE_MIN
+#    undef DBL_TRUE_MIN
+#    undef LDBL_TRUE_MIN
+#    undef FLT_DECIMAL_DIG
+#    undef DBL_DECIMAL_DIG
+#    undef LDBL_DECIMAL_DIG
+#  endif
+#endif
+
+/* Characteristics of floating point types, C99 5.2.4.2.2 */
+
+#define FLT_EVAL_METHOD __FLT_EVAL_METHOD__
+#define FLT_ROUNDS (__builtin_flt_rounds())
+#define FLT_RADIX __FLT_RADIX__
+
+#define FLT_MANT_DIG __FLT_MANT_DIG__
+#define DBL_MANT_DIG __DBL_MANT_DIG__
+#define LDBL_MANT_DIG __LDBL_MANT_DIG__
+
+#if __STDC_VERSION__ >= 199901L || !defined(__STRICT_ANSI__)
+#  define DECIMAL_DIG __DECIMAL_DIG__
+#endif
+
+#define FLT_DIG __FLT_DIG__
+#define DBL_DIG __DBL_DIG__
+#define LDBL_DIG __LDBL_DIG__
+
+#define FLT_MIN_EXP __FLT_MIN_EXP__
+#define DBL_MIN_EXP __DBL_MIN_EXP__
+#define LDBL_MIN_EXP __LDBL_MIN_EXP__
+
+#define FLT_MIN_10_EXP __FLT_MIN_10_EXP__
+#define DBL_MIN_10_EXP __DBL_MIN_10_EXP__
+#define LDBL_MIN_10_EXP __LDBL_MIN_10_EXP__
+
+#define FLT_MAX_EXP __FLT_MAX_EXP__
+#define DBL_MAX_EXP __DBL_MAX_EXP__
+#define LDBL_MAX_EXP __LDBL_MAX_EXP__
+
+#define FLT_MAX_10_EXP __FLT_MAX_10_EXP__
+#define DBL_MAX_10_EXP __DBL_MAX_10_EXP__
+#define LDBL_MAX_10_EXP __LDBL_MAX_10_EXP__
+
+#define FLT_MAX __FLT_MAX__
+#define DBL_MAX __DBL_MAX__
+#define LDBL_MAX __LDBL_MAX__
+
+#define FLT_EPSILON __FLT_EPSILON__
+#define DBL_EPSILON __DBL_EPSILON__
+#define LDBL_EPSILON __LDBL_EPSILON__
+
+#define FLT_MIN __FLT_MIN__
+#define DBL_MIN __DBL_MIN__
+#define LDBL_MIN __LDBL_MIN__
+
+#if __STDC_VERSION__ >= 201112L || !defined(__STRICT_ANSI__)
+#  define FLT_TRUE_MIN __FLT_DENORM_MIN__
+#  define DBL_TRUE_MIN __DBL_DENORM_MIN__
+#  define LDBL_TRUE_MIN __LDBL_DENORM_MIN__
+#  define FLT_DECIMAL_DIG __FLT_DECIMAL_DIG__
+#  define DBL_DECIMAL_DIG __DBL_DECIMAL_DIG__
+#  define LDBL_DECIMAL_DIG __LDBL_DECIMAL_DIG__
+#endif
+
+#ifdef __STDC_WANT_IEC_60559_TYPES_EXT__
+#  define FLT16_MANT_DIG    __FLT16_MANT_DIG__
+#  define FLT16_DECIMAL_DIG __FLT16_DECIMAL_DIG__
+#  define FLT16_DIG         __FLT16_DIG__
+#  define FLT16_MIN_EXP     __FLT16_MIN_EXP__
+#  define FLT16_MIN_10_EXP  __FLT16_MIN_10_EXP__
+#  define FLT16_MAX_EXP     __FLT16_MAX_EXP__
+#  define FLT16_MAX_10_EXP  __FLT16_MAX_10_EXP__
+#  define FLT16_MAX         __FLT16_MAX__
+#  define FLT16_EPSILON     __FLT16_EPSILON__
+#  define FLT16_MIN         __FLT16_MIN__
+#  define FLT16_TRUE_MIN    __FLT16_TRUE_MIN__
+#endif /* __STDC_WANT_IEC_60559_TYPES_EXT__ */
+
+#endif /* __FLOAT_H */

+ 230 - 0
demo/include/fma4intrin.h

@@ -0,0 +1,230 @@
+/*===---- fma4intrin.h - FMA4 intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#error "Never use <fma4intrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __FMA4INTRIN_H
+#define __FMA4INTRIN_H
+
+#include <pmmintrin.h>
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("fma4")))
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_macc_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_macc_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_macc_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_macc_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_msub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_msub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_msub_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_msub_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_nmacc_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_nmacc_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_nmacc_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_nmacc_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_nmsub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_nmsub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_nmsub_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_nmsub_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_maddsub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_maddsub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_msubadd_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_msubadd_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_macc_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_macc_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_msub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_msub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_nmacc_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_nmacc_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_nmsub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_nmsub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_maddsub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_maddsub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_msubadd_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_msubadd_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __FMA4INTRIN_H */

+ 228 - 0
demo/include/fmaintrin.h

@@ -0,0 +1,228 @@
+/*===---- fma4intrin.h - FMA4 intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <fmaintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __FMAINTRIN_H
+#define __FMAINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("fma")))
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, -(__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, -(__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, -(__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __FMAINTRIN_H */

+ 105 - 0
demo/include/fxsrintrin.h

@@ -0,0 +1,105 @@
+/*===---- fxsrintrin.h - FXSR intrinsic ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <fxsrintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __FXSRINTRIN_H
+#define __FXSRINTRIN_H
+
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("fxsr")))
+
+/// \brief Saves the XMM, MMX, MXCSR and x87 FPU registers into a 512-byte
+///    memory region pointed to by the input parameter \a __p.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> FXSAVE </c> instruction.
+///
+/// \param __p
+///    A pointer to a 512-byte memory region. The beginning of this memory
+///    region should be aligned on a 16-byte boundary.
+static __inline__ void __DEFAULT_FN_ATTRS
+_fxsave(void *__p)
+{
+  return __builtin_ia32_fxsave(__p);
+}
+
+/// \brief Restores the XMM, MMX, MXCSR and x87 FPU registers from the 512-byte
+///    memory region pointed to by the input parameter \a __p. The contents of
+///    this memory region should have been written to by a previous \c _fxsave
+///    or \c _fxsave64 intrinsic.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> FXRSTOR </c> instruction.
+///
+/// \param __p
+///    A pointer to a 512-byte memory region. The beginning of this memory
+///    region should be aligned on a 16-byte boundary.
+static __inline__ void __DEFAULT_FN_ATTRS
+_fxrstor(void *__p)
+{
+  return __builtin_ia32_fxrstor(__p);
+}
+
+#ifdef __x86_64__
+/// \brief Saves the XMM, MMX, MXCSR and x87 FPU registers into a 512-byte
+///    memory region pointed to by the input parameter \a __p.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> FXSAVE64 </c> instruction.
+///
+/// \param __p
+///    A pointer to a 512-byte memory region. The beginning of this memory
+///    region should be aligned on a 16-byte boundary.
+static __inline__ void __DEFAULT_FN_ATTRS
+_fxsave64(void *__p)
+{
+  return __builtin_ia32_fxsave64(__p);
+}
+
+/// \brief Restores the XMM, MMX, MXCSR and x87 FPU registers from the 512-byte
+///    memory region pointed to by the input parameter \a __p. The contents of
+///    this memory region should have been written to by a previous \c _fxsave
+///    or \c _fxsave64 intrinsic.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> FXRSTOR64 </c> instruction.
+///
+/// \param __p
+///    A pointer to a 512-byte memory region. The beginning of this memory
+///    region should be aligned on a 16-byte boundary.
+static __inline__ void __DEFAULT_FN_ATTRS
+_fxrstor64(void *__p)
+{
+  return __builtin_ia32_fxrstor64(__p);
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif

+ 202 - 0
demo/include/gfniintrin.h

@@ -0,0 +1,202 @@
+/*===----------------- gfniintrin.h - GFNI intrinsics ----------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <gfniintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __GFNIINTRIN_H
+#define __GFNIINTRIN_H
+
+
+#define _mm_gf2p8affineinv_epi64_epi8(A, B, I) __extension__ ({                   \
+  (__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi((__v16qi)(__m128i)(A),          \
+                                                  (__v16qi)(__m128i)(B),          \
+                                                  (char)(I)); })
+
+#define _mm_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) __extension__ ({        \
+  (__m128i)__builtin_ia32_selectb_128((__mmask16)(U),                             \
+        (__v16qi)_mm_gf2p8affineinv_epi64_epi8(A, B, I),                          \
+        (__v16qi)(__m128i)(S)); })
+
+
+#define _mm_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) __extension__ ({          \
+  (__m128i)_mm_mask_gf2p8affineinv_epi64_epi8((__m128i)_mm_setzero_si128(),       \
+        U, A, B, I); })
+
+
+#define _mm256_gf2p8affineinv_epi64_epi8(A, B, I) __extension__ ({                \
+  (__m256i)__builtin_ia32_vgf2p8affineinvqb_v32qi((__v32qi)(__m256i)(A),          \
+                                                  (__v32qi)(__m256i)(B),          \
+                                                  (char)(I)); })
+
+#define _mm256_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) __extension__ ({     \
+   (__m256i)__builtin_ia32_selectb_256((__mmask32)(U),                            \
+        (__v32qi)_mm256_gf2p8affineinv_epi64_epi8(A, B, I),                       \
+        (__v32qi)(__m256i)(S)); })
+
+#define _mm256_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) __extension__ ({       \
+  (__m256i)_mm256_mask_gf2p8affineinv_epi64_epi8((__m256i)_mm256_setzero_si256(), \
+        U, A, B, I); })
+
+
+#define _mm512_gf2p8affineinv_epi64_epi8(A, B, I) __extension__ ({                \
+  (__m512i)__builtin_ia32_vgf2p8affineinvqb_v64qi((__v64qi)(__m512i)(A),          \
+                                                  (__v64qi)(__m512i)(B),          \
+                                                  (char)(I)); })
+
+#define _mm512_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) __extension__ ({     \
+   (__m512i)__builtin_ia32_selectb_512((__mmask64)(U),                            \
+        (__v64qi)_mm512_gf2p8affineinv_epi64_epi8(A, B, I),                       \
+        (__v64qi)(__m512i)(S)); })
+
+#define _mm512_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) __extension__ ({       \
+  (__m512i)_mm512_mask_gf2p8affineinv_epi64_epi8((__m512i)_mm512_setzero_qi(),    \
+        U, A, B, I); })
+
+#define _mm_gf2p8affine_epi64_epi8(A, B, I) __extension__ ({                      \
+  (__m128i)__builtin_ia32_vgf2p8affineqb_v16qi((__v16qi)(__m128i)(A),             \
+                                                  (__v16qi)(__m128i)(B),          \
+                                                  (char)(I)); })
+
+#define _mm_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) __extension__ ({           \
+  (__m128i)__builtin_ia32_selectb_128((__mmask16)(U),                             \
+        (__v16qi)_mm_gf2p8affine_epi64_epi8(A, B, I),                             \
+        (__v16qi)(__m128i)(S)); })
+
+
+#define _mm_maskz_gf2p8affine_epi64_epi8(U, A, B, I) __extension__ ({             \
+  (__m128i)_mm_mask_gf2p8affine_epi64_epi8((__m128i)_mm_setzero_si128(),          \
+        U, A, B, I); })
+
+
+#define _mm256_gf2p8affine_epi64_epi8(A, B, I) __extension__ ({                   \
+  (__m256i)__builtin_ia32_vgf2p8affineqb_v32qi((__v32qi)(__m256i)(A),             \
+                                                  (__v32qi)(__m256i)(B),          \
+                                                  (char)(I)); })
+
+#define _mm256_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) __extension__ ({        \
+   (__m256i)__builtin_ia32_selectb_256((__mmask32)(U),                            \
+        (__v32qi)_mm256_gf2p8affine_epi64_epi8(A, B, I),                          \
+        (__v32qi)(__m256i)(S)); })
+
+#define _mm256_maskz_gf2p8affine_epi64_epi8(U, A, B, I) __extension__ ({          \
+  (__m256i)_mm256_mask_gf2p8affine_epi64_epi8((__m256i)_mm256_setzero_si256(),    \
+        U, A, B, I); })
+
+
+#define _mm512_gf2p8affine_epi64_epi8(A, B, I) __extension__ ({                   \
+  (__m512i)__builtin_ia32_vgf2p8affineqb_v64qi((__v64qi)(__m512i)(A),             \
+                                                  (__v64qi)(__m512i)(B),          \
+                                                  (char)(I)); })
+
+#define _mm512_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) __extension__ ({        \
+   (__m512i)__builtin_ia32_selectb_512((__mmask64)(U),                            \
+        (__v64qi)_mm512_gf2p8affine_epi64_epi8(A, B, I),                          \
+        (__v64qi)(__m512i)(S)); })
+
+#define _mm512_maskz_gf2p8affine_epi64_epi8(U, A, B, I) __extension__ ({          \
+  (__m512i)_mm512_mask_gf2p8affine_epi64_epi8((__m512i)_mm512_setzero_qi(),       \
+        U, A, B, I); })
+
+/* Default attributes for simple form (no masking). */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("gfni")))
+
+/* Default attributes for ZMM forms. */
+#define __DEFAULT_FN_ATTRS_F __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,gfni")))
+
+/* Default attributes for VLX forms. */
+#define __DEFAULT_FN_ATTRS_VL __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni")))
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_gf2p8mul_epi8(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vgf2p8mulb_v16qi((__v16qi) __A,
+              (__v16qi) __B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS_VL
+_mm_mask_gf2p8mul_epi8(__m128i __S, __mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_selectb_128(__U,
+              (__v16qi) _mm_gf2p8mul_epi8(__A, __B),
+              (__v16qi) __S);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS_VL
+_mm_maskz_gf2p8mul_epi8(__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return _mm_mask_gf2p8mul_epi8((__m128i)_mm_setzero_si128(),
+              __U, __A, __B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_gf2p8mul_epi8(__m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vgf2p8mulb_v32qi((__v32qi) __A,
+              (__v32qi) __B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS_VL
+_mm256_mask_gf2p8mul_epi8(__m256i __S, __mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_selectb_256(__U,
+              (__v32qi) _mm256_gf2p8mul_epi8(__A, __B),
+              (__v32qi) __S);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS_VL
+_mm256_maskz_gf2p8mul_epi8(__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return _mm256_mask_gf2p8mul_epi8((__m256i)_mm256_setzero_si256(),
+              __U, __A, __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS_F
+_mm512_gf2p8mul_epi8(__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vgf2p8mulb_v64qi((__v64qi) __A,
+              (__v64qi) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS_F
+_mm512_mask_gf2p8mul_epi8(__m512i __S, __mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_selectb_512(__U,
+              (__v64qi) _mm512_gf2p8mul_epi8(__A, __B),
+              (__v64qi) __S);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS_F
+_mm512_maskz_gf2p8mul_epi8(__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return _mm512_mask_gf2p8mul_epi8((__m512i)_mm512_setzero_qi(),
+              __U, __A, __B);
+}
+
+#undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS_F
+#undef __DEFAULT_FN_ATTRS_VL
+
+#endif // __GFNIINTRIN_H
+

+ 226 - 0
demo/include/htmintrin.h

@@ -0,0 +1,226 @@
+/*===---- htmintrin.h - Standard header for PowerPC HTM ---------------===*\
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+\*===----------------------------------------------------------------------===*/
+
+#ifndef __HTMINTRIN_H
+#define __HTMINTRIN_H
+
+#ifndef __HTM__
+#error "HTM instruction set not enabled"
+#endif
+
+#ifdef __powerpc__
+
+#include <stdint.h>
+
+typedef uint64_t texasr_t;
+typedef uint32_t texasru_t;
+typedef uint32_t texasrl_t;
+typedef uintptr_t tfiar_t;
+typedef uintptr_t tfhar_t;
+
+#define _HTM_STATE(CR0) ((CR0 >> 1) & 0x3)
+#define _HTM_NONTRANSACTIONAL 0x0
+#define _HTM_SUSPENDED        0x1
+#define _HTM_TRANSACTIONAL    0x2
+
+#define _TEXASR_EXTRACT_BITS(TEXASR,BITNUM,SIZE) \
+  (((TEXASR) >> (63-(BITNUM))) & ((1<<(SIZE))-1))
+#define _TEXASRU_EXTRACT_BITS(TEXASR,BITNUM,SIZE) \
+  (((TEXASR) >> (31-(BITNUM))) & ((1<<(SIZE))-1))
+
+#define _TEXASR_FAILURE_CODE(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 7, 8)
+#define _TEXASRU_FAILURE_CODE(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 7, 8)
+
+#define _TEXASR_FAILURE_PERSISTENT(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 7, 1)
+#define _TEXASRU_FAILURE_PERSISTENT(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 7, 1)
+
+#define _TEXASR_DISALLOWED(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 8, 1)
+#define _TEXASRU_DISALLOWED(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 8, 1)
+
+#define _TEXASR_NESTING_OVERFLOW(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 9, 1)
+#define _TEXASRU_NESTING_OVERFLOW(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 9, 1)
+
+#define _TEXASR_FOOTPRINT_OVERFLOW(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 10, 1)
+#define _TEXASRU_FOOTPRINT_OVERFLOW(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 10, 1)
+
+#define _TEXASR_SELF_INDUCED_CONFLICT(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 11, 1)
+#define _TEXASRU_SELF_INDUCED_CONFLICT(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 11, 1)
+
+#define _TEXASR_NON_TRANSACTIONAL_CONFLICT(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 12, 1)
+#define _TEXASRU_NON_TRANSACTIONAL_CONFLICT(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 12, 1)
+
+#define _TEXASR_TRANSACTION_CONFLICT(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 13, 1)
+#define _TEXASRU_TRANSACTION_CONFLICT(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 13, 1)
+
+#define _TEXASR_TRANSLATION_INVALIDATION_CONFLICT(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 14, 1)
+#define _TEXASRU_TRANSLATION_INVALIDATION_CONFLICT(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 14, 1)
+
+#define _TEXASR_IMPLEMENTAION_SPECIFIC(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 15, 1)
+#define _TEXASRU_IMPLEMENTAION_SPECIFIC(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 15, 1)
+
+#define _TEXASR_INSTRUCTION_FETCH_CONFLICT(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 16, 1)
+#define _TEXASRU_INSTRUCTION_FETCH_CONFLICT(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 16, 1)
+
+#define _TEXASR_ABORT(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 31, 1)
+#define _TEXASRU_ABORT(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 31, 1)
+
+
+#define _TEXASR_SUSPENDED(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 32, 1)
+
+#define _TEXASR_PRIVILEGE(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 35, 2)
+
+#define _TEXASR_FAILURE_SUMMARY(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 36, 1)
+
+#define _TEXASR_TFIAR_EXACT(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 37, 1)
+
+#define _TEXASR_ROT(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 38, 1)
+
+#define _TEXASR_TRANSACTION_LEVEL(TEXASR) \
+  _TEXASR_EXTRACT_BITS(TEXASR, 63, 12)
+
+#endif /* __powerpc */
+
+#ifdef __s390__
+
+/* Condition codes generated by tbegin  */
+#define _HTM_TBEGIN_STARTED       0
+#define _HTM_TBEGIN_INDETERMINATE 1
+#define _HTM_TBEGIN_TRANSIENT     2
+#define _HTM_TBEGIN_PERSISTENT    3
+
+/* The abort codes below this threshold are reserved for machine use.  */
+#define _HTM_FIRST_USER_ABORT_CODE 256
+
+/* The transaction diagnostic block is it is defined in the Principles
+   of Operation chapter 5-91.  */
+
+struct __htm_tdb {
+  unsigned char format;                /*   0 */
+  unsigned char flags;
+  unsigned char reserved1[4];
+  unsigned short nesting_depth;
+  unsigned long long abort_code;       /*   8 */
+  unsigned long long conflict_token;   /*  16 */
+  unsigned long long atia;             /*  24 */
+  unsigned char eaid;                  /*  32 */
+  unsigned char dxc;
+  unsigned char reserved2[2];
+  unsigned int program_int_id;
+  unsigned long long exception_id;     /*  40 */
+  unsigned long long bea;              /*  48 */
+  unsigned char reserved3[72];         /*  56 */
+  unsigned long long gprs[16];         /* 128 */
+} __attribute__((__packed__, __aligned__ (8)));
+
+
+/* Helper intrinsics to retry tbegin in case of transient failure.  */
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+__builtin_tbegin_retry_null (int __retry)
+{
+  int cc, i = 0;
+
+  while ((cc = __builtin_tbegin(0)) == _HTM_TBEGIN_TRANSIENT
+         && i++ < __retry)
+    __builtin_tx_assist(i);
+
+  return cc;
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+__builtin_tbegin_retry_tdb (void *__tdb, int __retry)
+{
+  int cc, i = 0;
+
+  while ((cc = __builtin_tbegin(__tdb)) == _HTM_TBEGIN_TRANSIENT
+         && i++ < __retry)
+    __builtin_tx_assist(i);
+
+  return cc;
+}
+
+#define __builtin_tbegin_retry(tdb, retry) \
+  (__builtin_constant_p(tdb == 0) && tdb == 0 ? \
+   __builtin_tbegin_retry_null(retry) : \
+   __builtin_tbegin_retry_tdb(tdb, retry))
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+__builtin_tbegin_retry_nofloat_null (int __retry)
+{
+  int cc, i = 0;
+
+  while ((cc = __builtin_tbegin_nofloat(0)) == _HTM_TBEGIN_TRANSIENT
+         && i++ < __retry)
+    __builtin_tx_assist(i);
+
+  return cc;
+}
+
+static __inline int __attribute__((__always_inline__, __nodebug__))
+__builtin_tbegin_retry_nofloat_tdb (void *__tdb, int __retry)
+{
+  int cc, i = 0;
+
+  while ((cc = __builtin_tbegin_nofloat(__tdb)) == _HTM_TBEGIN_TRANSIENT
+         && i++ < __retry)
+    __builtin_tx_assist(i);
+
+  return cc;
+}
+
+#define __builtin_tbegin_retry_nofloat(tdb, retry) \
+  (__builtin_constant_p(tdb == 0) && tdb == 0 ? \
+   __builtin_tbegin_retry_nofloat_null(retry) : \
+   __builtin_tbegin_retry_nofloat_tdb(tdb, retry))
+
+#endif /* __s390__ */
+
+#endif /* __HTMINTRIN_H */

+ 359 - 0
demo/include/htmxlintrin.h

@@ -0,0 +1,359 @@
+/*===---- htmxlintrin.h - XL compiler HTM execution intrinsics-------------===*\
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+\*===----------------------------------------------------------------------===*/
+
+#ifndef __HTMXLINTRIN_H
+#define __HTMXLINTRIN_H
+
+#ifndef __HTM__
+#error "HTM instruction set not enabled"
+#endif
+
+#include <htmintrin.h>
+
+#ifdef __powerpc__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define _TEXASR_PTR(TM_BUF) ((texasr_t *)((char *)(TM_BUF) + 0))
+#define _TEXASRU_PTR(TM_BUF) ((texasru_t *)((char *)(TM_BUF) + 0))
+#define _TEXASRL_PTR(TM_BUF) ((texasrl_t *)((char *)(TM_BUF) + 4))
+#define _TFIAR_PTR(TM_BUF) ((tfiar_t *)((char *)(TM_BUF) + 8))
+
+typedef char TM_buff_type[16];
+
+/* This macro can be used to determine whether a transaction was successfully
+   started from the __TM_begin() and __TM_simple_begin() intrinsic functions
+   below.  */
+#define _HTM_TBEGIN_STARTED     1
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_simple_begin (void)
+{
+  if (__builtin_expect (__builtin_tbegin (0), 1))
+    return _HTM_TBEGIN_STARTED;
+  return 0;
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_begin (void* const __TM_buff)
+{
+  *_TEXASRL_PTR (__TM_buff) = 0;
+  if (__builtin_expect (__builtin_tbegin (0), 1))
+    return _HTM_TBEGIN_STARTED;
+#ifdef __powerpc64__
+  *_TEXASR_PTR (__TM_buff) = __builtin_get_texasr ();
+#else
+  *_TEXASRU_PTR (__TM_buff) = __builtin_get_texasru ();
+  *_TEXASRL_PTR (__TM_buff) = __builtin_get_texasr ();
+#endif
+  *_TFIAR_PTR (__TM_buff) = __builtin_get_tfiar ();
+  return 0;
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_end (void)
+{
+  if (__builtin_expect (__builtin_tend (0), 1))
+    return 1;
+  return 0;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_abort (void)
+{
+  __builtin_tabort (0);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_named_abort (unsigned char const __code)
+{
+  __builtin_tabort (__code);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_resume (void)
+{
+  __builtin_tresume ();
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_suspend (void)
+{
+  __builtin_tsuspend ();
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_is_user_abort (void* const __TM_buff)
+{
+  texasru_t texasru = *_TEXASRU_PTR (__TM_buff);
+  return _TEXASRU_ABORT (texasru);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_is_named_user_abort (void* const __TM_buff, unsigned char *__code)
+{
+  texasru_t texasru = *_TEXASRU_PTR (__TM_buff);
+
+  *__code = _TEXASRU_FAILURE_CODE (texasru);
+  return _TEXASRU_ABORT (texasru);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_is_illegal (void* const __TM_buff)
+{
+  texasru_t texasru = *_TEXASRU_PTR (__TM_buff);
+  return _TEXASRU_DISALLOWED (texasru);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_is_footprint_exceeded (void* const __TM_buff)
+{
+  texasru_t texasru = *_TEXASRU_PTR (__TM_buff);
+  return _TEXASRU_FOOTPRINT_OVERFLOW (texasru);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_nesting_depth (void* const __TM_buff)
+{
+  texasrl_t texasrl;
+
+  if (_HTM_STATE (__builtin_ttest ()) == _HTM_NONTRANSACTIONAL)
+    {
+      texasrl = *_TEXASRL_PTR (__TM_buff);
+      if (!_TEXASR_FAILURE_SUMMARY (texasrl))
+        texasrl = 0;
+    }
+  else
+    texasrl = (texasrl_t) __builtin_get_texasr ();
+
+  return _TEXASR_TRANSACTION_LEVEL (texasrl);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_is_nested_too_deep(void* const __TM_buff)
+{
+  texasru_t texasru = *_TEXASRU_PTR (__TM_buff);
+  return _TEXASRU_NESTING_OVERFLOW (texasru);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_is_conflict(void* const __TM_buff)
+{
+  texasru_t texasru = *_TEXASRU_PTR (__TM_buff);
+  /* Return TEXASR bits 11 (Self-Induced Conflict) through
+     14 (Translation Invalidation Conflict).  */
+  return (_TEXASRU_EXTRACT_BITS (texasru, 14, 4)) ? 1 : 0;
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_is_failure_persistent(void* const __TM_buff)
+{
+  texasru_t texasru = *_TEXASRU_PTR (__TM_buff);
+  return _TEXASRU_FAILURE_PERSISTENT (texasru);
+}
+
+extern __inline long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_failure_address(void* const __TM_buff)
+{
+  return *_TFIAR_PTR (__TM_buff);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+__TM_failure_code(void* const __TM_buff)
+{
+  return *_TEXASR_PTR (__TM_buff);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __powerpc__ */
+
+#ifdef __s390__
+
+#include <stdint.h>
+
+/* These intrinsics are being made available for compatibility with
+   the IBM XL compiler.  For documentation please see the "z/OS XL
+   C/C++ Programming Guide" publically available on the web.  */
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_simple_begin ()
+{
+  return __builtin_tbegin_nofloat (0);
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_begin (void* const __tdb)
+{
+  return __builtin_tbegin_nofloat (__tdb);
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_end ()
+{
+  return __builtin_tend ();
+}
+
+static __inline void __attribute__((__always_inline__))
+__TM_abort ()
+{
+  return __builtin_tabort (_HTM_FIRST_USER_ABORT_CODE);
+}
+
+static __inline void __attribute__((__always_inline__, __nodebug__))
+__TM_named_abort (unsigned char const __code)
+{
+  return __builtin_tabort ((int)_HTM_FIRST_USER_ABORT_CODE + __code);
+}
+
+static __inline void __attribute__((__always_inline__, __nodebug__))
+__TM_non_transactional_store (void* const __addr, long long const __value)
+{
+  __builtin_non_tx_store ((uint64_t*)__addr, (uint64_t)__value);
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_nesting_depth (void* const __tdb_ptr)
+{
+  int depth = __builtin_tx_nesting_depth ();
+  struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
+
+  if (depth != 0)
+    return depth;
+
+  if (tdb->format != 1)
+    return 0;
+  return tdb->nesting_depth;
+}
+
+/* Transaction failure diagnostics */
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_is_user_abort (void* const __tdb_ptr)
+{
+  struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
+
+  if (tdb->format != 1)
+    return 0;
+
+  return !!(tdb->abort_code >= _HTM_FIRST_USER_ABORT_CODE);
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_is_named_user_abort (void* const __tdb_ptr, unsigned char* __code)
+{
+  struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
+
+  if (tdb->format != 1)
+    return 0;
+
+  if (tdb->abort_code >= _HTM_FIRST_USER_ABORT_CODE)
+    {
+      *__code = tdb->abort_code - _HTM_FIRST_USER_ABORT_CODE;
+      return 1;
+    }
+  return 0;
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_is_illegal (void* const __tdb_ptr)
+{
+  struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
+
+  return (tdb->format == 1
+	  && (tdb->abort_code == 4 /* unfiltered program interruption */
+	      || tdb->abort_code == 11 /* restricted instruction */));
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_is_footprint_exceeded (void* const __tdb_ptr)
+{
+  struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
+
+  return (tdb->format == 1
+	  && (tdb->abort_code == 7 /* fetch overflow */
+	      || tdb->abort_code == 8 /* store overflow */));
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_is_nested_too_deep (void* const __tdb_ptr)
+{
+  struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
+
+  return tdb->format == 1 && tdb->abort_code == 13; /* depth exceeded */
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_is_conflict (void* const __tdb_ptr)
+{
+  struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
+
+  return (tdb->format == 1
+	  && (tdb->abort_code == 9 /* fetch conflict */
+	      || tdb->abort_code == 10 /* store conflict */));
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_is_failure_persistent (long const __result)
+{
+  return __result == _HTM_TBEGIN_PERSISTENT;
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_failure_address (void* const __tdb_ptr)
+{
+  struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
+  return tdb->atia;
+}
+
+static __inline long __attribute__((__always_inline__, __nodebug__))
+__TM_failure_code (void* const __tdb_ptr)
+{
+  struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
+
+  return tdb->abort_code;
+}
+
+#endif /* __s390__ */
+
+#endif /* __HTMXLINTRIN_H  */

+ 73 - 0
demo/include/ia32intrin.h

@@ -0,0 +1,73 @@
+/* ===-------- ia32intrin.h ---------------------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#error "Never use <ia32intrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __IA32INTRIN_H
+#define __IA32INTRIN_H
+
+#ifdef __x86_64__
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+__readeflags(void)
+{
+  return __builtin_ia32_readeflags_u64();
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__writeeflags(unsigned long long __f)
+{
+  __builtin_ia32_writeeflags_u64(__f);
+}
+
+#else /* !__x86_64__ */
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
+__readeflags(void)
+{
+  return __builtin_ia32_readeflags_u32();
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+__writeeflags(unsigned int __f)
+{
+  __builtin_ia32_writeeflags_u32(__f);
+}
+#endif /* !__x86_64__ */
+
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+__rdpmc(int __A) {
+  return __builtin_ia32_rdpmc(__A);
+}
+
+/* __rdtscp */
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
+__rdtscp(unsigned int *__A) {
+  return __builtin_ia32_rdtscp(__A);
+}
+
+#define _rdtsc() __rdtsc()
+
+#define _rdpmc(A) __rdpmc(A)
+
+#endif /* __IA32INTRIN_H */

+ 374 - 0
demo/include/immintrin.h

@@ -0,0 +1,374 @@
+/*===---- immintrin.h - Intel intrinsics -----------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#define __IMMINTRIN_H
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__MMX__)
+#include <mmintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SSE__)
+#include <xmmintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SSE2__)
+#include <emmintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SSE3__)
+#include <pmmintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SSSE3__)
+#include <tmmintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || \
+    (defined(__SSE4_2__) || defined(__SSE4_1__))
+#include <smmintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || \
+    (defined(__AES__) || defined(__PCLMUL__))
+#include <wmmintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__CLFLUSHOPT__)
+#include <clflushoptintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__CLWB__)
+#include <clwbintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX__)
+#include <avxintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX2__)
+#include <avx2intrin.h>
+
+/* The 256-bit versions of functions in f16cintrin.h.
+   Intel documents these as being in immintrin.h, and
+   they depend on typedefs from avxintrin.h. */
+
+/// \brief Converts a 256-bit vector of [8 x float] into a 128-bit vector
+///    containing 16-bit half-precision float values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm256_cvtps_ph(__m256 a, const int imm);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction.
+///
+/// \param a
+///    A 256-bit vector containing 32-bit single-precision float values to be
+///    converted to 16-bit half-precision float values.
+/// \param imm
+///    An immediate value controlling rounding using bits [2:0]: \n
+///    000: Nearest \n
+///    001: Down \n
+///    010: Up \n
+///    011: Truncate \n
+///    1XX: Use MXCSR.RC for rounding
+/// \returns A 128-bit vector containing the converted 16-bit half-precision
+///    float values.
+#define _mm256_cvtps_ph(a, imm) __extension__ ({ \
+ (__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)(__m256)(a), (imm)); })
+
+/// \brief Converts a 128-bit vector containing 16-bit half-precision float
+///    values into a 256-bit vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector containing 16-bit half-precision float values to be
+///    converted to 32-bit single-precision float values.
+/// \returns A vector of [8 x float] containing the converted 32-bit
+///    single-precision float values.
+static __inline __m256 __attribute__((__always_inline__, __nodebug__, __target__("f16c")))
+_mm256_cvtph_ps(__m128i __a)
+{
+  return (__m256)__builtin_ia32_vcvtph2ps256((__v8hi)__a);
+}
+#endif /* __AVX2__ */
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__VPCLMULQDQ__)
+#include <vpclmulqdqintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__BMI__)
+#include <bmiintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__BMI2__)
+#include <bmi2intrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__LZCNT__)
+#include <lzcntintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__FMA__)
+#include <fmaintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512F__)
+#include <avx512fintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512VL__)
+#include <avx512vlintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512BW__)
+#include <avx512bwintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512BITALG__)
+#include <avx512bitalgintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512CD__)
+#include <avx512cdintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512VPOPCNTDQ__)
+#include <avx512vpopcntdqintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || \
+    (defined(__AVX512VL__) && defined(__AVX512VPOPCNTDQ__))
+#include <avx512vpopcntdqvlintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512VNNI__)
+#include <avx512vnniintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || \
+    (defined(__AVX512VL__) && defined(__AVX512VNNI__))
+#include <avx512vlvnniintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512DQ__)
+#include <avx512dqintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || \
+    (defined(__AVX512VL__) && defined(__AVX512BITALG__))
+#include <avx512vlbitalgintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || \
+    (defined(__AVX512VL__) && defined(__AVX512BW__))
+#include <avx512vlbwintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || \
+    (defined(__AVX512VL__) && defined(__AVX512CD__))
+#include <avx512vlcdintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || \
+    (defined(__AVX512VL__) && defined(__AVX512DQ__))
+#include <avx512vldqintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512ER__)
+#include <avx512erintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512IFMA__)
+#include <avx512ifmaintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || \
+    (defined(__AVX512IFMA__) && defined(__AVX512VL__))
+#include <avx512ifmavlintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512VBMI__)
+#include <avx512vbmiintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || \
+    (defined(__AVX512VBMI__) && defined(__AVX512VL__))
+#include <avx512vbmivlintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512VBMI2__)
+#include <avx512vbmi2intrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || \
+    (defined(__AVX512VBMI2__) && defined(__AVX512VL__))
+#include <avx512vlvbmi2intrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512PF__)
+#include <avx512pfintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__PKU__)
+#include <pkuintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__VAES__)
+#include <vaesintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__GFNI__)
+#include <gfniintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__RDRND__)
+static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
+_rdrand16_step(unsigned short *__p)
+{
+  return __builtin_ia32_rdrand16_step(__p);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
+_rdrand32_step(unsigned int *__p)
+{
+  return __builtin_ia32_rdrand32_step(__p);
+}
+
+#ifdef __x86_64__
+static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
+_rdrand64_step(unsigned long long *__p)
+{
+  return __builtin_ia32_rdrand64_step(__p);
+}
+#endif
+#endif /* __RDRND__ */
+
+/* __bit_scan_forward */
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_bit_scan_forward(int __A) {
+  return __builtin_ctz(__A);
+}
+
+/* __bit_scan_reverse */
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+_bit_scan_reverse(int __A) {
+  return 31 - __builtin_clz(__A);
+}
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__FSGSBASE__)
+#ifdef __x86_64__
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
+_readfsbase_u32(void)
+{
+  return __builtin_ia32_rdfsbase32();
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
+_readfsbase_u64(void)
+{
+  return __builtin_ia32_rdfsbase64();
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
+_readgsbase_u32(void)
+{
+  return __builtin_ia32_rdgsbase32();
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
+_readgsbase_u64(void)
+{
+  return __builtin_ia32_rdgsbase64();
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
+_writefsbase_u32(unsigned int __V)
+{
+  return __builtin_ia32_wrfsbase32(__V);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
+_writefsbase_u64(unsigned long long __V)
+{
+  return __builtin_ia32_wrfsbase64(__V);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
+_writegsbase_u32(unsigned int __V)
+{
+  return __builtin_ia32_wrgsbase32(__V);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
+_writegsbase_u64(unsigned long long __V)
+{
+  return __builtin_ia32_wrgsbase64(__V);
+}
+
+#endif
+#endif /* __FSGSBASE__ */
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__RTM__)
+#include <rtmintrin.h>
+#include <xtestintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SHA__)
+#include <shaintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__FXSR__)
+#include <fxsrintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__XSAVE__)
+#include <xsaveintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__XSAVEOPT__)
+#include <xsaveoptintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__XSAVEC__)
+#include <xsavecintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__XSAVES__)
+#include <xsavesintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SHSTK__)
+#include <cetintrin.h>
+#endif
+
+/* Some intrinsics inside adxintrin.h are available only on processors with ADX,
+ * whereas others are also available at all times. */
+#include <adxintrin.h>
+
+#endif /* __IMMINTRIN_H */

+ 969 - 0
demo/include/intrin.h

@@ -0,0 +1,969 @@
+/* ===-------- intrin.h ---------------------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/* Only include this if we're compiling for the windows platform. */
+#ifndef _MSC_VER
+#include_next <intrin.h>
+#else
+
+#ifndef __INTRIN_H
+#define __INTRIN_H
+
+/* First include the standard intrinsics. */
+#if defined(__i386__) || defined(__x86_64__)
+#include <x86intrin.h>
+#endif
+
+#if defined(__arm__)
+#include <armintr.h>
+#endif
+
+#if defined(_M_ARM64)
+#include <arm64intr.h>
+#endif
+
+/* For the definition of jmp_buf. */
+#if __STDC_HOSTED__
+#include <setjmp.h>
+#endif
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(__MMX__)
+/* And the random ones that aren't in those files. */
+__m64 _m_from_float(float);
+float _m_to_float(__m64);
+#endif
+
+/* Other assorted instruction intrinsics. */
+void __addfsbyte(unsigned long, unsigned char);
+void __addfsdword(unsigned long, unsigned long);
+void __addfsword(unsigned long, unsigned short);
+void __code_seg(const char *);
+static __inline__
+void __cpuid(int[4], int);
+static __inline__
+void __cpuidex(int[4], int, int);
+static __inline__
+__int64 __emul(int, int);
+static __inline__
+unsigned __int64 __emulu(unsigned int, unsigned int);
+unsigned int __getcallerseflags(void);
+static __inline__
+void __halt(void);
+unsigned char __inbyte(unsigned short);
+void __inbytestring(unsigned short, unsigned char *, unsigned long);
+void __incfsbyte(unsigned long);
+void __incfsdword(unsigned long);
+void __incfsword(unsigned long);
+unsigned long __indword(unsigned short);
+void __indwordstring(unsigned short, unsigned long *, unsigned long);
+void __invlpg(void *);
+unsigned short __inword(unsigned short);
+void __inwordstring(unsigned short, unsigned short *, unsigned long);
+void __lidt(void *);
+unsigned __int64 __ll_lshift(unsigned __int64, int);
+__int64 __ll_rshift(__int64, int);
+unsigned int __lzcnt(unsigned int);
+unsigned short __lzcnt16(unsigned short);
+static __inline__
+void __movsb(unsigned char *, unsigned char const *, size_t);
+static __inline__
+void __movsd(unsigned long *, unsigned long const *, size_t);
+static __inline__
+void __movsw(unsigned short *, unsigned short const *, size_t);
+static __inline__
+void __nop(void);
+void __nvreg_restore_fence(void);
+void __nvreg_save_fence(void);
+void __outbyte(unsigned short, unsigned char);
+void __outbytestring(unsigned short, unsigned char *, unsigned long);
+void __outdword(unsigned short, unsigned long);
+void __outdwordstring(unsigned short, unsigned long *, unsigned long);
+void __outword(unsigned short, unsigned short);
+void __outwordstring(unsigned short, unsigned short *, unsigned long);
+unsigned long __readcr0(void);
+unsigned long __readcr2(void);
+static __inline__
+unsigned long __readcr3(void);
+unsigned long __readcr4(void);
+unsigned long __readcr8(void);
+unsigned int __readdr(unsigned int);
+#ifdef __i386__
+static __inline__
+unsigned char __readfsbyte(unsigned long);
+static __inline__
+unsigned __int64 __readfsqword(unsigned long);
+static __inline__
+unsigned short __readfsword(unsigned long);
+#endif
+static __inline__
+unsigned __int64 __readmsr(unsigned long);
+unsigned __int64 __readpmc(unsigned long);
+unsigned long __segmentlimit(unsigned long);
+void __sidt(void *);
+static __inline__
+void __stosb(unsigned char *, unsigned char, size_t);
+static __inline__
+void __stosd(unsigned long *, unsigned long, size_t);
+static __inline__
+void __stosw(unsigned short *, unsigned short, size_t);
+void __svm_clgi(void);
+void __svm_invlpga(void *, int);
+void __svm_skinit(int);
+void __svm_stgi(void);
+void __svm_vmload(size_t);
+void __svm_vmrun(size_t);
+void __svm_vmsave(size_t);
+unsigned __int64 __ull_rshift(unsigned __int64, int);
+void __vmx_off(void);
+void __vmx_vmptrst(unsigned __int64 *);
+void __wbinvd(void);
+void __writecr0(unsigned int);
+static __inline__
+void __writecr3(unsigned int);
+void __writecr4(unsigned int);
+void __writecr8(unsigned int);
+void __writedr(unsigned int, unsigned int);
+void __writefsbyte(unsigned long, unsigned char);
+void __writefsdword(unsigned long, unsigned long);
+void __writefsqword(unsigned long, unsigned __int64);
+void __writefsword(unsigned long, unsigned short);
+void __writemsr(unsigned long, unsigned __int64);
+static __inline__
+void *_AddressOfReturnAddress(void);
+static __inline__
+unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
+static __inline__
+unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
+static __inline__
+unsigned char _bittest(long const *, long);
+static __inline__
+unsigned char _bittestandcomplement(long *, long);
+static __inline__
+unsigned char _bittestandreset(long *, long);
+static __inline__
+unsigned char _bittestandset(long *, long);
+void __cdecl _disable(void);
+void __cdecl _enable(void);
+long _InterlockedAddLargeStatistic(__int64 volatile *_Addend, long _Value);
+unsigned char _interlockedbittestandreset(long volatile *, long);
+unsigned char _interlockedbittestandset(long volatile *, long);
+long _InterlockedCompareExchange_HLEAcquire(long volatile *, long, long);
+long _InterlockedCompareExchange_HLERelease(long volatile *, long, long);
+__int64 _InterlockedcompareExchange64_HLEAcquire(__int64 volatile *, __int64,
+                                                 __int64);
+__int64 _InterlockedCompareExchange64_HLERelease(__int64 volatile *, __int64,
+                                                 __int64);
+void *_InterlockedCompareExchangePointer_HLEAcquire(void *volatile *, void *,
+                                                    void *);
+void *_InterlockedCompareExchangePointer_HLERelease(void *volatile *, void *,
+                                                    void *);
+long _InterlockedExchangeAdd_HLEAcquire(long volatile *, long);
+long _InterlockedExchangeAdd_HLERelease(long volatile *, long);
+__int64 _InterlockedExchangeAdd64_HLEAcquire(__int64 volatile *, __int64);
+__int64 _InterlockedExchangeAdd64_HLERelease(__int64 volatile *, __int64);
+void __cdecl _invpcid(unsigned int, void *);
+static __inline__ void
+__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
+_ReadBarrier(void);
+static __inline__ void
+__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
+_ReadWriteBarrier(void);
+unsigned int _rorx_u32(unsigned int, const unsigned int);
+int _sarx_i32(int, unsigned int);
+#if __STDC_HOSTED__
+int __cdecl _setjmp(jmp_buf);
+#endif
+unsigned int _shlx_u32(unsigned int, unsigned int);
+unsigned int _shrx_u32(unsigned int, unsigned int);
+void _Store_HLERelease(long volatile *, long);
+void _Store64_HLERelease(__int64 volatile *, __int64);
+void _StorePointer_HLERelease(void *volatile *, void *);
+static __inline__ void
+__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
+_WriteBarrier(void);
+unsigned __int32 xbegin(void);
+void _xend(void);
+static __inline__
+#define _XCR_XFEATURE_ENABLED_MASK 0
+unsigned __int64 __cdecl _xgetbv(unsigned int);
+void __cdecl _xsetbv(unsigned int, unsigned __int64);
+
+/* These additional intrinsics are turned on in x64/amd64/x86_64 mode. */
+#ifdef __x86_64__
+void __addgsbyte(unsigned long, unsigned char);
+void __addgsdword(unsigned long, unsigned long);
+void __addgsqword(unsigned long, unsigned __int64);
+void __addgsword(unsigned long, unsigned short);
+static __inline__
+void __faststorefence(void);
+void __incgsbyte(unsigned long);
+void __incgsdword(unsigned long);
+void __incgsqword(unsigned long);
+void __incgsword(unsigned long);
+unsigned __int64 __lzcnt64(unsigned __int64);
+static __inline__
+void __movsq(unsigned long long *, unsigned long long const *, size_t);
+static __inline__
+unsigned char __readgsbyte(unsigned long);
+static __inline__
+unsigned long __readgsdword(unsigned long);
+static __inline__
+unsigned __int64 __readgsqword(unsigned long);
+unsigned short __readgsword(unsigned long);
+unsigned __int64 __shiftleft128(unsigned __int64 _LowPart,
+                                unsigned __int64 _HighPart,
+                                unsigned char _Shift);
+unsigned __int64 __shiftright128(unsigned __int64 _LowPart,
+                                 unsigned __int64 _HighPart,
+                                 unsigned char _Shift);
+static __inline__
+void __stosq(unsigned __int64 *, unsigned __int64, size_t);
+unsigned char __vmx_on(unsigned __int64 *);
+unsigned char __vmx_vmclear(unsigned __int64 *);
+unsigned char __vmx_vmlaunch(void);
+unsigned char __vmx_vmptrld(unsigned __int64 *);
+unsigned char __vmx_vmread(size_t, size_t *);
+unsigned char __vmx_vmresume(void);
+unsigned char __vmx_vmwrite(size_t, size_t);
+void __writegsbyte(unsigned long, unsigned char);
+void __writegsdword(unsigned long, unsigned long);
+void __writegsqword(unsigned long, unsigned __int64);
+void __writegsword(unsigned long, unsigned short);
+static __inline__
+unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
+static __inline__
+unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
+static __inline__
+unsigned char _bittest64(__int64 const *, __int64);
+static __inline__
+unsigned char _bittestandcomplement64(__int64 *, __int64);
+static __inline__
+unsigned char _bittestandreset64(__int64 *, __int64);
+static __inline__
+unsigned char _bittestandset64(__int64 *, __int64);
+long _InterlockedAnd_np(long volatile *_Value, long _Mask);
+short _InterlockedAnd16_np(short volatile *_Value, short _Mask);
+__int64 _InterlockedAnd64_np(__int64 volatile *_Value, __int64 _Mask);
+char _InterlockedAnd8_np(char volatile *_Value, char _Mask);
+unsigned char _interlockedbittestandreset64(__int64 volatile *, __int64);
+static __inline__
+unsigned char _interlockedbittestandset64(__int64 volatile *, __int64);
+long _InterlockedCompareExchange_np(long volatile *_Destination, long _Exchange,
+                                    long _Comparand);
+unsigned char _InterlockedCompareExchange128(__int64 volatile *_Destination,
+                                             __int64 _ExchangeHigh,
+                                             __int64 _ExchangeLow,
+                                             __int64 *_CompareandResult);
+unsigned char _InterlockedCompareExchange128_np(__int64 volatile *_Destination,
+                                                __int64 _ExchangeHigh,
+                                                __int64 _ExchangeLow,
+                                                __int64 *_ComparandResult);
+short _InterlockedCompareExchange16_np(short volatile *_Destination,
+                                       short _Exchange, short _Comparand);
+__int64 _InterlockedCompareExchange64_HLEAcquire(__int64 volatile *, __int64,
+                                                 __int64);
+__int64 _InterlockedCompareExchange64_HLERelease(__int64 volatile *, __int64,
+                                                 __int64);
+__int64 _InterlockedCompareExchange64_np(__int64 volatile *_Destination,
+                                         __int64 _Exchange, __int64 _Comparand);
+void *_InterlockedCompareExchangePointer_np(void *volatile *_Destination,
+                                            void *_Exchange, void *_Comparand);
+long _InterlockedOr_np(long volatile *_Value, long _Mask);
+short _InterlockedOr16_np(short volatile *_Value, short _Mask);
+__int64 _InterlockedOr64_np(__int64 volatile *_Value, __int64 _Mask);
+char _InterlockedOr8_np(char volatile *_Value, char _Mask);
+long _InterlockedXor_np(long volatile *_Value, long _Mask);
+short _InterlockedXor16_np(short volatile *_Value, short _Mask);
+__int64 _InterlockedXor64_np(__int64 volatile *_Value, __int64 _Mask);
+char _InterlockedXor8_np(char volatile *_Value, char _Mask);
+unsigned __int64 _rorx_u64(unsigned __int64, const unsigned int);
+__int64 _sarx_i64(__int64, unsigned int);
+unsigned __int64 _shlx_u64(unsigned __int64, unsigned int);
+unsigned __int64 _shrx_u64(unsigned __int64, unsigned int);
+static __inline__
+__int64 __mulh(__int64, __int64);
+static __inline__
+unsigned __int64 __umulh(unsigned __int64, unsigned __int64);
+static __inline__
+__int64 _mul128(__int64, __int64, __int64*);
+static __inline__
+unsigned __int64 _umul128(unsigned __int64,
+                          unsigned __int64,
+                          unsigned __int64*);
+
+#endif /* __x86_64__ */
+
+#if defined(__x86_64__) || defined(__arm__)
+
+static __inline__
+__int64 _InterlockedDecrement64(__int64 volatile *_Addend);
+static __inline__
+__int64 _InterlockedExchange64(__int64 volatile *_Target, __int64 _Value);
+static __inline__
+__int64 _InterlockedExchangeAdd64(__int64 volatile *_Addend, __int64 _Value);
+static __inline__
+__int64 _InterlockedExchangeSub64(__int64 volatile *_Subend, __int64 _Value);
+static __inline__
+__int64 _InterlockedIncrement64(__int64 volatile *_Addend);
+static __inline__
+__int64 _InterlockedOr64(__int64 volatile *_Value, __int64 _Mask);
+static __inline__
+__int64 _InterlockedXor64(__int64 volatile *_Value, __int64 _Mask);
+static __inline__
+__int64 _InterlockedAnd64(__int64 volatile *_Value, __int64 _Mask);
+
+#endif
+
+/*----------------------------------------------------------------------------*\
+|* Bit Counting and Testing
+\*----------------------------------------------------------------------------*/
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_bittest(long const *_BitBase, long _BitPos) {
+  return (*_BitBase >> _BitPos) & 1;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_bittestandcomplement(long *_BitBase, long _BitPos) {
+  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
+  *_BitBase = *_BitBase ^ (1 << _BitPos);
+  return _Res;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_bittestandreset(long *_BitBase, long _BitPos) {
+  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
+  *_BitBase = *_BitBase & ~(1 << _BitPos);
+  return _Res;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_bittestandset(long *_BitBase, long _BitPos) {
+  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
+  *_BitBase = *_BitBase | (1 << _BitPos);
+  return _Res;
+}
+#if defined(__arm__) || defined(__aarch64__)
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_interlockedbittestandset_acq(long volatile *_BitBase, long _BitPos) {
+  long _PrevVal = __atomic_fetch_or(_BitBase, 1l << _BitPos, __ATOMIC_ACQUIRE);
+  return (_PrevVal >> _BitPos) & 1;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_interlockedbittestandset_nf(long volatile *_BitBase, long _BitPos) {
+  long _PrevVal = __atomic_fetch_or(_BitBase, 1l << _BitPos, __ATOMIC_RELAXED);
+  return (_PrevVal >> _BitPos) & 1;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_interlockedbittestandset_rel(long volatile *_BitBase, long _BitPos) {
+  long _PrevVal = __atomic_fetch_or(_BitBase, 1l << _BitPos, __ATOMIC_RELEASE);
+  return (_PrevVal >> _BitPos) & 1;
+}
+#endif
+#ifdef __x86_64__
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_bittest64(__int64 const *_BitBase, __int64 _BitPos) {
+  return (*_BitBase >> _BitPos) & 1;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_bittestandcomplement64(__int64 *_BitBase, __int64 _BitPos) {
+  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
+  *_BitBase = *_BitBase ^ (1ll << _BitPos);
+  return _Res;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_bittestandreset64(__int64 *_BitBase, __int64 _BitPos) {
+  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
+  *_BitBase = *_BitBase & ~(1ll << _BitPos);
+  return _Res;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_bittestandset64(__int64 *_BitBase, __int64 _BitPos) {
+  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
+  *_BitBase = *_BitBase | (1ll << _BitPos);
+  return _Res;
+}
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_interlockedbittestandset64(__int64 volatile *_BitBase, __int64 _BitPos) {
+  long long _PrevVal =
+      __atomic_fetch_or(_BitBase, 1ll << _BitPos, __ATOMIC_SEQ_CST);
+  return (_PrevVal >> _BitPos) & 1;
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Exchange Add
+\*----------------------------------------------------------------------------*/
+#if defined(__arm__) || defined(__aarch64__)
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd8_acq(char volatile *_Addend, char _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_ACQUIRE);
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd8_nf(char volatile *_Addend, char _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELAXED);
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd8_rel(char volatile *_Addend, char _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELAXED);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd16_acq(short volatile *_Addend, short _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_ACQUIRE);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd16_nf(short volatile *_Addend, short _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELAXED);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd16_rel(short volatile *_Addend, short _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELEASE);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd_acq(long volatile *_Addend, long _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_ACQUIRE);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd_nf(long volatile *_Addend, long _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELAXED);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd_rel(long volatile *_Addend, long _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELEASE);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd64_acq(__int64 volatile *_Addend, __int64 _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_ACQUIRE);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd64_nf(__int64 volatile *_Addend, __int64 _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELAXED);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedExchangeAdd64_rel(__int64 volatile *_Addend, __int64 _Value) {
+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELEASE);
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Increment
+\*----------------------------------------------------------------------------*/
+#if defined(__arm__) || defined(__aarch64__)
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedIncrement16_acq(short volatile *_Value) {
+  return __atomic_add_fetch(_Value, 1, __ATOMIC_ACQUIRE);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedIncrement16_nf(short volatile *_Value) {
+  return __atomic_add_fetch(_Value, 1, __ATOMIC_RELAXED);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedIncrement16_rel(short volatile *_Value) {
+  return __atomic_add_fetch(_Value, 1, __ATOMIC_RELEASE);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedIncrement_acq(long volatile *_Value) {
+  return __atomic_add_fetch(_Value, 1, __ATOMIC_ACQUIRE);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedIncrement_nf(long volatile *_Value) {
+  return __atomic_add_fetch(_Value, 1, __ATOMIC_RELAXED);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedIncrement_rel(long volatile *_Value) {
+  return __atomic_add_fetch(_Value, 1, __ATOMIC_RELEASE);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedIncrement64_acq(__int64 volatile *_Value) {
+  return __atomic_add_fetch(_Value, 1, __ATOMIC_ACQUIRE);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedIncrement64_nf(__int64 volatile *_Value) {
+  return __atomic_add_fetch(_Value, 1, __ATOMIC_RELAXED);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedIncrement64_rel(__int64 volatile *_Value) {
+  return __atomic_add_fetch(_Value, 1, __ATOMIC_RELEASE);
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Decrement
+\*----------------------------------------------------------------------------*/
+#if defined(__arm__) || defined(__aarch64__)
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedDecrement16_acq(short volatile *_Value) {
+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_ACQUIRE);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedDecrement16_nf(short volatile *_Value) {
+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_RELAXED);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedDecrement16_rel(short volatile *_Value) {
+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_RELEASE);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedDecrement_acq(long volatile *_Value) {
+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_ACQUIRE);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedDecrement_nf(long volatile *_Value) {
+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_RELAXED);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedDecrement_rel(long volatile *_Value) {
+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_RELEASE);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedDecrement64_acq(__int64 volatile *_Value) {
+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_ACQUIRE);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedDecrement64_nf(__int64 volatile *_Value) {
+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_RELAXED);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedDecrement64_rel(__int64 volatile *_Value) {
+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_RELEASE);
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked And
+\*----------------------------------------------------------------------------*/
+#if defined(__arm__) || defined(__aarch64__)
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedAnd8_acq(char volatile *_Value, char _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedAnd8_nf(char volatile *_Value, char _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELAXED);
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedAnd8_rel(char volatile *_Value, char _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELEASE);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedAnd16_acq(short volatile *_Value, short _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedAnd16_nf(short volatile *_Value, short _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELAXED);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedAnd16_rel(short volatile *_Value, short _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELEASE);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedAnd_acq(long volatile *_Value, long _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedAnd_nf(long volatile *_Value, long _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELAXED);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedAnd_rel(long volatile *_Value, long _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELEASE);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedAnd64_acq(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedAnd64_nf(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELAXED);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedAnd64_rel(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELEASE);
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Or
+\*----------------------------------------------------------------------------*/
+#if defined(__arm__) || defined(__aarch64__)
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedOr8_acq(char volatile *_Value, char _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedOr8_nf(char volatile *_Value, char _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELAXED);
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedOr8_rel(char volatile *_Value, char _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELEASE);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedOr16_acq(short volatile *_Value, short _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedOr16_nf(short volatile *_Value, short _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELAXED);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedOr16_rel(short volatile *_Value, short _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELEASE);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedOr_acq(long volatile *_Value, long _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedOr_nf(long volatile *_Value, long _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELAXED);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedOr_rel(long volatile *_Value, long _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELEASE);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedOr64_acq(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedOr64_nf(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELAXED);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedOr64_rel(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELEASE);
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Xor
+\*----------------------------------------------------------------------------*/
+#if defined(__arm__) || defined(__aarch64__)
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedXor8_acq(char volatile *_Value, char _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedXor8_nf(char volatile *_Value, char _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELAXED);
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedXor8_rel(char volatile *_Value, char _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELEASE);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedXor16_acq(short volatile *_Value, short _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedXor16_nf(short volatile *_Value, short _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELAXED);
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedXor16_rel(short volatile *_Value, short _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELEASE);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedXor_acq(long volatile *_Value, long _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedXor_nf(long volatile *_Value, long _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELAXED);
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedXor_rel(long volatile *_Value, long _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELEASE);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedXor64_acq(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_ACQUIRE);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedXor64_nf(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELAXED);
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedXor64_rel(__int64 volatile *_Value, __int64 _Mask) {
+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELEASE);
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Exchange
+\*----------------------------------------------------------------------------*/
+#if defined(__arm__) || defined(__aarch64__)
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedExchange8_acq(char volatile *_Target, char _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_ACQUIRE);
+  return _Value;
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedExchange8_nf(char volatile *_Target, char _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELAXED);
+  return _Value;
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedExchange8_rel(char volatile *_Target, char _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELEASE);
+  return _Value;
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedExchange16_acq(short volatile *_Target, short _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_ACQUIRE);
+  return _Value;
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedExchange16_nf(short volatile *_Target, short _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELAXED);
+  return _Value;
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedExchange16_rel(short volatile *_Target, short _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELEASE);
+  return _Value;
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedExchange_acq(long volatile *_Target, long _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_ACQUIRE);
+  return _Value;
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedExchange_nf(long volatile *_Target, long _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELAXED);
+  return _Value;
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedExchange_rel(long volatile *_Target, long _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELEASE);
+  return _Value;
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedExchange64_acq(__int64 volatile *_Target, __int64 _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_ACQUIRE);
+  return _Value;
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedExchange64_nf(__int64 volatile *_Target, __int64 _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELAXED);
+  return _Value;
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedExchange64_rel(__int64 volatile *_Target, __int64 _Value) {
+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELEASE);
+  return _Value;
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Compare Exchange
+\*----------------------------------------------------------------------------*/
+#if defined(__arm__) || defined(__aarch64__)
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange8_acq(char volatile *_Destination,
+                             char _Exchange, char _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_ACQUIRE);
+  return _Comparand;
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange8_nf(char volatile *_Destination,
+                             char _Exchange, char _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
+  return _Comparand;
+}
+static __inline__ char __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange8_rel(char volatile *_Destination,
+                             char _Exchange, char _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_RELEASE);
+  return _Comparand;
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange16_acq(short volatile *_Destination,
+                              short _Exchange, short _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_ACQUIRE);
+  return _Comparand;
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange16_nf(short volatile *_Destination,
+                              short _Exchange, short _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
+  return _Comparand;
+}
+static __inline__ short __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange16_rel(short volatile *_Destination,
+                              short _Exchange, short _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_RELEASE);
+  return _Comparand;
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange_acq(long volatile *_Destination,
+                              long _Exchange, long _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_ACQUIRE);
+  return _Comparand;
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange_nf(long volatile *_Destination,
+                              long _Exchange, long _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
+  return _Comparand;
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange_rel(long volatile *_Destination,
+                              long _Exchange, long _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_RELEASE);
+  return _Comparand;
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange64_acq(__int64 volatile *_Destination,
+                              __int64 _Exchange, __int64 _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_ACQUIRE);
+  return _Comparand;
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange64_nf(__int64 volatile *_Destination,
+                              __int64 _Exchange, __int64 _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
+  return _Comparand;
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange64_rel(__int64 volatile *_Destination,
+                              __int64 _Exchange, __int64 _Comparand) {
+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
+                            __ATOMIC_SEQ_CST, __ATOMIC_RELEASE);
+  return _Comparand;
+}
+#endif
+
+/*----------------------------------------------------------------------------*\
+|* movs, stos
+\*----------------------------------------------------------------------------*/
+#if defined(__i386__) || defined(__x86_64__)
+static __inline__ void __DEFAULT_FN_ATTRS
+__movsb(unsigned char *__dst, unsigned char const *__src, size_t __n) {
+  __asm__("rep movsb" : : "D"(__dst), "S"(__src), "c"(__n));
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+__movsd(unsigned long *__dst, unsigned long const *__src, size_t __n) {
+  __asm__("rep movsl" : : "D"(__dst), "S"(__src), "c"(__n));
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+__movsw(unsigned short *__dst, unsigned short const *__src, size_t __n) {
+  __asm__("rep movsw" : : "D"(__dst), "S"(__src), "c"(__n));
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+__stosd(unsigned long *__dst, unsigned long __x, size_t __n) {
+  __asm__("rep stosl" : : "D"(__dst), "a"(__x), "c"(__n));
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+__stosw(unsigned short *__dst, unsigned short __x, size_t __n) {
+  __asm__("rep stosw" : : "D"(__dst), "a"(__x), "c"(__n));
+}
+#endif
+#ifdef __x86_64__
+static __inline__ void __DEFAULT_FN_ATTRS
+__movsq(unsigned long long *__dst, unsigned long long const *__src, size_t __n) {
+  __asm__("rep movsq" : : "D"(__dst), "S"(__src), "c"(__n));
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+__stosq(unsigned __int64 *__dst, unsigned __int64 __x, size_t __n) {
+  __asm__("rep stosq" : : "D"(__dst), "a"(__x), "c"(__n));
+}
+#endif
+
+/*----------------------------------------------------------------------------*\
+|* Misc
+\*----------------------------------------------------------------------------*/
+#if defined(__i386__) || defined(__x86_64__)
+static __inline__ void __DEFAULT_FN_ATTRS
+__cpuid(int __info[4], int __level) {
+  __asm__ ("cpuid" : "=a"(__info[0]), "=b" (__info[1]), "=c"(__info[2]), "=d"(__info[3])
+                   : "a"(__level));
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+__cpuidex(int __info[4], int __level, int __ecx) {
+  __asm__ ("cpuid" : "=a"(__info[0]), "=b" (__info[1]), "=c"(__info[2]), "=d"(__info[3])
+                   : "a"(__level), "c"(__ecx));
+}
+static __inline__ unsigned __int64 __cdecl __DEFAULT_FN_ATTRS
+_xgetbv(unsigned int __xcr_no) {
+  unsigned int __eax, __edx;
+  __asm__ ("xgetbv" : "=a" (__eax), "=d" (__edx) : "c" (__xcr_no));
+  return ((unsigned __int64)__edx << 32) | __eax;
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+__halt(void) {
+  __asm__ volatile ("hlt");
+}
+static __inline__ void __DEFAULT_FN_ATTRS
+__nop(void) {
+  __asm__ volatile ("nop");
+}
+#endif
+
+/*----------------------------------------------------------------------------*\
+|* Privileged intrinsics
+\*----------------------------------------------------------------------------*/
+#if defined(__i386__) || defined(__x86_64__)
+static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
+__readmsr(unsigned long __register) {
+  // Loads the contents of a 64-bit model specific register (MSR) specified in
+  // the ECX register into registers EDX:EAX. The EDX register is loaded with
+  // the high-order 32 bits of the MSR and the EAX register is loaded with the
+  // low-order 32 bits. If less than 64 bits are implemented in the MSR being
+  // read, the values returned to EDX:EAX in unimplemented bit locations are
+  // undefined.
+  unsigned long __edx;
+  unsigned long __eax;
+  __asm__ ("rdmsr" : "=d"(__edx), "=a"(__eax) : "c"(__register));
+  return (((unsigned __int64)__edx) << 32) | (unsigned __int64)__eax;
+}
+
+static __inline__ unsigned long __DEFAULT_FN_ATTRS
+__readcr3(void) {
+  unsigned long __cr3_val;
+  __asm__ __volatile__ ("mov %%cr3, %0" : "=q"(__cr3_val) : : "memory");
+  return __cr3_val;
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+__writecr3(unsigned int __cr3_val) {
+  __asm__ ("mov %0, %%cr3" : : "q"(__cr3_val) : "memory");
+}
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __INTRIN_H */
+#endif /* _MSC_VER */

+ 106 - 0
demo/include/inttypes.h

@@ -0,0 +1,106 @@
+/*===---- inttypes.h - Standard header for integer printf macros ----------===*\
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+\*===----------------------------------------------------------------------===*/
+
+#ifndef __CLANG_INTTYPES_H
+#define __CLANG_INTTYPES_H
+
+#if defined(_MSC_VER) && _MSC_VER < 1800
+#error MSVC does not have inttypes.h prior to Visual Studio 2013
+#endif
+
+#include_next <inttypes.h>
+
+#if defined(_MSC_VER) && _MSC_VER < 1900
+/* MSVC headers define int32_t as int, but PRIx32 as "lx" instead of "x".
+ * This triggers format warnings, so fix it up here. */
+#undef PRId32
+#undef PRIdLEAST32
+#undef PRIdFAST32
+#undef PRIi32
+#undef PRIiLEAST32
+#undef PRIiFAST32
+#undef PRIo32
+#undef PRIoLEAST32
+#undef PRIoFAST32
+#undef PRIu32
+#undef PRIuLEAST32
+#undef PRIuFAST32
+#undef PRIx32
+#undef PRIxLEAST32
+#undef PRIxFAST32
+#undef PRIX32
+#undef PRIXLEAST32
+#undef PRIXFAST32
+
+#undef SCNd32
+#undef SCNdLEAST32
+#undef SCNdFAST32
+#undef SCNi32
+#undef SCNiLEAST32
+#undef SCNiFAST32
+#undef SCNo32
+#undef SCNoLEAST32
+#undef SCNoFAST32
+#undef SCNu32
+#undef SCNuLEAST32
+#undef SCNuFAST32
+#undef SCNx32
+#undef SCNxLEAST32
+#undef SCNxFAST32
+
+#define PRId32 "d"
+#define PRIdLEAST32 "d"
+#define PRIdFAST32 "d"
+#define PRIi32 "i"
+#define PRIiLEAST32 "i"
+#define PRIiFAST32 "i"
+#define PRIo32 "o"
+#define PRIoLEAST32 "o"
+#define PRIoFAST32 "o"
+#define PRIu32 "u"
+#define PRIuLEAST32 "u"
+#define PRIuFAST32 "u"
+#define PRIx32 "x"
+#define PRIxLEAST32 "x"
+#define PRIxFAST32 "x"
+#define PRIX32 "X"
+#define PRIXLEAST32 "X"
+#define PRIXFAST32 "X"
+
+#define SCNd32 "d"
+#define SCNdLEAST32 "d"
+#define SCNdFAST32 "d"
+#define SCNi32 "i"
+#define SCNiLEAST32 "i"
+#define SCNiFAST32 "i"
+#define SCNo32 "o"
+#define SCNoLEAST32 "o"
+#define SCNoFAST32 "o"
+#define SCNu32 "u"
+#define SCNuLEAST32 "u"
+#define SCNuFAST32 "u"
+#define SCNx32 "x"
+#define SCNxLEAST32 "x"
+#define SCNxFAST32 "x"
+#endif
+
+#endif /* __CLANG_INTTYPES_H */

+ 43 - 0
demo/include/iso646.h

@@ -0,0 +1,43 @@
+/*===---- iso646.h - Standard header for alternate spellings of operators---===
+ *
+ * Copyright (c) 2008 Eli Friedman
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __ISO646_H
+#define __ISO646_H
+
+#ifndef __cplusplus
+#define and    &&
+#define and_eq &=
+#define bitand &
+#define bitor  |
+#define compl  ~
+#define not    !
+#define not_eq !=
+#define or     ||
+#define or_eq  |=
+#define xor    ^
+#define xor_eq ^=
+#endif
+
+#endif /* __ISO646_H */

+ 118 - 0
demo/include/limits.h

@@ -0,0 +1,118 @@
+/*===---- limits.h - Standard header for integer sizes --------------------===*\
+ *
+ * Copyright (c) 2009 Chris Lattner
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+\*===----------------------------------------------------------------------===*/
+
+#ifndef __CLANG_LIMITS_H
+#define __CLANG_LIMITS_H
+
+/* The system's limits.h may, in turn, try to #include_next GCC's limits.h.
+   Avert this #include_next madness. */
+#if defined __GNUC__ && !defined _GCC_LIMITS_H_
+#define _GCC_LIMITS_H_
+#endif
+
+/* System headers include a number of constants from POSIX in <limits.h>.
+   Include it if we're hosted. */
+#if __STDC_HOSTED__ && __has_include_next(<limits.h>)
+#include_next <limits.h>
+#endif
+
+/* Many system headers try to "help us out" by defining these.  No really, we
+   know how big each datatype is. */
+#undef  SCHAR_MIN
+#undef  SCHAR_MAX
+#undef  UCHAR_MAX
+#undef  SHRT_MIN
+#undef  SHRT_MAX
+#undef  USHRT_MAX
+#undef  INT_MIN
+#undef  INT_MAX
+#undef  UINT_MAX
+#undef  LONG_MIN
+#undef  LONG_MAX
+#undef  ULONG_MAX
+
+#undef  CHAR_BIT
+#undef  CHAR_MIN
+#undef  CHAR_MAX
+
+/* C90/99 5.2.4.2.1 */
+#define SCHAR_MAX __SCHAR_MAX__
+#define SHRT_MAX  __SHRT_MAX__
+#define INT_MAX   __INT_MAX__
+#define LONG_MAX  __LONG_MAX__
+
+#define SCHAR_MIN (-__SCHAR_MAX__-1)
+#define SHRT_MIN  (-__SHRT_MAX__ -1)
+#define INT_MIN   (-__INT_MAX__  -1)
+#define LONG_MIN  (-__LONG_MAX__ -1L)
+
+#define UCHAR_MAX (__SCHAR_MAX__*2  +1)
+#define USHRT_MAX (__SHRT_MAX__ *2  +1)
+#define UINT_MAX  (__INT_MAX__  *2U +1U)
+#define ULONG_MAX (__LONG_MAX__ *2UL+1UL)
+
+#ifndef MB_LEN_MAX
+#define MB_LEN_MAX 1
+#endif
+
+#define CHAR_BIT  __CHAR_BIT__
+
+#ifdef __CHAR_UNSIGNED__  /* -funsigned-char */
+#define CHAR_MIN 0
+#define CHAR_MAX UCHAR_MAX
+#else
+#define CHAR_MIN SCHAR_MIN
+#define CHAR_MAX __SCHAR_MAX__
+#endif
+
+/* C99 5.2.4.2.1: Added long long.
+   C++11 18.3.3.2: same contents as the Standard C Library header <limits.h>.
+ */
+#if __STDC_VERSION__ >= 199901L || __cplusplus >= 201103L
+
+#undef  LLONG_MIN
+#undef  LLONG_MAX
+#undef  ULLONG_MAX
+
+#define LLONG_MAX  __LONG_LONG_MAX__
+#define LLONG_MIN  (-__LONG_LONG_MAX__-1LL)
+#define ULLONG_MAX (__LONG_LONG_MAX__*2ULL+1ULL)
+#endif
+
+/* LONG_LONG_MIN/LONG_LONG_MAX/ULONG_LONG_MAX are a GNU extension.  It's too bad
+   that we don't have something like #pragma poison that could be used to
+   deprecate a macro - the code should just use LLONG_MAX and friends.
+ */
+#if defined(__GNU_LIBRARY__) ? defined(__USE_GNU) : !defined(__STRICT_ANSI__)
+
+#undef   LONG_LONG_MIN
+#undef   LONG_LONG_MAX
+#undef   ULONG_LONG_MAX
+
+#define LONG_LONG_MAX  __LONG_LONG_MAX__
+#define LONG_LONG_MIN  (-__LONG_LONG_MAX__-1LL)
+#define ULONG_LONG_MAX (__LONG_LONG_MAX__*2ULL+1ULL)
+#endif
+
+#endif /* __CLANG_LIMITS_H */

+ 150 - 0
demo/include/lwpintrin.h

@@ -0,0 +1,150 @@
+/*===---- lwpintrin.h - LWP intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#error "Never use <lwpintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __LWPINTRIN_H
+#define __LWPINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("lwp")))
+
+/// \brief Parses the LWPCB at the specified address and enables
+///        profiling if valid.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> LLWPCB </c> instruction.
+///
+/// \param __addr
+///    Address to the new Lightweight Profiling Control Block (LWPCB). If the
+///    LWPCB is valid, writes the address into the LWP_CBADDR MSR and enables
+///    Lightweight Profiling.
+static __inline__ void __DEFAULT_FN_ATTRS
+__llwpcb (void *__addr)
+{
+  __builtin_ia32_llwpcb(__addr);
+}
+
+/// \brief Flushes the LWP state to memory and returns the address of the LWPCB.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> SLWPCB </c> instruction.
+///
+/// \return
+///    Address to the current Lightweight Profiling Control Block (LWPCB).
+///    If LWP is not currently enabled, returns NULL.
+static __inline__ void* __DEFAULT_FN_ATTRS
+__slwpcb ()
+{
+  return __builtin_ia32_slwpcb();
+}
+
+/// \brief Inserts programmed event record into the LWP event ring buffer
+///        and advances the ring buffer pointer.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> LWPINS </c> instruction.
+///
+/// \param DATA2
+///    A 32-bit value is zero-extended and inserted into the 64-bit Data2 field.
+/// \param DATA1
+///    A 32-bit value is inserted into the 32-bit Data1 field.
+/// \param FLAGS
+///    A 32-bit immediate value is inserted into the 32-bit Flags field.
+/// \returns If the ring buffer is full and LWP is running in Synchronized Mode,
+///    the event record overwrites the last record in the buffer, the MissedEvents
+///    counter in the LWPCB is incremented, the head pointer is not advanced, and
+///    1 is returned. Otherwise 0 is returned.
+#define __lwpins32(DATA2, DATA1, FLAGS) \
+  (__builtin_ia32_lwpins32((unsigned int) (DATA2), (unsigned int) (DATA1), \
+                           (unsigned int) (FLAGS)))
+
+/// \brief Decrements the LWP programmed value sample event counter. If the result is 
+///        negative, inserts an event record into the LWP event ring buffer in memory
+///        and advances the ring buffer pointer.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> LWPVAL </c> instruction.
+///
+/// \param DATA2
+///    A 32-bit value is zero-extended and inserted into the 64-bit Data2 field.
+/// \param DATA1
+///    A 32-bit value is inserted into the 32-bit Data1 field.
+/// \param FLAGS
+///    A 32-bit immediate value is inserted into the 32-bit Flags field.
+#define __lwpval32(DATA2, DATA1, FLAGS) \
+  (__builtin_ia32_lwpval32((unsigned int) (DATA2), (unsigned int) (DATA1), \
+                           (unsigned int) (FLAGS)))
+
+#ifdef __x86_64__
+
+/// \brief Inserts programmed event record into the LWP event ring buffer
+///        and advances the ring buffer pointer.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> LWPINS </c> instruction.
+///
+/// \param DATA2
+///    A 64-bit value is inserted into the 64-bit Data2 field.
+/// \param DATA1
+///    A 32-bit value is inserted into the 32-bit Data1 field.
+/// \param FLAGS
+///    A 32-bit immediate value is inserted into the 32-bit Flags field.
+/// \returns If the ring buffer is full and LWP is running in Synchronized Mode,
+///    the event record overwrites the last record in the buffer, the MissedEvents
+///    counter in the LWPCB is incremented, the head pointer is not advanced, and
+///    1 is returned. Otherwise 0 is returned.
+#define __lwpins64(DATA2, DATA1, FLAGS) \
+  (__builtin_ia32_lwpins64((unsigned long long) (DATA2), (unsigned int) (DATA1), \
+                           (unsigned int) (FLAGS)))
+
+/// \brief Decrements the LWP programmed value sample event counter. If the result is 
+///        negative, inserts an event record into the LWP event ring buffer in memory
+///        and advances the ring buffer pointer.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> LWPVAL </c> instruction.
+///
+/// \param DATA2
+///    A 64-bit value is and inserted into the 64-bit Data2 field.
+/// \param DATA1
+///    A 32-bit value is inserted into the 32-bit Data1 field.
+/// \param FLAGS
+///    A 32-bit immediate value is inserted into the 32-bit Flags field.
+#define __lwpval64(DATA2, DATA1, FLAGS) \
+  (__builtin_ia32_lwpval64((unsigned long long) (DATA2), (unsigned int) (DATA1), \
+                           (unsigned int) (FLAGS)))
+
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __LWPINTRIN_H */

+ 118 - 0
demo/include/lzcntintrin.h

@@ -0,0 +1,118 @@
+/*===---- lzcntintrin.h - LZCNT intrinsics ---------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <lzcntintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __LZCNTINTRIN_H
+#define __LZCNTINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("lzcnt")))
+
+/// \brief Counts the number of leading zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c LZCNT instruction.
+///
+/// \param __X
+///    An unsigned 16-bit integer whose leading zeros are to be counted.
+/// \returns An unsigned 16-bit integer containing the number of leading zero
+///    bits in the operand.
+static __inline__ unsigned short __DEFAULT_FN_ATTRS
+__lzcnt16(unsigned short __X)
+{
+  return __X ? __builtin_clzs(__X) : 16;
+}
+
+/// \brief Counts the number of leading zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c LZCNT instruction.
+///
+/// \param __X
+///    An unsigned 32-bit integer whose leading zeros are to be counted.
+/// \returns An unsigned 32-bit integer containing the number of leading zero
+///    bits in the operand.
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__lzcnt32(unsigned int __X)
+{
+  return __X ? __builtin_clz(__X) : 32;
+}
+
+/// \brief Counts the number of leading zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c LZCNT instruction.
+///
+/// \param __X
+///    An unsigned 32-bit integer whose leading zeros are to be counted.
+/// \returns An unsigned 32-bit integer containing the number of leading zero
+///    bits in the operand.
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_lzcnt_u32(unsigned int __X)
+{
+  return __X ? __builtin_clz(__X) : 32;
+}
+
+#ifdef __x86_64__
+/// \brief Counts the number of leading zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c LZCNT instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer whose leading zeros are to be counted.
+/// \returns An unsigned 64-bit integer containing the number of leading zero
+///    bits in the operand.
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__lzcnt64(unsigned long long __X)
+{
+  return __X ? __builtin_clzll(__X) : 64;
+}
+
+/// \brief Counts the number of leading zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c LZCNT instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer whose leading zeros are to be counted.
+/// \returns An unsigned 64-bit integer containing the number of leading zero
+///    bits in the operand.
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_lzcnt_u64(unsigned long long __X)
+{
+  return __X ? __builtin_clzll(__X) : 64;
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __LZCNTINTRIN_H */

+ 171 - 0
demo/include/mm3dnow.h

@@ -0,0 +1,171 @@
+/*===---- mm3dnow.h - 3DNow! intrinsics ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef _MM3DNOW_H_INCLUDED
+#define _MM3DNOW_H_INCLUDED
+
+#include <mmintrin.h>
+#include <prfchwintrin.h>
+
+typedef float __v2sf __attribute__((__vector_size__(8)));
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("3dnow")))
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_m_femms(void) {
+  __builtin_ia32_femms();
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pavgusb(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pavgusb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pf2id(__m64 __m) {
+  return (__m64)__builtin_ia32_pf2id((__v2sf)__m);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfacc(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfacc((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfadd(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfadd((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfcmpeq(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfcmpeq((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfcmpge(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfcmpge((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfcmpgt(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfcmpgt((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfmax(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfmax((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfmin(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfmin((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfmul(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfmul((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfrcp(__m64 __m) {
+  return (__m64)__builtin_ia32_pfrcp((__v2sf)__m);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfrcpit1(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfrcpit1((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfrcpit2(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfrcpit2((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfrsqrt(__m64 __m) {
+  return (__m64)__builtin_ia32_pfrsqrt((__v2sf)__m);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfrsqrtit1(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfrsqit1((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfsub(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfsub((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfsubr(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfsubr((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pi2fd(__m64 __m) {
+  return (__m64)__builtin_ia32_pi2fd((__v2si)__m);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pmulhrw(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pmulhrw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+/* Handle the 3dnowa instructions here. */
+#undef __DEFAULT_FN_ATTRS
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("3dnowa")))
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pf2iw(__m64 __m) {
+  return (__m64)__builtin_ia32_pf2iw((__v2sf)__m);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfnacc(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfnacc((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfpnacc(__m64 __m1, __m64 __m2) {
+  return (__m64)__builtin_ia32_pfpnacc((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pi2fw(__m64 __m) {
+  return (__m64)__builtin_ia32_pi2fw((__v2si)__m);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pswapdsf(__m64 __m) {
+  return (__m64)__builtin_ia32_pswapdsf((__v2sf)__m);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pswapdsi(__m64 __m) {
+  return (__m64)__builtin_ia32_pswapdsi((__v2si)__m);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif

+ 75 - 0
demo/include/mm_malloc.h

@@ -0,0 +1,75 @@
+/*===---- mm_malloc.h - Allocating and Freeing Aligned Memory Blocks -------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __MM_MALLOC_H
+#define __MM_MALLOC_H
+
+#include <stdlib.h>
+
+#ifdef _WIN32
+#include <malloc.h>
+#else
+#ifndef __cplusplus
+extern int posix_memalign(void **__memptr, size_t __alignment, size_t __size);
+#else
+// Some systems (e.g. those with GNU libc) declare posix_memalign with an
+// exception specifier. Via an "egregious workaround" in
+// Sema::CheckEquivalentExceptionSpec, Clang accepts the following as a valid
+// redeclaration of glibc's declaration.
+extern "C" int posix_memalign(void **__memptr, size_t __alignment, size_t __size);
+#endif
+#endif
+
+#if !(defined(_WIN32) && defined(_mm_malloc))
+static __inline__ void *__attribute__((__always_inline__, __nodebug__,
+                                       __malloc__))
+_mm_malloc(size_t __size, size_t __align)
+{
+  if (__align == 1) {
+    return malloc(__size);
+  }
+
+  if (!(__align & (__align - 1)) && __align < sizeof(void *))
+    __align = sizeof(void *);
+
+  void *__mallocedMemory;
+#if defined(__MINGW32__)
+  __mallocedMemory = __mingw_aligned_malloc(__size, __align);
+#elif defined(_WIN32)
+  __mallocedMemory = _aligned_malloc(__size, __align);
+#else
+  if (posix_memalign(&__mallocedMemory, __align, __size))
+    return 0;
+#endif
+
+  return __mallocedMemory;
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_free(void *__p)
+{
+  free(__p);
+}
+#endif
+
+#endif /* __MM_MALLOC_H */

文件差異過大導致無法顯示
+ 1570 - 0
demo/include/mmintrin.h


+ 167 - 0
demo/include/module.modulemap

@@ -0,0 +1,167 @@
+/*===---- module.modulemap - intrinsics module map -------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+module _Builtin_intrinsics [system] [extern_c] {
+  explicit module altivec {
+    requires altivec
+    header "altivec.h"
+  }
+
+  explicit module arm {
+    requires arm
+
+    explicit module acle {
+      header "arm_acle.h"
+      export *
+    }
+
+    explicit module neon {
+      requires neon
+      header "arm_neon.h"
+      export *
+    }
+  }
+
+  explicit module intel {
+    requires x86
+    export *
+
+    header "immintrin.h"
+    textual header "f16cintrin.h"
+    textual header "avxintrin.h"
+    textual header "avx2intrin.h"
+    textual header "avx512fintrin.h"
+    textual header "avx512erintrin.h"
+    textual header "fmaintrin.h"
+
+    header "x86intrin.h"
+    textual header "bmiintrin.h"
+    textual header "bmi2intrin.h"
+    textual header "lzcntintrin.h"
+    textual header "xopintrin.h"
+    textual header "fma4intrin.h"
+    textual header "mwaitxintrin.h"
+    textual header "clzerointrin.h"
+
+    explicit module mm_malloc {
+      requires !freestanding
+      header "mm_malloc.h"
+      export * // note: for <stdlib.h> dependency
+    }
+
+    explicit module cpuid {
+      requires gnuinlineasm
+      header "cpuid.h"
+    }
+
+    explicit module mmx {
+      header "mmintrin.h"
+    }
+
+    explicit module sse {
+      export mm_malloc
+      export mmx
+      export sse2 // note: for hackish <emmintrin.h> dependency
+      header "xmmintrin.h"
+    }
+
+    explicit module sse2 {
+      export sse
+      header "emmintrin.h"
+    }
+
+    explicit module sse3 {
+      export sse2
+      header "pmmintrin.h"
+    }
+
+    explicit module ssse3 {
+      export sse3
+      header "tmmintrin.h"
+    }
+
+    explicit module sse4_1 {
+      export ssse3
+      header "smmintrin.h"
+    }
+
+    explicit module sse4_2 {
+      export sse4_1
+      header "nmmintrin.h"
+    }
+
+    explicit module sse4a {
+      export sse3
+      header "ammintrin.h"
+    }
+
+    explicit module popcnt {
+      header "popcntintrin.h"
+    }
+
+    explicit module mm3dnow {
+      header "mm3dnow.h"
+    }
+
+    explicit module aes_pclmul {
+      header "wmmintrin.h"
+      export aes
+      export pclmul
+    }
+
+    explicit module aes {
+      header "__wmmintrin_aes.h"
+    }
+
+    explicit module pclmul {
+      header "__wmmintrin_pclmul.h"
+    }
+  }
+
+  explicit module systemz {
+    requires systemz
+    export *
+
+    header "s390intrin.h"
+
+    explicit module htm {
+      requires htm
+      header "htmintrin.h"
+      header "htmxlintrin.h"
+    }
+
+    explicit module zvector {
+      requires zvector, vx
+      header "vecintrin.h"
+    }
+  }
+}
+
+module _Builtin_stddef_max_align_t [system] [extern_c] {
+  header "__stddef_max_align_t.h"
+}
+
+module opencl_c {
+  requires opencl
+  header "opencl-c.h"
+}

+ 583 - 0
demo/include/msa.h

@@ -0,0 +1,583 @@
+/*===---- msa.h - MIPS MSA intrinsics --------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef _MSA_H
+#define _MSA_H 1
+
+#if defined(__mips_msa)
+typedef signed char v16i8 __attribute__((vector_size(16), aligned(16)));
+typedef signed char v16i8_b __attribute__((vector_size(16), aligned(1)));
+typedef unsigned char v16u8 __attribute__((vector_size(16), aligned(16)));
+typedef unsigned char v16u8_b __attribute__((vector_size(16), aligned(1)));
+typedef short v8i16 __attribute__((vector_size(16), aligned(16)));
+typedef short v8i16_h __attribute__((vector_size(16), aligned(2)));
+typedef unsigned short v8u16 __attribute__((vector_size(16), aligned(16)));
+typedef unsigned short v8u16_h __attribute__((vector_size(16), aligned(2)));
+typedef int v4i32 __attribute__((vector_size(16), aligned(16)));
+typedef int v4i32_w __attribute__((vector_size(16), aligned(4)));
+typedef unsigned int v4u32 __attribute__((vector_size(16), aligned(16)));
+typedef unsigned int v4u32_w __attribute__((vector_size(16), aligned(4)));
+typedef long long v2i64 __attribute__((vector_size(16), aligned(16)));
+typedef long long v2i64_d __attribute__((vector_size(16), aligned(8)));
+typedef unsigned long long v2u64 __attribute__((vector_size(16), aligned(16)));
+typedef unsigned long long v2u64_d __attribute__((vector_size(16), aligned(8)));
+typedef float v4f32 __attribute__((vector_size(16), aligned(16)));
+typedef float v4f32_w __attribute__((vector_size(16), aligned(4)));
+typedef double v2f64 __attribute__ ((vector_size(16), aligned(16)));
+typedef double v2f64_d __attribute__ ((vector_size(16), aligned(8)));
+
+#define __msa_sll_b __builtin_msa_sll_b
+#define __msa_sll_h __builtin_msa_sll_h
+#define __msa_sll_w __builtin_msa_sll_w
+#define __msa_sll_d __builtin_msa_sll_d
+#define __msa_slli_b __builtin_msa_slli_b
+#define __msa_slli_h __builtin_msa_slli_h
+#define __msa_slli_w __builtin_msa_slli_w
+#define __msa_slli_d __builtin_msa_slli_d
+#define __msa_sra_b __builtin_msa_sra_b
+#define __msa_sra_h __builtin_msa_sra_h
+#define __msa_sra_w __builtin_msa_sra_w
+#define __msa_sra_d __builtin_msa_sra_d
+#define __msa_srai_b __builtin_msa_srai_b
+#define __msa_srai_h __builtin_msa_srai_h
+#define __msa_srai_w __builtin_msa_srai_w
+#define __msa_srai_d __builtin_msa_srai_d
+#define __msa_srar_b __builtin_msa_srar_b
+#define __msa_srar_h __builtin_msa_srar_h
+#define __msa_srar_w __builtin_msa_srar_w
+#define __msa_srar_d __builtin_msa_srar_d
+#define __msa_srari_b __builtin_msa_srari_b
+#define __msa_srari_h __builtin_msa_srari_h
+#define __msa_srari_w __builtin_msa_srari_w
+#define __msa_srari_d __builtin_msa_srari_d
+#define __msa_srl_b __builtin_msa_srl_b
+#define __msa_srl_h __builtin_msa_srl_h
+#define __msa_srl_w __builtin_msa_srl_w
+#define __msa_srl_d __builtin_msa_srl_d
+#define __msa_srli_b __builtin_msa_srli_b
+#define __msa_srli_h __builtin_msa_srli_h
+#define __msa_srli_w __builtin_msa_srli_w
+#define __msa_srli_d __builtin_msa_srli_d
+#define __msa_srlr_b __builtin_msa_srlr_b
+#define __msa_srlr_h __builtin_msa_srlr_h
+#define __msa_srlr_w __builtin_msa_srlr_w
+#define __msa_srlr_d __builtin_msa_srlr_d
+#define __msa_srlri_b __builtin_msa_srlri_b
+#define __msa_srlri_h __builtin_msa_srlri_h
+#define __msa_srlri_w __builtin_msa_srlri_w
+#define __msa_srlri_d __builtin_msa_srlri_d
+#define __msa_bclr_b __builtin_msa_bclr_b
+#define __msa_bclr_h __builtin_msa_bclr_h
+#define __msa_bclr_w __builtin_msa_bclr_w
+#define __msa_bclr_d __builtin_msa_bclr_d
+#define __msa_bclri_b __builtin_msa_bclri_b
+#define __msa_bclri_h __builtin_msa_bclri_h
+#define __msa_bclri_w __builtin_msa_bclri_w
+#define __msa_bclri_d __builtin_msa_bclri_d
+#define __msa_bset_b __builtin_msa_bset_b
+#define __msa_bset_h __builtin_msa_bset_h
+#define __msa_bset_w __builtin_msa_bset_w
+#define __msa_bset_d __builtin_msa_bset_d
+#define __msa_bseti_b __builtin_msa_bseti_b
+#define __msa_bseti_h __builtin_msa_bseti_h
+#define __msa_bseti_w __builtin_msa_bseti_w
+#define __msa_bseti_d __builtin_msa_bseti_d
+#define __msa_bneg_b __builtin_msa_bneg_b
+#define __msa_bneg_h __builtin_msa_bneg_h
+#define __msa_bneg_w __builtin_msa_bneg_w
+#define __msa_bneg_d __builtin_msa_bneg_d
+#define __msa_bnegi_b __builtin_msa_bnegi_b
+#define __msa_bnegi_h __builtin_msa_bnegi_h
+#define __msa_bnegi_w __builtin_msa_bnegi_w
+#define __msa_bnegi_d __builtin_msa_bnegi_d
+#define __msa_binsl_b __builtin_msa_binsl_b
+#define __msa_binsl_h __builtin_msa_binsl_h
+#define __msa_binsl_w __builtin_msa_binsl_w
+#define __msa_binsl_d __builtin_msa_binsl_d
+#define __msa_binsli_b __builtin_msa_binsli_b
+#define __msa_binsli_h __builtin_msa_binsli_h
+#define __msa_binsli_w __builtin_msa_binsli_w
+#define __msa_binsli_d __builtin_msa_binsli_d
+#define __msa_binsr_b __builtin_msa_binsr_b
+#define __msa_binsr_h __builtin_msa_binsr_h
+#define __msa_binsr_w __builtin_msa_binsr_w
+#define __msa_binsr_d __builtin_msa_binsr_d
+#define __msa_binsri_b __builtin_msa_binsri_b
+#define __msa_binsri_h __builtin_msa_binsri_h
+#define __msa_binsri_w __builtin_msa_binsri_w
+#define __msa_binsri_d __builtin_msa_binsri_d
+#define __msa_addv_b __builtin_msa_addv_b
+#define __msa_addv_h __builtin_msa_addv_h
+#define __msa_addv_w __builtin_msa_addv_w
+#define __msa_addv_d __builtin_msa_addv_d
+#define __msa_addvi_b __builtin_msa_addvi_b
+#define __msa_addvi_h __builtin_msa_addvi_h
+#define __msa_addvi_w __builtin_msa_addvi_w
+#define __msa_addvi_d __builtin_msa_addvi_d
+#define __msa_subv_b __builtin_msa_subv_b
+#define __msa_subv_h __builtin_msa_subv_h
+#define __msa_subv_w __builtin_msa_subv_w
+#define __msa_subv_d __builtin_msa_subv_d
+#define __msa_subvi_b __builtin_msa_subvi_b
+#define __msa_subvi_h __builtin_msa_subvi_h
+#define __msa_subvi_w __builtin_msa_subvi_w
+#define __msa_subvi_d __builtin_msa_subvi_d
+#define __msa_max_s_b __builtin_msa_max_s_b
+#define __msa_max_s_h __builtin_msa_max_s_h
+#define __msa_max_s_w __builtin_msa_max_s_w
+#define __msa_max_s_d __builtin_msa_max_s_d
+#define __msa_maxi_s_b __builtin_msa_maxi_s_b
+#define __msa_maxi_s_h __builtin_msa_maxi_s_h
+#define __msa_maxi_s_w __builtin_msa_maxi_s_w
+#define __msa_maxi_s_d __builtin_msa_maxi_s_d
+#define __msa_max_u_b __builtin_msa_max_u_b
+#define __msa_max_u_h __builtin_msa_max_u_h
+#define __msa_max_u_w __builtin_msa_max_u_w
+#define __msa_max_u_d __builtin_msa_max_u_d
+#define __msa_maxi_u_b __builtin_msa_maxi_u_b
+#define __msa_maxi_u_h __builtin_msa_maxi_u_h
+#define __msa_maxi_u_w __builtin_msa_maxi_u_w
+#define __msa_maxi_u_d __builtin_msa_maxi_u_d
+#define __msa_min_s_b __builtin_msa_min_s_b
+#define __msa_min_s_h __builtin_msa_min_s_h
+#define __msa_min_s_w __builtin_msa_min_s_w
+#define __msa_min_s_d __builtin_msa_min_s_d
+#define __msa_mini_s_b __builtin_msa_mini_s_b
+#define __msa_mini_s_h __builtin_msa_mini_s_h
+#define __msa_mini_s_w __builtin_msa_mini_s_w
+#define __msa_mini_s_d __builtin_msa_mini_s_d
+#define __msa_min_u_b __builtin_msa_min_u_b
+#define __msa_min_u_h __builtin_msa_min_u_h
+#define __msa_min_u_w __builtin_msa_min_u_w
+#define __msa_min_u_d __builtin_msa_min_u_d
+#define __msa_mini_u_b __builtin_msa_mini_u_b
+#define __msa_mini_u_h __builtin_msa_mini_u_h
+#define __msa_mini_u_w __builtin_msa_mini_u_w
+#define __msa_mini_u_d __builtin_msa_mini_u_d
+#define __msa_max_a_b __builtin_msa_max_a_b
+#define __msa_max_a_h __builtin_msa_max_a_h
+#define __msa_max_a_w __builtin_msa_max_a_w
+#define __msa_max_a_d __builtin_msa_max_a_d
+#define __msa_min_a_b __builtin_msa_min_a_b
+#define __msa_min_a_h __builtin_msa_min_a_h
+#define __msa_min_a_w __builtin_msa_min_a_w
+#define __msa_min_a_d __builtin_msa_min_a_d
+#define __msa_ceq_b __builtin_msa_ceq_b
+#define __msa_ceq_h __builtin_msa_ceq_h
+#define __msa_ceq_w __builtin_msa_ceq_w
+#define __msa_ceq_d __builtin_msa_ceq_d
+#define __msa_ceqi_b __builtin_msa_ceqi_b
+#define __msa_ceqi_h __builtin_msa_ceqi_h
+#define __msa_ceqi_w __builtin_msa_ceqi_w
+#define __msa_ceqi_d __builtin_msa_ceqi_d
+#define __msa_clt_s_b __builtin_msa_clt_s_b
+#define __msa_clt_s_h __builtin_msa_clt_s_h
+#define __msa_clt_s_w __builtin_msa_clt_s_w
+#define __msa_clt_s_d __builtin_msa_clt_s_d
+#define __msa_clti_s_b __builtin_msa_clti_s_b
+#define __msa_clti_s_h __builtin_msa_clti_s_h
+#define __msa_clti_s_w __builtin_msa_clti_s_w
+#define __msa_clti_s_d __builtin_msa_clti_s_d
+#define __msa_clt_u_b __builtin_msa_clt_u_b
+#define __msa_clt_u_h __builtin_msa_clt_u_h
+#define __msa_clt_u_w __builtin_msa_clt_u_w
+#define __msa_clt_u_d __builtin_msa_clt_u_d
+#define __msa_clti_u_b __builtin_msa_clti_u_b
+#define __msa_clti_u_h __builtin_msa_clti_u_h
+#define __msa_clti_u_w __builtin_msa_clti_u_w
+#define __msa_clti_u_d __builtin_msa_clti_u_d
+#define __msa_cle_s_b __builtin_msa_cle_s_b
+#define __msa_cle_s_h __builtin_msa_cle_s_h
+#define __msa_cle_s_w __builtin_msa_cle_s_w
+#define __msa_cle_s_d __builtin_msa_cle_s_d
+#define __msa_clei_s_b __builtin_msa_clei_s_b
+#define __msa_clei_s_h __builtin_msa_clei_s_h
+#define __msa_clei_s_w __builtin_msa_clei_s_w
+#define __msa_clei_s_d __builtin_msa_clei_s_d
+#define __msa_cle_u_b __builtin_msa_cle_u_b
+#define __msa_cle_u_h __builtin_msa_cle_u_h
+#define __msa_cle_u_w __builtin_msa_cle_u_w
+#define __msa_cle_u_d __builtin_msa_cle_u_d
+#define __msa_clei_u_b __builtin_msa_clei_u_b
+#define __msa_clei_u_h __builtin_msa_clei_u_h
+#define __msa_clei_u_w __builtin_msa_clei_u_w
+#define __msa_clei_u_d __builtin_msa_clei_u_d
+#define __msa_ld_b __builtin_msa_ld_b
+#define __msa_ld_h __builtin_msa_ld_h
+#define __msa_ld_w __builtin_msa_ld_w
+#define __msa_ld_d __builtin_msa_ld_d
+#define __msa_st_b __builtin_msa_st_b
+#define __msa_st_h __builtin_msa_st_h
+#define __msa_st_w __builtin_msa_st_w
+#define __msa_st_d __builtin_msa_st_d
+#define __msa_sat_s_b __builtin_msa_sat_s_b
+#define __msa_sat_s_h __builtin_msa_sat_s_h
+#define __msa_sat_s_w __builtin_msa_sat_s_w
+#define __msa_sat_s_d __builtin_msa_sat_s_d
+#define __msa_sat_u_b __builtin_msa_sat_u_b
+#define __msa_sat_u_h __builtin_msa_sat_u_h
+#define __msa_sat_u_w __builtin_msa_sat_u_w
+#define __msa_sat_u_d __builtin_msa_sat_u_d
+#define __msa_add_a_b __builtin_msa_add_a_b
+#define __msa_add_a_h __builtin_msa_add_a_h
+#define __msa_add_a_w __builtin_msa_add_a_w
+#define __msa_add_a_d __builtin_msa_add_a_d
+#define __msa_adds_a_b __builtin_msa_adds_a_b
+#define __msa_adds_a_h __builtin_msa_adds_a_h
+#define __msa_adds_a_w __builtin_msa_adds_a_w
+#define __msa_adds_a_d __builtin_msa_adds_a_d
+#define __msa_adds_s_b __builtin_msa_adds_s_b
+#define __msa_adds_s_h __builtin_msa_adds_s_h
+#define __msa_adds_s_w __builtin_msa_adds_s_w
+#define __msa_adds_s_d __builtin_msa_adds_s_d
+#define __msa_adds_u_b __builtin_msa_adds_u_b
+#define __msa_adds_u_h __builtin_msa_adds_u_h
+#define __msa_adds_u_w __builtin_msa_adds_u_w
+#define __msa_adds_u_d __builtin_msa_adds_u_d
+#define __msa_ave_s_b __builtin_msa_ave_s_b
+#define __msa_ave_s_h __builtin_msa_ave_s_h
+#define __msa_ave_s_w __builtin_msa_ave_s_w
+#define __msa_ave_s_d __builtin_msa_ave_s_d
+#define __msa_ave_u_b __builtin_msa_ave_u_b
+#define __msa_ave_u_h __builtin_msa_ave_u_h
+#define __msa_ave_u_w __builtin_msa_ave_u_w
+#define __msa_ave_u_d __builtin_msa_ave_u_d
+#define __msa_aver_s_b __builtin_msa_aver_s_b
+#define __msa_aver_s_h __builtin_msa_aver_s_h
+#define __msa_aver_s_w __builtin_msa_aver_s_w
+#define __msa_aver_s_d __builtin_msa_aver_s_d
+#define __msa_aver_u_b __builtin_msa_aver_u_b
+#define __msa_aver_u_h __builtin_msa_aver_u_h
+#define __msa_aver_u_w __builtin_msa_aver_u_w
+#define __msa_aver_u_d __builtin_msa_aver_u_d
+#define __msa_subs_s_b __builtin_msa_subs_s_b
+#define __msa_subs_s_h __builtin_msa_subs_s_h
+#define __msa_subs_s_w __builtin_msa_subs_s_w
+#define __msa_subs_s_d __builtin_msa_subs_s_d
+#define __msa_subs_u_b __builtin_msa_subs_u_b
+#define __msa_subs_u_h __builtin_msa_subs_u_h
+#define __msa_subs_u_w __builtin_msa_subs_u_w
+#define __msa_subs_u_d __builtin_msa_subs_u_d
+#define __msa_subsuu_s_b __builtin_msa_subsuu_s_b
+#define __msa_subsuu_s_h __builtin_msa_subsuu_s_h
+#define __msa_subsuu_s_w __builtin_msa_subsuu_s_w
+#define __msa_subsuu_s_d __builtin_msa_subsuu_s_d
+#define __msa_subsus_u_b __builtin_msa_subsus_u_b
+#define __msa_subsus_u_h __builtin_msa_subsus_u_h
+#define __msa_subsus_u_w __builtin_msa_subsus_u_w
+#define __msa_subsus_u_d __builtin_msa_subsus_u_d
+#define __msa_asub_s_b __builtin_msa_asub_s_b
+#define __msa_asub_s_h __builtin_msa_asub_s_h
+#define __msa_asub_s_w __builtin_msa_asub_s_w
+#define __msa_asub_s_d __builtin_msa_asub_s_d
+#define __msa_asub_u_b __builtin_msa_asub_u_b
+#define __msa_asub_u_h __builtin_msa_asub_u_h
+#define __msa_asub_u_w __builtin_msa_asub_u_w
+#define __msa_asub_u_d __builtin_msa_asub_u_d
+#define __msa_mulv_b __builtin_msa_mulv_b
+#define __msa_mulv_h __builtin_msa_mulv_h
+#define __msa_mulv_w __builtin_msa_mulv_w
+#define __msa_mulv_d __builtin_msa_mulv_d
+#define __msa_maddv_b __builtin_msa_maddv_b
+#define __msa_maddv_h __builtin_msa_maddv_h
+#define __msa_maddv_w __builtin_msa_maddv_w
+#define __msa_maddv_d __builtin_msa_maddv_d
+#define __msa_msubv_b __builtin_msa_msubv_b
+#define __msa_msubv_h __builtin_msa_msubv_h
+#define __msa_msubv_w __builtin_msa_msubv_w
+#define __msa_msubv_d __builtin_msa_msubv_d
+#define __msa_div_s_b __builtin_msa_div_s_b
+#define __msa_div_s_h __builtin_msa_div_s_h
+#define __msa_div_s_w __builtin_msa_div_s_w
+#define __msa_div_s_d __builtin_msa_div_s_d
+#define __msa_div_u_b __builtin_msa_div_u_b
+#define __msa_div_u_h __builtin_msa_div_u_h
+#define __msa_div_u_w __builtin_msa_div_u_w
+#define __msa_div_u_d __builtin_msa_div_u_d
+#define __msa_hadd_s_h __builtin_msa_hadd_s_h
+#define __msa_hadd_s_w __builtin_msa_hadd_s_w
+#define __msa_hadd_s_d __builtin_msa_hadd_s_d
+#define __msa_hadd_u_h __builtin_msa_hadd_u_h
+#define __msa_hadd_u_w __builtin_msa_hadd_u_w
+#define __msa_hadd_u_d __builtin_msa_hadd_u_d
+#define __msa_hsub_s_h __builtin_msa_hsub_s_h
+#define __msa_hsub_s_w __builtin_msa_hsub_s_w
+#define __msa_hsub_s_d __builtin_msa_hsub_s_d
+#define __msa_hsub_u_h __builtin_msa_hsub_u_h
+#define __msa_hsub_u_w __builtin_msa_hsub_u_w
+#define __msa_hsub_u_d __builtin_msa_hsub_u_d
+#define __msa_mod_s_b __builtin_msa_mod_s_b
+#define __msa_mod_s_h __builtin_msa_mod_s_h
+#define __msa_mod_s_w __builtin_msa_mod_s_w
+#define __msa_mod_s_d __builtin_msa_mod_s_d
+#define __msa_mod_u_b __builtin_msa_mod_u_b
+#define __msa_mod_u_h __builtin_msa_mod_u_h
+#define __msa_mod_u_w __builtin_msa_mod_u_w
+#define __msa_mod_u_d __builtin_msa_mod_u_d
+#define __msa_dotp_s_h __builtin_msa_dotp_s_h
+#define __msa_dotp_s_w __builtin_msa_dotp_s_w
+#define __msa_dotp_s_d __builtin_msa_dotp_s_d
+#define __msa_dotp_u_h __builtin_msa_dotp_u_h
+#define __msa_dotp_u_w __builtin_msa_dotp_u_w
+#define __msa_dotp_u_d __builtin_msa_dotp_u_d
+#define __msa_dpadd_s_h __builtin_msa_dpadd_s_h
+#define __msa_dpadd_s_w __builtin_msa_dpadd_s_w
+#define __msa_dpadd_s_d __builtin_msa_dpadd_s_d
+#define __msa_dpadd_u_h __builtin_msa_dpadd_u_h
+#define __msa_dpadd_u_w __builtin_msa_dpadd_u_w
+#define __msa_dpadd_u_d __builtin_msa_dpadd_u_d
+#define __msa_dpsub_s_h __builtin_msa_dpsub_s_h
+#define __msa_dpsub_s_w __builtin_msa_dpsub_s_w
+#define __msa_dpsub_s_d __builtin_msa_dpsub_s_d
+#define __msa_dpsub_u_h __builtin_msa_dpsub_u_h
+#define __msa_dpsub_u_w __builtin_msa_dpsub_u_w
+#define __msa_dpsub_u_d __builtin_msa_dpsub_u_d
+#define __msa_sld_b __builtin_msa_sld_b
+#define __msa_sld_h __builtin_msa_sld_h
+#define __msa_sld_w __builtin_msa_sld_w
+#define __msa_sld_d __builtin_msa_sld_d
+#define __msa_sldi_b __builtin_msa_sldi_b
+#define __msa_sldi_h __builtin_msa_sldi_h
+#define __msa_sldi_w __builtin_msa_sldi_w
+#define __msa_sldi_d __builtin_msa_sldi_d
+#define __msa_splat_b __builtin_msa_splat_b
+#define __msa_splat_h __builtin_msa_splat_h
+#define __msa_splat_w __builtin_msa_splat_w
+#define __msa_splat_d __builtin_msa_splat_d
+#define __msa_splati_b __builtin_msa_splati_b
+#define __msa_splati_h __builtin_msa_splati_h
+#define __msa_splati_w __builtin_msa_splati_w
+#define __msa_splati_d __builtin_msa_splati_d
+#define __msa_pckev_b __builtin_msa_pckev_b
+#define __msa_pckev_h __builtin_msa_pckev_h
+#define __msa_pckev_w __builtin_msa_pckev_w
+#define __msa_pckev_d __builtin_msa_pckev_d
+#define __msa_pckod_b __builtin_msa_pckod_b
+#define __msa_pckod_h __builtin_msa_pckod_h
+#define __msa_pckod_w __builtin_msa_pckod_w
+#define __msa_pckod_d __builtin_msa_pckod_d
+#define __msa_ilvl_b __builtin_msa_ilvl_b
+#define __msa_ilvl_h __builtin_msa_ilvl_h
+#define __msa_ilvl_w __builtin_msa_ilvl_w
+#define __msa_ilvl_d __builtin_msa_ilvl_d
+#define __msa_ilvr_b __builtin_msa_ilvr_b
+#define __msa_ilvr_h __builtin_msa_ilvr_h
+#define __msa_ilvr_w __builtin_msa_ilvr_w
+#define __msa_ilvr_d __builtin_msa_ilvr_d
+#define __msa_ilvev_b __builtin_msa_ilvev_b
+#define __msa_ilvev_h __builtin_msa_ilvev_h
+#define __msa_ilvev_w __builtin_msa_ilvev_w
+#define __msa_ilvev_d __builtin_msa_ilvev_d
+#define __msa_ilvod_b __builtin_msa_ilvod_b
+#define __msa_ilvod_h __builtin_msa_ilvod_h
+#define __msa_ilvod_w __builtin_msa_ilvod_w
+#define __msa_ilvod_d __builtin_msa_ilvod_d
+#define __msa_vshf_b __builtin_msa_vshf_b
+#define __msa_vshf_h __builtin_msa_vshf_h
+#define __msa_vshf_w __builtin_msa_vshf_w
+#define __msa_vshf_d __builtin_msa_vshf_d
+#define __msa_and_v __builtin_msa_and_v
+#define __msa_andi_b __builtin_msa_andi_b
+#define __msa_or_v __builtin_msa_or_v
+#define __msa_ori_b __builtin_msa_ori_b
+#define __msa_nor_v __builtin_msa_nor_v
+#define __msa_nori_b __builtin_msa_nori_b
+#define __msa_xor_v __builtin_msa_xor_v
+#define __msa_xori_b __builtin_msa_xori_b
+#define __msa_bmnz_v __builtin_msa_bmnz_v
+#define __msa_bmnzi_b __builtin_msa_bmnzi_b
+#define __msa_bmz_v __builtin_msa_bmz_v
+#define __msa_bmzi_b __builtin_msa_bmzi_b
+#define __msa_bsel_v __builtin_msa_bsel_v
+#define __msa_bseli_b __builtin_msa_bseli_b
+#define __msa_shf_b __builtin_msa_shf_b
+#define __msa_shf_h __builtin_msa_shf_h
+#define __msa_shf_w __builtin_msa_shf_w
+#define __msa_test_bnz_v __builtin_msa_bnz_v
+#define __msa_test_bz_v __builtin_msa_bz_v
+#define __msa_fill_b __builtin_msa_fill_b
+#define __msa_fill_h __builtin_msa_fill_h
+#define __msa_fill_w __builtin_msa_fill_w
+#define __msa_fill_d __builtin_msa_fill_d
+#define __msa_pcnt_b __builtin_msa_pcnt_b
+#define __msa_pcnt_h __builtin_msa_pcnt_h
+#define __msa_pcnt_w __builtin_msa_pcnt_w
+#define __msa_pcnt_d __builtin_msa_pcnt_d
+#define __msa_nloc_b __builtin_msa_nloc_b
+#define __msa_nloc_h __builtin_msa_nloc_h
+#define __msa_nloc_w __builtin_msa_nloc_w
+#define __msa_nloc_d __builtin_msa_nloc_d
+#define __msa_nlzc_b __builtin_msa_nlzc_b
+#define __msa_nlzc_h __builtin_msa_nlzc_h
+#define __msa_nlzc_w __builtin_msa_nlzc_w
+#define __msa_nlzc_d __builtin_msa_nlzc_d
+#define __msa_copy_s_b __builtin_msa_copy_s_b
+#define __msa_copy_s_h __builtin_msa_copy_s_h
+#define __msa_copy_s_w __builtin_msa_copy_s_w
+#define __msa_copy_s_d __builtin_msa_copy_s_d
+#define __msa_copy_u_b __builtin_msa_copy_u_b
+#define __msa_copy_u_h __builtin_msa_copy_u_h
+#define __msa_copy_u_w __builtin_msa_copy_u_w
+#define __msa_copy_u_d __builtin_msa_copy_u_d
+#define __msa_insert_b __builtin_msa_insert_b
+#define __msa_insert_h __builtin_msa_insert_h
+#define __msa_insert_w __builtin_msa_insert_w
+#define __msa_insert_d __builtin_msa_insert_d
+#define __msa_insve_b __builtin_msa_insve_b
+#define __msa_insve_h __builtin_msa_insve_h
+#define __msa_insve_w __builtin_msa_insve_w
+#define __msa_insve_d __builtin_msa_insve_d
+#define __msa_test_bnz_b __builtin_msa_bnz_b
+#define __msa_test_bnz_h __builtin_msa_bnz_h
+#define __msa_test_bnz_w __builtin_msa_bnz_w
+#define __msa_test_bnz_d __builtin_msa_bnz_d
+#define __msa_test_bz_b __builtin_msa_bz_b
+#define __msa_test_bz_h __builtin_msa_bz_h
+#define __msa_test_bz_w __builtin_msa_bz_w
+#define __msa_test_bz_d __builtin_msa_bz_d
+#define __msa_ldi_b __builtin_msa_ldi_b
+#define __msa_ldi_h __builtin_msa_ldi_h
+#define __msa_ldi_w __builtin_msa_ldi_w
+#define __msa_ldi_d __builtin_msa_ldi_d
+#define __msa_fcaf_w __builtin_msa_fcaf_w
+#define __msa_fcaf_d __builtin_msa_fcaf_d
+#define __msa_fcor_w __builtin_msa_fcor_w
+#define __msa_fcor_d __builtin_msa_fcor_d
+#define __msa_fcun_w __builtin_msa_fcun_w
+#define __msa_fcun_d __builtin_msa_fcun_d
+#define __msa_fcune_w __builtin_msa_fcune_w
+#define __msa_fcune_d __builtin_msa_fcune_d
+#define __msa_fcueq_w __builtin_msa_fcueq_w
+#define __msa_fcueq_d __builtin_msa_fcueq_d
+#define __msa_fceq_w __builtin_msa_fceq_w
+#define __msa_fceq_d __builtin_msa_fceq_d
+#define __msa_fcne_w __builtin_msa_fcne_w
+#define __msa_fcne_d __builtin_msa_fcne_d
+#define __msa_fclt_w __builtin_msa_fclt_w
+#define __msa_fclt_d __builtin_msa_fclt_d
+#define __msa_fcult_w __builtin_msa_fcult_w
+#define __msa_fcult_d __builtin_msa_fcult_d
+#define __msa_fcle_w __builtin_msa_fcle_w
+#define __msa_fcle_d __builtin_msa_fcle_d
+#define __msa_fcule_w __builtin_msa_fcule_w
+#define __msa_fcule_d __builtin_msa_fcule_d
+#define __msa_fsaf_w __builtin_msa_fsaf_w
+#define __msa_fsaf_d __builtin_msa_fsaf_d
+#define __msa_fsor_w __builtin_msa_fsor_w
+#define __msa_fsor_d __builtin_msa_fsor_d
+#define __msa_fsun_w __builtin_msa_fsun_w
+#define __msa_fsun_d __builtin_msa_fsun_d
+#define __msa_fsune_w __builtin_msa_fsune_w
+#define __msa_fsune_d __builtin_msa_fsune_d
+#define __msa_fsueq_w __builtin_msa_fsueq_w
+#define __msa_fsueq_d __builtin_msa_fsueq_d
+#define __msa_fseq_w __builtin_msa_fseq_w
+#define __msa_fseq_d __builtin_msa_fseq_d
+#define __msa_fsne_w __builtin_msa_fsne_w
+#define __msa_fsne_d __builtin_msa_fsne_d
+#define __msa_fslt_w __builtin_msa_fslt_w
+#define __msa_fslt_d __builtin_msa_fslt_d
+#define __msa_fsult_w __builtin_msa_fsult_w
+#define __msa_fsult_d __builtin_msa_fsult_d
+#define __msa_fsle_w __builtin_msa_fsle_w
+#define __msa_fsle_d __builtin_msa_fsle_d
+#define __msa_fsule_w __builtin_msa_fsule_w
+#define __msa_fsule_d __builtin_msa_fsule_d
+#define __msa_fadd_w __builtin_msa_fadd_w
+#define __msa_fadd_d __builtin_msa_fadd_d
+#define __msa_fsub_w __builtin_msa_fsub_w
+#define __msa_fsub_d __builtin_msa_fsub_d
+#define __msa_fmul_w __builtin_msa_fmul_w
+#define __msa_fmul_d __builtin_msa_fmul_d
+#define __msa_fdiv_w __builtin_msa_fdiv_w
+#define __msa_fdiv_d __builtin_msa_fdiv_d
+#define __msa_fmadd_w __builtin_msa_fmadd_w
+#define __msa_fmadd_d __builtin_msa_fmadd_d
+#define __msa_fmsub_w __builtin_msa_fmsub_w
+#define __msa_fmsub_d __builtin_msa_fmsub_d
+#define __msa_fexp2_w __builtin_msa_fexp2_w
+#define __msa_fexp2_d __builtin_msa_fexp2_d
+#define __msa_fexdo_h __builtin_msa_fexdo_h
+#define __msa_fexdo_w __builtin_msa_fexdo_w
+#define __msa_ftq_h __builtin_msa_ftq_h
+#define __msa_ftq_w __builtin_msa_ftq_w
+#define __msa_fmin_w __builtin_msa_fmin_w
+#define __msa_fmin_d __builtin_msa_fmin_d
+#define __msa_fmin_a_w __builtin_msa_fmin_a_w
+#define __msa_fmin_a_d __builtin_msa_fmin_a_d
+#define __msa_fmax_w __builtin_msa_fmax_w
+#define __msa_fmax_d __builtin_msa_fmax_d
+#define __msa_fmax_a_w __builtin_msa_fmax_a_w
+#define __msa_fmax_a_d __builtin_msa_fmax_a_d
+#define __msa_mul_q_h __builtin_msa_mul_q_h
+#define __msa_mul_q_w __builtin_msa_mul_q_w
+#define __msa_mulr_q_h __builtin_msa_mulr_q_h
+#define __msa_mulr_q_w __builtin_msa_mulr_q_w
+#define __msa_madd_q_h __builtin_msa_madd_q_h
+#define __msa_madd_q_w __builtin_msa_madd_q_w
+#define __msa_maddr_q_h __builtin_msa_maddr_q_h
+#define __msa_maddr_q_w __builtin_msa_maddr_q_w
+#define __msa_msub_q_h __builtin_msa_msub_q_h
+#define __msa_msub_q_w __builtin_msa_msub_q_w
+#define __msa_msubr_q_h __builtin_msa_msubr_q_h
+#define __msa_msubr_q_w __builtin_msa_msubr_q_w
+#define __msa_fclass_w __builtin_msa_fclass_w
+#define __msa_fclass_d __builtin_msa_fclass_d
+#define __msa_fsqrt_w __builtin_msa_fsqrt_w
+#define __msa_fsqrt_d __builtin_msa_fsqrt_d
+#define __msa_frcp_w __builtin_msa_frcp_w
+#define __msa_frcp_d __builtin_msa_frcp_d
+#define __msa_frint_w __builtin_msa_frint_w
+#define __msa_frint_d __builtin_msa_frint_d
+#define __msa_frsqrt_w __builtin_msa_frsqrt_w
+#define __msa_frsqrt_d __builtin_msa_frsqrt_d
+#define __msa_flog2_w __builtin_msa_flog2_w
+#define __msa_flog2_d __builtin_msa_flog2_d
+#define __msa_fexupl_w __builtin_msa_fexupl_w
+#define __msa_fexupl_d __builtin_msa_fexupl_d
+#define __msa_fexupr_w __builtin_msa_fexupr_w
+#define __msa_fexupr_d __builtin_msa_fexupr_d
+#define __msa_ffql_w __builtin_msa_ffql_w
+#define __msa_ffql_d __builtin_msa_ffql_d
+#define __msa_ffqr_w __builtin_msa_ffqr_w
+#define __msa_ffqr_d __builtin_msa_ffqr_d
+#define __msa_ftint_s_w __builtin_msa_ftint_s_w
+#define __msa_ftint_s_d __builtin_msa_ftint_s_d
+#define __msa_ftint_u_w __builtin_msa_ftint_u_w
+#define __msa_ftint_u_d __builtin_msa_ftint_u_d
+#define __msa_ftrunc_s_w __builtin_msa_ftrunc_s_w
+#define __msa_ftrunc_s_d __builtin_msa_ftrunc_s_d
+#define __msa_ftrunc_u_w __builtin_msa_ftrunc_u_w
+#define __msa_ftrunc_u_d __builtin_msa_ftrunc_u_d
+#define __msa_ffint_s_w __builtin_msa_ffint_s_w
+#define __msa_ffint_s_d __builtin_msa_ffint_s_d
+#define __msa_ffint_u_w __builtin_msa_ffint_u_w
+#define __msa_ffint_u_d __builtin_msa_ffint_u_d
+#define __msa_cfcmsa __builtin_msa_cfcmsa
+#define __msa_move_v __builtin_msa_move_v
+#define __msa_cast_to_vector_float __builtin_msa_cast_to_vector_float
+#define __msa_cast_to_vector_double __builtin_msa_cast_to_vector_double
+#define __msa_cast_to_scalar_float __builtin_msa_cast_to_scalar_float
+#define __msa_cast_to_scalar_double __builtin_msa_cast_to_scalar_double
+#endif /* defined(__mips_msa) */
+#endif /* _MSA_H */

+ 47 - 0
demo/include/mwaitxintrin.h

@@ -0,0 +1,47 @@
+/*===---- mwaitxintrin.h - MONITORX/MWAITX intrinsics ----------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#error "Never use <mwaitxintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef _MWAITXINTRIN_H
+#define _MWAITXINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("mwaitx")))
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_monitorx(void const * __p, unsigned __extensions, unsigned __hints)
+{
+  __builtin_ia32_monitorx((void *)__p, __extensions, __hints);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mwaitx(unsigned __extensions, unsigned __hints, unsigned __clock)
+{
+  __builtin_ia32_mwaitx(__extensions, __hints, __clock);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* _MWAITXINTRIN_H */

+ 30 - 0
demo/include/nmmintrin.h

@@ -0,0 +1,30 @@
+/*===---- nmmintrin.h - SSE4 intrinsics ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef _NMMINTRIN_H
+#define _NMMINTRIN_H
+
+/* To match expectations of gcc we put the sse4.2 definitions into smmintrin.h,
+   just include it now then.  */
+#include <smmintrin.h>
+#endif /* _NMMINTRIN_H */

文件差異過大導致無法顯示
+ 16391 - 0
demo/include/opencl-c.h


+ 48 - 0
demo/include/pkuintrin.h

@@ -0,0 +1,48 @@
+/*===------------- pkuintrin.h - PKU intrinsics ------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <pkuintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __PKUINTRIN_H
+#define __PKUINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("pku")))
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_rdpkru_u32(void)
+{
+  return __builtin_ia32_rdpkru();
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_wrpkru(unsigned int __val)
+{
+  return __builtin_ia32_wrpkru(__val);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif

+ 304 - 0
demo/include/pmmintrin.h

@@ -0,0 +1,304 @@
+/*===---- pmmintrin.h - SSE3 intrinsics ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __PMMINTRIN_H
+#define __PMMINTRIN_H
+
+#include <emmintrin.h>
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS \
+  __attribute__((__always_inline__, __nodebug__, __target__("sse3")))
+
+/// \brief Loads data from an unaligned memory location to elements in a 128-bit
+///    vector.
+///
+///    If the address of the data is not 16-byte aligned, the instruction may
+///    read two adjacent aligned blocks of memory to retrieve the requested
+///    data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
+///
+/// \param __p
+///    A pointer to a 128-bit integer vector containing integer values.
+/// \returns A 128-bit vector containing the moved values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_lddqu_si128(__m128i const *__p)
+{
+  return (__m128i)__builtin_ia32_lddqu((char const *)__p);
+}
+
+/// \brief Adds the even-indexed values and subtracts the odd-indexed values of
+///    two 128-bit vectors of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing the left source operand.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing the right source operand.
+/// \returns A 128-bit vector of [4 x float] containing the alternating sums and
+///    differences of both operands.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_addsub_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_addsubps((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Horizontally adds the adjacent pairs of values contained in two
+///    128-bit vectors of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the source operands.
+///    The horizontal sums of the values are stored in the lower bits of the
+///    destination.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the source operands.
+///    The horizontal sums of the values are stored in the upper bits of the
+///    destination.
+/// \returns A 128-bit vector of [4 x float] containing the horizontal sums of
+///    both operands.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_hadd_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_haddps((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Horizontally subtracts the adjacent pairs of values contained in two
+///    128-bit vectors of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the source operands.
+///    The horizontal differences between the values are stored in the lower
+///    bits of the destination.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the source operands.
+///    The horizontal differences between the values are stored in the upper
+///    bits of the destination.
+/// \returns A 128-bit vector of [4 x float] containing the horizontal
+///    differences of both operands.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_hsub_ps(__m128 __a, __m128 __b)
+{
+  return __builtin_ia32_hsubps((__v4sf)__a, (__v4sf)__b);
+}
+
+/// \brief Moves and duplicates odd-indexed values from a 128-bit vector
+///    of [4 x float] to float values stored in a 128-bit vector of
+///    [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. \n
+///    Bits [127:96] of the source are written to bits [127:96] and [95:64] of
+///    the destination. \n
+///    Bits [63:32] of the source are written to bits [63:32] and [31:0] of the
+///    destination.
+/// \returns A 128-bit vector of [4 x float] containing the moved and duplicated
+///    values.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_movehdup_ps(__m128 __a)
+{
+  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 1, 1, 3, 3);
+}
+
+/// \brief Duplicates even-indexed values from a 128-bit vector of
+///    [4 x float] to float values stored in a 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] \n
+///    Bits [95:64] of the source are written to bits [127:96] and [95:64] of
+///    the destination. \n
+///    Bits [31:0] of the source are written to bits [63:32] and [31:0] of the
+///    destination.
+/// \returns A 128-bit vector of [4 x float] containing the moved and duplicated
+///    values.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_moveldup_ps(__m128 __a)
+{
+  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 2, 2);
+}
+
+/// \brief Adds the even-indexed values and subtracts the odd-indexed values of
+///    two 128-bit vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the left source operand.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing the right source operand.
+/// \returns A 128-bit vector of [2 x double] containing the alternating sums
+///    and differences of both operands.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_addsub_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_addsubpd((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Horizontally adds the pairs of values contained in two 128-bit
+///    vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+///    The horizontal sum of the values is stored in the lower bits of the
+///    destination.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+///    The horizontal sum of the values is stored in the upper bits of the
+///    destination.
+/// \returns A 128-bit vector of [2 x double] containing the horizontal sums of
+///    both operands.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_hadd_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_haddpd((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Horizontally subtracts the pairs of values contained in two 128-bit
+///    vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+///    The horizontal difference of the values is stored in the lower bits of
+///    the destination.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the source operands.
+///    The horizontal difference of the values is stored in the upper bits of
+///    the destination.
+/// \returns A 128-bit vector of [2 x double] containing the horizontal
+///    differences of both operands.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_hsub_pd(__m128d __a, __m128d __b)
+{
+  return __builtin_ia32_hsubpd((__v2df)__a, (__v2df)__b);
+}
+
+/// \brief Moves and duplicates one double-precision value to double-precision
+///    values stored in a 128-bit vector of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128d _mm_loaddup_pd(double const * dp);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
+///
+/// \param dp
+///    A pointer to a double-precision value to be moved and duplicated.
+/// \returns A 128-bit vector of [2 x double] containing the moved and
+///    duplicated values.
+#define        _mm_loaddup_pd(dp)        _mm_load1_pd(dp)
+
+/// \brief Moves and duplicates the double-precision value in the lower bits of
+///    a 128-bit vector of [2 x double] to double-precision values stored in a
+///    128-bit vector of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. Bits [63:0] are written to bits
+///    [127:64] and [63:0] of the destination.
+/// \returns A 128-bit vector of [2 x double] containing the moved and
+///    duplicated values.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_movedup_pd(__m128d __a)
+{
+  return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
+}
+
+/// \brief Establishes a linear address memory range to be monitored and puts
+///    the processor in the monitor event pending state. Data stored in the
+///    monitored address range causes the processor to exit the pending state.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> MONITOR </c> instruction.
+///
+/// \param __p
+///    The memory range to be monitored. The size of the range is determined by
+///    CPUID function 0000_0005h.
+/// \param __extensions
+///    Optional extensions for the monitoring state.
+/// \param __hints
+///    Optional hints for the monitoring state.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_monitor(void const *__p, unsigned __extensions, unsigned __hints)
+{
+  __builtin_ia32_monitor((void *)__p, __extensions, __hints);
+}
+
+/// \brief Used with the MONITOR instruction to wait while the processor is in
+///    the monitor event pending state. Data stored in the monitored address
+///    range causes the processor to exit the pending state.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> MWAIT </c> instruction.
+///
+/// \param __extensions
+///    Optional extensions for the monitoring state, which may vary by
+///    processor.
+/// \param __hints
+///    Optional hints for the monitoring state, which may vary by processor.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mwait(unsigned __extensions, unsigned __hints)
+{
+  __builtin_ia32_mwait(__extensions, __hints);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __PMMINTRIN_H */

+ 98 - 0
demo/include/popcntintrin.h

@@ -0,0 +1,98 @@
+/*===---- popcntintrin.h - POPCNT intrinsics -------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef _POPCNTINTRIN_H
+#define _POPCNTINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("popcnt")))
+
+/// \brief Counts the number of bits in the source operand having a value of 1.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> POPCNT </c> instruction.
+///
+/// \param __A
+///    An unsigned 32-bit integer operand.
+/// \returns A 32-bit integer containing the number of bits with value 1 in the
+///    source operand.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_popcnt_u32(unsigned int __A)
+{
+  return __builtin_popcount(__A);
+}
+
+/// \brief Counts the number of bits in the source operand having a value of 1.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> POPCNT </c> instruction.
+///
+/// \param __A
+///    A signed 32-bit integer operand.
+/// \returns A 32-bit integer containing the number of bits with value 1 in the
+///    source operand.
+static __inline__ int __DEFAULT_FN_ATTRS
+_popcnt32(int __A)
+{
+  return __builtin_popcount(__A);
+}
+
+#ifdef __x86_64__
+/// \brief Counts the number of bits in the source operand having a value of 1.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> POPCNT </c> instruction.
+///
+/// \param __A
+///    An unsigned 64-bit integer operand.
+/// \returns A 64-bit integer containing the number of bits with value 1 in the
+///    source operand.
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm_popcnt_u64(unsigned long long __A)
+{
+  return __builtin_popcountll(__A);
+}
+
+/// \brief Counts the number of bits in the source operand having a value of 1.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> POPCNT </c> instruction.
+///
+/// \param __A
+///    A signed 64-bit integer operand.
+/// \returns A 64-bit integer containing the number of bits with value 1 in the
+///    source operand.
+static __inline__ long long __DEFAULT_FN_ATTRS
+_popcnt64(long long __A)
+{
+  return __builtin_popcountll(__A);
+}
+#endif /* __x86_64__ */
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* _POPCNTINTRIN_H */

+ 71 - 0
demo/include/prfchwintrin.h

@@ -0,0 +1,71 @@
+/*===---- prfchwintrin.h - PREFETCHW intrinsic -----------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined(__X86INTRIN_H) && !defined(_MM3DNOW_H_INCLUDED)
+#error "Never use <prfchwintrin.h> directly; include <x86intrin.h> or <mm3dnow.h> instead."
+#endif
+
+#ifndef __PRFCHWINTRIN_H
+#define __PRFCHWINTRIN_H
+
+#if defined(__PRFCHW__) || defined(__3dNOW__)
+/// \brief Loads a memory sequence containing the specified memory address into
+///    all data cache levels. The cache-coherency state is set to exclusive.
+///    Data can be read from and written to the cache line without additional
+///    delay.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PREFETCHT0 instruction.
+///
+/// \param __P
+///    A pointer specifying the memory address to be prefetched.
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_m_prefetch(void *__P)
+{
+  __builtin_prefetch (__P, 0, 3 /* _MM_HINT_T0 */);
+}
+
+/// \brief Loads a memory sequence containing the specified memory address into
+///    the L1 data cache and sets the cache-coherency to modified. This
+///    provides a hint to the processor that the cache line will be modified.
+///    It is intended for use when the cache line will be written to shortly
+///    after the prefetch is performed.
+///
+///    Note that the effect of this intrinsic is dependent on the processor
+///    implementation.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PREFETCHW instruction.
+///
+/// \param __P
+///    A pointer specifying the memory address to be prefetched.
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_m_prefetchw(void *__P)
+{
+  __builtin_prefetch (__P, 1, 3 /* _MM_HINT_T0 */);
+}
+#endif
+
+#endif /* __PRFCHWINTRIN_H */

+ 56 - 0
demo/include/rdseedintrin.h

@@ -0,0 +1,56 @@
+/*===---- rdseedintrin.h - RDSEED intrinsics -------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#error "Never use <rdseedintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __RDSEEDINTRIN_H
+#define __RDSEEDINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("rdseed")))
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_rdseed16_step(unsigned short *__p)
+{
+  return __builtin_ia32_rdseed16_step(__p);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_rdseed32_step(unsigned int *__p)
+{
+  return __builtin_ia32_rdseed32_step(__p);
+}
+
+#ifdef __x86_64__
+static __inline__ int __DEFAULT_FN_ATTRS
+_rdseed64_step(unsigned long long *__p)
+{
+  return __builtin_ia32_rdseed64_step(__p);
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __RDSEEDINTRIN_H */

+ 59 - 0
demo/include/rtmintrin.h

@@ -0,0 +1,59 @@
+/*===---- rtmintrin.h - RTM intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <rtmintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __RTMINTRIN_H
+#define __RTMINTRIN_H
+
+#define _XBEGIN_STARTED   (~0u)
+#define _XABORT_EXPLICIT  (1 << 0)
+#define _XABORT_RETRY     (1 << 1)
+#define _XABORT_CONFLICT  (1 << 2)
+#define _XABORT_CAPACITY  (1 << 3)
+#define _XABORT_DEBUG     (1 << 4)
+#define _XABORT_NESTED    (1 << 5)
+#define _XABORT_CODE(x)   (((x) >> 24) & 0xFF)
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("rtm")))
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_xbegin(void)
+{
+  return __builtin_ia32_xbegin();
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_xend(void)
+{
+  __builtin_ia32_xend();
+}
+
+#define _xabort(imm) __builtin_ia32_xabort((imm))
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __RTMINTRIN_H */

+ 39 - 0
demo/include/s390intrin.h

@@ -0,0 +1,39 @@
+/*===---- s390intrin.h - SystemZ intrinsics --------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __S390INTRIN_H
+#define __S390INTRIN_H
+
+#ifndef __s390__
+#error "<s390intrin.h> is for s390 only"
+#endif
+
+#ifdef __HTM__
+#include <htmintrin.h>
+#endif
+
+#ifdef __VEC__
+#include <vecintrin.h>
+#endif
+
+#endif /* __S390INTRIN_H*/

+ 90 - 0
demo/include/sanitizer/allocator_interface.h

@@ -0,0 +1,90 @@
+//===-- allocator_interface.h ---------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Public interface header for allocator used in sanitizers (ASan/TSan/MSan).
+//===----------------------------------------------------------------------===//
+#ifndef SANITIZER_ALLOCATOR_INTERFACE_H
+#define SANITIZER_ALLOCATOR_INTERFACE_H
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+  /* Returns the estimated number of bytes that will be reserved by allocator
+     for request of "size" bytes. If allocator can't allocate that much
+     memory, returns the maximal possible allocation size, otherwise returns
+     "size". */
+  size_t __sanitizer_get_estimated_allocated_size(size_t size);
+
+  /* Returns true if p was returned by the allocator and
+     is not yet freed. */
+  int __sanitizer_get_ownership(const volatile void *p);
+
+  /* Returns the number of bytes reserved for the pointer p.
+     Requires (get_ownership(p) == true) or (p == 0). */
+  size_t __sanitizer_get_allocated_size(const volatile void *p);
+
+  /* Number of bytes, allocated and not yet freed by the application. */
+  size_t __sanitizer_get_current_allocated_bytes(void);
+
+  /* Number of bytes, mmaped by the allocator to fulfill allocation requests.
+     Generally, for request of X bytes, allocator can reserve and add to free
+     lists a large number of chunks of size X to use them for future requests.
+     All these chunks count toward the heap size. Currently, allocator never
+     releases memory to OS (instead, it just puts freed chunks to free
+     lists). */
+  size_t __sanitizer_get_heap_size(void);
+
+  /* Number of bytes, mmaped by the allocator, which can be used to fulfill
+     allocation requests. When a user program frees memory chunk, it can first
+     fall into quarantine and will count toward __sanitizer_get_free_bytes()
+     later. */
+  size_t __sanitizer_get_free_bytes(void);
+
+  /* Number of bytes in unmapped pages, that are released to OS. Currently,
+     always returns 0. */
+  size_t __sanitizer_get_unmapped_bytes(void);
+
+  /* Malloc hooks that may be optionally provided by user.
+     __sanitizer_malloc_hook(ptr, size) is called immediately after
+       allocation of "size" bytes, which returned "ptr".
+     __sanitizer_free_hook(ptr) is called immediately before
+       deallocation of "ptr". */
+  void __sanitizer_malloc_hook(const volatile void *ptr, size_t size);
+  void __sanitizer_free_hook(const volatile void *ptr);
+
+  /* Installs a pair of hooks for malloc/free.
+     Several (currently, 5) hook pairs may be installed, they are executed
+     in the order they were installed and after calling
+     __sanitizer_malloc_hook/__sanitizer_free_hook.
+     Unlike __sanitizer_malloc_hook/__sanitizer_free_hook these hooks can be
+     chained and do not rely on weak symbols working on the platform, but
+     require __sanitizer_install_malloc_and_free_hooks to be called at startup
+     and thus will not be called on malloc/free very early in the process.
+     Returns the number of hooks currently installed or 0 on failure.
+     Not thread-safe, should be called in the main thread before starting
+     other threads.
+  */
+  int __sanitizer_install_malloc_and_free_hooks(
+      void (*malloc_hook)(const volatile void *, size_t),
+      void (*free_hook)(const volatile void *));
+
+  /* Drains allocator quarantines (calling thread's and global ones), returns
+     freed memory back to OS and releases other non-essential internal allocator
+     resources in attempt to reduce process RSS.
+     Currently available with ASan only.
+  */
+  void __sanitizer_purge_allocator(void);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif

+ 155 - 0
demo/include/sanitizer/asan_interface.h

@@ -0,0 +1,155 @@
+//===-- sanitizer/asan_interface.h ------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of AddressSanitizer.
+//
+// Public interface header.
+//===----------------------------------------------------------------------===//
+#ifndef SANITIZER_ASAN_INTERFACE_H
+#define SANITIZER_ASAN_INTERFACE_H
+
+#include <sanitizer/common_interface_defs.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+  // Marks memory region [addr, addr+size) as unaddressable.
+  // This memory must be previously allocated by the user program. Accessing
+  // addresses in this region from instrumented code is forbidden until
+  // this region is unpoisoned. This function is not guaranteed to poison
+  // the whole region - it may poison only subregion of [addr, addr+size) due
+  // to ASan alignment restrictions.
+  // Method is NOT thread-safe in the sense that no two threads can
+  // (un)poison memory in the same memory region simultaneously.
+  void __asan_poison_memory_region(void const volatile *addr, size_t size);
+  // Marks memory region [addr, addr+size) as addressable.
+  // This memory must be previously allocated by the user program. Accessing
+  // addresses in this region is allowed until this region is poisoned again.
+  // This function may unpoison a superregion of [addr, addr+size) due to
+  // ASan alignment restrictions.
+  // Method is NOT thread-safe in the sense that no two threads can
+  // (un)poison memory in the same memory region simultaneously.
+  void __asan_unpoison_memory_region(void const volatile *addr, size_t size);
+
+// User code should use macros instead of functions.
+#if __has_feature(address_sanitizer) || defined(__SANITIZE_ADDRESS__)
+#define ASAN_POISON_MEMORY_REGION(addr, size) \
+  __asan_poison_memory_region((addr), (size))
+#define ASAN_UNPOISON_MEMORY_REGION(addr, size) \
+  __asan_unpoison_memory_region((addr), (size))
+#else
+#define ASAN_POISON_MEMORY_REGION(addr, size) \
+  ((void)(addr), (void)(size))
+#define ASAN_UNPOISON_MEMORY_REGION(addr, size) \
+  ((void)(addr), (void)(size))
+#endif
+
+  // Returns 1 if addr is poisoned (i.e. 1-byte read/write access to this
+  // address will result in error report from AddressSanitizer).
+  // Otherwise returns 0.
+  int __asan_address_is_poisoned(void const volatile *addr);
+
+  // If at least one byte in [beg, beg+size) is poisoned, return the address
+  // of the first such byte. Otherwise return 0.
+  void *__asan_region_is_poisoned(void *beg, size_t size);
+
+  // Print the description of addr (useful when debugging in gdb).
+  void __asan_describe_address(void *addr);
+
+  // Useful for calling from a debugger to get information about an ASan error.
+  // Returns 1 if an error has been (or is being) reported, otherwise returns 0.
+  int __asan_report_present(void);
+
+  // Useful for calling from a debugger to get information about an ASan error.
+  // If an error has been (or is being) reported, the following functions return
+  // the pc, bp, sp, address, access type (0 = read, 1 = write), access size and
+  // bug description (e.g. "heap-use-after-free"). Otherwise they return 0.
+  void *__asan_get_report_pc(void);
+  void *__asan_get_report_bp(void);
+  void *__asan_get_report_sp(void);
+  void *__asan_get_report_address(void);
+  int __asan_get_report_access_type(void);
+  size_t __asan_get_report_access_size(void);
+  const char *__asan_get_report_description(void);
+
+  // Useful for calling from the debugger to get information about a pointer.
+  // Returns the category of the given pointer as a constant string.
+  // Possible return values are "global", "stack", "stack-fake", "heap",
+  // "heap-invalid", "shadow-low", "shadow-gap", "shadow-high", "unknown".
+  // If global or stack, tries to also return the variable name, address and
+  // size. If heap, tries to return the chunk address and size. 'name' should
+  // point to an allocated buffer of size 'name_size'.
+  const char *__asan_locate_address(void *addr, char *name, size_t name_size,
+                                    void **region_address, size_t *region_size);
+
+  // Useful for calling from the debugger to get the allocation stack trace
+  // and thread ID for a heap address. Stores up to 'size' frames into 'trace',
+  // returns the number of stored frames or 0 on error.
+  size_t __asan_get_alloc_stack(void *addr, void **trace, size_t size,
+                                int *thread_id);
+
+  // Useful for calling from the debugger to get the free stack trace
+  // and thread ID for a heap address. Stores up to 'size' frames into 'trace',
+  // returns the number of stored frames or 0 on error.
+  size_t __asan_get_free_stack(void *addr, void **trace, size_t size,
+                               int *thread_id);
+
+  // Useful for calling from the debugger to get the current shadow memory
+  // mapping.
+  void __asan_get_shadow_mapping(size_t *shadow_scale, size_t *shadow_offset);
+
+  // This is an internal function that is called to report an error.
+  // However it is still a part of the interface because users may want to
+  // set a breakpoint on this function in a debugger.
+  void __asan_report_error(void *pc, void *bp, void *sp,
+                           void *addr, int is_write, size_t access_size);
+
+  // Deprecated. Call __sanitizer_set_death_callback instead.
+  void __asan_set_death_callback(void (*callback)(void));
+
+  void __asan_set_error_report_callback(void (*callback)(const char*));
+
+  // User may provide function that would be called right when ASan detects
+  // an error. This can be used to notice cases when ASan detects an error, but
+  // the program crashes before ASan report is printed.
+  void __asan_on_error(void);
+
+  // Prints accumulated stats to stderr. Used for debugging.
+  void __asan_print_accumulated_stats(void);
+
+  // This function may be optionally provided by user and should return
+  // a string containing ASan runtime options. See asan_flags.h for details.
+  const char* __asan_default_options(void);
+
+  // The following 2 functions facilitate garbage collection in presence of
+  // asan's fake stack.
+
+  // Returns an opaque handler to be used later in __asan_addr_is_in_fake_stack.
+  // Returns NULL if the current thread does not have a fake stack.
+  void *__asan_get_current_fake_stack(void);
+
+  // If fake_stack is non-NULL and addr belongs to a fake frame in
+  // fake_stack, returns the address on real stack that corresponds to
+  // the fake frame and sets beg/end to the boundaries of this fake frame.
+  // Otherwise returns NULL and does not touch beg/end.
+  // If beg/end are NULL, they are not touched.
+  // This function may be called from a thread other than the owner of
+  // fake_stack, but the owner thread need to be alive.
+  void *__asan_addr_is_in_fake_stack(void *fake_stack, void *addr, void **beg,
+                                     void **end);
+
+  // Performs cleanup before a [[noreturn]] function.  Must be called
+  // before things like _exit and execl to avoid false positives on stack.
+  void __asan_handle_no_return(void);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // SANITIZER_ASAN_INTERFACE_H

+ 198 - 0
demo/include/sanitizer/common_interface_defs.h

@@ -0,0 +1,198 @@
+//===-- sanitizer/common_interface_defs.h -----------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Common part of the public sanitizer interface.
+//===----------------------------------------------------------------------===//
+
+#ifndef SANITIZER_COMMON_INTERFACE_DEFS_H
+#define SANITIZER_COMMON_INTERFACE_DEFS_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+// GCC does not understand __has_feature.
+#if !defined(__has_feature)
+# define __has_feature(x) 0
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+  // Arguments for __sanitizer_sandbox_on_notify() below.
+  typedef struct {
+    // Enable sandbox support in sanitizer coverage.
+    int coverage_sandboxed;
+    // File descriptor to write coverage data to. If -1 is passed, a file will
+    // be pre-opened by __sanitizer_sandobx_on_notify(). This field has no
+    // effect if coverage_sandboxed == 0.
+    intptr_t coverage_fd;
+    // If non-zero, split the coverage data into well-formed blocks. This is
+    // useful when coverage_fd is a socket descriptor. Each block will contain
+    // a header, allowing data from multiple processes to be sent over the same
+    // socket.
+    unsigned int coverage_max_block_size;
+  } __sanitizer_sandbox_arguments;
+
+  // Tell the tools to write their reports to "path.<pid>" instead of stderr.
+  void __sanitizer_set_report_path(const char *path);
+  // Tell the tools to write their reports to the provided file descriptor
+  // (casted to void *).
+  void __sanitizer_set_report_fd(void *fd);
+
+  // Notify the tools that the sandbox is going to be turned on. The reserved
+  // parameter will be used in the future to hold a structure with functions
+  // that the tools may call to bypass the sandbox.
+  void __sanitizer_sandbox_on_notify(__sanitizer_sandbox_arguments *args);
+
+  // This function is called by the tool when it has just finished reporting
+  // an error. 'error_summary' is a one-line string that summarizes
+  // the error message. This function can be overridden by the client.
+  void __sanitizer_report_error_summary(const char *error_summary);
+
+  // Some of the sanitizers (e.g. asan/tsan) may miss bugs that happen
+  // in unaligned loads/stores. In order to find such bugs reliably one needs
+  // to replace plain unaligned loads/stores with these calls.
+  uint16_t __sanitizer_unaligned_load16(const void *p);
+  uint32_t __sanitizer_unaligned_load32(const void *p);
+  uint64_t __sanitizer_unaligned_load64(const void *p);
+  void __sanitizer_unaligned_store16(void *p, uint16_t x);
+  void __sanitizer_unaligned_store32(void *p, uint32_t x);
+  void __sanitizer_unaligned_store64(void *p, uint64_t x);
+
+  // Annotate the current state of a contiguous container, such as
+  // std::vector, std::string or similar.
+  // A contiguous container is a container that keeps all of its elements
+  // in a contiguous region of memory. The container owns the region of memory
+  // [beg, end); the memory [beg, mid) is used to store the current elements
+  // and the memory [mid, end) is reserved for future elements;
+  // beg <= mid <= end. For example, in "std::vector<> v"
+  //   beg = &v[0];
+  //   end = beg + v.capacity() * sizeof(v[0]);
+  //   mid = beg + v.size()     * sizeof(v[0]);
+  //
+  // This annotation tells the Sanitizer tool about the current state of the
+  // container so that the tool can report errors when memory from [mid, end)
+  // is accessed. Insert this annotation into methods like push_back/pop_back.
+  // Supply the old and the new values of mid (old_mid/new_mid).
+  // In the initial state mid == end and so should be the final
+  // state when the container is destroyed or when it reallocates the storage.
+  //
+  // Use with caution and don't use for anything other than vector-like classes.
+  //
+  // For AddressSanitizer, 'beg' should be 8-aligned and 'end' should
+  // be either 8-aligned or it should point to the end of a separate heap-,
+  // stack-, or global- allocated buffer. I.e. the following will not work:
+  //   int64_t x[2];  // 16 bytes, 8-aligned.
+  //   char *beg = (char *)&x[0];
+  //   char *end = beg + 12;  // Not 8 aligned, not the end of the buffer.
+  // This however will work fine:
+  //   int32_t x[3];  // 12 bytes, but 8-aligned under AddressSanitizer.
+  //   char *beg = (char*)&x[0];
+  //   char *end = beg + 12;  // Not 8-aligned, but is the end of the buffer.
+  void __sanitizer_annotate_contiguous_container(const void *beg,
+                                                 const void *end,
+                                                 const void *old_mid,
+                                                 const void *new_mid);
+  // Returns true if the contiguous container [beg, end) is properly poisoned
+  // (e.g. with __sanitizer_annotate_contiguous_container), i.e. if
+  //  - [beg, mid) is addressable,
+  //  - [mid, end) is unaddressable.
+  // Full verification requires O(end-beg) time; this function tries to avoid
+  // such complexity by touching only parts of the container around beg/mid/end.
+  int __sanitizer_verify_contiguous_container(const void *beg, const void *mid,
+                                              const void *end);
+
+  // Similar to __sanitizer_verify_contiguous_container but returns the address
+  // of the first improperly poisoned byte otherwise. Returns null if the area
+  // is poisoned properly.
+  const void *__sanitizer_contiguous_container_find_bad_address(
+      const void *beg, const void *mid, const void *end);
+
+  // Print the stack trace leading to this call. Useful for debugging user code.
+  void __sanitizer_print_stack_trace(void);
+
+  // Symbolizes the supplied 'pc' using the format string 'fmt'.
+  // Outputs at most 'out_buf_size' bytes into 'out_buf'.
+  // The format syntax is described in
+  // lib/sanitizer_common/sanitizer_stacktrace_printer.h.
+  void __sanitizer_symbolize_pc(void *pc, const char *fmt, char *out_buf,
+                                size_t out_buf_size);
+  // Same as __sanitizer_symbolize_pc, but for data section (i.e. globals).
+  void __sanitizer_symbolize_global(void *data_ptr, const char *fmt,
+                                    char *out_buf, size_t out_buf_size);
+
+  // Sets the callback to be called right before death on error.
+  // Passing 0 will unset the callback.
+  void __sanitizer_set_death_callback(void (*callback)(void));
+
+  // Interceptor hooks.
+  // Whenever a libc function interceptor is called it checks if the
+  // corresponding weak hook is defined, and it so -- calls it.
+  // The primary use case is data-flow-guided fuzzing, where the fuzzer needs
+  // to know what is being passed to libc functions, e.g. memcmp.
+  // FIXME: implement more hooks.
+  void __sanitizer_weak_hook_memcmp(void *called_pc, const void *s1,
+                                    const void *s2, size_t n, int result);
+  void __sanitizer_weak_hook_strncmp(void *called_pc, const char *s1,
+                                    const char *s2, size_t n, int result);
+  void __sanitizer_weak_hook_strncasecmp(void *called_pc, const char *s1,
+                                         const char *s2, size_t n, int result);
+  void __sanitizer_weak_hook_strcmp(void *called_pc, const char *s1,
+                                    const char *s2, int result);
+  void __sanitizer_weak_hook_strcasecmp(void *called_pc, const char *s1,
+                                        const char *s2, int result);
+  void __sanitizer_weak_hook_strstr(void *called_pc, const char *s1,
+                                    const char *s2, char *result);
+  void __sanitizer_weak_hook_strcasestr(void *called_pc, const char *s1,
+                                        const char *s2, char *result);
+  void __sanitizer_weak_hook_memmem(void *called_pc,
+                                    const void *s1, size_t len1,
+                                    const void *s2, size_t len2, void *result);
+
+  // Prints stack traces for all live heap allocations ordered by total
+  // allocation size until `top_percent` of total live heap is shown.
+  // `top_percent` should be between 1 and 100.
+  // At most `max_number_of_contexts` contexts (stack traces) is printed.
+  // Experimental feature currently available only with asan on Linux/x86_64.
+  void __sanitizer_print_memory_profile(size_t top_percent,
+                                        size_t max_number_of_contexts);
+
+  // Fiber annotation interface.
+  // Before switching to a different stack, one must call
+  // __sanitizer_start_switch_fiber with a pointer to the bottom of the
+  // destination stack and its size. When code starts running on the new stack,
+  // it must call __sanitizer_finish_switch_fiber to finalize the switch.
+  // The start_switch function takes a void** to store the current fake stack if
+  // there is one (it is needed when detect_stack_use_after_return is enabled).
+  // When restoring a stack, this pointer must be given to the finish_switch
+  // function. In most cases, this void* can be stored on the stack just before
+  // switching.  When leaving a fiber definitely, null must be passed as first
+  // argument to the start_switch function so that the fake stack is destroyed.
+  // If you do not want support for stack use-after-return detection, you can
+  // always pass null to these two functions.
+  // Note that the fake stack mechanism is disabled during fiber switch, so if a
+  // signal callback runs during the switch, it will not benefit from the stack
+  // use-after-return detection.
+  void __sanitizer_start_switch_fiber(void **fake_stack_save,
+                                      const void *bottom, size_t size);
+  void __sanitizer_finish_switch_fiber(void *fake_stack_save,
+                                       const void **bottom_old,
+                                       size_t *size_old);
+
+  // Get full module name and calculate pc offset within it.
+  // Returns 1 if pc belongs to some module, 0 if module was not found.
+  int __sanitizer_get_module_and_offset_for_pc(void *pc, char *module_path,
+                                               size_t module_path_len,
+                                               void **pc_offset);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // SANITIZER_COMMON_INTERFACE_DEFS_H

+ 36 - 0
demo/include/sanitizer/coverage_interface.h

@@ -0,0 +1,36 @@
+//===-- sanitizer/coverage_interface.h --------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Public interface for sanitizer coverage.
+//===----------------------------------------------------------------------===//
+
+#ifndef SANITIZER_COVERAG_INTERFACE_H
+#define SANITIZER_COVERAG_INTERFACE_H
+
+#include <sanitizer/common_interface_defs.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+  // Record and dump coverage info.
+  void __sanitizer_cov_dump(void);
+
+  // Clear collected coverage info.
+  void __sanitizer_cov_reset(void);
+
+  // Dump collected coverage info. Sorts pcs by module into individual .sancov
+  // files.
+  void __sanitizer_dump_coverage(const uintptr_t *pcs, uintptr_t len);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // SANITIZER_COVERAG_INTERFACE_H

+ 116 - 0
demo/include/sanitizer/dfsan_interface.h

@@ -0,0 +1,116 @@
+//===-- dfsan_interface.h -------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of DataFlowSanitizer.
+//
+// Public interface header.
+//===----------------------------------------------------------------------===//
+#ifndef DFSAN_INTERFACE_H
+#define DFSAN_INTERFACE_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include <sanitizer/common_interface_defs.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef uint16_t dfsan_label;
+
+/// Stores information associated with a specific label identifier.  A label
+/// may be a base label created using dfsan_create_label, with associated
+/// text description and user data, or an automatically created union label,
+/// which represents the union of two label identifiers (which may themselves
+/// be base or union labels).
+struct dfsan_label_info {
+  // Fields for union labels, set to 0 for base labels.
+  dfsan_label l1;
+  dfsan_label l2;
+
+  // Fields for base labels.
+  const char *desc;
+  void *userdata;
+};
+
+/// Signature of the callback argument to dfsan_set_write_callback().
+typedef void (*dfsan_write_callback_t)(int fd, const void *buf, size_t count);
+
+/// Computes the union of \c l1 and \c l2, possibly creating a union label in
+/// the process.
+dfsan_label dfsan_union(dfsan_label l1, dfsan_label l2);
+
+/// Creates and returns a base label with the given description and user data.
+dfsan_label dfsan_create_label(const char *desc, void *userdata);
+
+/// Sets the label for each address in [addr,addr+size) to \c label.
+void dfsan_set_label(dfsan_label label, void *addr, size_t size);
+
+/// Sets the label for each address in [addr,addr+size) to the union of the
+/// current label for that address and \c label.
+void dfsan_add_label(dfsan_label label, void *addr, size_t size);
+
+/// Retrieves the label associated with the given data.
+///
+/// The type of 'data' is arbitrary.  The function accepts a value of any type,
+/// which can be truncated or extended (implicitly or explicitly) as necessary.
+/// The truncation/extension operations will preserve the label of the original
+/// value.
+dfsan_label dfsan_get_label(long data);
+
+/// Retrieves the label associated with the data at the given address.
+dfsan_label dfsan_read_label(const void *addr, size_t size);
+
+/// Retrieves a pointer to the dfsan_label_info struct for the given label.
+const struct dfsan_label_info *dfsan_get_label_info(dfsan_label label);
+
+/// Returns whether the given label label contains the label elem.
+int dfsan_has_label(dfsan_label label, dfsan_label elem);
+
+/// If the given label label contains a label with the description desc, returns
+/// that label, else returns 0.
+dfsan_label dfsan_has_label_with_desc(dfsan_label label, const char *desc);
+
+/// Returns the number of labels allocated.
+size_t dfsan_get_label_count(void);
+
+/// Sets a callback to be invoked on calls to write().  The callback is invoked
+/// before the write is done.  The write is not guaranteed to succeed when the
+/// callback executes.  Pass in NULL to remove any callback.
+void dfsan_set_write_callback(dfsan_write_callback_t labeled_write_callback);
+
+/// Writes the labels currently used by the program to the given file
+/// descriptor. The lines of the output have the following format:
+///
+/// <label> <parent label 1> <parent label 2> <label description if any>
+void dfsan_dump_labels(int fd);
+
+/// Interceptor hooks.
+/// Whenever a dfsan's custom function is called the corresponding
+/// hook is called it non-zero. The hooks should be defined by the user.
+/// The primary use case is taint-guided fuzzing, where the fuzzer
+/// needs to see the parameters of the function and the labels.
+/// FIXME: implement more hooks.
+void dfsan_weak_hook_memcmp(void *caller_pc, const void *s1, const void *s2,
+                            size_t n, dfsan_label s1_label,
+                            dfsan_label s2_label, dfsan_label n_label);
+void dfsan_weak_hook_strncmp(void *caller_pc, const char *s1, const char *s2,
+                             size_t n, dfsan_label s1_label,
+                             dfsan_label s2_label, dfsan_label n_label);
+#ifdef __cplusplus
+}  // extern "C"
+
+template <typename T>
+void dfsan_set_label(dfsan_label label, T &data) {  // NOLINT
+  dfsan_set_label(label, (void *)&data, sizeof(T));
+}
+
+#endif
+
+#endif  // DFSAN_INTERFACE_H

+ 50 - 0
demo/include/sanitizer/esan_interface.h

@@ -0,0 +1,50 @@
+//===-- sanitizer/esan_interface.h ------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of EfficiencySanitizer, a family of performance tuners.
+//
+// Public interface header.
+//===----------------------------------------------------------------------===//
+#ifndef SANITIZER_ESAN_INTERFACE_H
+#define SANITIZER_ESAN_INTERFACE_H
+
+#include <sanitizer/common_interface_defs.h>
+
+// We declare our interface routines as weak to allow the user to avoid
+// ifdefs and instead use this pattern to allow building the same sources
+// with and without our runtime library:
+//     if (__esan_report)
+//       __esan_report();
+#ifdef _MSC_VER
+/* selectany is as close to weak as we'll get. */
+#define COMPILER_RT_WEAK __declspec(selectany)
+#elif __GNUC__
+#define COMPILER_RT_WEAK __attribute__((weak))
+#else
+#define COMPILER_RT_WEAK
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// This function can be called mid-run (or at the end of a run for
+// a server process that doesn't shut down normally) to request that
+// data for that point in the run be reported from the tool.
+void COMPILER_RT_WEAK __esan_report(void);
+
+// This function returns the number of samples that the esan tool has collected
+// to this point.  This is useful for testing.
+unsigned int COMPILER_RT_WEAK __esan_get_sample_count(void);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // SANITIZER_ESAN_INTERFACE_H

+ 33 - 0
demo/include/sanitizer/hwasan_interface.h

@@ -0,0 +1,33 @@
+//===-- sanitizer/asan_interface.h ------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of HWAddressSanitizer.
+//
+// Public interface header.
+//===----------------------------------------------------------------------===//
+#ifndef SANITIZER_HWASAN_INTERFACE_H
+#define SANITIZER_HWASAN_INTERFACE_H
+
+#include <sanitizer/common_interface_defs.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+  // This function may be optionally provided by user and should return
+  // a string containing HWASan runtime options. See asan_flags.h for details.
+  const char* __hwasan_default_options(void);
+
+  void __hwasan_enable_allocator_tagging(void);
+  void __hwasan_disable_allocator_tagging(void);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // SANITIZER_HWASAN_INTERFACE_H

文件差異過大導致無法顯示
+ 3083 - 0
demo/include/sanitizer/linux_syscall_hooks.h


+ 90 - 0
demo/include/sanitizer/lsan_interface.h

@@ -0,0 +1,90 @@
+//===-- sanitizer/lsan_interface.h ------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of LeakSanitizer.
+//
+// Public interface header.
+//===----------------------------------------------------------------------===//
+#ifndef SANITIZER_LSAN_INTERFACE_H
+#define SANITIZER_LSAN_INTERFACE_H
+
+#include <sanitizer/common_interface_defs.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+  // Allocations made between calls to __lsan_disable() and __lsan_enable() will
+  // be treated as non-leaks. Disable/enable pairs may be nested.
+  void __lsan_disable(void);
+  void __lsan_enable(void);
+
+  // The heap object into which p points will be treated as a non-leak.
+  void __lsan_ignore_object(const void *p);
+
+  // Memory regions registered through this interface will be treated as sources
+  // of live pointers during leak checking. Useful if you store pointers in
+  // mapped memory.
+  // Points of note:
+  // - __lsan_unregister_root_region() must be called with the same pointer and
+  // size that have earlier been passed to __lsan_register_root_region()
+  // - LSan will skip any inaccessible memory when scanning a root region. E.g.,
+  // if you map memory within a larger region that you have mprotect'ed, you can
+  // register the entire large region.
+  // - the implementation is not optimized for performance. This interface is
+  // intended to be used for a small number of relatively static regions.
+  void __lsan_register_root_region(const void *p, size_t size);
+  void __lsan_unregister_root_region(const void *p, size_t size);
+
+  // Check for leaks now. This function behaves identically to the default
+  // end-of-process leak check. In particular, it will terminate the process if
+  // leaks are found and the exitcode runtime flag is non-zero.
+  // Subsequent calls to this function will have no effect and end-of-process
+  // leak check will not run. Effectively, end-of-process leak check is moved to
+  // the time of first invocation of this function.
+  // By calling this function early during process shutdown, you can instruct
+  // LSan to ignore shutdown-only leaks which happen later on.
+  void __lsan_do_leak_check(void);
+
+  // Check for leaks now. Returns zero if no leaks have been found or if leak
+  // detection is disabled, non-zero otherwise.
+  // This function may be called repeatedly, e.g. to periodically check a
+  // long-running process. It prints a leak report if appropriate, but does not
+  // terminate the process. It does not affect the behavior of
+  // __lsan_do_leak_check() or the end-of-process leak check, and is not
+  // affected by them.
+  int __lsan_do_recoverable_leak_check(void);
+
+  // The user may optionally provide this function to disallow leak checking
+  // for the program it is linked into (if the return value is non-zero). This
+  // function must be defined as returning a constant value; any behavior beyond
+  // that is unsupported.
+  // To avoid dead stripping, you may need to define this function with
+  // __attribute__((used))
+  int __lsan_is_turned_off(void);
+
+  // This function may be optionally provided by user and should return
+  // a string containing LSan runtime options. See lsan_flags.inc for details.
+  const char *__lsan_default_options(void);
+
+  // This function may be optionally provided by the user and should return
+  // a string containing LSan suppressions.
+  const char *__lsan_default_suppressions(void);
+#ifdef __cplusplus
+}  // extern "C"
+
+namespace __lsan {
+class ScopedDisabler {
+ public:
+  ScopedDisabler() { __lsan_disable(); }
+  ~ScopedDisabler() { __lsan_enable(); }
+};
+}  // namespace __lsan
+#endif
+
+#endif  // SANITIZER_LSAN_INTERFACE_H

+ 111 - 0
demo/include/sanitizer/msan_interface.h

@@ -0,0 +1,111 @@
+//===-- msan_interface.h --------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of MemorySanitizer.
+//
+// Public interface header.
+//===----------------------------------------------------------------------===//
+#ifndef MSAN_INTERFACE_H
+#define MSAN_INTERFACE_H
+
+#include <sanitizer/common_interface_defs.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+  /* Set raw origin for the memory range. */
+  void __msan_set_origin(const volatile void *a, size_t size, uint32_t origin);
+
+  /* Get raw origin for an address. */
+  uint32_t __msan_get_origin(const volatile void *a);
+
+  /* Test that this_id is a descendant of prev_id (or they are simply equal).
+   * "descendant" here means they are part of the same chain, created with
+   * __msan_chain_origin. */
+  int __msan_origin_is_descendant_or_same(uint32_t this_id, uint32_t prev_id);
+
+  /* Returns non-zero if tracking origins. */
+  int __msan_get_track_origins(void);
+
+  /* Returns the origin id of the latest UMR in the calling thread. */
+  uint32_t __msan_get_umr_origin(void);
+
+  /* Make memory region fully initialized (without changing its contents). */
+  void __msan_unpoison(const volatile void *a, size_t size);
+
+  /* Make a null-terminated string fully initialized (without changing its
+     contents). */
+  void __msan_unpoison_string(const volatile char *a);
+
+  /* Make memory region fully uninitialized (without changing its contents).
+     This is a legacy interface that does not update origin information. Use
+     __msan_allocated_memory() instead. */
+  void __msan_poison(const volatile void *a, size_t size);
+
+  /* Make memory region partially uninitialized (without changing its contents).
+   */
+  void __msan_partial_poison(const volatile void *data, void *shadow,
+                             size_t size);
+
+  /* Returns the offset of the first (at least partially) poisoned byte in the
+     memory range, or -1 if the whole range is good. */
+  intptr_t __msan_test_shadow(const volatile void *x, size_t size);
+
+  /* Checks that memory range is fully initialized, and reports an error if it
+   * is not. */
+  void __msan_check_mem_is_initialized(const volatile void *x, size_t size);
+
+  /* For testing:
+     __msan_set_expect_umr(1);
+     ... some buggy code ...
+     __msan_set_expect_umr(0);
+     The last line will verify that a UMR happened. */
+  void __msan_set_expect_umr(int expect_umr);
+
+  /* Change the value of keep_going flag. Non-zero value means don't terminate
+     program execution when an error is detected. This will not affect error in
+     modules that were compiled without the corresponding compiler flag. */
+  void __msan_set_keep_going(int keep_going);
+
+  /* Print shadow and origin for the memory range to stderr in a human-readable
+     format. */
+  void __msan_print_shadow(const volatile void *x, size_t size);
+
+  /* Print shadow for the memory range to stderr in a minimalistic
+     human-readable format. */
+  void __msan_dump_shadow(const volatile void *x, size_t size);
+
+  /* Returns true if running under a dynamic tool (DynamoRio-based). */
+  int  __msan_has_dynamic_component(void);
+
+  /* Tell MSan about newly allocated memory (ex.: custom allocator).
+     Memory will be marked uninitialized, with origin at the call site. */
+  void __msan_allocated_memory(const volatile void* data, size_t size);
+
+  /* Tell MSan about newly destroyed memory. Mark memory as uninitialized. */
+  void __sanitizer_dtor_callback(const volatile void* data, size_t size);
+
+  /* This function may be optionally provided by user and should return
+     a string containing Msan runtime options. See msan_flags.h for details. */
+  const char* __msan_default_options(void);
+
+  /* Deprecated. Call __sanitizer_set_death_callback instead. */
+  void __msan_set_death_callback(void (*callback)(void));
+
+  /* Update shadow for the application copy of size bytes from src to dst.
+     Src and dst are application addresses. This function does not copy the
+     actual application memory, it only updates shadow and origin for such
+     copy. Source and destination regions can overlap. */
+  void __msan_copy_shadow(const volatile void *dst, const volatile void *src,
+                          size_t size);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif

+ 34 - 0
demo/include/sanitizer/scudo_interface.h

@@ -0,0 +1,34 @@
+//===-- sanitizer/scudo_interface.h -----------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// Public Scudo interface header.
+//
+//===----------------------------------------------------------------------===//
+#ifndef SANITIZER_SCUDO_INTERFACE_H_
+#define SANITIZER_SCUDO_INTERFACE_H_
+
+#include <sanitizer/common_interface_defs.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+  // This function may be optionally provided by a user and should return
+  // a string containing Scudo runtime options. See scudo_flags.h for details.
+  const char* __scudo_default_options(void);
+
+  // This function allows to set the RSS limit at runtime. This can be either
+  // the hard limit (HardLimit=1) or the soft limit (HardLimit=0). The limit
+  // can be removed by setting LimitMb to 0. This function's parameters should
+  // be fully trusted to avoid security mishaps.
+  void __scudo_set_rss_limit(unsigned long LimitMb, int HardLimit);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // SANITIZER_SCUDO_INTERFACE_H_

+ 144 - 0
demo/include/sanitizer/tsan_interface.h

@@ -0,0 +1,144 @@
+//===-- tsan_interface.h ----------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of ThreadSanitizer (TSan), a race detector.
+//
+// Public interface header for TSan.
+//===----------------------------------------------------------------------===//
+#ifndef SANITIZER_TSAN_INTERFACE_H
+#define SANITIZER_TSAN_INTERFACE_H
+
+#include <sanitizer/common_interface_defs.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// __tsan_release establishes a happens-before relation with a preceding
+// __tsan_acquire on the same address.
+void __tsan_acquire(void *addr);
+void __tsan_release(void *addr);
+
+// Annotations for custom mutexes.
+// The annotations allow to get better reports (with sets of locked mutexes),
+// detect more types of bugs (e.g. mutex misuses, races between lock/unlock and
+// destruction and potential deadlocks) and improve precision and performance
+// (by ignoring individual atomic operations in mutex code). However, the
+// downside is that annotated mutex code itself is not checked for correctness.
+
+// Mutex creation flags are passed to __tsan_mutex_create annotation.
+// If mutex has no constructor and __tsan_mutex_create is not called,
+// the flags may be passed to __tsan_mutex_pre_lock/__tsan_mutex_post_lock
+// annotations.
+
+// Mutex has static storage duration and no-op constructor and destructor.
+// This effectively makes tsan ignore destroy annotation.
+const unsigned __tsan_mutex_linker_init      = 1 << 0;
+// Mutex is write reentrant.
+const unsigned __tsan_mutex_write_reentrant  = 1 << 1;
+// Mutex is read reentrant.
+const unsigned __tsan_mutex_read_reentrant   = 1 << 2;
+// Mutex does not have static storage duration, and must not be used after
+// its destructor runs.  The opposite of __tsan_mutex_linker_init.
+// If this flag is passed to __tsan_mutex_destroy, then the destruction
+// is ignored unless this flag was previously set on the mutex.
+const unsigned __tsan_mutex_not_static       = 1 << 8;
+
+// Mutex operation flags:
+
+// Denotes read lock operation.
+const unsigned __tsan_mutex_read_lock        = 1 << 3;
+// Denotes try lock operation.
+const unsigned __tsan_mutex_try_lock         = 1 << 4;
+// Denotes that a try lock operation has failed to acquire the mutex.
+const unsigned __tsan_mutex_try_lock_failed  = 1 << 5;
+// Denotes that the lock operation acquires multiple recursion levels.
+// Number of levels is passed in recursion parameter.
+// This is useful for annotation of e.g. Java builtin monitors,
+// for which wait operation releases all recursive acquisitions of the mutex.
+const unsigned __tsan_mutex_recursive_lock   = 1 << 6;
+// Denotes that the unlock operation releases all recursion levels.
+// Number of released levels is returned and later must be passed to
+// the corresponding __tsan_mutex_post_lock annotation.
+const unsigned __tsan_mutex_recursive_unlock = 1 << 7;
+
+// Annotate creation of a mutex.
+// Supported flags: mutex creation flags.
+void __tsan_mutex_create(void *addr, unsigned flags);
+
+// Annotate destruction of a mutex.
+// Supported flags:
+//   - __tsan_mutex_linker_init
+//   - __tsan_mutex_not_static
+void __tsan_mutex_destroy(void *addr, unsigned flags);
+
+// Annotate start of lock operation.
+// Supported flags:
+//   - __tsan_mutex_read_lock
+//   - __tsan_mutex_try_lock
+//   - all mutex creation flags
+void __tsan_mutex_pre_lock(void *addr, unsigned flags);
+
+// Annotate end of lock operation.
+// Supported flags:
+//   - __tsan_mutex_read_lock (must match __tsan_mutex_pre_lock)
+//   - __tsan_mutex_try_lock (must match __tsan_mutex_pre_lock)
+//   - __tsan_mutex_try_lock_failed
+//   - __tsan_mutex_recursive_lock
+//   - all mutex creation flags
+void __tsan_mutex_post_lock(void *addr, unsigned flags, int recursion);
+
+// Annotate start of unlock operation.
+// Supported flags:
+//   - __tsan_mutex_read_lock
+//   - __tsan_mutex_recursive_unlock
+int __tsan_mutex_pre_unlock(void *addr, unsigned flags);
+
+// Annotate end of unlock operation.
+// Supported flags:
+//   - __tsan_mutex_read_lock (must match __tsan_mutex_pre_unlock)
+void __tsan_mutex_post_unlock(void *addr, unsigned flags);
+
+// Annotate start/end of notify/signal/broadcast operation.
+// Supported flags: none.
+void __tsan_mutex_pre_signal(void *addr, unsigned flags);
+void __tsan_mutex_post_signal(void *addr, unsigned flags);
+
+// Annotate start/end of a region of code where lock/unlock/signal operation
+// diverts to do something else unrelated to the mutex. This can be used to
+// annotate, for example, calls into cooperative scheduler or contention
+// profiling code.
+// These annotations must be called only from within
+// __tsan_mutex_pre/post_lock, __tsan_mutex_pre/post_unlock,
+// __tsan_mutex_pre/post_signal regions.
+// Supported flags: none.
+void __tsan_mutex_pre_divert(void *addr, unsigned flags);
+void __tsan_mutex_post_divert(void *addr, unsigned flags);
+
+// External race detection API.
+// Can be used by non-instrumented libraries to detect when their objects are
+// being used in an unsafe manner.
+//   - __tsan_external_read/__tsan_external_write annotates the logical reads
+//       and writes of the object at the specified address. 'caller_pc' should
+//       be the PC of the library user, which the library can obtain with e.g.
+//       `__builtin_return_address(0)`.
+//   - __tsan_external_register_tag registers a 'tag' with the specified name,
+//       which is later used in read/write annotations to denote the object type
+//   - __tsan_external_assign_tag can optionally mark a heap object with a tag
+void *__tsan_external_register_tag(const char *object_type);
+void __tsan_external_register_header(void *tag, const char *header);
+void __tsan_external_assign_tag(void *addr, void *tag);
+void __tsan_external_read(void *addr, void *caller_pc, void *tag);
+void __tsan_external_write(void *addr, void *caller_pc, void *tag);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // SANITIZER_TSAN_INTERFACE_H

+ 222 - 0
demo/include/sanitizer/tsan_interface_atomic.h

@@ -0,0 +1,222 @@
+//===-- tsan_interface_atomic.h ---------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of ThreadSanitizer (TSan), a race detector.
+//
+// Public interface header for TSan atomics.
+//===----------------------------------------------------------------------===//
+#ifndef TSAN_INTERFACE_ATOMIC_H
+#define TSAN_INTERFACE_ATOMIC_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef char     __tsan_atomic8;
+typedef short    __tsan_atomic16;  // NOLINT
+typedef int      __tsan_atomic32;
+typedef long     __tsan_atomic64;  // NOLINT
+#if defined(__SIZEOF_INT128__) \
+    || (__clang_major__ * 100 + __clang_minor__ >= 302)
+__extension__ typedef __int128 __tsan_atomic128;
+# define __TSAN_HAS_INT128 1
+#else
+# define __TSAN_HAS_INT128 0
+#endif
+
+// Part of ABI, do not change.
+// http://llvm.org/viewvc/llvm-project/libcxx/trunk/include/atomic?view=markup
+typedef enum {
+  __tsan_memory_order_relaxed,
+  __tsan_memory_order_consume,
+  __tsan_memory_order_acquire,
+  __tsan_memory_order_release,
+  __tsan_memory_order_acq_rel,
+  __tsan_memory_order_seq_cst
+} __tsan_memory_order;
+
+__tsan_atomic8 __tsan_atomic8_load(const volatile __tsan_atomic8 *a,
+    __tsan_memory_order mo);
+__tsan_atomic16 __tsan_atomic16_load(const volatile __tsan_atomic16 *a,
+    __tsan_memory_order mo);
+__tsan_atomic32 __tsan_atomic32_load(const volatile __tsan_atomic32 *a,
+    __tsan_memory_order mo);
+__tsan_atomic64 __tsan_atomic64_load(const volatile __tsan_atomic64 *a,
+    __tsan_memory_order mo);
+#if __TSAN_HAS_INT128
+__tsan_atomic128 __tsan_atomic128_load(const volatile __tsan_atomic128 *a,
+    __tsan_memory_order mo);
+#endif
+
+void __tsan_atomic8_store(volatile __tsan_atomic8 *a, __tsan_atomic8 v,
+    __tsan_memory_order mo);
+void __tsan_atomic16_store(volatile __tsan_atomic16 *a, __tsan_atomic16 v,
+    __tsan_memory_order mo);
+void __tsan_atomic32_store(volatile __tsan_atomic32 *a, __tsan_atomic32 v,
+    __tsan_memory_order mo);
+void __tsan_atomic64_store(volatile __tsan_atomic64 *a, __tsan_atomic64 v,
+    __tsan_memory_order mo);
+#if __TSAN_HAS_INT128
+void __tsan_atomic128_store(volatile __tsan_atomic128 *a, __tsan_atomic128 v,
+    __tsan_memory_order mo);
+#endif
+
+__tsan_atomic8 __tsan_atomic8_exchange(volatile __tsan_atomic8 *a,
+    __tsan_atomic8 v, __tsan_memory_order mo);
+__tsan_atomic16 __tsan_atomic16_exchange(volatile __tsan_atomic16 *a,
+    __tsan_atomic16 v, __tsan_memory_order mo);
+__tsan_atomic32 __tsan_atomic32_exchange(volatile __tsan_atomic32 *a,
+    __tsan_atomic32 v, __tsan_memory_order mo);
+__tsan_atomic64 __tsan_atomic64_exchange(volatile __tsan_atomic64 *a,
+    __tsan_atomic64 v, __tsan_memory_order mo);
+#if __TSAN_HAS_INT128
+__tsan_atomic128 __tsan_atomic128_exchange(volatile __tsan_atomic128 *a,
+    __tsan_atomic128 v, __tsan_memory_order mo);
+#endif
+
+__tsan_atomic8 __tsan_atomic8_fetch_add(volatile __tsan_atomic8 *a,
+    __tsan_atomic8 v, __tsan_memory_order mo);
+__tsan_atomic16 __tsan_atomic16_fetch_add(volatile __tsan_atomic16 *a,
+    __tsan_atomic16 v, __tsan_memory_order mo);
+__tsan_atomic32 __tsan_atomic32_fetch_add(volatile __tsan_atomic32 *a,
+    __tsan_atomic32 v, __tsan_memory_order mo);
+__tsan_atomic64 __tsan_atomic64_fetch_add(volatile __tsan_atomic64 *a,
+    __tsan_atomic64 v, __tsan_memory_order mo);
+#if __TSAN_HAS_INT128
+__tsan_atomic128 __tsan_atomic128_fetch_add(volatile __tsan_atomic128 *a,
+    __tsan_atomic128 v, __tsan_memory_order mo);
+#endif
+
+__tsan_atomic8 __tsan_atomic8_fetch_sub(volatile __tsan_atomic8 *a,
+    __tsan_atomic8 v, __tsan_memory_order mo);
+__tsan_atomic16 __tsan_atomic16_fetch_sub(volatile __tsan_atomic16 *a,
+    __tsan_atomic16 v, __tsan_memory_order mo);
+__tsan_atomic32 __tsan_atomic32_fetch_sub(volatile __tsan_atomic32 *a,
+    __tsan_atomic32 v, __tsan_memory_order mo);
+__tsan_atomic64 __tsan_atomic64_fetch_sub(volatile __tsan_atomic64 *a,
+    __tsan_atomic64 v, __tsan_memory_order mo);
+#if __TSAN_HAS_INT128
+__tsan_atomic128 __tsan_atomic128_fetch_sub(volatile __tsan_atomic128 *a,
+    __tsan_atomic128 v, __tsan_memory_order mo);
+#endif
+
+__tsan_atomic8 __tsan_atomic8_fetch_and(volatile __tsan_atomic8 *a,
+    __tsan_atomic8 v, __tsan_memory_order mo);
+__tsan_atomic16 __tsan_atomic16_fetch_and(volatile __tsan_atomic16 *a,
+    __tsan_atomic16 v, __tsan_memory_order mo);
+__tsan_atomic32 __tsan_atomic32_fetch_and(volatile __tsan_atomic32 *a,
+    __tsan_atomic32 v, __tsan_memory_order mo);
+__tsan_atomic64 __tsan_atomic64_fetch_and(volatile __tsan_atomic64 *a,
+    __tsan_atomic64 v, __tsan_memory_order mo);
+#if __TSAN_HAS_INT128
+__tsan_atomic128 __tsan_atomic128_fetch_and(volatile __tsan_atomic128 *a,
+    __tsan_atomic128 v, __tsan_memory_order mo);
+#endif
+
+__tsan_atomic8 __tsan_atomic8_fetch_or(volatile __tsan_atomic8 *a,
+    __tsan_atomic8 v, __tsan_memory_order mo);
+__tsan_atomic16 __tsan_atomic16_fetch_or(volatile __tsan_atomic16 *a,
+    __tsan_atomic16 v, __tsan_memory_order mo);
+__tsan_atomic32 __tsan_atomic32_fetch_or(volatile __tsan_atomic32 *a,
+    __tsan_atomic32 v, __tsan_memory_order mo);
+__tsan_atomic64 __tsan_atomic64_fetch_or(volatile __tsan_atomic64 *a,
+    __tsan_atomic64 v, __tsan_memory_order mo);
+#if __TSAN_HAS_INT128
+__tsan_atomic128 __tsan_atomic128_fetch_or(volatile __tsan_atomic128 *a,
+    __tsan_atomic128 v, __tsan_memory_order mo);
+#endif
+
+__tsan_atomic8 __tsan_atomic8_fetch_xor(volatile __tsan_atomic8 *a,
+    __tsan_atomic8 v, __tsan_memory_order mo);
+__tsan_atomic16 __tsan_atomic16_fetch_xor(volatile __tsan_atomic16 *a,
+    __tsan_atomic16 v, __tsan_memory_order mo);
+__tsan_atomic32 __tsan_atomic32_fetch_xor(volatile __tsan_atomic32 *a,
+    __tsan_atomic32 v, __tsan_memory_order mo);
+__tsan_atomic64 __tsan_atomic64_fetch_xor(volatile __tsan_atomic64 *a,
+    __tsan_atomic64 v, __tsan_memory_order mo);
+#if __TSAN_HAS_INT128
+__tsan_atomic128 __tsan_atomic128_fetch_xor(volatile __tsan_atomic128 *a,
+    __tsan_atomic128 v, __tsan_memory_order mo);
+#endif
+
+__tsan_atomic8 __tsan_atomic8_fetch_nand(volatile __tsan_atomic8 *a,
+    __tsan_atomic8 v, __tsan_memory_order mo);
+__tsan_atomic16 __tsan_atomic16_fetch_nand(volatile __tsan_atomic16 *a,
+    __tsan_atomic16 v, __tsan_memory_order mo);
+__tsan_atomic32 __tsan_atomic32_fetch_nand(volatile __tsan_atomic32 *a,
+    __tsan_atomic32 v, __tsan_memory_order mo);
+__tsan_atomic64 __tsan_atomic64_fetch_nand(volatile __tsan_atomic64 *a,
+    __tsan_atomic64 v, __tsan_memory_order mo);
+#if __TSAN_HAS_INT128
+__tsan_atomic128 __tsan_atomic128_fetch_nand(volatile __tsan_atomic128 *a,
+    __tsan_atomic128 v, __tsan_memory_order mo);
+#endif
+
+int __tsan_atomic8_compare_exchange_weak(volatile __tsan_atomic8 *a,
+    __tsan_atomic8 *c, __tsan_atomic8 v, __tsan_memory_order mo,
+    __tsan_memory_order fail_mo);
+int __tsan_atomic16_compare_exchange_weak(volatile __tsan_atomic16 *a,
+    __tsan_atomic16 *c, __tsan_atomic16 v, __tsan_memory_order mo,
+    __tsan_memory_order fail_mo);
+int __tsan_atomic32_compare_exchange_weak(volatile __tsan_atomic32 *a,
+    __tsan_atomic32 *c, __tsan_atomic32 v, __tsan_memory_order mo,
+    __tsan_memory_order fail_mo);
+int __tsan_atomic64_compare_exchange_weak(volatile __tsan_atomic64 *a,
+    __tsan_atomic64 *c, __tsan_atomic64 v, __tsan_memory_order mo,
+    __tsan_memory_order fail_mo);
+#if __TSAN_HAS_INT128
+int __tsan_atomic128_compare_exchange_weak(volatile __tsan_atomic128 *a,
+    __tsan_atomic128 *c, __tsan_atomic128 v, __tsan_memory_order mo,
+    __tsan_memory_order fail_mo);
+#endif
+
+int __tsan_atomic8_compare_exchange_strong(volatile __tsan_atomic8 *a,
+    __tsan_atomic8 *c, __tsan_atomic8 v, __tsan_memory_order mo,
+    __tsan_memory_order fail_mo);
+int __tsan_atomic16_compare_exchange_strong(volatile __tsan_atomic16 *a,
+    __tsan_atomic16 *c, __tsan_atomic16 v, __tsan_memory_order mo,
+    __tsan_memory_order fail_mo);
+int __tsan_atomic32_compare_exchange_strong(volatile __tsan_atomic32 *a,
+    __tsan_atomic32 *c, __tsan_atomic32 v, __tsan_memory_order mo,
+    __tsan_memory_order fail_mo);
+int __tsan_atomic64_compare_exchange_strong(volatile __tsan_atomic64 *a,
+    __tsan_atomic64 *c, __tsan_atomic64 v, __tsan_memory_order mo,
+    __tsan_memory_order fail_mo);
+#if __TSAN_HAS_INT128
+int __tsan_atomic128_compare_exchange_strong(volatile __tsan_atomic128 *a,
+    __tsan_atomic128 *c, __tsan_atomic128 v, __tsan_memory_order mo,
+    __tsan_memory_order fail_mo);
+#endif
+
+__tsan_atomic8 __tsan_atomic8_compare_exchange_val(
+    volatile __tsan_atomic8 *a, __tsan_atomic8 c, __tsan_atomic8 v,
+    __tsan_memory_order mo, __tsan_memory_order fail_mo);
+__tsan_atomic16 __tsan_atomic16_compare_exchange_val(
+    volatile __tsan_atomic16 *a, __tsan_atomic16 c, __tsan_atomic16 v,
+    __tsan_memory_order mo, __tsan_memory_order fail_mo);
+__tsan_atomic32 __tsan_atomic32_compare_exchange_val(
+    volatile __tsan_atomic32 *a, __tsan_atomic32 c, __tsan_atomic32 v,
+    __tsan_memory_order mo, __tsan_memory_order fail_mo);
+__tsan_atomic64 __tsan_atomic64_compare_exchange_val(
+    volatile __tsan_atomic64 *a, __tsan_atomic64 c, __tsan_atomic64 v,
+    __tsan_memory_order mo, __tsan_memory_order fail_mo);
+#if __TSAN_HAS_INT128
+__tsan_atomic128 __tsan_atomic128_compare_exchange_val(
+    volatile __tsan_atomic128 *a, __tsan_atomic128 c, __tsan_atomic128 v,
+    __tsan_memory_order mo, __tsan_memory_order fail_mo);
+#endif
+
+void __tsan_atomic_thread_fence(__tsan_memory_order mo);
+void __tsan_atomic_signal_fence(__tsan_memory_order mo);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // TSAN_INTERFACE_ATOMIC_H

+ 75 - 0
demo/include/shaintrin.h

@@ -0,0 +1,75 @@
+/*===---- shaintrin.h - SHA intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <shaintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __SHAINTRIN_H
+#define __SHAINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sha")))
+
+#define _mm_sha1rnds4_epu32(V1, V2, M) __extension__ ({ \
+  __builtin_ia32_sha1rnds4((__v4si)(__m128i)(V1), (__v4si)(__m128i)(V2), (M)); })
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sha1nexte_epu32(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_sha1nexte((__v4si)__X, (__v4si)__Y);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sha1msg1_epu32(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_sha1msg1((__v4si)__X, (__v4si)__Y);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sha1msg2_epu32(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_sha1msg2((__v4si)__X, (__v4si)__Y);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sha256rnds2_epu32(__m128i __X, __m128i __Y, __m128i __Z)
+{
+  return (__m128i)__builtin_ia32_sha256rnds2((__v4si)__X, (__v4si)__Y, (__v4si)__Z);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sha256msg1_epu32(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_sha256msg1((__v4si)__X, (__v4si)__Y);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sha256msg2_epu32(__m128i __X, __m128i __Y)
+{
+  return (__m128i)__builtin_ia32_sha256msg2((__v4si)__X, (__v4si)__Y);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __SHAINTRIN_H */

文件差異過大導致無法顯示
+ 2465 - 0
demo/include/smmintrin.h


+ 35 - 0
demo/include/stdalign.h

@@ -0,0 +1,35 @@
+/*===---- stdalign.h - Standard header for alignment ------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __STDALIGN_H
+#define __STDALIGN_H
+
+#ifndef __cplusplus
+#define alignas _Alignas
+#define alignof _Alignof
+#endif
+
+#define __alignas_is_defined 1
+#define __alignof_is_defined 1
+
+#endif /* __STDALIGN_H */

+ 51 - 0
demo/include/stdarg.h

@@ -0,0 +1,51 @@
+/*===---- stdarg.h - Variable argument handling ----------------------------===
+ *
+ * Copyright (c) 2008 Eli Friedman
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __STDARG_H
+#define __STDARG_H
+
+#ifndef _VA_LIST
+typedef __builtin_va_list va_list;
+#define _VA_LIST
+#endif
+#define va_start(ap, param) __builtin_va_start(ap, param)
+#define va_end(ap)          __builtin_va_end(ap)
+#define va_arg(ap, type)    __builtin_va_arg(ap, type)
+
+/* GCC always defines __va_copy, but does not define va_copy unless in c99 mode
+ * or -ansi is not specified, since it was not part of C90.
+ */
+#define __va_copy(d,s) __builtin_va_copy(d,s)
+
+#if __STDC_VERSION__ >= 199901L || __cplusplus >= 201103L || !defined(__STRICT_ANSI__)
+#define va_copy(dest, src)  __builtin_va_copy(dest, src)
+#endif
+
+#ifndef __GNUC_VA_LIST
+#define __GNUC_VA_LIST 1
+typedef __builtin_va_list __gnuc_va_list;
+#endif
+
+#endif /* __STDARG_H */

+ 0 - 0
demo/include/stdatomic.h


部分文件因文件數量過多而無法顯示