5 years ago · 196b5b52c3
--- a/demo/include/__clang_cuda_builtin_vars.h
+++ b/demo/include/__clang_cuda_builtin_vars.h
@@ -0,0 +1,126 @@
 
				+/*===---- cuda_builtin_vars.h - CUDA built-in variables ---------------------===
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+
			
 
				+#ifndef __CUDA_BUILTIN_VARS_H
			
 
				+#define __CUDA_BUILTIN_VARS_H
			
 
				+
			
 
				+// Forward declares from vector_types.h.
			
 
				+struct uint3;
			
 
				+struct dim3;
			
 
				+
			
 
				+// The file implements built-in CUDA variables using __declspec(property).
			
 
				+// https://msdn.microsoft.com/en-us/library/yhfk0thd.aspx
			
 
				+// All read accesses of built-in variable fields get converted into calls to a
			
 
				+// getter function which in turn calls the appropriate builtin to fetch the
			
 
				+// value.
			
 
				+//
			
 
				+// Example:
			
 
				+//    int x = threadIdx.x;
			
 
				+// IR output:
			
 
				+//  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
			
 
				+// PTX output:
			
 
				+//  mov.u32     %r2, %tid.x;
			
 
				+
			
 
				+#define __CUDA_DEVICE_BUILTIN(FIELD, INTRINSIC)                                \
			
 
				+  __declspec(property(get = __fetch_builtin_##FIELD)) unsigned int FIELD;      \
			
 
				+  static inline __attribute__((always_inline))                                 \
			
 
				+      __attribute__((device)) unsigned int __fetch_builtin_##FIELD(void) {     \
			
 
				+    return INTRINSIC;                                                          \
			
 
				+  }
			
 
				+
			
 
				+#if __cplusplus >= 201103L
			
 
				+#define __DELETE =delete
			
 
				+#else
			
 
				+#define __DELETE
			
 
				+#endif
			
 
				+
			
 
				+// Make sure nobody can create instances of the special varible types.  nvcc
			
 
				+// also disallows taking address of special variables, so we disable address-of
			
 
				+// operator as well.
			
 
				+#define __CUDA_DISALLOW_BUILTINVAR_ACCESS(TypeName)                            \
			
 
				+  __attribute__((device)) TypeName() __DELETE;                                 \
			
 
				+  __attribute__((device)) TypeName(const TypeName &) __DELETE;                 \
			
 
				+  __attribute__((device)) void operator=(const TypeName &) const __DELETE;     \
			
 
				+  __attribute__((device)) TypeName *operator&() const __DELETE
			
 
				+
			
 
				+struct __cuda_builtin_threadIdx_t {
			
 
				+  __CUDA_DEVICE_BUILTIN(x,__nvvm_read_ptx_sreg_tid_x());
			
 
				+  __CUDA_DEVICE_BUILTIN(y,__nvvm_read_ptx_sreg_tid_y());
			
 
				+  __CUDA_DEVICE_BUILTIN(z,__nvvm_read_ptx_sreg_tid_z());
			
 
				+  // threadIdx should be convertible to uint3 (in fact in nvcc, it *is* a
			
 
				+  // uint3).  This function is defined after we pull in vector_types.h.
			
 
				+  __attribute__((device)) operator uint3() const;
			
 
				+private:
			
 
				+  __CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_threadIdx_t);
			
 
				+};
			
 
				+
			
 
				+struct __cuda_builtin_blockIdx_t {
			
 
				+  __CUDA_DEVICE_BUILTIN(x,__nvvm_read_ptx_sreg_ctaid_x());
			
 
				+  __CUDA_DEVICE_BUILTIN(y,__nvvm_read_ptx_sreg_ctaid_y());
			
 
				+  __CUDA_DEVICE_BUILTIN(z,__nvvm_read_ptx_sreg_ctaid_z());
			
 
				+  // blockIdx should be convertible to uint3 (in fact in nvcc, it *is* a
			
 
				+  // uint3).  This function is defined after we pull in vector_types.h.
			
 
				+  __attribute__((device)) operator uint3() const;
			
 
				+private:
			
 
				+  __CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_blockIdx_t);
			
 
				+};
			
 
				+
			
 
				+struct __cuda_builtin_blockDim_t {
			
 
				+  __CUDA_DEVICE_BUILTIN(x,__nvvm_read_ptx_sreg_ntid_x());
			
 
				+  __CUDA_DEVICE_BUILTIN(y,__nvvm_read_ptx_sreg_ntid_y());
			
 
				+  __CUDA_DEVICE_BUILTIN(z,__nvvm_read_ptx_sreg_ntid_z());
			
 
				+  // blockDim should be convertible to dim3 (in fact in nvcc, it *is* a
			
 
				+  // dim3).  This function is defined after we pull in vector_types.h.
			
 
				+  __attribute__((device)) operator dim3() const;
			
 
				+private:
			
 
				+  __CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_blockDim_t);
			
 
				+};
			
 
				+
			
 
				+struct __cuda_builtin_gridDim_t {
			
 
				+  __CUDA_DEVICE_BUILTIN(x,__nvvm_read_ptx_sreg_nctaid_x());
			
 
				+  __CUDA_DEVICE_BUILTIN(y,__nvvm_read_ptx_sreg_nctaid_y());
			
 
				+  __CUDA_DEVICE_BUILTIN(z,__nvvm_read_ptx_sreg_nctaid_z());
			
 
				+  // gridDim should be convertible to dim3 (in fact in nvcc, it *is* a
			
 
				+  // dim3).  This function is defined after we pull in vector_types.h.
			
 
				+  __attribute__((device)) operator dim3() const;
			
 
				+private:
			
 
				+  __CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_gridDim_t);
			
 
				+};
			
 
				+
			
 
				+#define __CUDA_BUILTIN_VAR                                                     \
			
 
				+  extern const __attribute__((device)) __attribute__((weak))
			
 
				+__CUDA_BUILTIN_VAR __cuda_builtin_threadIdx_t threadIdx;
			
 
				+__CUDA_BUILTIN_VAR __cuda_builtin_blockIdx_t blockIdx;
			
 
				+__CUDA_BUILTIN_VAR __cuda_builtin_blockDim_t blockDim;
			
 
				+__CUDA_BUILTIN_VAR __cuda_builtin_gridDim_t gridDim;
			
 
				+
			
 
				+// warpSize should translate to read of %WARP_SZ but there's currently no
			
 
				+// builtin to do so. According to PTX v4.2 docs 'to date, all target
			
 
				+// architectures have a WARP_SZ value of 32'.
			
 
				+__attribute__((device)) const int warpSize = 32;
			
 
				+
			
 
				+#undef __CUDA_DEVICE_BUILTIN
			
 
				+#undef __CUDA_BUILTIN_VAR
			
 
				+#undef __CUDA_DISALLOW_BUILTINVAR_ACCESS
			
 
				+
			
 
				+#endif /* __CUDA_BUILTIN_VARS_H */
			
--- a/demo/include/__clang_cuda_cmath.h
+++ b/demo/include/__clang_cuda_cmath.h
@@ -0,0 +1,472 @@
 
				+/*===---- __clang_cuda_cmath.h - Device-side CUDA cmath support ------------===
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+#ifndef __CLANG_CUDA_CMATH_H__
			
 
				+#define __CLANG_CUDA_CMATH_H__
			
 
				+#ifndef __CUDA__
			
 
				+#error "This file is for CUDA compilation only."
			
 
				+#endif
			
 
				+
			
 
				+#include <limits>
			
 
				+
			
 
				+// CUDA lets us use various std math functions on the device side.  This file
			
 
				+// works in concert with __clang_cuda_math_forward_declares.h to make this work.
			
 
				+//
			
 
				+// Specifically, the forward-declares header declares __device__ overloads for
			
 
				+// these functions in the global namespace, then pulls them into namespace std
			
 
				+// with 'using' statements.  Then this file implements those functions, after
			
 
				+// their implementations have been pulled in.
			
 
				+//
			
 
				+// It's important that we declare the functions in the global namespace and pull
			
 
				+// them into namespace std with using statements, as opposed to simply declaring
			
 
				+// these functions in namespace std, because our device functions need to
			
 
				+// overload the standard library functions, which may be declared in the global
			
 
				+// namespace or in std, depending on the degree of conformance of the stdlib
			
 
				+// implementation.  Declaring in the global namespace and pulling into namespace
			
 
				+// std covers all of the known knowns.
			
 
				+
			
 
				+#define __DEVICE__ static __device__ __inline__ __attribute__((always_inline))
			
 
				+
			
 
				+__DEVICE__ long long abs(long long __n) { return ::llabs(__n); }
			
 
				+__DEVICE__ long abs(long __n) { return ::labs(__n); }
			
 
				+__DEVICE__ float abs(float __x) { return ::fabsf(__x); }
			
 
				+__DEVICE__ double abs(double __x) { return ::fabs(__x); }
			
 
				+__DEVICE__ float acos(float __x) { return ::acosf(__x); }
			
 
				+__DEVICE__ float asin(float __x) { return ::asinf(__x); }
			
 
				+__DEVICE__ float atan(float __x) { return ::atanf(__x); }
			
 
				+__DEVICE__ float atan2(float __x, float __y) { return ::atan2f(__x, __y); }
			
 
				+__DEVICE__ float ceil(float __x) { return ::ceilf(__x); }
			
 
				+__DEVICE__ float cos(float __x) { return ::cosf(__x); }
			
 
				+__DEVICE__ float cosh(float __x) { return ::coshf(__x); }
			
 
				+__DEVICE__ float exp(float __x) { return ::expf(__x); }
			
 
				+__DEVICE__ float fabs(float __x) { return ::fabsf(__x); }
			
 
				+__DEVICE__ float floor(float __x) { return ::floorf(__x); }
			
 
				+__DEVICE__ float fmod(float __x, float __y) { return ::fmodf(__x, __y); }
			
 
				+__DEVICE__ int fpclassify(float __x) {
			
 
				+  return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL,
			
 
				+                              FP_ZERO, __x);
			
 
				+}
			
 
				+__DEVICE__ int fpclassify(double __x) {
			
 
				+  return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL,
			
 
				+                              FP_ZERO, __x);
			
 
				+}
			
 
				+__DEVICE__ float frexp(float __arg, int *__exp) {
			
 
				+  return ::frexpf(__arg, __exp);
			
 
				+}
			
 
				+
			
 
				+// For inscrutable reasons, the CUDA headers define these functions for us on
			
 
				+// Windows.
			
 
				+#ifndef _MSC_VER
			
 
				+__DEVICE__ bool isinf(float __x) { return ::__isinff(__x); }
			
 
				+__DEVICE__ bool isinf(double __x) { return ::__isinf(__x); }
			
 
				+__DEVICE__ bool isfinite(float __x) { return ::__finitef(__x); }
			
 
				+// For inscrutable reasons, __finite(), the double-precision version of
			
 
				+// __finitef, does not exist when compiling for MacOS.  __isfinited is available
			
 
				+// everywhere and is just as good.
			
 
				+__DEVICE__ bool isfinite(double __x) { return ::__isfinited(__x); }
			
 
				+__DEVICE__ bool isnan(float __x) { return ::__isnanf(__x); }
			
 
				+__DEVICE__ bool isnan(double __x) { return ::__isnan(__x); }
			
 
				+#endif
			
 
				+
			
 
				+__DEVICE__ bool isgreater(float __x, float __y) {
			
 
				+  return __builtin_isgreater(__x, __y);
			
 
				+}
			
 
				+__DEVICE__ bool isgreater(double __x, double __y) {
			
 
				+  return __builtin_isgreater(__x, __y);
			
 
				+}
			
 
				+__DEVICE__ bool isgreaterequal(float __x, float __y) {
			
 
				+  return __builtin_isgreaterequal(__x, __y);
			
 
				+}
			
 
				+__DEVICE__ bool isgreaterequal(double __x, double __y) {
			
 
				+  return __builtin_isgreaterequal(__x, __y);
			
 
				+}
			
 
				+__DEVICE__ bool isless(float __x, float __y) {
			
 
				+  return __builtin_isless(__x, __y);
			
 
				+}
			
 
				+__DEVICE__ bool isless(double __x, double __y) {
			
 
				+  return __builtin_isless(__x, __y);
			
 
				+}
			
 
				+__DEVICE__ bool islessequal(float __x, float __y) {
			
 
				+  return __builtin_islessequal(__x, __y);
			
 
				+}
			
 
				+__DEVICE__ bool islessequal(double __x, double __y) {
			
 
				+  return __builtin_islessequal(__x, __y);
			
 
				+}
			
 
				+__DEVICE__ bool islessgreater(float __x, float __y) {
			
 
				+  return __builtin_islessgreater(__x, __y);
			
 
				+}
			
 
				+__DEVICE__ bool islessgreater(double __x, double __y) {
			
 
				+  return __builtin_islessgreater(__x, __y);
			
 
				+}
			
 
				+__DEVICE__ bool isnormal(float __x) { return __builtin_isnormal(__x); }
			
 
				+__DEVICE__ bool isnormal(double __x) { return __builtin_isnormal(__x); }
			
 
				+__DEVICE__ bool isunordered(float __x, float __y) {
			
 
				+  return __builtin_isunordered(__x, __y);
			
 
				+}
			
 
				+__DEVICE__ bool isunordered(double __x, double __y) {
			
 
				+  return __builtin_isunordered(__x, __y);
			
 
				+}
			
 
				+__DEVICE__ float ldexp(float __arg, int __exp) {
			
 
				+  return ::ldexpf(__arg, __exp);
			
 
				+}
			
 
				+__DEVICE__ float log(float __x) { return ::logf(__x); }
			
 
				+__DEVICE__ float log10(float __x) { return ::log10f(__x); }
			
 
				+__DEVICE__ float modf(float __x, float *__iptr) { return ::modff(__x, __iptr); }
			
 
				+__DEVICE__ float pow(float __base, float __exp) {
			
 
				+  return ::powf(__base, __exp);
			
 
				+}
			
 
				+__DEVICE__ float pow(float __base, int __iexp) {
			
 
				+  return ::powif(__base, __iexp);
			
 
				+}
			
 
				+__DEVICE__ double pow(double __base, int __iexp) {
			
 
				+  return ::powi(__base, __iexp);
			
 
				+}
			
 
				+__DEVICE__ bool signbit(float __x) { return ::__signbitf(__x); }
			
 
				+__DEVICE__ bool signbit(double __x) { return ::__signbitd(__x); }
			
 
				+__DEVICE__ float sin(float __x) { return ::sinf(__x); }
			
 
				+__DEVICE__ float sinh(float __x) { return ::sinhf(__x); }
			
 
				+__DEVICE__ float sqrt(float __x) { return ::sqrtf(__x); }
			
 
				+__DEVICE__ float tan(float __x) { return ::tanf(__x); }
			
 
				+__DEVICE__ float tanh(float __x) { return ::tanhf(__x); }
			
 
				+
			
 
				+// Notably missing above is nexttoward.  We omit it because
			
 
				+// libdevice doesn't provide an implementation, and we don't want to be in the
			
 
				+// business of implementing tricky libm functions in this header.
			
 
				+
			
 
				+// Now we've defined everything we promised we'd define in
			
 
				+// __clang_cuda_math_forward_declares.h.  We need to do two additional things to
			
 
				+// fix up our math functions.
			
 
				+//
			
 
				+// 1) Define __device__ overloads for e.g. sin(int).  The CUDA headers define
			
 
				+//    only sin(float) and sin(double), which means that e.g. sin(0) is
			
 
				+//    ambiguous.
			
 
				+//
			
 
				+// 2) Pull the __device__ overloads of "foobarf" math functions into namespace
			
 
				+//    std.  These are defined in the CUDA headers in the global namespace,
			
 
				+//    independent of everything else we've done here.
			
 
				+
			
 
				+// We can't use std::enable_if, because we want to be pre-C++11 compatible.  But
			
 
				+// we go ahead and unconditionally define functions that are only available when
			
 
				+// compiling for C++11 to match the behavior of the CUDA headers.
			
 
				+template<bool __B, class __T = void>
			
 
				+struct __clang_cuda_enable_if {};
			
 
				+
			
 
				+template <class __T> struct __clang_cuda_enable_if<true, __T> {
			
 
				+  typedef __T type;
			
 
				+};
			
 
				+
			
 
				+// Defines an overload of __fn that accepts one integral argument, calls
			
 
				+// __fn((double)x), and returns __retty.
			
 
				+#define __CUDA_CLANG_FN_INTEGER_OVERLOAD_1(__retty, __fn)                      \
			
 
				+  template <typename __T>                                                      \
			
 
				+  __DEVICE__                                                                   \
			
 
				+      typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,    \
			
 
				+                                      __retty>::type                           \
			
 
				+      __fn(__T __x) {                                                          \
			
 
				+    return ::__fn((double)__x);                                                \
			
 
				+  }
			
 
				+
			
 
				+// Defines an overload of __fn that accepts one two arithmetic arguments, calls
			
 
				+// __fn((double)x, (double)y), and returns a double.
			
 
				+//
			
 
				+// Note this is different from OVERLOAD_1, which generates an overload that
			
 
				+// accepts only *integral* arguments.
			
 
				+#define __CUDA_CLANG_FN_INTEGER_OVERLOAD_2(__retty, __fn)                      \
			
 
				+  template <typename __T1, typename __T2>                                      \
			
 
				+  __DEVICE__ typename __clang_cuda_enable_if<                                  \
			
 
				+      std::numeric_limits<__T1>::is_specialized &&                             \
			
 
				+          std::numeric_limits<__T2>::is_specialized,                           \
			
 
				+      __retty>::type                                                           \
			
 
				+  __fn(__T1 __x, __T2 __y) {                                                   \
			
 
				+    return __fn((double)__x, (double)__y);                                     \
			
 
				+  }
			
 
				+
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, acos)
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, acosh)
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, asin)
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, asinh)
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, atan)
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, atan2);
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, atanh)
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, cbrt)
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, ceil)
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, copysign);
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, cos)
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, cosh)
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, erf)
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, erfc)
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, exp)
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, exp2)
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, expm1)
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, fabs)
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, fdim);
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, floor)
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, fmax);
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, fmin);
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, fmod);
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(int, fpclassify)
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, hypot);
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(int, ilogb)
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, isfinite)
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, isgreater);
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, isgreaterequal);
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, isinf);
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, isless);
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, islessequal);
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, islessgreater);
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, isnan);
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, isnormal)
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, isunordered);
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, lgamma)
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, log)
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, log10)
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, log1p)
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, log2)
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, logb)
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(long long, llrint)
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(long long, llround)
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(long, lrint)
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(long, lround)
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, nearbyint);
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, nextafter);
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, pow);
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, remainder);
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, rint);
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, round);
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, signbit)
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, sin)
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, sinh)
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, sqrt)
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, tan)
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, tanh)
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, tgamma)
			
 
				+__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, trunc);
			
 
				+
			
 
				+#undef __CUDA_CLANG_FN_INTEGER_OVERLOAD_1
			
 
				+#undef __CUDA_CLANG_FN_INTEGER_OVERLOAD_2
			
 
				+
			
 
				+// Overloads for functions that don't match the patterns expected by
			
 
				+// __CUDA_CLANG_FN_INTEGER_OVERLOAD_{1,2}.
			
 
				+template <typename __T1, typename __T2, typename __T3>
			
 
				+__DEVICE__ typename __clang_cuda_enable_if<
			
 
				+    std::numeric_limits<__T1>::is_specialized &&
			
 
				+        std::numeric_limits<__T2>::is_specialized &&
			
 
				+        std::numeric_limits<__T3>::is_specialized,
			
 
				+    double>::type
			
 
				+fma(__T1 __x, __T2 __y, __T3 __z) {
			
 
				+  return std::fma((double)__x, (double)__y, (double)__z);
			
 
				+}
			
 
				+
			
 
				+template <typename __T>
			
 
				+__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
			
 
				+                                           double>::type
			
 
				+frexp(__T __x, int *__exp) {
			
 
				+  return std::frexp((double)__x, __exp);
			
 
				+}
			
 
				+
			
 
				+template <typename __T>
			
 
				+__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
			
 
				+                                           double>::type
			
 
				+ldexp(__T __x, int __exp) {
			
 
				+  return std::ldexp((double)__x, __exp);
			
 
				+}
			
 
				+
			
 
				+template <typename __T1, typename __T2>
			
 
				+__DEVICE__ typename __clang_cuda_enable_if<
			
 
				+    std::numeric_limits<__T1>::is_specialized &&
			
 
				+        std::numeric_limits<__T2>::is_specialized,
			
 
				+    double>::type
			
 
				+remquo(__T1 __x, __T2 __y, int *__quo) {
			
 
				+  return std::remquo((double)__x, (double)__y, __quo);
			
 
				+}
			
 
				+
			
 
				+template <typename __T>
			
 
				+__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
			
 
				+                                           double>::type
			
 
				+scalbln(__T __x, long __exp) {
			
 
				+  return std::scalbln((double)__x, __exp);
			
 
				+}
			
 
				+
			
 
				+template <typename __T>
			
 
				+__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
			
 
				+                                           double>::type
			
 
				+scalbn(__T __x, int __exp) {
			
 
				+  return std::scalbn((double)__x, __exp);
			
 
				+}
			
 
				+
			
 
				+// We need to define these overloads in exactly the namespace our standard
			
 
				+// library uses (including the right inline namespace), otherwise they won't be
			
 
				+// picked up by other functions in the standard library (e.g. functions in
			
 
				+// <complex>).  Thus the ugliness below.
			
 
				+#ifdef _LIBCPP_BEGIN_NAMESPACE_STD
			
 
				+_LIBCPP_BEGIN_NAMESPACE_STD
			
 
				+#else
			
 
				+namespace std {
			
 
				+#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
			
 
				+_GLIBCXX_BEGIN_NAMESPACE_VERSION
			
 
				+#endif
			
 
				+#endif
			
 
				+
			
 
				+// Pull the new overloads we defined above into namespace std.
			
 
				+using ::acos;
			
 
				+using ::acosh;
			
 
				+using ::asin;
			
 
				+using ::asinh;
			
 
				+using ::atan;
			
 
				+using ::atan2;
			
 
				+using ::atanh;
			
 
				+using ::cbrt;
			
 
				+using ::ceil;
			
 
				+using ::copysign;
			
 
				+using ::cos;
			
 
				+using ::cosh;
			
 
				+using ::erf;
			
 
				+using ::erfc;
			
 
				+using ::exp;
			
 
				+using ::exp2;
			
 
				+using ::expm1;
			
 
				+using ::fabs;
			
 
				+using ::fdim;
			
 
				+using ::floor;
			
 
				+using ::fma;
			
 
				+using ::fmax;
			
 
				+using ::fmin;
			
 
				+using ::fmod;
			
 
				+using ::fpclassify;
			
 
				+using ::frexp;
			
 
				+using ::hypot;
			
 
				+using ::ilogb;
			
 
				+using ::isfinite;
			
 
				+using ::isgreater;
			
 
				+using ::isgreaterequal;
			
 
				+using ::isless;
			
 
				+using ::islessequal;
			
 
				+using ::islessgreater;
			
 
				+using ::isnormal;
			
 
				+using ::isunordered;
			
 
				+using ::ldexp;
			
 
				+using ::lgamma;
			
 
				+using ::llrint;
			
 
				+using ::llround;
			
 
				+using ::log;
			
 
				+using ::log10;
			
 
				+using ::log1p;
			
 
				+using ::log2;
			
 
				+using ::logb;
			
 
				+using ::lrint;
			
 
				+using ::lround;
			
 
				+using ::nearbyint;
			
 
				+using ::nextafter;
			
 
				+using ::pow;
			
 
				+using ::remainder;
			
 
				+using ::remquo;
			
 
				+using ::rint;
			
 
				+using ::round;
			
 
				+using ::scalbln;
			
 
				+using ::scalbn;
			
 
				+using ::signbit;
			
 
				+using ::sin;
			
 
				+using ::sinh;
			
 
				+using ::sqrt;
			
 
				+using ::tan;
			
 
				+using ::tanh;
			
 
				+using ::tgamma;
			
 
				+using ::trunc;
			
 
				+
			
 
				+// Well this is fun: We need to pull these symbols in for libc++, but we can't
			
 
				+// pull them in with libstdc++, because its ::isinf and ::isnan are different
			
 
				+// than its std::isinf and std::isnan.
			
 
				+#ifndef __GLIBCXX__
			
 
				+using ::isinf;
			
 
				+using ::isnan;
			
 
				+#endif
			
 
				+
			
 
				+// Finally, pull the "foobarf" functions that CUDA defines in its headers into
			
 
				+// namespace std.
			
 
				+using ::acosf;
			
 
				+using ::acoshf;
			
 
				+using ::asinf;
			
 
				+using ::asinhf;
			
 
				+using ::atan2f;
			
 
				+using ::atanf;
			
 
				+using ::atanhf;
			
 
				+using ::cbrtf;
			
 
				+using ::ceilf;
			
 
				+using ::copysignf;
			
 
				+using ::cosf;
			
 
				+using ::coshf;
			
 
				+using ::erfcf;
			
 
				+using ::erff;
			
 
				+using ::exp2f;
			
 
				+using ::expf;
			
 
				+using ::expm1f;
			
 
				+using ::fabsf;
			
 
				+using ::fdimf;
			
 
				+using ::floorf;
			
 
				+using ::fmaf;
			
 
				+using ::fmaxf;
			
 
				+using ::fminf;
			
 
				+using ::fmodf;
			
 
				+using ::frexpf;
			
 
				+using ::hypotf;
			
 
				+using ::ilogbf;
			
 
				+using ::ldexpf;
			
 
				+using ::lgammaf;
			
 
				+using ::llrintf;
			
 
				+using ::llroundf;
			
 
				+using ::log10f;
			
 
				+using ::log1pf;
			
 
				+using ::log2f;
			
 
				+using ::logbf;
			
 
				+using ::logf;
			
 
				+using ::lrintf;
			
 
				+using ::lroundf;
			
 
				+using ::modff;
			
 
				+using ::nearbyintf;
			
 
				+using ::nextafterf;
			
 
				+using ::powf;
			
 
				+using ::remainderf;
			
 
				+using ::remquof;
			
 
				+using ::rintf;
			
 
				+using ::roundf;
			
 
				+using ::scalblnf;
			
 
				+using ::scalbnf;
			
 
				+using ::sinf;
			
 
				+using ::sinhf;
			
 
				+using ::sqrtf;
			
 
				+using ::tanf;
			
 
				+using ::tanhf;
			
 
				+using ::tgammaf;
			
 
				+using ::truncf;
			
 
				+
			
 
				+#ifdef _LIBCPP_END_NAMESPACE_STD
			
 
				+_LIBCPP_END_NAMESPACE_STD
			
 
				+#else
			
 
				+#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
			
 
				+_GLIBCXX_END_NAMESPACE_VERSION
			
 
				+#endif
			
 
				+} // namespace std
			
 
				+#endif
			
 
				+
			
 
				+#undef __DEVICE__
			
 
				+
			
 
				+#endif
			
--- a/demo/include/__clang_cuda_complex_builtins.h
+++ b/demo/include/__clang_cuda_complex_builtins.h
@@ -0,0 +1,203 @@
 
				+/*===-- __clang_cuda_complex_builtins - CUDA impls of runtime complex fns ---===
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+
			
 
				+#ifndef __CLANG_CUDA_COMPLEX_BUILTINS
			
 
				+#define __CLANG_CUDA_COMPLEX_BUILTINS
			
 
				+
			
 
				+// This header defines __muldc3, __mulsc3, __divdc3, and __divsc3.  These are
			
 
				+// libgcc functions that clang assumes are available when compiling c99 complex
			
 
				+// operations.  (These implementations come from libc++, and have been modified
			
 
				+// to work with CUDA.)
			
 
				+
			
 
				+extern "C" inline __device__ double _Complex __muldc3(double __a, double __b,
			
 
				+                                                      double __c, double __d) {
			
 
				+  double __ac = __a * __c;
			
 
				+  double __bd = __b * __d;
			
 
				+  double __ad = __a * __d;
			
 
				+  double __bc = __b * __c;
			
 
				+  double _Complex z;
			
 
				+  __real__(z) = __ac - __bd;
			
 
				+  __imag__(z) = __ad + __bc;
			
 
				+  if (std::isnan(__real__(z)) && std::isnan(__imag__(z))) {
			
 
				+    int __recalc = 0;
			
 
				+    if (std::isinf(__a) || std::isinf(__b)) {
			
 
				+      __a = std::copysign(std::isinf(__a) ? 1 : 0, __a);
			
 
				+      __b = std::copysign(std::isinf(__b) ? 1 : 0, __b);
			
 
				+      if (std::isnan(__c))
			
 
				+        __c = std::copysign(0, __c);
			
 
				+      if (std::isnan(__d))
			
 
				+        __d = std::copysign(0, __d);
			
 
				+      __recalc = 1;
			
 
				+    }
			
 
				+    if (std::isinf(__c) || std::isinf(__d)) {
			
 
				+      __c = std::copysign(std::isinf(__c) ? 1 : 0, __c);
			
 
				+      __d = std::copysign(std::isinf(__d) ? 1 : 0, __d);
			
 
				+      if (std::isnan(__a))
			
 
				+        __a = std::copysign(0, __a);
			
 
				+      if (std::isnan(__b))
			
 
				+        __b = std::copysign(0, __b);
			
 
				+      __recalc = 1;
			
 
				+    }
			
 
				+    if (!__recalc && (std::isinf(__ac) || std::isinf(__bd) ||
			
 
				+                      std::isinf(__ad) || std::isinf(__bc))) {
			
 
				+      if (std::isnan(__a))
			
 
				+        __a = std::copysign(0, __a);
			
 
				+      if (std::isnan(__b))
			
 
				+        __b = std::copysign(0, __b);
			
 
				+      if (std::isnan(__c))
			
 
				+        __c = std::copysign(0, __c);
			
 
				+      if (std::isnan(__d))
			
 
				+        __d = std::copysign(0, __d);
			
 
				+      __recalc = 1;
			
 
				+    }
			
 
				+    if (__recalc) {
			
 
				+      // Can't use std::numeric_limits<double>::infinity() -- that doesn't have
			
 
				+      // a device overload (and isn't constexpr before C++11, naturally).
			
 
				+      __real__(z) = __builtin_huge_valf() * (__a * __c - __b * __d);
			
 
				+      __imag__(z) = __builtin_huge_valf() * (__a * __d + __b * __c);
			
 
				+    }
			
 
				+  }
			
 
				+  return z;
			
 
				+}
			
 
				+
			
 
				+extern "C" inline __device__ float _Complex __mulsc3(float __a, float __b,
			
 
				+                                                     float __c, float __d) {
			
 
				+  float __ac = __a * __c;
			
 
				+  float __bd = __b * __d;
			
 
				+  float __ad = __a * __d;
			
 
				+  float __bc = __b * __c;
			
 
				+  float _Complex z;
			
 
				+  __real__(z) = __ac - __bd;
			
 
				+  __imag__(z) = __ad + __bc;
			
 
				+  if (std::isnan(__real__(z)) && std::isnan(__imag__(z))) {
			
 
				+    int __recalc = 0;
			
 
				+    if (std::isinf(__a) || std::isinf(__b)) {
			
 
				+      __a = std::copysign(std::isinf(__a) ? 1 : 0, __a);
			
 
				+      __b = std::copysign(std::isinf(__b) ? 1 : 0, __b);
			
 
				+      if (std::isnan(__c))
			
 
				+        __c = std::copysign(0, __c);
			
 
				+      if (std::isnan(__d))
			
 
				+        __d = std::copysign(0, __d);
			
 
				+      __recalc = 1;
			
 
				+    }
			
 
				+    if (std::isinf(__c) || std::isinf(__d)) {
			
 
				+      __c = std::copysign(std::isinf(__c) ? 1 : 0, __c);
			
 
				+      __d = std::copysign(std::isinf(__d) ? 1 : 0, __d);
			
 
				+      if (std::isnan(__a))
			
 
				+        __a = std::copysign(0, __a);
			
 
				+      if (std::isnan(__b))
			
 
				+        __b = std::copysign(0, __b);
			
 
				+      __recalc = 1;
			
 
				+    }
			
 
				+    if (!__recalc && (std::isinf(__ac) || std::isinf(__bd) ||
			
 
				+                      std::isinf(__ad) || std::isinf(__bc))) {
			
 
				+      if (std::isnan(__a))
			
 
				+        __a = std::copysign(0, __a);
			
 
				+      if (std::isnan(__b))
			
 
				+        __b = std::copysign(0, __b);
			
 
				+      if (std::isnan(__c))
			
 
				+        __c = std::copysign(0, __c);
			
 
				+      if (std::isnan(__d))
			
 
				+        __d = std::copysign(0, __d);
			
 
				+      __recalc = 1;
			
 
				+    }
			
 
				+    if (__recalc) {
			
 
				+      __real__(z) = __builtin_huge_valf() * (__a * __c - __b * __d);
			
 
				+      __imag__(z) = __builtin_huge_valf() * (__a * __d + __b * __c);
			
 
				+    }
			
 
				+  }
			
 
				+  return z;
			
 
				+}
			
 
				+
			
 
				+extern "C" inline __device__ double _Complex __divdc3(double __a, double __b,
			
 
				+                                                      double __c, double __d) {
			
 
				+  int __ilogbw = 0;
			
 
				+  // Can't use std::max, because that's defined in <algorithm>, and we don't
			
 
				+  // want to pull that in for every compile.  The CUDA headers define
			
 
				+  // ::max(float, float) and ::max(double, double), which is sufficient for us.
			
 
				+  double __logbw = std::logb(max(std::abs(__c), std::abs(__d)));
			
 
				+  if (std::isfinite(__logbw)) {
			
 
				+    __ilogbw = (int)__logbw;
			
 
				+    __c = std::scalbn(__c, -__ilogbw);
			
 
				+    __d = std::scalbn(__d, -__ilogbw);
			
 
				+  }
			
 
				+  double __denom = __c * __c + __d * __d;
			
 
				+  double _Complex z;
			
 
				+  __real__(z) = std::scalbn((__a * __c + __b * __d) / __denom, -__ilogbw);
			
 
				+  __imag__(z) = std::scalbn((__b * __c - __a * __d) / __denom, -__ilogbw);
			
 
				+  if (std::isnan(__real__(z)) && std::isnan(__imag__(z))) {
			
 
				+    if ((__denom == 0.0) && (!std::isnan(__a) || !std::isnan(__b))) {
			
 
				+      __real__(z) = std::copysign(__builtin_huge_valf(), __c) * __a;
			
 
				+      __imag__(z) = std::copysign(__builtin_huge_valf(), __c) * __b;
			
 
				+    } else if ((std::isinf(__a) || std::isinf(__b)) && std::isfinite(__c) &&
			
 
				+               std::isfinite(__d)) {
			
 
				+      __a = std::copysign(std::isinf(__a) ? 1.0 : 0.0, __a);
			
 
				+      __b = std::copysign(std::isinf(__b) ? 1.0 : 0.0, __b);
			
 
				+      __real__(z) = __builtin_huge_valf() * (__a * __c + __b * __d);
			
 
				+      __imag__(z) = __builtin_huge_valf() * (__b * __c - __a * __d);
			
 
				+    } else if (std::isinf(__logbw) && __logbw > 0.0 && std::isfinite(__a) &&
			
 
				+               std::isfinite(__b)) {
			
 
				+      __c = std::copysign(std::isinf(__c) ? 1.0 : 0.0, __c);
			
 
				+      __d = std::copysign(std::isinf(__d) ? 1.0 : 0.0, __d);
			
 
				+      __real__(z) = 0.0 * (__a * __c + __b * __d);
			
 
				+      __imag__(z) = 0.0 * (__b * __c - __a * __d);
			
 
				+    }
			
 
				+  }
			
 
				+  return z;
			
 
				+}
			
 
				+
			
 
				+extern "C" inline __device__ float _Complex __divsc3(float __a, float __b,
			
 
				+                                                     float __c, float __d) {
			
 
				+  int __ilogbw = 0;
			
 
				+  float __logbw = std::logb(max(std::abs(__c), std::abs(__d)));
			
 
				+  if (std::isfinite(__logbw)) {
			
 
				+    __ilogbw = (int)__logbw;
			
 
				+    __c = std::scalbn(__c, -__ilogbw);
			
 
				+    __d = std::scalbn(__d, -__ilogbw);
			
 
				+  }
			
 
				+  float __denom = __c * __c + __d * __d;
			
 
				+  float _Complex z;
			
 
				+  __real__(z) = std::scalbn((__a * __c + __b * __d) / __denom, -__ilogbw);
			
 
				+  __imag__(z) = std::scalbn((__b * __c - __a * __d) / __denom, -__ilogbw);
			
 
				+  if (std::isnan(__real__(z)) && std::isnan(__imag__(z))) {
			
 
				+    if ((__denom == 0) && (!std::isnan(__a) || !std::isnan(__b))) {
			
 
				+      __real__(z) = std::copysign(__builtin_huge_valf(), __c) * __a;
			
 
				+      __imag__(z) = std::copysign(__builtin_huge_valf(), __c) * __b;
			
 
				+    } else if ((std::isinf(__a) || std::isinf(__b)) && std::isfinite(__c) &&
			
 
				+               std::isfinite(__d)) {
			
 
				+      __a = std::copysign(std::isinf(__a) ? 1 : 0, __a);
			
 
				+      __b = std::copysign(std::isinf(__b) ? 1 : 0, __b);
			
 
				+      __real__(z) = __builtin_huge_valf() * (__a * __c + __b * __d);
			
 
				+      __imag__(z) = __builtin_huge_valf() * (__b * __c - __a * __d);
			
 
				+    } else if (std::isinf(__logbw) && __logbw > 0 && std::isfinite(__a) &&
			
 
				+               std::isfinite(__b)) {
			
 
				+      __c = std::copysign(std::isinf(__c) ? 1 : 0, __c);
			
 
				+      __d = std::copysign(std::isinf(__d) ? 1 : 0, __d);
			
 
				+      __real__(z) = 0 * (__a * __c + __b * __d);
			
 
				+      __imag__(z) = 0 * (__b * __c - __a * __d);
			
 
				+    }
			
 
				+  }
			
 
				+  return z;
			
 
				+}
			
 
				+
			
 
				+#endif // __CLANG_CUDA_COMPLEX_BUILTINS
			
--- a/demo/include/__clang_cuda_intrinsics.h
+++ b/demo/include/__clang_cuda_intrinsics.h
@@ -0,0 +1,489 @@
 
				+/*===--- __clang_cuda_intrinsics.h - Device-side CUDA intrinsic wrappers ---===
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+#ifndef __CLANG_CUDA_INTRINSICS_H__
			
 
				+#define __CLANG_CUDA_INTRINSICS_H__
			
 
				+#ifndef __CUDA__
			
 
				+#error "This file is for CUDA compilation only."
			
 
				+#endif
			
 
				+
			
 
				+// sm_30 intrinsics: __shfl_{up,down,xor}.
			
 
				+
			
 
				+#define __SM_30_INTRINSICS_H__
			
 
				+#define __SM_30_INTRINSICS_HPP__
			
 
				+
			
 
				+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
			
 
				+
			
 
				+#pragma push_macro("__MAKE_SHUFFLES")
			
 
				+#define __MAKE_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, __Mask,    \
			
 
				+                        __Type)                                                \
			
 
				+  inline __device__ int __FnName(int __val, __Type __offset,                   \
			
 
				+                                 int __width = warpSize) {                     \
			
 
				+    return __IntIntrinsic(__val, __offset,                                     \
			
 
				+                          ((warpSize - __width) << 8) | (__Mask));             \
			
 
				+  }                                                                            \
			
 
				+  inline __device__ float __FnName(float __val, __Type __offset,               \
			
 
				+                                   int __width = warpSize) {                   \
			
 
				+    return __FloatIntrinsic(__val, __offset,                                   \
			
 
				+                            ((warpSize - __width) << 8) | (__Mask));           \
			
 
				+  }                                                                            \
			
 
				+  inline __device__ unsigned int __FnName(unsigned int __val, __Type __offset, \
			
 
				+                                          int __width = warpSize) {            \
			
 
				+    return static_cast<unsigned int>(                                          \
			
 
				+        ::__FnName(static_cast<int>(__val), __offset, __width));               \
			
 
				+  }                                                                            \
			
 
				+  inline __device__ long long __FnName(long long __val, __Type __offset,       \
			
 
				+                                       int __width = warpSize) {               \
			
 
				+    struct __Bits {                                                            \
			
 
				+      int __a, __b;                                                            \
			
 
				+    };                                                                         \
			
 
				+    _Static_assert(sizeof(__val) == sizeof(__Bits));                           \
			
 
				+    _Static_assert(sizeof(__Bits) == 2 * sizeof(int));                         \
			
 
				+    __Bits __tmp;                                                              \
			
 
				+    memcpy(&__val, &__tmp, sizeof(__val));                                     \
			
 
				+    __tmp.__a = ::__FnName(__tmp.__a, __offset, __width);                      \
			
 
				+    __tmp.__b = ::__FnName(__tmp.__b, __offset, __width);                      \
			
 
				+    long long __ret;                                                           \
			
 
				+    memcpy(&__ret, &__tmp, sizeof(__tmp));                                     \
			
 
				+    return __ret;                                                              \
			
 
				+  }                                                                            \
			
 
				+  inline __device__ long __FnName(long __val, __Type __offset,                 \
			
 
				+                                  int __width = warpSize) {                    \
			
 
				+    _Static_assert(sizeof(long) == sizeof(long long) ||                        \
			
 
				+                   sizeof(long) == sizeof(int));                               \
			
 
				+    if (sizeof(long) == sizeof(long long)) {                                   \
			
 
				+      return static_cast<long>(                                                \
			
 
				+          ::__FnName(static_cast<long long>(__val), __offset, __width));       \
			
 
				+    } else if (sizeof(long) == sizeof(int)) {                                  \
			
 
				+      return static_cast<long>(                                                \
			
 
				+          ::__FnName(static_cast<int>(__val), __offset, __width));             \
			
 
				+    }                                                                          \
			
 
				+  }                                                                            \
			
 
				+  inline __device__ unsigned long __FnName(                                    \
			
 
				+      unsigned long __val, __Type __offset, int __width = warpSize) {          \
			
 
				+    return static_cast<unsigned long>(                                         \
			
 
				+        ::__FnName(static_cast<long>(__val), __offset, __width));              \
			
 
				+  }                                                                            \
			
 
				+  inline __device__ unsigned long long __FnName(                               \
			
 
				+      unsigned long long __val, __Type __offset, int __width = warpSize) {     \
			
 
				+    return static_cast<unsigned long long>(::__FnName(                         \
			
 
				+        static_cast<unsigned long long>(__val), __offset, __width));           \
			
 
				+  }                                                                            \
			
 
				+  inline __device__ double __FnName(double __val, __Type __offset,             \
			
 
				+                                    int __width = warpSize) {                  \
			
 
				+    long long __tmp;                                                           \
			
 
				+    _Static_assert(sizeof(__tmp) == sizeof(__val));                            \
			
 
				+    memcpy(&__tmp, &__val, sizeof(__val));                                     \
			
 
				+    __tmp = ::__FnName(__tmp, __offset, __width);                              \
			
 
				+    double __ret;                                                              \
			
 
				+    memcpy(&__ret, &__tmp, sizeof(__ret));                                     \
			
 
				+    return __ret;                                                              \
			
 
				+  }
			
 
				+
			
 
				+__MAKE_SHUFFLES(__shfl, __nvvm_shfl_idx_i32, __nvvm_shfl_idx_f32, 0x1f, int);
			
 
				+// We use 0 rather than 31 as our mask, because shfl.up applies to lanes >=
			
 
				+// maxLane.
			
 
				+__MAKE_SHUFFLES(__shfl_up, __nvvm_shfl_up_i32, __nvvm_shfl_up_f32, 0,
			
 
				+                unsigned int);
			
 
				+__MAKE_SHUFFLES(__shfl_down, __nvvm_shfl_down_i32, __nvvm_shfl_down_f32, 0x1f,
			
 
				+                unsigned int);
			
 
				+__MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f,
			
 
				+                int);
			
 
				+#pragma pop_macro("__MAKE_SHUFFLES")
			
 
				+
			
 
				+#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
			
 
				+
			
 
				+#if CUDA_VERSION >= 9000
			
 
				+#if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300)
			
 
				+// __shfl_sync_* variants available in CUDA-9
			
 
				+#pragma push_macro("__MAKE_SYNC_SHUFFLES")
			
 
				+#define __MAKE_SYNC_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic,       \
			
 
				+                             __Mask, __Type)                                   \
			
 
				+  inline __device__ int __FnName(unsigned int __mask, int __val,               \
			
 
				+                                 __Type __offset, int __width = warpSize) {    \
			
 
				+    return __IntIntrinsic(__mask, __val, __offset,                             \
			
 
				+                          ((warpSize - __width) << 8) | (__Mask));             \
			
 
				+  }                                                                            \
			
 
				+  inline __device__ float __FnName(unsigned int __mask, float __val,           \
			
 
				+                                   __Type __offset, int __width = warpSize) {  \
			
 
				+    return __FloatIntrinsic(__mask, __val, __offset,                           \
			
 
				+                            ((warpSize - __width) << 8) | (__Mask));           \
			
 
				+  }                                                                            \
			
 
				+  inline __device__ unsigned int __FnName(unsigned int __mask,                 \
			
 
				+                                          unsigned int __val, __Type __offset, \
			
 
				+                                          int __width = warpSize) {            \
			
 
				+    return static_cast<unsigned int>(                                          \
			
 
				+        ::__FnName(__mask, static_cast<int>(__val), __offset, __width));       \
			
 
				+  }                                                                            \
			
 
				+  inline __device__ long long __FnName(unsigned int __mask, long long __val,   \
			
 
				+                                       __Type __offset,                        \
			
 
				+                                       int __width = warpSize) {               \
			
 
				+    struct __Bits {                                                            \
			
 
				+      int __a, __b;                                                            \
			
 
				+    };                                                                         \
			
 
				+    _Static_assert(sizeof(__val) == sizeof(__Bits));                           \
			
 
				+    _Static_assert(sizeof(__Bits) == 2 * sizeof(int));                         \
			
 
				+    __Bits __tmp;                                                              \
			
 
				+    memcpy(&__val, &__tmp, sizeof(__val));                                     \
			
 
				+    __tmp.__a = ::__FnName(__mask, __tmp.__a, __offset, __width);              \
			
 
				+    __tmp.__b = ::__FnName(__mask, __tmp.__b, __offset, __width);              \
			
 
				+    long long __ret;                                                           \
			
 
				+    memcpy(&__ret, &__tmp, sizeof(__tmp));                                     \
			
 
				+    return __ret;                                                              \
			
 
				+  }                                                                            \
			
 
				+  inline __device__ unsigned long long __FnName(                               \
			
 
				+      unsigned int __mask, unsigned long long __val, __Type __offset,          \
			
 
				+      int __width = warpSize) {                                                \
			
 
				+    return static_cast<unsigned long long>(::__FnName(                         \
			
 
				+        __mask, static_cast<unsigned long long>(__val), __offset, __width));   \
			
 
				+  }                                                                            \
			
 
				+  inline __device__ long __FnName(unsigned int __mask, long __val,             \
			
 
				+                                  __Type __offset, int __width = warpSize) {   \
			
 
				+    _Static_assert(sizeof(long) == sizeof(long long) ||                        \
			
 
				+                   sizeof(long) == sizeof(int));                               \
			
 
				+    if (sizeof(long) == sizeof(long long)) {                                   \
			
 
				+      return static_cast<long>(::__FnName(                                     \
			
 
				+          __mask, static_cast<long long>(__val), __offset, __width));          \
			
 
				+    } else if (sizeof(long) == sizeof(int)) {                                  \
			
 
				+      return static_cast<long>(                                                \
			
 
				+          ::__FnName(__mask, static_cast<int>(__val), __offset, __width));     \
			
 
				+    }                                                                          \
			
 
				+  }                                                                            \
			
 
				+  inline __device__ unsigned long __FnName(                                    \
			
 
				+      unsigned int __mask, unsigned long __val, __Type __offset,               \
			
 
				+      int __width = warpSize) {                                                \
			
 
				+    return static_cast<unsigned long>(                                         \
			
 
				+        ::__FnName(__mask, static_cast<long>(__val), __offset, __width));      \
			
 
				+  }                                                                            \
			
 
				+  inline __device__ double __FnName(unsigned int __mask, double __val,         \
			
 
				+                                    __Type __offset, int __width = warpSize) { \
			
 
				+    long long __tmp;                                                           \
			
 
				+    _Static_assert(sizeof(__tmp) == sizeof(__val));                            \
			
 
				+    memcpy(&__tmp, &__val, sizeof(__val));                                     \
			
 
				+    __tmp = ::__FnName(__mask, __tmp, __offset, __width);                      \
			
 
				+    double __ret;                                                              \
			
 
				+    memcpy(&__ret, &__tmp, sizeof(__ret));                                     \
			
 
				+    return __ret;                                                              \
			
 
				+  }
			
 
				+__MAKE_SYNC_SHUFFLES(__shfl_sync, __nvvm_shfl_sync_idx_i32,
			
 
				+                     __nvvm_shfl_sync_idx_f32, 0x1f, int);
			
 
				+// We use 0 rather than 31 as our mask, because shfl.up applies to lanes >=
			
 
				+// maxLane.
			
 
				+__MAKE_SYNC_SHUFFLES(__shfl_up_sync, __nvvm_shfl_sync_up_i32,
			
 
				+                     __nvvm_shfl_sync_up_f32, 0, unsigned int);
			
 
				+__MAKE_SYNC_SHUFFLES(__shfl_down_sync, __nvvm_shfl_sync_down_i32,
			
 
				+                     __nvvm_shfl_sync_down_f32, 0x1f, unsigned int);
			
 
				+__MAKE_SYNC_SHUFFLES(__shfl_xor_sync, __nvvm_shfl_sync_bfly_i32,
			
 
				+                     __nvvm_shfl_sync_bfly_f32, 0x1f, int);
			
 
				+#pragma pop_macro("__MAKE_SYNC_SHUFFLES")
			
 
				+
			
 
				+inline __device__ void __syncwarp(unsigned int mask = 0xffffffff) {
			
 
				+  return __nvvm_bar_warp_sync(mask);
			
 
				+}
			
 
				+
			
 
				+inline __device__ void __barrier_sync(unsigned int id) {
			
 
				+  __nvvm_barrier_sync(id);
			
 
				+}
			
 
				+
			
 
				+inline __device__ void __barrier_sync_count(unsigned int id,
			
 
				+                                            unsigned int count) {
			
 
				+  __nvvm_barrier_sync_cnt(id, count);
			
 
				+}
			
 
				+
			
 
				+inline __device__ int __all_sync(unsigned int mask, int pred) {
			
 
				+  return __nvvm_vote_all_sync(mask, pred);
			
 
				+}
			
 
				+
			
 
				+inline __device__ int __any_sync(unsigned int mask, int pred) {
			
 
				+  return __nvvm_vote_any_sync(mask, pred);
			
 
				+}
			
 
				+
			
 
				+inline __device__ int __uni_sync(unsigned int mask, int pred) {
			
 
				+  return __nvvm_vote_uni_sync(mask, pred);
			
 
				+}
			
 
				+
			
 
				+inline __device__ unsigned int __ballot_sync(unsigned int mask, int pred) {
			
 
				+  return __nvvm_vote_ballot_sync(mask, pred);
			
 
				+}
			
 
				+
			
 
				+inline __device__ unsigned int __activemask() { return __nvvm_vote_ballot(1); }
			
 
				+
			
 
				+inline __device__ unsigned int __fns(unsigned mask, unsigned base, int offset) {
			
 
				+  return __nvvm_fns(mask, base, offset);
			
 
				+}
			
 
				+
			
 
				+#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
			
 
				+
			
 
				+// Define __match* builtins CUDA-9 headers expect to see.
			
 
				+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
			
 
				+inline __device__ unsigned int __match32_any_sync(unsigned int mask,
			
 
				+                                                  unsigned int value) {
			
 
				+  return __nvvm_match_any_sync_i32(mask, value);
			
 
				+}
			
 
				+
			
 
				+inline __device__ unsigned long long
			
 
				+__match64_any_sync(unsigned int mask, unsigned long long value) {
			
 
				+  return __nvvm_match_any_sync_i64(mask, value);
			
 
				+}
			
 
				+
			
 
				+inline __device__ unsigned int
			
 
				+__match32_all_sync(unsigned int mask, unsigned int value, int *pred) {
			
 
				+  return __nvvm_match_all_sync_i32p(mask, value, pred);
			
 
				+}
			
 
				+
			
 
				+inline __device__ unsigned long long
			
 
				+__match64_all_sync(unsigned int mask, unsigned long long value, int *pred) {
			
 
				+  return __nvvm_match_all_sync_i64p(mask, value, pred);
			
 
				+}
			
 
				+#include "crt/sm_70_rt.hpp"
			
 
				+
			
 
				+#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
			
 
				+#endif // __CUDA_VERSION >= 9000
			
 
				+
			
 
				+// sm_32 intrinsics: __ldg and __funnelshift_{l,lc,r,rc}.
			
 
				+
			
 
				+// Prevent the vanilla sm_32 intrinsics header from being included.
			
 
				+#define __SM_32_INTRINSICS_H__
			
 
				+#define __SM_32_INTRINSICS_HPP__
			
 
				+
			
 
				+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
			
 
				+
			
 
				+inline __device__ char __ldg(const char *ptr) { return __nvvm_ldg_c(ptr); }
			
 
				+inline __device__ short __ldg(const short *ptr) { return __nvvm_ldg_s(ptr); }
			
 
				+inline __device__ int __ldg(const int *ptr) { return __nvvm_ldg_i(ptr); }
			
 
				+inline __device__ long __ldg(const long *ptr) { return __nvvm_ldg_l(ptr); }
			
 
				+inline __device__ long long __ldg(const long long *ptr) {
			
 
				+  return __nvvm_ldg_ll(ptr);
			
 
				+}
			
 
				+inline __device__ unsigned char __ldg(const unsigned char *ptr) {
			
 
				+  return __nvvm_ldg_uc(ptr);
			
 
				+}
			
 
				+inline __device__ unsigned short __ldg(const unsigned short *ptr) {
			
 
				+  return __nvvm_ldg_us(ptr);
			
 
				+}
			
 
				+inline __device__ unsigned int __ldg(const unsigned int *ptr) {
			
 
				+  return __nvvm_ldg_ui(ptr);
			
 
				+}
			
 
				+inline __device__ unsigned long __ldg(const unsigned long *ptr) {
			
 
				+  return __nvvm_ldg_ul(ptr);
			
 
				+}
			
 
				+inline __device__ unsigned long long __ldg(const unsigned long long *ptr) {
			
 
				+  return __nvvm_ldg_ull(ptr);
			
 
				+}
			
 
				+inline __device__ float __ldg(const float *ptr) { return __nvvm_ldg_f(ptr); }
			
 
				+inline __device__ double __ldg(const double *ptr) { return __nvvm_ldg_d(ptr); }
			
 
				+
			
 
				+inline __device__ char2 __ldg(const char2 *ptr) {
			
 
				+  typedef char c2 __attribute__((ext_vector_type(2)));
			
 
				+  // We can assume that ptr is aligned at least to char2's alignment, but the
			
 
				+  // load will assume that ptr is aligned to char2's alignment.  This is only
			
 
				+  // safe if alignof(c2) <= alignof(char2).
			
 
				+  c2 rv = __nvvm_ldg_c2(reinterpret_cast<const c2 *>(ptr));
			
 
				+  char2 ret;
			
 
				+  ret.x = rv[0];
			
 
				+  ret.y = rv[1];
			
 
				+  return ret;
			
 
				+}
			
 
				+inline __device__ char4 __ldg(const char4 *ptr) {
			
 
				+  typedef char c4 __attribute__((ext_vector_type(4)));
			
 
				+  c4 rv = __nvvm_ldg_c4(reinterpret_cast<const c4 *>(ptr));
			
 
				+  char4 ret;
			
 
				+  ret.x = rv[0];
			
 
				+  ret.y = rv[1];
			
 
				+  ret.z = rv[2];
			
 
				+  ret.w = rv[3];
			
 
				+  return ret;
			
 
				+}
			
 
				+inline __device__ short2 __ldg(const short2 *ptr) {
			
 
				+  typedef short s2 __attribute__((ext_vector_type(2)));
			
 
				+  s2 rv = __nvvm_ldg_s2(reinterpret_cast<const s2 *>(ptr));
			
 
				+  short2 ret;
			
 
				+  ret.x = rv[0];
			
 
				+  ret.y = rv[1];
			
 
				+  return ret;
			
 
				+}
			
 
				+inline __device__ short4 __ldg(const short4 *ptr) {
			
 
				+  typedef short s4 __attribute__((ext_vector_type(4)));
			
 
				+  s4 rv = __nvvm_ldg_s4(reinterpret_cast<const s4 *>(ptr));
			
 
				+  short4 ret;
			
 
				+  ret.x = rv[0];
			
 
				+  ret.y = rv[1];
			
 
				+  ret.z = rv[2];
			
 
				+  ret.w = rv[3];
			
 
				+  return ret;
			
 
				+}
			
 
				+inline __device__ int2 __ldg(const int2 *ptr) {
			
 
				+  typedef int i2 __attribute__((ext_vector_type(2)));
			
 
				+  i2 rv = __nvvm_ldg_i2(reinterpret_cast<const i2 *>(ptr));
			
 
				+  int2 ret;
			
 
				+  ret.x = rv[0];
			
 
				+  ret.y = rv[1];
			
 
				+  return ret;
			
 
				+}
			
 
				+inline __device__ int4 __ldg(const int4 *ptr) {
			
 
				+  typedef int i4 __attribute__((ext_vector_type(4)));
			
 
				+  i4 rv = __nvvm_ldg_i4(reinterpret_cast<const i4 *>(ptr));
			
 
				+  int4 ret;
			
 
				+  ret.x = rv[0];
			
 
				+  ret.y = rv[1];
			
 
				+  ret.z = rv[2];
			
 
				+  ret.w = rv[3];
			
 
				+  return ret;
			
 
				+}
			
 
				+inline __device__ longlong2 __ldg(const longlong2 *ptr) {
			
 
				+  typedef long long ll2 __attribute__((ext_vector_type(2)));
			
 
				+  ll2 rv = __nvvm_ldg_ll2(reinterpret_cast<const ll2 *>(ptr));
			
 
				+  longlong2 ret;
			
 
				+  ret.x = rv[0];
			
 
				+  ret.y = rv[1];
			
 
				+  return ret;
			
 
				+}
			
 
				+
			
 
				+inline __device__ uchar2 __ldg(const uchar2 *ptr) {
			
 
				+  typedef unsigned char uc2 __attribute__((ext_vector_type(2)));
			
 
				+  uc2 rv = __nvvm_ldg_uc2(reinterpret_cast<const uc2 *>(ptr));
			
 
				+  uchar2 ret;
			
 
				+  ret.x = rv[0];
			
 
				+  ret.y = rv[1];
			
 
				+  return ret;
			
 
				+}
			
 
				+inline __device__ uchar4 __ldg(const uchar4 *ptr) {
			
 
				+  typedef unsigned char uc4 __attribute__((ext_vector_type(4)));
			
 
				+  uc4 rv = __nvvm_ldg_uc4(reinterpret_cast<const uc4 *>(ptr));
			
 
				+  uchar4 ret;
			
 
				+  ret.x = rv[0];
			
 
				+  ret.y = rv[1];
			
 
				+  ret.z = rv[2];
			
 
				+  ret.w = rv[3];
			
 
				+  return ret;
			
 
				+}
			
 
				+inline __device__ ushort2 __ldg(const ushort2 *ptr) {
			
 
				+  typedef unsigned short us2 __attribute__((ext_vector_type(2)));
			
 
				+  us2 rv = __nvvm_ldg_us2(reinterpret_cast<const us2 *>(ptr));
			
 
				+  ushort2 ret;
			
 
				+  ret.x = rv[0];
			
 
				+  ret.y = rv[1];
			
 
				+  return ret;
			
 
				+}
			
 
				+inline __device__ ushort4 __ldg(const ushort4 *ptr) {
			
 
				+  typedef unsigned short us4 __attribute__((ext_vector_type(4)));
			
 
				+  us4 rv = __nvvm_ldg_us4(reinterpret_cast<const us4 *>(ptr));
			
 
				+  ushort4 ret;
			
 
				+  ret.x = rv[0];
			
 
				+  ret.y = rv[1];
			
 
				+  ret.z = rv[2];
			
 
				+  ret.w = rv[3];
			
 
				+  return ret;
			
 
				+}
			
 
				+inline __device__ uint2 __ldg(const uint2 *ptr) {
			
 
				+  typedef unsigned int ui2 __attribute__((ext_vector_type(2)));
			
 
				+  ui2 rv = __nvvm_ldg_ui2(reinterpret_cast<const ui2 *>(ptr));
			
 
				+  uint2 ret;
			
 
				+  ret.x = rv[0];
			
 
				+  ret.y = rv[1];
			
 
				+  return ret;
			
 
				+}
			
 
				+inline __device__ uint4 __ldg(const uint4 *ptr) {
			
 
				+  typedef unsigned int ui4 __attribute__((ext_vector_type(4)));
			
 
				+  ui4 rv = __nvvm_ldg_ui4(reinterpret_cast<const ui4 *>(ptr));
			
 
				+  uint4 ret;
			
 
				+  ret.x = rv[0];
			
 
				+  ret.y = rv[1];
			
 
				+  ret.z = rv[2];
			
 
				+  ret.w = rv[3];
			
 
				+  return ret;
			
 
				+}
			
 
				+inline __device__ ulonglong2 __ldg(const ulonglong2 *ptr) {
			
 
				+  typedef unsigned long long ull2 __attribute__((ext_vector_type(2)));
			
 
				+  ull2 rv = __nvvm_ldg_ull2(reinterpret_cast<const ull2 *>(ptr));
			
 
				+  ulonglong2 ret;
			
 
				+  ret.x = rv[0];
			
 
				+  ret.y = rv[1];
			
 
				+  return ret;
			
 
				+}
			
 
				+
			
 
				+inline __device__ float2 __ldg(const float2 *ptr) {
			
 
				+  typedef float f2 __attribute__((ext_vector_type(2)));
			
 
				+  f2 rv = __nvvm_ldg_f2(reinterpret_cast<const f2 *>(ptr));
			
 
				+  float2 ret;
			
 
				+  ret.x = rv[0];
			
 
				+  ret.y = rv[1];
			
 
				+  return ret;
			
 
				+}
			
 
				+inline __device__ float4 __ldg(const float4 *ptr) {
			
 
				+  typedef float f4 __attribute__((ext_vector_type(4)));
			
 
				+  f4 rv = __nvvm_ldg_f4(reinterpret_cast<const f4 *>(ptr));
			
 
				+  float4 ret;
			
 
				+  ret.x = rv[0];
			
 
				+  ret.y = rv[1];
			
 
				+  ret.z = rv[2];
			
 
				+  ret.w = rv[3];
			
 
				+  return ret;
			
 
				+}
			
 
				+inline __device__ double2 __ldg(const double2 *ptr) {
			
 
				+  typedef double d2 __attribute__((ext_vector_type(2)));
			
 
				+  d2 rv = __nvvm_ldg_d2(reinterpret_cast<const d2 *>(ptr));
			
 
				+  double2 ret;
			
 
				+  ret.x = rv[0];
			
 
				+  ret.y = rv[1];
			
 
				+  return ret;
			
 
				+}
			
 
				+
			
 
				+// TODO: Implement these as intrinsics, so the backend can work its magic on
			
 
				+// these.  Alternatively, we could implement these as plain C and try to get
			
 
				+// llvm to recognize the relevant patterns.
			
 
				+inline __device__ unsigned __funnelshift_l(unsigned low32, unsigned high32,
			
 
				+                                           unsigned shiftWidth) {
			
 
				+  unsigned result;
			
 
				+  asm("shf.l.wrap.b32 %0, %1, %2, %3;"
			
 
				+      : "=r"(result)
			
 
				+      : "r"(low32), "r"(high32), "r"(shiftWidth));
			
 
				+  return result;
			
 
				+}
			
 
				+inline __device__ unsigned __funnelshift_lc(unsigned low32, unsigned high32,
			
 
				+                                            unsigned shiftWidth) {
			
 
				+  unsigned result;
			
 
				+  asm("shf.l.clamp.b32 %0, %1, %2, %3;"
			
 
				+      : "=r"(result)
			
 
				+      : "r"(low32), "r"(high32), "r"(shiftWidth));
			
 
				+  return result;
			
 
				+}
			
 
				+inline __device__ unsigned __funnelshift_r(unsigned low32, unsigned high32,
			
 
				+                                           unsigned shiftWidth) {
			
 
				+  unsigned result;
			
 
				+  asm("shf.r.wrap.b32 %0, %1, %2, %3;"
			
 
				+      : "=r"(result)
			
 
				+      : "r"(low32), "r"(high32), "r"(shiftWidth));
			
 
				+  return result;
			
 
				+}
			
 
				+inline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32,
			
 
				+                                            unsigned shiftWidth) {
			
 
				+  unsigned ret;
			
 
				+  asm("shf.r.clamp.b32 %0, %1, %2, %3;"
			
 
				+      : "=r"(ret)
			
 
				+      : "r"(low32), "r"(high32), "r"(shiftWidth));
			
 
				+  return ret;
			
 
				+}
			
 
				+
			
 
				+#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
			
 
				+
			
 
				+#endif // defined(__CLANG_CUDA_INTRINSICS_H__)
			
--- a/demo/include/__clang_cuda_math_forward_declares.h
+++ b/demo/include/__clang_cuda_math_forward_declares.h
@@ -0,0 +1,286 @@
 
				+/*===- __clang_math_forward_declares.h - Prototypes of __device__ math fns --===
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+#ifndef __CLANG__CUDA_MATH_FORWARD_DECLARES_H__
			
 
				+#define __CLANG__CUDA_MATH_FORWARD_DECLARES_H__
			
 
				+#ifndef __CUDA__
			
 
				+#error "This file is for CUDA compilation only."
			
 
				+#endif
			
 
				+
			
 
				+// This file forward-declares of some math functions we (or the CUDA headers)
			
 
				+// will define later.  We need to do this, and do it before cmath is included,
			
 
				+// because the standard library may have constexpr math functions.  In the
			
 
				+// absence of a prior __device__ decl, those constexpr functions may become
			
 
				+// implicitly host+device.  host+device functions can't be overloaded, so that
			
 
				+// would preclude the use of our own __device__ overloads for these functions.
			
 
				+
			
 
				+#pragma push_macro("__DEVICE__")
			
 
				+#define __DEVICE__                                                             \
			
 
				+  static __inline__ __attribute__((always_inline)) __attribute__((device))
			
 
				+
			
 
				+__DEVICE__ double abs(double);
			
 
				+__DEVICE__ float abs(float);
			
 
				+__DEVICE__ int abs(int);
			
 
				+__DEVICE__ long abs(long);
			
 
				+__DEVICE__ long long abs(long long);
			
 
				+__DEVICE__ double acos(double);
			
 
				+__DEVICE__ float acos(float);
			
 
				+__DEVICE__ double acosh(double);
			
 
				+__DEVICE__ float acosh(float);
			
 
				+__DEVICE__ double asin(double);
			
 
				+__DEVICE__ float asin(float);
			
 
				+__DEVICE__ double asinh(double);
			
 
				+__DEVICE__ float asinh(float);
			
 
				+__DEVICE__ double atan2(double, double);
			
 
				+__DEVICE__ float atan2(float, float);
			
 
				+__DEVICE__ double atan(double);
			
 
				+__DEVICE__ float atan(float);
			
 
				+__DEVICE__ double atanh(double);
			
 
				+__DEVICE__ float atanh(float);
			
 
				+__DEVICE__ double cbrt(double);
			
 
				+__DEVICE__ float cbrt(float);
			
 
				+__DEVICE__ double ceil(double);
			
 
				+__DEVICE__ float ceil(float);
			
 
				+__DEVICE__ double copysign(double, double);
			
 
				+__DEVICE__ float copysign(float, float);
			
 
				+__DEVICE__ double cos(double);
			
 
				+__DEVICE__ float cos(float);
			
 
				+__DEVICE__ double cosh(double);
			
 
				+__DEVICE__ float cosh(float);
			
 
				+__DEVICE__ double erfc(double);
			
 
				+__DEVICE__ float erfc(float);
			
 
				+__DEVICE__ double erf(double);
			
 
				+__DEVICE__ float erf(float);
			
 
				+__DEVICE__ double exp2(double);
			
 
				+__DEVICE__ float exp2(float);
			
 
				+__DEVICE__ double exp(double);
			
 
				+__DEVICE__ float exp(float);
			
 
				+__DEVICE__ double expm1(double);
			
 
				+__DEVICE__ float expm1(float);
			
 
				+__DEVICE__ double fabs(double);
			
 
				+__DEVICE__ float fabs(float);
			
 
				+__DEVICE__ double fdim(double, double);
			
 
				+__DEVICE__ float fdim(float, float);
			
 
				+__DEVICE__ double floor(double);
			
 
				+__DEVICE__ float floor(float);
			
 
				+__DEVICE__ double fma(double, double, double);
			
 
				+__DEVICE__ float fma(float, float, float);
			
 
				+__DEVICE__ double fmax(double, double);
			
 
				+__DEVICE__ float fmax(float, float);
			
 
				+__DEVICE__ double fmin(double, double);
			
 
				+__DEVICE__ float fmin(float, float);
			
 
				+__DEVICE__ double fmod(double, double);
			
 
				+__DEVICE__ float fmod(float, float);
			
 
				+__DEVICE__ int fpclassify(double);
			
 
				+__DEVICE__ int fpclassify(float);
			
 
				+__DEVICE__ double frexp(double, int *);
			
 
				+__DEVICE__ float frexp(float, int *);
			
 
				+__DEVICE__ double hypot(double, double);
			
 
				+__DEVICE__ float hypot(float, float);
			
 
				+__DEVICE__ int ilogb(double);
			
 
				+__DEVICE__ int ilogb(float);
			
 
				+__DEVICE__ bool isfinite(double);
			
 
				+__DEVICE__ bool isfinite(float);
			
 
				+__DEVICE__ bool isgreater(double, double);
			
 
				+__DEVICE__ bool isgreaterequal(double, double);
			
 
				+__DEVICE__ bool isgreaterequal(float, float);
			
 
				+__DEVICE__ bool isgreater(float, float);
			
 
				+__DEVICE__ bool isinf(double);
			
 
				+__DEVICE__ bool isinf(float);
			
 
				+__DEVICE__ bool isless(double, double);
			
 
				+__DEVICE__ bool islessequal(double, double);
			
 
				+__DEVICE__ bool islessequal(float, float);
			
 
				+__DEVICE__ bool isless(float, float);
			
 
				+__DEVICE__ bool islessgreater(double, double);
			
 
				+__DEVICE__ bool islessgreater(float, float);
			
 
				+__DEVICE__ bool isnan(double);
			
 
				+__DEVICE__ bool isnan(float);
			
 
				+__DEVICE__ bool isnormal(double);
			
 
				+__DEVICE__ bool isnormal(float);
			
 
				+__DEVICE__ bool isunordered(double, double);
			
 
				+__DEVICE__ bool isunordered(float, float);
			
 
				+__DEVICE__ long labs(long);
			
 
				+__DEVICE__ double ldexp(double, int);
			
 
				+__DEVICE__ float ldexp(float, int);
			
 
				+__DEVICE__ double lgamma(double);
			
 
				+__DEVICE__ float lgamma(float);
			
 
				+__DEVICE__ long long llabs(long long);
			
 
				+__DEVICE__ long long llrint(double);
			
 
				+__DEVICE__ long long llrint(float);
			
 
				+__DEVICE__ double log10(double);
			
 
				+__DEVICE__ float log10(float);
			
 
				+__DEVICE__ double log1p(double);
			
 
				+__DEVICE__ float log1p(float);
			
 
				+__DEVICE__ double log2(double);
			
 
				+__DEVICE__ float log2(float);
			
 
				+__DEVICE__ double logb(double);
			
 
				+__DEVICE__ float logb(float);
			
 
				+__DEVICE__ double log(double);
			
 
				+__DEVICE__ float log(float);
			
 
				+__DEVICE__ long lrint(double);
			
 
				+__DEVICE__ long lrint(float);
			
 
				+__DEVICE__ long lround(double);
			
 
				+__DEVICE__ long lround(float);
			
 
				+__DEVICE__ long long llround(float); // No llround(double).
			
 
				+__DEVICE__ double modf(double, double *);
			
 
				+__DEVICE__ float modf(float, float *);
			
 
				+__DEVICE__ double nan(const char *);
			
 
				+__DEVICE__ float nanf(const char *);
			
 
				+__DEVICE__ double nearbyint(double);
			
 
				+__DEVICE__ float nearbyint(float);
			
 
				+__DEVICE__ double nextafter(double, double);
			
 
				+__DEVICE__ float nextafter(float, float);
			
 
				+__DEVICE__ double pow(double, double);
			
 
				+__DEVICE__ double pow(double, int);
			
 
				+__DEVICE__ float pow(float, float);
			
 
				+__DEVICE__ float pow(float, int);
			
 
				+__DEVICE__ double remainder(double, double);
			
 
				+__DEVICE__ float remainder(float, float);
			
 
				+__DEVICE__ double remquo(double, double, int *);
			
 
				+__DEVICE__ float remquo(float, float, int *);
			
 
				+__DEVICE__ double rint(double);
			
 
				+__DEVICE__ float rint(float);
			
 
				+__DEVICE__ double round(double);
			
 
				+__DEVICE__ float round(float);
			
 
				+__DEVICE__ double scalbln(double, long);
			
 
				+__DEVICE__ float scalbln(float, long);
			
 
				+__DEVICE__ double scalbn(double, int);
			
 
				+__DEVICE__ float scalbn(float, int);
			
 
				+__DEVICE__ bool signbit(double);
			
 
				+__DEVICE__ bool signbit(float);
			
 
				+__DEVICE__ double sin(double);
			
 
				+__DEVICE__ float sin(float);
			
 
				+__DEVICE__ double sinh(double);
			
 
				+__DEVICE__ float sinh(float);
			
 
				+__DEVICE__ double sqrt(double);
			
 
				+__DEVICE__ float sqrt(float);
			
 
				+__DEVICE__ double tan(double);
			
 
				+__DEVICE__ float tan(float);
			
 
				+__DEVICE__ double tanh(double);
			
 
				+__DEVICE__ float tanh(float);
			
 
				+__DEVICE__ double tgamma(double);
			
 
				+__DEVICE__ float tgamma(float);
			
 
				+__DEVICE__ double trunc(double);
			
 
				+__DEVICE__ float trunc(float);
			
 
				+
			
 
				+// Notably missing above is nexttoward, which we don't define on
			
 
				+// the device side because libdevice doesn't give us an implementation, and we
			
 
				+// don't want to be in the business of writing one ourselves.
			
 
				+
			
 
				+// We need to define these overloads in exactly the namespace our standard
			
 
				+// library uses (including the right inline namespace), otherwise they won't be
			
 
				+// picked up by other functions in the standard library (e.g. functions in
			
 
				+// <complex>).  Thus the ugliness below.
			
 
				+#ifdef _LIBCPP_BEGIN_NAMESPACE_STD
			
 
				+_LIBCPP_BEGIN_NAMESPACE_STD
			
 
				+#else
			
 
				+namespace std {
			
 
				+#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
			
 
				+_GLIBCXX_BEGIN_NAMESPACE_VERSION
			
 
				+#endif
			
 
				+#endif
			
 
				+
			
 
				+using ::abs;
			
 
				+using ::acos;
			
 
				+using ::acosh;
			
 
				+using ::asin;
			
 
				+using ::asinh;
			
 
				+using ::atan;
			
 
				+using ::atan2;
			
 
				+using ::atanh;
			
 
				+using ::cbrt;
			
 
				+using ::ceil;
			
 
				+using ::copysign;
			
 
				+using ::cos;
			
 
				+using ::cosh;
			
 
				+using ::erf;
			
 
				+using ::erfc;
			
 
				+using ::exp;
			
 
				+using ::exp2;
			
 
				+using ::expm1;
			
 
				+using ::fabs;
			
 
				+using ::fdim;
			
 
				+using ::floor;
			
 
				+using ::fma;
			
 
				+using ::fmax;
			
 
				+using ::fmin;
			
 
				+using ::fmod;
			
 
				+using ::fpclassify;
			
 
				+using ::frexp;
			
 
				+using ::hypot;
			
 
				+using ::ilogb;
			
 
				+using ::isfinite;
			
 
				+using ::isgreater;
			
 
				+using ::isgreaterequal;
			
 
				+using ::isinf;
			
 
				+using ::isless;
			
 
				+using ::islessequal;
			
 
				+using ::islessgreater;
			
 
				+using ::isnan;
			
 
				+using ::isnormal;
			
 
				+using ::isunordered;
			
 
				+using ::labs;
			
 
				+using ::ldexp;
			
 
				+using ::lgamma;
			
 
				+using ::llabs;
			
 
				+using ::llrint;
			
 
				+using ::log;
			
 
				+using ::log10;
			
 
				+using ::log1p;
			
 
				+using ::log2;
			
 
				+using ::logb;
			
 
				+using ::lrint;
			
 
				+using ::lround;
			
 
				+using ::llround;
			
 
				+using ::modf;
			
 
				+using ::nan;
			
 
				+using ::nanf;
			
 
				+using ::nearbyint;
			
 
				+using ::nextafter;
			
 
				+using ::pow;
			
 
				+using ::remainder;
			
 
				+using ::remquo;
			
 
				+using ::rint;
			
 
				+using ::round;
			
 
				+using ::scalbln;
			
 
				+using ::scalbn;
			
 
				+using ::signbit;
			
 
				+using ::sin;
			
 
				+using ::sinh;
			
 
				+using ::sqrt;
			
 
				+using ::tan;
			
 
				+using ::tanh;
			
 
				+using ::tgamma;
			
 
				+using ::trunc;
			
 
				+
			
 
				+#ifdef _LIBCPP_END_NAMESPACE_STD
			
 
				+_LIBCPP_END_NAMESPACE_STD
			
 
				+#else
			
 
				+#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
			
 
				+_GLIBCXX_END_NAMESPACE_VERSION
			
 
				+#endif
			
 
				+} // namespace std
			
 
				+#endif
			
 
				+
			
 
				+#pragma pop_macro("__DEVICE__")
			
 
				+
			
 
				+#endif
			
--- a/demo/include/__clang_cuda_runtime_wrapper.h
+++ b/demo/include/__clang_cuda_runtime_wrapper.h
@@ -0,0 +1,381 @@
 
				+/*===---- __clang_cuda_runtime_wrapper.h - CUDA runtime support -------------===
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * WARNING: This header is intended to be directly -include'd by
			
 
				+ * the compiler and is not supposed to be included by users.
			
 
				+ *
			
 
				+ * CUDA headers are implemented in a way that currently makes it
			
 
				+ * impossible for user code to #include directly when compiling with
			
 
				+ * Clang. They present different view of CUDA-supplied functions
			
 
				+ * depending on where in NVCC's compilation pipeline the headers are
			
 
				+ * included. Neither of these modes provides function definitions with
			
 
				+ * correct attributes, so we use preprocessor to force the headers
			
 
				+ * into a form that Clang can use.
			
 
				+ *
			
 
				+ * Similarly to NVCC which -include's cuda_runtime.h, Clang -include's
			
 
				+ * this file during every CUDA compilation.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __CLANG_CUDA_RUNTIME_WRAPPER_H__
			
 
				+#define __CLANG_CUDA_RUNTIME_WRAPPER_H__
			
 
				+
			
 
				+#if defined(__CUDA__) && defined(__clang__)
			
 
				+
			
 
				+// Include some forward declares that must come before cmath.
			
 
				+#include <__clang_cuda_math_forward_declares.h>
			
 
				+
			
 
				+// Include some standard headers to avoid CUDA headers including them
			
 
				+// while some required macros (like __THROW) are in a weird state.
			
 
				+#include <cmath>
			
 
				+#include <cstdlib>
			
 
				+#include <stdlib.h>
			
 
				+
			
 
				+// Preserve common macros that will be changed below by us or by CUDA
			
 
				+// headers.
			
 
				+#pragma push_macro("__THROW")
			
 
				+#pragma push_macro("__CUDA_ARCH__")
			
 
				+
			
 
				+// WARNING: Preprocessor hacks below are based on specific details of
			
 
				+// CUDA-7.x headers and are not expected to work with any other
			
 
				+// version of CUDA headers.
			
 
				+#include "cuda.h"
			
 
				+#if !defined(CUDA_VERSION)
			
 
				+#error "cuda.h did not define CUDA_VERSION"
			
 
				+#elif CUDA_VERSION < 7000 || CUDA_VERSION > 9000
			
 
				+#error "Unsupported CUDA version!"
			
 
				+#endif
			
 
				+
			
 
				+// Make largest subset of device functions available during host
			
 
				+// compilation -- SM_35 for the time being.
			
 
				+#ifndef __CUDA_ARCH__
			
 
				+#define __CUDA_ARCH__ 350
			
 
				+#endif
			
 
				+
			
 
				+#include "__clang_cuda_builtin_vars.h"
			
 
				+
			
 
				+// No need for device_launch_parameters.h as __clang_cuda_builtin_vars.h above
			
 
				+// has taken care of builtin variables declared in the file.
			
 
				+#define __DEVICE_LAUNCH_PARAMETERS_H__
			
 
				+
			
 
				+// {math,device}_functions.h only have declarations of the
			
 
				+// functions. We don't need them as we're going to pull in their
			
 
				+// definitions from .hpp files.
			
 
				+#define __DEVICE_FUNCTIONS_H__
			
 
				+#define __MATH_FUNCTIONS_H__
			
 
				+#define __COMMON_FUNCTIONS_H__
			
 
				+
			
 
				+#undef __CUDACC__
			
 
				+#if CUDA_VERSION < 9000
			
 
				+#define __CUDABE__
			
 
				+#else
			
 
				+#define __CUDA_LIBDEVICE__
			
 
				+#endif
			
 
				+// Disables definitions of device-side runtime support stubs in
			
 
				+// cuda_device_runtime_api.h
			
 
				+#include "driver_types.h"
			
 
				+#include "host_config.h"
			
 
				+#include "host_defines.h"
			
 
				+
			
 
				+#undef __CUDABE__
			
 
				+#undef __CUDA_LIBDEVICE__
			
 
				+#define __CUDACC__
			
 
				+#include "cuda_runtime.h"
			
 
				+
			
 
				+#undef __CUDACC__
			
 
				+#define __CUDABE__
			
 
				+
			
 
				+// CUDA headers use __nvvm_memcpy and __nvvm_memset which Clang does
			
 
				+// not have at the moment. Emulate them with a builtin memcpy/memset.
			
 
				+#define __nvvm_memcpy(s, d, n, a) __builtin_memcpy(s, d, n)
			
 
				+#define __nvvm_memset(d, c, n, a) __builtin_memset(d, c, n)
			
 
				+
			
 
				+#if CUDA_VERSION < 9000
			
 
				+#include "crt/device_runtime.h"
			
 
				+#endif
			
 
				+#include "crt/host_runtime.h"
			
 
				+// device_runtime.h defines __cxa_* macros that will conflict with
			
 
				+// cxxabi.h.
			
 
				+// FIXME: redefine these as __device__ functions.
			
 
				+#undef __cxa_vec_ctor
			
 
				+#undef __cxa_vec_cctor
			
 
				+#undef __cxa_vec_dtor
			
 
				+#undef __cxa_vec_new
			
 
				+#undef __cxa_vec_new2
			
 
				+#undef __cxa_vec_new3
			
 
				+#undef __cxa_vec_delete2
			
 
				+#undef __cxa_vec_delete
			
 
				+#undef __cxa_vec_delete3
			
 
				+#undef __cxa_pure_virtual
			
 
				+
			
 
				+// math_functions.hpp expects this host function be defined on MacOS, but it
			
 
				+// ends up not being there because of the games we play here.  Just define it
			
 
				+// ourselves; it's simple enough.
			
 
				+#ifdef __APPLE__
			
 
				+inline __host__ double __signbitd(double x) {
			
 
				+  return std::signbit(x);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+// We need decls for functions in CUDA's libdevice with __device__
			
 
				+// attribute only. Alas they come either as __host__ __device__ or
			
 
				+// with no attributes at all. To work around that, define __CUDA_RTC__
			
 
				+// which produces HD variant and undef __host__ which gives us desided
			
 
				+// decls with __device__ attribute.
			
 
				+#pragma push_macro("__host__")
			
 
				+#define __host__
			
 
				+#define __CUDACC_RTC__
			
 
				+#include "device_functions_decls.h"
			
 
				+#undef __CUDACC_RTC__
			
 
				+
			
 
				+// Temporarily poison __host__ macro to ensure it's not used by any of
			
 
				+// the headers we're about to include.
			
 
				+#define __host__ UNEXPECTED_HOST_ATTRIBUTE
			
 
				+
			
 
				+// CUDA 8.0.41 relies on __USE_FAST_MATH__ and __CUDA_PREC_DIV's values.
			
 
				+// Previous versions used to check whether they are defined or not.
			
 
				+// CU_DEVICE_INVALID macro is only defined in 8.0.41, so we use it
			
 
				+// here to detect the switch.
			
 
				+
			
 
				+#if defined(CU_DEVICE_INVALID)
			
 
				+#if !defined(__USE_FAST_MATH__)
			
 
				+#define __USE_FAST_MATH__ 0
			
 
				+#endif
			
 
				+
			
 
				+#if !defined(__CUDA_PREC_DIV)
			
 
				+#define __CUDA_PREC_DIV 0
			
 
				+#endif
			
 
				+#endif
			
 
				+
			
 
				+// device_functions.hpp and math_functions*.hpp use 'static
			
 
				+// __forceinline__' (with no __device__) for definitions of device
			
 
				+// functions. Temporarily redefine __forceinline__ to include
			
 
				+// __device__.
			
 
				+#pragma push_macro("__forceinline__")
			
 
				+#define __forceinline__ __device__ __inline__ __attribute__((always_inline))
			
 
				+
			
 
				+#pragma push_macro("__float2half_rn")
			
 
				+#if CUDA_VERSION >= 9000
			
 
				+// CUDA-9 has conflicting prototypes for __float2half_rn(float f) in
			
 
				+// cuda_fp16.h[pp] and device_functions.hpp. We need to get the one in
			
 
				+// device_functions.hpp out of the way.
			
 
				+#define __float2half_rn  __float2half_rn_disabled
			
 
				+#endif
			
 
				+
			
 
				+#include "device_functions.hpp"
			
 
				+#pragma pop_macro("__float2half_rn")
			
 
				+
			
 
				+
			
 
				+// math_function.hpp uses the __USE_FAST_MATH__ macro to determine whether we
			
 
				+// get the slow-but-accurate or fast-but-inaccurate versions of functions like
			
 
				+// sin and exp.  This is controlled in clang by -fcuda-approx-transcendentals.
			
 
				+//
			
 
				+// device_functions.hpp uses __USE_FAST_MATH__ for a different purpose (fast vs.
			
 
				+// slow divides), so we need to scope our define carefully here.
			
 
				+#pragma push_macro("__USE_FAST_MATH__")
			
 
				+#if defined(__CLANG_CUDA_APPROX_TRANSCENDENTALS__)
			
 
				+#define __USE_FAST_MATH__ 1
			
 
				+#endif
			
 
				+#include "math_functions.hpp"
			
 
				+#pragma pop_macro("__USE_FAST_MATH__")
			
 
				+
			
 
				+#include "math_functions_dbl_ptx3.hpp"
			
 
				+#pragma pop_macro("__forceinline__")
			
 
				+
			
 
				+// Pull in host-only functions that are only available when neither
			
 
				+// __CUDACC__ nor __CUDABE__ are defined.
			
 
				+#undef __MATH_FUNCTIONS_HPP__
			
 
				+#undef __CUDABE__
			
 
				+#include "math_functions.hpp"
			
 
				+// Alas, additional overloads for these functions are hard to get to.
			
 
				+// Considering that we only need these overloads for a few functions,
			
 
				+// we can provide them here.
			
 
				+static inline float rsqrt(float __a) { return rsqrtf(__a); }
			
 
				+static inline float rcbrt(float __a) { return rcbrtf(__a); }
			
 
				+static inline float sinpi(float __a) { return sinpif(__a); }
			
 
				+static inline float cospi(float __a) { return cospif(__a); }
			
 
				+static inline void sincospi(float __a, float *__b, float *__c) {
			
 
				+  return sincospif(__a, __b, __c);
			
 
				+}
			
 
				+static inline float erfcinv(float __a) { return erfcinvf(__a); }
			
 
				+static inline float normcdfinv(float __a) { return normcdfinvf(__a); }
			
 
				+static inline float normcdf(float __a) { return normcdff(__a); }
			
 
				+static inline float erfcx(float __a) { return erfcxf(__a); }
			
 
				+
			
 
				+// For some reason single-argument variant is not always declared by
			
 
				+// CUDA headers. Alas, device_functions.hpp included below needs it.
			
 
				+static inline __device__ void __brkpt(int __c) { __brkpt(); }
			
 
				+
			
 
				+// Now include *.hpp with definitions of various GPU functions.  Alas,
			
 
				+// a lot of thins get declared/defined with __host__ attribute which
			
 
				+// we don't want and we have to define it out. We also have to include
			
 
				+// {device,math}_functions.hpp again in order to extract the other
			
 
				+// branch of #if/else inside.
			
 
				+
			
 
				+#define __host__
			
 
				+#undef __CUDABE__
			
 
				+#define __CUDACC__
			
 
				+#undef __DEVICE_FUNCTIONS_HPP__
			
 
				+#include "device_atomic_functions.hpp"
			
 
				+#include "device_functions.hpp"
			
 
				+#include "sm_20_atomic_functions.hpp"
			
 
				+#include "sm_20_intrinsics.hpp"
			
 
				+#include "sm_32_atomic_functions.hpp"
			
 
				+
			
 
				+// Don't include sm_30_intrinsics.h and sm_32_intrinsics.h.  These define the
			
 
				+// __shfl and __ldg intrinsics using inline (volatile) asm, but we want to
			
 
				+// define them using builtins so that the optimizer can reason about and across
			
 
				+// these instructions.  In particular, using intrinsics for ldg gets us the
			
 
				+// [addr+imm] addressing mode, which, although it doesn't actually exist in the
			
 
				+// hardware, seems to generate faster machine code because ptxas can more easily
			
 
				+// reason about our code.
			
 
				+
			
 
				+#if CUDA_VERSION >= 8000
			
 
				+#include "sm_60_atomic_functions.hpp"
			
 
				+#include "sm_61_intrinsics.hpp"
			
 
				+#endif
			
 
				+
			
 
				+#undef __MATH_FUNCTIONS_HPP__
			
 
				+
			
 
				+// math_functions.hpp defines ::signbit as a __host__ __device__ function.  This
			
 
				+// conflicts with libstdc++'s constexpr ::signbit, so we have to rename
			
 
				+// math_function.hpp's ::signbit.  It's guarded by #undef signbit, but that's
			
 
				+// conditional on __GNUC__.  :)
			
 
				+#pragma push_macro("signbit")
			
 
				+#pragma push_macro("__GNUC__")
			
 
				+#undef __GNUC__
			
 
				+#define signbit __ignored_cuda_signbit
			
 
				+
			
 
				+// CUDA-9 omits device-side definitions of some math functions if it sees
			
 
				+// include guard from math.h wrapper from libstdc++. We have to undo the header
			
 
				+// guard temporarily to get the definitions we need.
			
 
				+#pragma push_macro("_GLIBCXX_MATH_H")
			
 
				+#pragma push_macro("_LIBCPP_VERSION")
			
 
				+#if CUDA_VERSION >= 9000
			
 
				+#undef _GLIBCXX_MATH_H
			
 
				+// We also need to undo another guard that checks for libc++ 3.8+
			
 
				+#ifdef _LIBCPP_VERSION
			
 
				+#define _LIBCPP_VERSION 3700
			
 
				+#endif
			
 
				+#endif
			
 
				+
			
 
				+#include "math_functions.hpp"
			
 
				+#pragma pop_macro("_GLIBCXX_MATH_H")
			
 
				+#pragma pop_macro("_LIBCPP_VERSION")
			
 
				+#pragma pop_macro("__GNUC__")
			
 
				+#pragma pop_macro("signbit")
			
 
				+
			
 
				+#pragma pop_macro("__host__")
			
 
				+
			
 
				+#include "texture_indirect_functions.h"
			
 
				+
			
 
				+// Restore state of __CUDA_ARCH__ and __THROW we had on entry.
			
 
				+#pragma pop_macro("__CUDA_ARCH__")
			
 
				+#pragma pop_macro("__THROW")
			
 
				+
			
 
				+// Set up compiler macros expected to be seen during compilation.
			
 
				+#undef __CUDABE__
			
 
				+#define __CUDACC__
			
 
				+
			
 
				+extern "C" {
			
 
				+// Device-side CUDA system calls.
			
 
				+// http://docs.nvidia.com/cuda/ptx-writers-guide-to-interoperability/index.html#system-calls
			
 
				+// We need these declarations and wrappers for device-side
			
 
				+// malloc/free/printf calls to work without relying on
			
 
				+// -fcuda-disable-target-call-checks option.
			
 
				+__device__ int vprintf(const char *, const char *);
			
 
				+__device__ void free(void *) __attribute((nothrow));
			
 
				+__device__ void *malloc(size_t) __attribute((nothrow)) __attribute__((malloc));
			
 
				+__device__ void __assertfail(const char *__message, const char *__file,
			
 
				+                             unsigned __line, const char *__function,
			
 
				+                             size_t __charSize) __attribute__((noreturn));
			
 
				+
			
 
				+// In order for standard assert() macro on linux to work we need to
			
 
				+// provide device-side __assert_fail()
			
 
				+__device__ static inline void __assert_fail(const char *__message,
			
 
				+                                            const char *__file, unsigned __line,
			
 
				+                                            const char *__function) {
			
 
				+  __assertfail(__message, __file, __line, __function, sizeof(char));
			
 
				+}
			
 
				+
			
 
				+// Clang will convert printf into vprintf, but we still need
			
 
				+// device-side declaration for it.
			
 
				+__device__ int printf(const char *, ...);
			
 
				+} // extern "C"
			
 
				+
			
 
				+// We also need device-side std::malloc and std::free.
			
 
				+namespace std {
			
 
				+__device__ static inline void free(void *__ptr) { ::free(__ptr); }
			
 
				+__device__ static inline void *malloc(size_t __size) {
			
 
				+  return ::malloc(__size);
			
 
				+}
			
 
				+} // namespace std
			
 
				+
			
 
				+// Out-of-line implementations from __clang_cuda_builtin_vars.h.  These need to
			
 
				+// come after we've pulled in the definition of uint3 and dim3.
			
 
				+
			
 
				+__device__ inline __cuda_builtin_threadIdx_t::operator uint3() const {
			
 
				+  uint3 ret;
			
 
				+  ret.x = x;
			
 
				+  ret.y = y;
			
 
				+  ret.z = z;
			
 
				+  return ret;
			
 
				+}
			
 
				+
			
 
				+__device__ inline __cuda_builtin_blockIdx_t::operator uint3() const {
			
 
				+  uint3 ret;
			
 
				+  ret.x = x;
			
 
				+  ret.y = y;
			
 
				+  ret.z = z;
			
 
				+  return ret;
			
 
				+}
			
 
				+
			
 
				+__device__ inline __cuda_builtin_blockDim_t::operator dim3() const {
			
 
				+  return dim3(x, y, z);
			
 
				+}
			
 
				+
			
 
				+__device__ inline __cuda_builtin_gridDim_t::operator dim3() const {
			
 
				+  return dim3(x, y, z);
			
 
				+}
			
 
				+
			
 
				+#include <__clang_cuda_cmath.h>
			
 
				+#include <__clang_cuda_intrinsics.h>
			
 
				+#include <__clang_cuda_complex_builtins.h>
			
 
				+
			
 
				+// curand_mtgp32_kernel helpfully redeclares blockDim and threadIdx in host
			
 
				+// mode, giving them their "proper" types of dim3 and uint3.  This is
			
 
				+// incompatible with the types we give in __clang_cuda_builtin_vars.h.  As as
			
 
				+// hack, force-include the header (nvcc doesn't include it by default) but
			
 
				+// redefine dim3 and uint3 to our builtin types.  (Thankfully dim3 and uint3 are
			
 
				+// only used here for the redeclarations of blockDim and threadIdx.)
			
 
				+#pragma push_macro("dim3")
			
 
				+#pragma push_macro("uint3")
			
 
				+#define dim3 __cuda_builtin_blockDim_t
			
 
				+#define uint3 __cuda_builtin_threadIdx_t
			
 
				+#include "curand_mtgp32_kernel.h"
			
 
				+#pragma pop_macro("dim3")
			
 
				+#pragma pop_macro("uint3")
			
 
				+#pragma pop_macro("__USE_FAST_MATH__")
			
 
				+
			
 
				+#endif // __CUDA__
			
 
				+#endif // __CLANG_CUDA_RUNTIME_WRAPPER_H__
			
--- a/demo/include/__stddef_max_align_t.h
+++ b/demo/include/__stddef_max_align_t.h
@@ -0,0 +1,43 @@
 
				+/*===---- __stddef_max_align_t.h - Definition of max_align_t for modules ---===
			
 
				+ *
			
 
				+ * Copyright (c) 2014 Chandler Carruth
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+
			
 
				+#ifndef __CLANG_MAX_ALIGN_T_DEFINED
			
 
				+#define __CLANG_MAX_ALIGN_T_DEFINED
			
 
				+
			
 
				+#if defined(_MSC_VER)
			
 
				+typedef double max_align_t;
			
 
				+#elif defined(__APPLE__)
			
 
				+typedef long double max_align_t;
			
 
				+#else
			
 
				+// Define 'max_align_t' to match the GCC definition.
			
 
				+typedef struct {
			
 
				+  long long __clang_max_align_nonce1
			
 
				+      __attribute__((__aligned__(__alignof__(long long))));
			
 
				+  long double __clang_max_align_nonce2
			
 
				+      __attribute__((__aligned__(__alignof__(long double))));
			
 
				+} max_align_t;
			
 
				+#endif
			
 
				+
			
 
				+#endif
			
--- a/demo/include/__wmmintrin_aes.h
+++ b/demo/include/__wmmintrin_aes.h
@@ -0,0 +1,151 @@
 
				+/*===---- __wmmintrin_aes.h - AES intrinsics -------------------------------===
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+#ifndef _WMMINTRIN_AES_H
			
 
				+#define _WMMINTRIN_AES_H
			
 
				+
			
 
				+#include <emmintrin.h>
			
 
				+
			
 
				+/* Define the default attributes for the functions in this file. */
			
 
				+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("aes")))
			
 
				+
			
 
				+/// \brief Performs a single round of AES encryption using the Equivalent
			
 
				+///    Inverse Cipher, transforming the state value from the first source
			
 
				+///    operand using a 128-bit round key value contained in the second source
			
 
				+///    operand, and writes the result to the destination.
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> VAESENC </c> instruction.
			
 
				+///
			
 
				+/// \param __V
			
 
				+///    A 128-bit integer vector containing the state value.
			
 
				+/// \param __R
			
 
				+///    A 128-bit integer vector containing the round key value.
			
 
				+/// \returns A 128-bit integer vector containing the encrypted value.
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm_aesenc_si128(__m128i __V, __m128i __R)
			
 
				+{
			
 
				+  return (__m128i)__builtin_ia32_aesenc128((__v2di)__V, (__v2di)__R);
			
 
				+}
			
 
				+
			
 
				+/// \brief Performs the final round of AES encryption using the Equivalent
			
 
				+///    Inverse Cipher, transforming the state value from the first source
			
 
				+///    operand using a 128-bit round key value contained in the second source
			
 
				+///    operand, and writes the result to the destination.
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> VAESENCLAST </c> instruction.
			
 
				+///
			
 
				+/// \param __V
			
 
				+///    A 128-bit integer vector containing the state value.
			
 
				+/// \param __R
			
 
				+///    A 128-bit integer vector containing the round key value.
			
 
				+/// \returns A 128-bit integer vector containing the encrypted value.
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm_aesenclast_si128(__m128i __V, __m128i __R)
			
 
				+{
			
 
				+  return (__m128i)__builtin_ia32_aesenclast128((__v2di)__V, (__v2di)__R);
			
 
				+}
			
 
				+
			
 
				+/// \brief Performs a single round of AES decryption using the Equivalent
			
 
				+///    Inverse Cipher, transforming the state value from the first source
			
 
				+///    operand using a 128-bit round key value contained in the second source
			
 
				+///    operand, and writes the result to the destination.
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> VAESDEC </c> instruction.
			
 
				+///
			
 
				+/// \param __V
			
 
				+///    A 128-bit integer vector containing the state value.
			
 
				+/// \param __R
			
 
				+///    A 128-bit integer vector containing the round key value.
			
 
				+/// \returns A 128-bit integer vector containing the decrypted value.
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm_aesdec_si128(__m128i __V, __m128i __R)
			
 
				+{
			
 
				+  return (__m128i)__builtin_ia32_aesdec128((__v2di)__V, (__v2di)__R);
			
 
				+}
			
 
				+
			
 
				+/// \brief Performs the final round of AES decryption using the Equivalent
			
 
				+///    Inverse Cipher, transforming the state value from the first source
			
 
				+///    operand using a 128-bit round key value contained in the second source
			
 
				+///    operand, and writes the result to the destination.
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> VAESDECLAST </c> instruction.
			
 
				+///
			
 
				+/// \param __V
			
 
				+///    A 128-bit integer vector containing the state value.
			
 
				+/// \param __R
			
 
				+///    A 128-bit integer vector containing the round key value.
			
 
				+/// \returns A 128-bit integer vector containing the decrypted value.
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm_aesdeclast_si128(__m128i __V, __m128i __R)
			
 
				+{
			
 
				+  return (__m128i)__builtin_ia32_aesdeclast128((__v2di)__V, (__v2di)__R);
			
 
				+}
			
 
				+
			
 
				+/// \brief Applies the AES InvMixColumns() transformation to an expanded key
			
 
				+///    contained in the source operand, and writes the result to the
			
 
				+///    destination.
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> VAESIMC </c> instruction.
			
 
				+///
			
 
				+/// \param __V
			
 
				+///    A 128-bit integer vector containing the expanded key.
			
 
				+/// \returns A 128-bit integer vector containing the transformed value.
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm_aesimc_si128(__m128i __V)
			
 
				+{
			
 
				+  return (__m128i)__builtin_ia32_aesimc128((__v2di)__V);
			
 
				+}
			
 
				+
			
 
				+/// \brief Generates a round key for AES encyption, operating on 128-bit data
			
 
				+///    specified in the first source operand and using an 8-bit round constant
			
 
				+///    specified by the second source operand, and writes the result to the
			
 
				+///    destination.
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// \code
			
 
				+/// __m128i _mm_aeskeygenassist_si128(__m128i C, const int R);
			
 
				+/// \endcode
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> AESKEYGENASSIST </c> instruction.
			
 
				+///
			
 
				+/// \param C
			
 
				+///    A 128-bit integer vector that is used to generate the AES encryption key.
			
 
				+/// \param R
			
 
				+///    An 8-bit round constant used to generate the AES encryption key.
			
 
				+/// \returns A 128-bit round key for AES encryption.
			
 
				+#define _mm_aeskeygenassist_si128(C, R) \
			
 
				+  (__m128i)__builtin_ia32_aeskeygenassist128((__v2di)(__m128i)(C), (int)(R))
			
 
				+
			
 
				+#undef __DEFAULT_FN_ATTRS
			
 
				+
			
 
				+#endif  /* _WMMINTRIN_AES_H */
			
--- a/demo/include/__wmmintrin_pclmul.h
+++ b/demo/include/__wmmintrin_pclmul.h
@@ -0,0 +1,57 @@
 
				+/*===---- __wmmintrin_pclmul.h - PCMUL intrinsics ---------------------------===
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+#ifndef _WMMINTRIN_PCLMUL_H
			
 
				+#define _WMMINTRIN_PCLMUL_H
			
 
				+
			
 
				+/// \brief Multiplies two 64-bit integer values, which are selected from source
			
 
				+///    operands using the immediate-value operand. The multiplication is a
			
 
				+///    carry-less multiplication, and the 128-bit integer product is stored in
			
 
				+///    the destination.
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// \code
			
 
				+/// __m128i _mm_clmulepi64_si128(__m128i __X, __m128i __Y, const int __I);
			
 
				+/// \endcode
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> VPCLMULQDQ </c> instruction.
			
 
				+///
			
 
				+/// \param __X
			
 
				+///    A 128-bit vector of [2 x i64] containing one of the source operands.
			
 
				+/// \param __Y
			
 
				+///    A 128-bit vector of [2 x i64] containing one of the source operands.
			
 
				+/// \param __I
			
 
				+///    An immediate value specifying which 64-bit values to select from the
			
 
				+///    operands. Bit 0 is used to select a value from operand \a __X, and bit
			
 
				+///    4 is used to select a value from operand \a __Y: \n
			
 
				+///    Bit[0]=0 indicates that bits[63:0] of operand \a __X are used. \n
			
 
				+///    Bit[0]=1 indicates that bits[127:64] of operand \a __X are used. \n
			
 
				+///    Bit[4]=0 indicates that bits[63:0] of operand \a __Y are used. \n
			
 
				+///    Bit[4]=1 indicates that bits[127:64] of operand \a __Y are used.
			
 
				+/// \returns The 128-bit integer vector containing the result of the carry-less
			
 
				+///    multiplication of the selected 64-bit values.
			
 
				+#define _mm_clmulepi64_si128(__X, __Y, __I) \
			
 
				+  ((__m128i)__builtin_ia32_pclmulqdq128((__v2di)(__m128i)(__X), \
			
 
				+                                        (__v2di)(__m128i)(__Y), (char)(__I)))
			
 
				+
			
 
				+#endif /* _WMMINTRIN_PCLMUL_H */
			
--- a/demo/include/adxintrin.h
+++ b/demo/include/adxintrin.h
@@ -0,0 +1,86 @@
 
				+/*===---- adxintrin.h - ADX intrinsics -------------------------------------===
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+
			
 
				+#ifndef __IMMINTRIN_H
			
 
				+#error "Never use <adxintrin.h> directly; include <immintrin.h> instead."
			
 
				+#endif
			
 
				+
			
 
				+#ifndef __ADXINTRIN_H
			
 
				+#define __ADXINTRIN_H
			
 
				+
			
 
				+/* Define the default attributes for the functions in this file. */
			
 
				+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
			
 
				+
			
 
				+/* Intrinsics that are available only if __ADX__ defined */
			
 
				+static __inline unsigned char __attribute__((__always_inline__, __nodebug__, __target__("adx")))
			
 
				+_addcarryx_u32(unsigned char __cf, unsigned int __x, unsigned int __y,
			
 
				+               unsigned int *__p)
			
 
				+{
			
 
				+  return __builtin_ia32_addcarryx_u32(__cf, __x, __y, __p);
			
 
				+}
			
 
				+
			
 
				+#ifdef __x86_64__
			
 
				+static __inline unsigned char __attribute__((__always_inline__, __nodebug__, __target__("adx")))
			
 
				+_addcarryx_u64(unsigned char __cf, unsigned long long __x,
			
 
				+               unsigned long long __y, unsigned long long  *__p)
			
 
				+{
			
 
				+  return __builtin_ia32_addcarryx_u64(__cf, __x, __y, __p);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+/* Intrinsics that are also available if __ADX__ undefined */
			
 
				+static __inline unsigned char __DEFAULT_FN_ATTRS
			
 
				+_addcarry_u32(unsigned char __cf, unsigned int __x, unsigned int __y,
			
 
				+              unsigned int *__p)
			
 
				+{
			
 
				+  return __builtin_ia32_addcarry_u32(__cf, __x, __y, __p);
			
 
				+}
			
 
				+
			
 
				+#ifdef __x86_64__
			
 
				+static __inline unsigned char __DEFAULT_FN_ATTRS
			
 
				+_addcarry_u64(unsigned char __cf, unsigned long long __x,
			
 
				+              unsigned long long __y, unsigned long long  *__p)
			
 
				+{
			
 
				+  return __builtin_ia32_addcarry_u64(__cf, __x, __y, __p);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+static __inline unsigned char __DEFAULT_FN_ATTRS
			
 
				+_subborrow_u32(unsigned char __cf, unsigned int __x, unsigned int __y,
			
 
				+              unsigned int *__p)
			
 
				+{
			
 
				+  return __builtin_ia32_subborrow_u32(__cf, __x, __y, __p);
			
 
				+}
			
 
				+
			
 
				+#ifdef __x86_64__
			
 
				+static __inline unsigned char __DEFAULT_FN_ATTRS
			
 
				+_subborrow_u64(unsigned char __cf, unsigned long long __x,
			
 
				+               unsigned long long __y, unsigned long long  *__p)
			
 
				+{
			
 
				+  return __builtin_ia32_subborrow_u64(__cf, __x, __y, __p);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#undef __DEFAULT_FN_ATTRS
			
 
				+
			
 
				+#endif /* __ADXINTRIN_H */
			
--- a/demo/include/altivec.h
+++ b/demo/include/altivec.h
--- a/demo/include/ammintrin.h
+++ b/demo/include/ammintrin.h
@@ -0,0 +1,193 @@
 
				+/*===---- ammintrin.h - SSE4a intrinsics -----------------------------------===
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+
			
 
				+#ifndef __AMMINTRIN_H
			
 
				+#define __AMMINTRIN_H
			
 
				+
			
 
				+#include <pmmintrin.h>
			
 
				+
			
 
				+/* Define the default attributes for the functions in this file. */
			
 
				+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4a")))
			
 
				+
			
 
				+/// \brief Extracts the specified bits from the lower 64 bits of the 128-bit
			
 
				+///    integer vector operand at the index \a idx and of the length \a len.
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// \code
			
 
				+/// __m128i _mm_extracti_si64(__m128i x, const int len, const int idx);
			
 
				+/// \endcode
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> EXTRQ </c> instruction.
			
 
				+///
			
 
				+/// \param x
			
 
				+///    The value from which bits are extracted.
			
 
				+/// \param len
			
 
				+///    Bits [5:0] specify the length; the other bits are ignored. If bits [5:0]
			
 
				+///    are zero, the length is interpreted as 64.
			
 
				+/// \param idx
			
 
				+///    Bits [5:0] specify the index of the least significant bit; the other
			
 
				+///    bits are ignored. If the sum of the index and length is greater than 64,
			
 
				+///    the result is undefined. If the length and index are both zero, bits
			
 
				+///    [63:0] of parameter \a x are extracted. If the length is zero but the
			
 
				+///    index is non-zero, the result is undefined.
			
 
				+/// \returns A 128-bit integer vector whose lower 64 bits contain the bits
			
 
				+///    extracted from the source operand.
			
 
				+#define _mm_extracti_si64(x, len, idx) \
			
 
				+  ((__m128i)__builtin_ia32_extrqi((__v2di)(__m128i)(x), \
			
 
				+                                  (char)(len), (char)(idx)))
			
 
				+
			
 
				+/// \brief Extracts the specified bits from the lower 64 bits of the 128-bit
			
 
				+///    integer vector operand at the index and of the length specified by
			
 
				+///    \a __y.
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> EXTRQ </c> instruction.
			
 
				+///
			
 
				+/// \param __x
			
 
				+///    The value from which bits are extracted.
			
 
				+/// \param __y
			
 
				+///    Specifies the index of the least significant bit at [13:8] and the
			
 
				+///    length at [5:0]; all other bits are ignored. If bits [5:0] are zero, the
			
 
				+///    length is interpreted as 64. If the sum of the index and length is
			
 
				+///    greater than 64, the result is undefined. If the length and index are
			
 
				+///    both zero, bits [63:0] of parameter \a __x are extracted. If the length
			
 
				+///    is zero but the index is non-zero, the result is undefined.
			
 
				+/// \returns A 128-bit vector whose lower 64 bits contain the bits extracted
			
 
				+///    from the source operand.
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm_extract_si64(__m128i __x, __m128i __y)
			
 
				+{
			
 
				+  return (__m128i)__builtin_ia32_extrq((__v2di)__x, (__v16qi)__y);
			
 
				+}
			
 
				+
			
 
				+/// \brief Inserts bits of a specified length from the source integer vector
			
 
				+///    \a y into the lower 64 bits of the destination integer vector \a x at
			
 
				+///    the index \a idx and of the length \a len.
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// \code
			
 
				+/// __m128i _mm_inserti_si64(__m128i x, __m128i y, const int len,
			
 
				+/// const int idx);
			
 
				+/// \endcode
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> INSERTQ </c> instruction.
			
 
				+///
			
 
				+/// \param x
			
 
				+///    The destination operand where bits will be inserted. The inserted bits
			
 
				+///    are defined by the length \a len and by the index \a idx specifying the
			
 
				+///    least significant bit.
			
 
				+/// \param y
			
 
				+///    The source operand containing the bits to be extracted. The extracted
			
 
				+///    bits are the least significant bits of operand \a y of length \a len.
			
 
				+/// \param len
			
 
				+///    Bits [5:0] specify the length; the other bits are ignored. If bits [5:0]
			
 
				+///    are zero, the length is interpreted as 64.
			
 
				+/// \param idx
			
 
				+///    Bits [5:0] specify the index of the least significant bit; the other
			
 
				+///    bits are ignored. If the sum of the index and length is greater than 64,
			
 
				+///    the result is undefined. If the length and index are both zero, bits
			
 
				+///    [63:0] of parameter \a y are inserted into parameter \a x. If the length
			
 
				+///    is zero but the index is non-zero, the result is undefined.
			
 
				+/// \returns A 128-bit integer vector containing the original lower 64-bits of
			
 
				+///    destination operand \a x with the specified bitfields replaced by the
			
 
				+///    lower bits of source operand \a y. The upper 64 bits of the return value
			
 
				+///    are undefined.
			
 
				+#define _mm_inserti_si64(x, y, len, idx) \
			
 
				+  ((__m128i)__builtin_ia32_insertqi((__v2di)(__m128i)(x), \
			
 
				+                                    (__v2di)(__m128i)(y), \
			
 
				+                                    (char)(len), (char)(idx)))
			
 
				+
			
 
				+/// \brief Inserts bits of a specified length from the source integer vector
			
 
				+///    \a __y into the lower 64 bits of the destination integer vector \a __x
			
 
				+///    at the index and of the length specified by \a __y.
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> INSERTQ </c> instruction.
			
 
				+///
			
 
				+/// \param __x
			
 
				+///    The destination operand where bits will be inserted. The inserted bits
			
 
				+///    are defined by the length and by the index of the least significant bit
			
 
				+///    specified by operand \a __y.
			
 
				+/// \param __y
			
 
				+///    The source operand containing the bits to be extracted. The extracted
			
 
				+///    bits are the least significant bits of operand \a __y with length
			
 
				+///    specified by bits [69:64]. These are inserted into the destination at the
			
 
				+///    index specified by bits [77:72]; all other bits are ignored. If bits
			
 
				+///    [69:64] are zero, the length is interpreted as 64. If the sum of the
			
 
				+///    index and length is greater than 64, the result is undefined. If the
			
 
				+///    length and index are both zero, bits [63:0] of parameter \a __y are
			
 
				+///    inserted into parameter \a __x. If the length is zero but the index is
			
 
				+///    non-zero, the result is undefined.
			
 
				+/// \returns A 128-bit integer vector containing the original lower 64-bits of
			
 
				+///    destination operand \a __x with the specified bitfields replaced by the
			
 
				+///    lower bits of source operand \a __y. The upper 64 bits of the return
			
 
				+///    value are undefined.
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm_insert_si64(__m128i __x, __m128i __y)
			
 
				+{
			
 
				+  return (__m128i)__builtin_ia32_insertq((__v2di)__x, (__v2di)__y);
			
 
				+}
			
 
				+
			
 
				+/// \brief Stores a 64-bit double-precision value in a 64-bit memory location.
			
 
				+///    To minimize caching, the data is flagged as non-temporal (unlikely to be
			
 
				+///    used again soon).
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> MOVNTSD </c> instruction.
			
 
				+///
			
 
				+/// \param __p
			
 
				+///    The 64-bit memory location used to store the register value.
			
 
				+/// \param __a
			
 
				+///    The 64-bit double-precision floating-point register value to be stored.
			
 
				+static __inline__ void __DEFAULT_FN_ATTRS
			
 
				+_mm_stream_sd(double *__p, __m128d __a)
			
 
				+{
			
 
				+  __builtin_ia32_movntsd(__p, (__v2df)__a);
			
 
				+}
			
 
				+
			
 
				+/// \brief Stores a 32-bit single-precision floating-point value in a 32-bit
			
 
				+///    memory location. To minimize caching, the data is flagged as
			
 
				+///    non-temporal (unlikely to be used again soon).
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> MOVNTSS </c> instruction.
			
 
				+///
			
 
				+/// \param __p
			
 
				+///    The 32-bit memory location used to store the register value.
			
 
				+/// \param __a
			
 
				+///    The 32-bit single-precision floating-point register value to be stored.
			
 
				+static __inline__ void __DEFAULT_FN_ATTRS
			
 
				+_mm_stream_ss(float *__p, __m128 __a)
			
 
				+{
			
 
				+  __builtin_ia32_movntss(__p, (__v4sf)__a);
			
 
				+}
			
 
				+
			
 
				+#undef __DEFAULT_FN_ATTRS
			
 
				+
			
 
				+#endif /* __AMMINTRIN_H */
			
--- a/demo/include/arm64intr.h
+++ b/demo/include/arm64intr.h
@@ -0,0 +1,49 @@
 
				+/*===---- arm64intr.h - ARM64 Windows intrinsics -------------------------------===
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+
			
 
				+/* Only include this if we're compiling for the windows platform. */
			
 
				+#ifndef _MSC_VER
			
 
				+#include_next <arm64intr.h>
			
 
				+#else
			
 
				+
			
 
				+#ifndef __ARM64INTR_H
			
 
				+#define __ARM64INTR_H
			
 
				+
			
 
				+typedef enum
			
 
				+{
			
 
				+  _ARM64_BARRIER_SY    = 0xF,
			
 
				+  _ARM64_BARRIER_ST    = 0xE,
			
 
				+  _ARM64_BARRIER_LD    = 0xD,
			
 
				+  _ARM64_BARRIER_ISH   = 0xB,
			
 
				+  _ARM64_BARRIER_ISHST = 0xA,
			
 
				+  _ARM64_BARRIER_ISHLD = 0x9,
			
 
				+  _ARM64_BARRIER_NSH   = 0x7,
			
 
				+  _ARM64_BARRIER_NSHST = 0x6,
			
 
				+  _ARM64_BARRIER_NSHLD = 0x5,
			
 
				+  _ARM64_BARRIER_OSH   = 0x3,
			
 
				+  _ARM64_BARRIER_OSHST = 0x2,
			
 
				+  _ARM64_BARRIER_OSHLD = 0x1
			
 
				+} _ARM64INTR_BARRIER_TYPE;
			
 
				+
			
 
				+#endif /* __ARM64INTR_H */
			
 
				+#endif /* _MSC_VER */
			
--- a/demo/include/arm_acle.h
+++ b/demo/include/arm_acle.h
@@ -0,0 +1,626 @@
 
				+/*===---- arm_acle.h - ARM Non-Neon intrinsics -----------------------------===
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+
			
 
				+#ifndef __ARM_ACLE_H
			
 
				+#define __ARM_ACLE_H
			
 
				+
			
 
				+#ifndef __ARM_ACLE
			
 
				+#error "ACLE intrinsics support not enabled."
			
 
				+#endif
			
 
				+
			
 
				+#include <stdint.h>
			
 
				+
			
 
				+#if defined(__cplusplus)
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+/* 8 SYNCHRONIZATION, BARRIER AND HINT INTRINSICS */
			
 
				+/* 8.3 Memory barriers */
			
 
				+#if !defined(_MSC_VER)
			
 
				+#define __dmb(i) __builtin_arm_dmb(i)
			
 
				+#define __dsb(i) __builtin_arm_dsb(i)
			
 
				+#define __isb(i) __builtin_arm_isb(i)
			
 
				+#endif
			
 
				+
			
 
				+/* 8.4 Hints */
			
 
				+
			
 
				+#if !defined(_MSC_VER)
			
 
				+static __inline__ void __attribute__((__always_inline__, __nodebug__)) __wfi(void) {
			
 
				+  __builtin_arm_wfi();
			
 
				+}
			
 
				+
			
 
				+static __inline__ void __attribute__((__always_inline__, __nodebug__)) __wfe(void) {
			
 
				+  __builtin_arm_wfe();
			
 
				+}
			
 
				+
			
 
				+static __inline__ void __attribute__((__always_inline__, __nodebug__)) __sev(void) {
			
 
				+  __builtin_arm_sev();
			
 
				+}
			
 
				+
			
 
				+static __inline__ void __attribute__((__always_inline__, __nodebug__)) __sevl(void) {
			
 
				+  __builtin_arm_sevl();
			
 
				+}
			
 
				+
			
 
				+static __inline__ void __attribute__((__always_inline__, __nodebug__)) __yield(void) {
			
 
				+  __builtin_arm_yield();
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#if __ARM_32BIT_STATE
			
 
				+#define __dbg(t) __builtin_arm_dbg(t)
			
 
				+#endif
			
 
				+
			
 
				+/* 8.5 Swap */
			
 
				+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__swp(uint32_t __x, volatile uint32_t *__p) {
			
 
				+  uint32_t v;
			
 
				+  do
			
 
				+    v = __builtin_arm_ldrex(__p);
			
 
				+  while (__builtin_arm_strex(__x, __p));
			
 
				+  return v;
			
 
				+}
			
 
				+
			
 
				+/* 8.6 Memory prefetch intrinsics */
			
 
				+/* 8.6.1 Data prefetch */
			
 
				+#define __pld(addr) __pldx(0, 0, 0, addr)
			
 
				+
			
 
				+#if __ARM_32BIT_STATE
			
 
				+#define __pldx(access_kind, cache_level, retention_policy, addr) \
			
 
				+  __builtin_arm_prefetch(addr, access_kind, 1)
			
 
				+#else
			
 
				+#define __pldx(access_kind, cache_level, retention_policy, addr) \
			
 
				+  __builtin_arm_prefetch(addr, access_kind, cache_level, retention_policy, 1)
			
 
				+#endif
			
 
				+
			
 
				+/* 8.6.2 Instruction prefetch */
			
 
				+#define __pli(addr) __plix(0, 0, addr)
			
 
				+
			
 
				+#if __ARM_32BIT_STATE
			
 
				+#define __plix(cache_level, retention_policy, addr) \
			
 
				+  __builtin_arm_prefetch(addr, 0, 0)
			
 
				+#else
			
 
				+#define __plix(cache_level, retention_policy, addr) \
			
 
				+  __builtin_arm_prefetch(addr, 0, cache_level, retention_policy, 0)
			
 
				+#endif
			
 
				+
			
 
				+/* 8.7 NOP */
			
 
				+static __inline__ void __attribute__((__always_inline__, __nodebug__)) __nop(void) {
			
 
				+  __builtin_arm_nop();
			
 
				+}
			
 
				+
			
 
				+/* 9 DATA-PROCESSING INTRINSICS */
			
 
				+/* 9.2 Miscellaneous data-processing intrinsics */
			
 
				+/* ROR */
			
 
				+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__ror(uint32_t __x, uint32_t __y) {
			
 
				+  __y %= 32;
			
 
				+  if (__y == 0)
			
 
				+    return __x;
			
 
				+  return (__x >> __y) | (__x << (32 - __y));
			
 
				+}
			
 
				+
			
 
				+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__rorll(uint64_t __x, uint32_t __y) {
			
 
				+  __y %= 64;
			
 
				+  if (__y == 0)
			
 
				+    return __x;
			
 
				+  return (__x >> __y) | (__x << (64 - __y));
			
 
				+}
			
 
				+
			
 
				+static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
			
 
				+__rorl(unsigned long __x, uint32_t __y) {
			
 
				+#if __SIZEOF_LONG__ == 4
			
 
				+  return __ror(__x, __y);
			
 
				+#else
			
 
				+  return __rorll(__x, __y);
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/* CLZ */
			
 
				+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__clz(uint32_t __t) {
			
 
				+  return __builtin_clz(__t);
			
 
				+}
			
 
				+
			
 
				+static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
			
 
				+__clzl(unsigned long __t) {
			
 
				+  return __builtin_clzl(__t);
			
 
				+}
			
 
				+
			
 
				+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__clzll(uint64_t __t) {
			
 
				+  return __builtin_clzll(__t);
			
 
				+}
			
 
				+
			
 
				+/* REV */
			
 
				+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__rev(uint32_t __t) {
			
 
				+  return __builtin_bswap32(__t);
			
 
				+}
			
 
				+
			
 
				+static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
			
 
				+__revl(unsigned long __t) {
			
 
				+#if __SIZEOF_LONG__ == 4
			
 
				+  return __builtin_bswap32(__t);
			
 
				+#else
			
 
				+  return __builtin_bswap64(__t);
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__revll(uint64_t __t) {
			
 
				+  return __builtin_bswap64(__t);
			
 
				+}
			
 
				+
			
 
				+/* REV16 */
			
 
				+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__rev16(uint32_t __t) {
			
 
				+  return __ror(__rev(__t), 16);
			
 
				+}
			
 
				+
			
 
				+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__rev16ll(uint64_t __t) {
			
 
				+  return (((uint64_t)__rev16(__t >> 32)) << 32) | __rev16(__t);
			
 
				+}
			
 
				+
			
 
				+static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
			
 
				+__rev16l(unsigned long __t) {
			
 
				+#if __SIZEOF_LONG__ == 4
			
 
				+    return __rev16(__t);
			
 
				+#else
			
 
				+    return __rev16ll(__t);
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+/* REVSH */
			
 
				+static __inline__ int16_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__revsh(int16_t __t) {
			
 
				+  return __builtin_bswap16(__t);
			
 
				+}
			
 
				+
			
 
				+/* RBIT */
			
 
				+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__rbit(uint32_t __t) {
			
 
				+  return __builtin_arm_rbit(__t);
			
 
				+}
			
 
				+
			
 
				+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__rbitll(uint64_t __t) {
			
 
				+#if __ARM_32BIT_STATE
			
 
				+  return (((uint64_t)__builtin_arm_rbit(__t)) << 32) |
			
 
				+         __builtin_arm_rbit(__t >> 32);
			
 
				+#else
			
 
				+  return __builtin_arm_rbit64(__t);
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
			
 
				+__rbitl(unsigned long __t) {
			
 
				+#if __SIZEOF_LONG__ == 4
			
 
				+  return __rbit(__t);
			
 
				+#else
			
 
				+  return __rbitll(__t);
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * 9.3 16-bit multiplications
			
 
				+ */
			
 
				+#if __ARM_FEATURE_DSP
			
 
				+static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
			
 
				+__smulbb(int32_t __a, int32_t __b) {
			
 
				+  return __builtin_arm_smulbb(__a, __b);
			
 
				+}
			
 
				+static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
			
 
				+__smulbt(int32_t __a, int32_t __b) {
			
 
				+  return __builtin_arm_smulbt(__a, __b);
			
 
				+}
			
 
				+static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
			
 
				+__smultb(int32_t __a, int32_t __b) {
			
 
				+  return __builtin_arm_smultb(__a, __b);
			
 
				+}
			
 
				+static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
			
 
				+__smultt(int32_t __a, int32_t __b) {
			
 
				+  return __builtin_arm_smultt(__a, __b);
			
 
				+}
			
 
				+static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
			
 
				+__smulwb(int32_t __a, int32_t __b) {
			
 
				+  return __builtin_arm_smulwb(__a, __b);
			
 
				+}
			
 
				+static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
			
 
				+__smulwt(int32_t __a, int32_t __b) {
			
 
				+  return __builtin_arm_smulwt(__a, __b);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+/*
			
 
				+ * 9.4 Saturating intrinsics
			
 
				+ *
			
 
				+ * FIXME: Change guard to their corrosponding __ARM_FEATURE flag when Q flag
			
 
				+ * intrinsics are implemented and the flag is enabled.
			
 
				+ */
			
 
				+/* 9.4.1 Width-specified saturation intrinsics */
			
 
				+#if __ARM_FEATURE_SAT
			
 
				+#define __ssat(x, y) __builtin_arm_ssat(x, y)
			
 
				+#define __usat(x, y) __builtin_arm_usat(x, y)
			
 
				+#endif
			
 
				+
			
 
				+/* 9.4.2 Saturating addition and subtraction intrinsics */
			
 
				+#if __ARM_FEATURE_DSP
			
 
				+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__qadd(int32_t __t, int32_t __v) {
			
 
				+  return __builtin_arm_qadd(__t, __v);
			
 
				+}
			
 
				+
			
 
				+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__qsub(int32_t __t, int32_t __v) {
			
 
				+  return __builtin_arm_qsub(__t, __v);
			
 
				+}
			
 
				+
			
 
				+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__qdbl(int32_t __t) {
			
 
				+  return __builtin_arm_qadd(__t, __t);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+/* 9.4.3 Accumultating multiplications */
			
 
				+#if __ARM_FEATURE_DSP
			
 
				+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__smlabb(int32_t __a, int32_t __b, int32_t __c) {
			
 
				+  return __builtin_arm_smlabb(__a, __b, __c);
			
 
				+}
			
 
				+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__smlabt(int32_t __a, int32_t __b, int32_t __c) {
			
 
				+  return __builtin_arm_smlabt(__a, __b, __c);
			
 
				+}
			
 
				+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__smlatb(int32_t __a, int32_t __b, int32_t __c) {
			
 
				+  return __builtin_arm_smlatb(__a, __b, __c);
			
 
				+}
			
 
				+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__smlatt(int32_t __a, int32_t __b, int32_t __c) {
			
 
				+  return __builtin_arm_smlatt(__a, __b, __c);
			
 
				+}
			
 
				+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__smlawb(int32_t __a, int32_t __b, int32_t __c) {
			
 
				+  return __builtin_arm_smlawb(__a, __b, __c);
			
 
				+}
			
 
				+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__smlawt(int32_t __a, int32_t __b, int32_t __c) {
			
 
				+  return __builtin_arm_smlawt(__a, __b, __c);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				+/* 9.5.4 Parallel 16-bit saturation */
			
 
				+#if __ARM_FEATURE_SIMD32
			
 
				+#define __ssat16(x, y) __builtin_arm_ssat16(x, y)
			
 
				+#define __usat16(x, y) __builtin_arm_usat16(x, y)
			
 
				+#endif
			
 
				+
			
 
				+/* 9.5.5 Packing and unpacking */
			
 
				+#if __ARM_FEATURE_SIMD32
			
 
				+typedef int32_t int8x4_t;
			
 
				+typedef int32_t int16x2_t;
			
 
				+typedef uint32_t uint8x4_t;
			
 
				+typedef uint32_t uint16x2_t;
			
 
				+
			
 
				+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__sxtab16(int16x2_t __a, int8x4_t __b) {
			
 
				+  return __builtin_arm_sxtab16(__a, __b);
			
 
				+}
			
 
				+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__sxtb16(int8x4_t __a) {
			
 
				+  return __builtin_arm_sxtb16(__a);
			
 
				+}
			
 
				+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__uxtab16(int16x2_t __a, int8x4_t __b) {
			
 
				+  return __builtin_arm_uxtab16(__a, __b);
			
 
				+}
			
 
				+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__uxtb16(int8x4_t __a) {
			
 
				+  return __builtin_arm_uxtb16(__a);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+/* 9.5.6 Parallel selection */
			
 
				+#if __ARM_FEATURE_SIMD32
			
 
				+static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__sel(uint8x4_t __a, uint8x4_t __b) {
			
 
				+  return __builtin_arm_sel(__a, __b);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+/* 9.5.7 Parallel 8-bit addition and subtraction */
			
 
				+#if __ARM_FEATURE_SIMD32
			
 
				+static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__qadd8(int8x4_t __a, int8x4_t __b) {
			
 
				+  return __builtin_arm_qadd8(__a, __b);
			
 
				+}
			
 
				+static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__qsub8(int8x4_t __a, int8x4_t __b) {
			
 
				+  return __builtin_arm_qsub8(__a, __b);
			
 
				+}
			
 
				+static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__sadd8(int8x4_t __a, int8x4_t __b) {
			
 
				+  return __builtin_arm_sadd8(__a, __b);
			
 
				+}
			
 
				+static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__shadd8(int8x4_t __a, int8x4_t __b) {
			
 
				+  return __builtin_arm_shadd8(__a, __b);
			
 
				+}
			
 
				+static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__shsub8(int8x4_t __a, int8x4_t __b) {
			
 
				+  return __builtin_arm_shsub8(__a, __b);
			
 
				+}
			
 
				+static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__ssub8(int8x4_t __a, int8x4_t __b) {
			
 
				+  return __builtin_arm_ssub8(__a, __b);
			
 
				+}
			
 
				+static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__uadd8(uint8x4_t __a, uint8x4_t __b) {
			
 
				+  return __builtin_arm_uadd8(__a, __b);
			
 
				+}
			
 
				+static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__uhadd8(uint8x4_t __a, uint8x4_t __b) {
			
 
				+  return __builtin_arm_uhadd8(__a, __b);
			
 
				+}
			
 
				+static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__uhsub8(uint8x4_t __a, uint8x4_t __b) {
			
 
				+  return __builtin_arm_uhsub8(__a, __b);
			
 
				+}
			
 
				+static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__uqadd8(uint8x4_t __a, uint8x4_t __b) {
			
 
				+  return __builtin_arm_uqadd8(__a, __b);
			
 
				+}
			
 
				+static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__uqsub8(uint8x4_t __a, uint8x4_t __b) {
			
 
				+  return __builtin_arm_uqsub8(__a, __b);
			
 
				+}
			
 
				+static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__usub8(uint8x4_t __a, uint8x4_t __b) {
			
 
				+  return __builtin_arm_usub8(__a, __b);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+/* 9.5.8 Sum of 8-bit absolute differences */
			
 
				+#if __ARM_FEATURE_SIMD32
			
 
				+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__usad8(uint8x4_t __a, uint8x4_t __b) {
			
 
				+  return __builtin_arm_usad8(__a, __b);
			
 
				+}
			
 
				+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__usada8(uint8x4_t __a, uint8x4_t __b, uint32_t __c) {
			
 
				+  return __builtin_arm_usada8(__a, __b, __c);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+/* 9.5.9 Parallel 16-bit addition and subtraction */
			
 
				+#if __ARM_FEATURE_SIMD32
			
 
				+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__qadd16(int16x2_t __a, int16x2_t __b) {
			
 
				+  return __builtin_arm_qadd16(__a, __b);
			
 
				+}
			
 
				+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__qasx(int16x2_t __a, int16x2_t __b) {
			
 
				+  return __builtin_arm_qasx(__a, __b);
			
 
				+}
			
 
				+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__qsax(int16x2_t __a, int16x2_t __b) {
			
 
				+  return __builtin_arm_qsax(__a, __b);
			
 
				+}
			
 
				+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__qsub16(int16x2_t __a, int16x2_t __b) {
			
 
				+  return __builtin_arm_qsub16(__a, __b);
			
 
				+}
			
 
				+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__sadd16(int16x2_t __a, int16x2_t __b) {
			
 
				+  return __builtin_arm_sadd16(__a, __b);
			
 
				+}
			
 
				+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__sasx(int16x2_t __a, int16x2_t __b) {
			
 
				+  return __builtin_arm_sasx(__a, __b);
			
 
				+}
			
 
				+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__shadd16(int16x2_t __a, int16x2_t __b) {
			
 
				+  return __builtin_arm_shadd16(__a, __b);
			
 
				+}
			
 
				+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__shasx(int16x2_t __a, int16x2_t __b) {
			
 
				+  return __builtin_arm_shasx(__a, __b);
			
 
				+}
			
 
				+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__shsax(int16x2_t __a, int16x2_t __b) {
			
 
				+  return __builtin_arm_shsax(__a, __b);
			
 
				+}
			
 
				+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__shsub16(int16x2_t __a, int16x2_t __b) {
			
 
				+  return __builtin_arm_shsub16(__a, __b);
			
 
				+}
			
 
				+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__ssax(int16x2_t __a, int16x2_t __b) {
			
 
				+  return __builtin_arm_ssax(__a, __b);
			
 
				+}
			
 
				+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__ssub16(int16x2_t __a, int16x2_t __b) {
			
 
				+  return __builtin_arm_ssub16(__a, __b);
			
 
				+}
			
 
				+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__uadd16(uint16x2_t __a, uint16x2_t __b) {
			
 
				+  return __builtin_arm_uadd16(__a, __b);
			
 
				+}
			
 
				+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__uasx(uint16x2_t __a, uint16x2_t __b) {
			
 
				+  return __builtin_arm_uasx(__a, __b);
			
 
				+}
			
 
				+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__uhadd16(uint16x2_t __a, uint16x2_t __b) {
			
 
				+  return __builtin_arm_uhadd16(__a, __b);
			
 
				+}
			
 
				+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__uhasx(uint16x2_t __a, uint16x2_t __b) {
			
 
				+  return __builtin_arm_uhasx(__a, __b);
			
 
				+}
			
 
				+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__uhsax(uint16x2_t __a, uint16x2_t __b) {
			
 
				+  return __builtin_arm_uhsax(__a, __b);
			
 
				+}
			
 
				+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__uhsub16(uint16x2_t __a, uint16x2_t __b) {
			
 
				+  return __builtin_arm_uhsub16(__a, __b);
			
 
				+}
			
 
				+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__uqadd16(uint16x2_t __a, uint16x2_t __b) {
			
 
				+  return __builtin_arm_uqadd16(__a, __b);
			
 
				+}
			
 
				+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__uqasx(uint16x2_t __a, uint16x2_t __b) {
			
 
				+  return __builtin_arm_uqasx(__a, __b);
			
 
				+}
			
 
				+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__uqsax(uint16x2_t __a, uint16x2_t __b) {
			
 
				+  return __builtin_arm_uqsax(__a, __b);
			
 
				+}
			
 
				+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__uqsub16(uint16x2_t __a, uint16x2_t __b) {
			
 
				+  return __builtin_arm_uqsub16(__a, __b);
			
 
				+}
			
 
				+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__usax(uint16x2_t __a, uint16x2_t __b) {
			
 
				+  return __builtin_arm_usax(__a, __b);
			
 
				+}
			
 
				+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__usub16(uint16x2_t __a, uint16x2_t __b) {
			
 
				+  return __builtin_arm_usub16(__a, __b);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+/* 9.5.10 Parallel 16-bit multiplications */
			
 
				+#if __ARM_FEATURE_SIMD32
			
 
				+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__smlad(int16x2_t __a, int16x2_t __b, int32_t __c) {
			
 
				+  return __builtin_arm_smlad(__a, __b, __c);
			
 
				+}
			
 
				+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__smladx(int16x2_t __a, int16x2_t __b, int32_t __c) {
			
 
				+  return __builtin_arm_smladx(__a, __b, __c);
			
 
				+}
			
 
				+static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__smlald(int16x2_t __a, int16x2_t __b, int64_t __c) {
			
 
				+  return __builtin_arm_smlald(__a, __b, __c);
			
 
				+}
			
 
				+static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__smlaldx(int16x2_t __a, int16x2_t __b, int64_t __c) {
			
 
				+  return __builtin_arm_smlaldx(__a, __b, __c);
			
 
				+}
			
 
				+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__smlsd(int16x2_t __a, int16x2_t __b, int32_t __c) {
			
 
				+  return __builtin_arm_smlsd(__a, __b, __c);
			
 
				+}
			
 
				+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__smlsdx(int16x2_t __a, int16x2_t __b, int32_t __c) {
			
 
				+  return __builtin_arm_smlsdx(__a, __b, __c);
			
 
				+}
			
 
				+static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__smlsld(int16x2_t __a, int16x2_t __b, int64_t __c) {
			
 
				+  return __builtin_arm_smlsld(__a, __b, __c);
			
 
				+}
			
 
				+static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__smlsldx(int16x2_t __a, int16x2_t __b, int64_t __c) {
			
 
				+  return __builtin_arm_smlsldx(__a, __b, __c);
			
 
				+}
			
 
				+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__smuad(int16x2_t __a, int16x2_t __b) {
			
 
				+  return __builtin_arm_smuad(__a, __b);
			
 
				+}
			
 
				+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__smuadx(int16x2_t __a, int16x2_t __b) {
			
 
				+  return __builtin_arm_smuadx(__a, __b);
			
 
				+}
			
 
				+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__smusd(int16x2_t __a, int16x2_t __b) {
			
 
				+  return __builtin_arm_smusd(__a, __b);
			
 
				+}
			
 
				+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__smusdx(int16x2_t __a, int16x2_t __b) {
			
 
				+  return __builtin_arm_smusdx(__a, __b);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+/* 9.7 CRC32 intrinsics */
			
 
				+#if __ARM_FEATURE_CRC32
			
 
				+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__crc32b(uint32_t __a, uint8_t __b) {
			
 
				+  return __builtin_arm_crc32b(__a, __b);
			
 
				+}
			
 
				+
			
 
				+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__crc32h(uint32_t __a, uint16_t __b) {
			
 
				+  return __builtin_arm_crc32h(__a, __b);
			
 
				+}
			
 
				+
			
 
				+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__crc32w(uint32_t __a, uint32_t __b) {
			
 
				+  return __builtin_arm_crc32w(__a, __b);
			
 
				+}
			
 
				+
			
 
				+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__crc32d(uint32_t __a, uint64_t __b) {
			
 
				+  return __builtin_arm_crc32d(__a, __b);
			
 
				+}
			
 
				+
			
 
				+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__crc32cb(uint32_t __a, uint8_t __b) {
			
 
				+  return __builtin_arm_crc32cb(__a, __b);
			
 
				+}
			
 
				+
			
 
				+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__crc32ch(uint32_t __a, uint16_t __b) {
			
 
				+  return __builtin_arm_crc32ch(__a, __b);
			
 
				+}
			
 
				+
			
 
				+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__crc32cw(uint32_t __a, uint32_t __b) {
			
 
				+  return __builtin_arm_crc32cw(__a, __b);
			
 
				+}
			
 
				+
			
 
				+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
			
 
				+__crc32cd(uint32_t __a, uint64_t __b) {
			
 
				+  return __builtin_arm_crc32cd(__a, __b);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+/* 10.1 Special register intrinsics */
			
 
				+#define __arm_rsr(sysreg) __builtin_arm_rsr(sysreg)
			
 
				+#define __arm_rsr64(sysreg) __builtin_arm_rsr64(sysreg)
			
 
				+#define __arm_rsrp(sysreg) __builtin_arm_rsrp(sysreg)
			
 
				+#define __arm_wsr(sysreg, v) __builtin_arm_wsr(sysreg, v)
			
 
				+#define __arm_wsr64(sysreg, v) __builtin_arm_wsr64(sysreg, v)
			
 
				+#define __arm_wsrp(sysreg, v) __builtin_arm_wsrp(sysreg, v)
			
 
				+
			
 
				+#if defined(__cplusplus)
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#endif /* __ARM_ACLE_H */
			
--- a/demo/include/arm_neon.h
+++ b/demo/include/arm_neon.h
--- a/demo/include/armintr.h
+++ b/demo/include/armintr.h
@@ -0,0 +1,45 @@
 
				+/*===---- armintr.h - ARM Windows intrinsics -------------------------------===
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+
			
 
				+/* Only include this if we're compiling for the windows platform. */
			
 
				+#ifndef _MSC_VER
			
 
				+#include_next <armintr.h>
			
 
				+#else
			
 
				+
			
 
				+#ifndef __ARMINTR_H
			
 
				+#define __ARMINTR_H
			
 
				+
			
 
				+typedef enum
			
 
				+{
			
 
				+  _ARM_BARRIER_SY    = 0xF,
			
 
				+  _ARM_BARRIER_ST    = 0xE,
			
 
				+  _ARM_BARRIER_ISH   = 0xB,
			
 
				+  _ARM_BARRIER_ISHST = 0xA,
			
 
				+  _ARM_BARRIER_NSH   = 0x7,
			
 
				+  _ARM_BARRIER_NSHST = 0x6,
			
 
				+  _ARM_BARRIER_OSH   = 0x3,
			
 
				+  _ARM_BARRIER_OSHST = 0x2
			
 
				+} _ARMINTR_BARRIER_TYPE;
			
 
				+
			
 
				+#endif /* __ARMINTR_H */
			
 
				+#endif /* _MSC_VER */
			
--- a/demo/include/avx2intrin.h
+++ b/demo/include/avx2intrin.h
--- a/demo/include/avx512bitalgintrin.h
+++ b/demo/include/avx512bitalgintrin.h
@@ -0,0 +1,97 @@
 
				+/*===------------- avx512bitalgintrin.h - BITALG intrinsics ------------------===
			
 
				+ *
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+#ifndef __IMMINTRIN_H
			
 
				+#error "Never use <avx512bitalgintrin.h> directly; include <immintrin.h> instead."
			
 
				+#endif
			
 
				+
			
 
				+#ifndef __AVX512BITALGINTRIN_H
			
 
				+#define __AVX512BITALGINTRIN_H
			
 
				+
			
 
				+/* Define the default attributes for the functions in this file. */
			
 
				+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512bitalg")))
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_popcnt_epi16(__m512i __A)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_vpopcntw_512((__v32hi) __A);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_mask_popcnt_epi16(__m512i __A, __mmask32 __U, __m512i __B)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_selectw_512((__mmask32) __U,
			
 
				+              (__v32hi) _mm512_popcnt_epi16(__B),
			
 
				+              (__v32hi) __A);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_maskz_popcnt_epi16(__mmask32 __U, __m512i __B)
			
 
				+{
			
 
				+  return _mm512_mask_popcnt_epi16((__m512i) _mm512_setzero_hi(),
			
 
				+              __U,
			
 
				+              __B);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_popcnt_epi8(__m512i __A)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_vpopcntb_512((__v64qi) __A);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_mask_popcnt_epi8(__m512i __A, __mmask64 __U, __m512i __B)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_selectb_512((__mmask64) __U,
			
 
				+              (__v64qi) _mm512_popcnt_epi8(__B),
			
 
				+              (__v64qi) __A);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_maskz_popcnt_epi8(__mmask64 __U, __m512i __B)
			
 
				+{
			
 
				+  return _mm512_mask_popcnt_epi8((__m512i) _mm512_setzero_qi(),
			
 
				+              __U,
			
 
				+              __B);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
			
 
				+_mm512_mask_bitshuffle_epi64_mask(__mmask64 __U, __m512i __A, __m512i __B)
			
 
				+{
			
 
				+  return (__mmask64) __builtin_ia32_vpshufbitqmb512_mask((__v64qi) __A,
			
 
				+              (__v64qi) __B,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
			
 
				+_mm512_bitshuffle_epi64_mask(__m512i __A, __m512i __B)
			
 
				+{
			
 
				+  return _mm512_mask_bitshuffle_epi64_mask((__mmask64) -1,
			
 
				+              __A,
			
 
				+              __B);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+#undef __DEFAULT_FN_ATTRS
			
 
				+
			
 
				+#endif
			
--- a/demo/include/avx512bwintrin.h
+++ b/demo/include/avx512bwintrin.h
--- a/demo/include/avx512cdintrin.h
+++ b/demo/include/avx512cdintrin.h
@@ -0,0 +1,145 @@
 
				+/*===------------- avx512cdintrin.h - AVX512CD intrinsics ------------------===
			
 
				+ *
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+#ifndef __IMMINTRIN_H
			
 
				+#error "Never use <avx512cdintrin.h> directly; include <immintrin.h> instead."
			
 
				+#endif
			
 
				+
			
 
				+#ifndef __AVX512CDINTRIN_H
			
 
				+#define __AVX512CDINTRIN_H
			
 
				+
			
 
				+/* Define the default attributes for the functions in this file. */
			
 
				+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512cd")))
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_conflict_epi64 (__m512i __A)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A,
			
 
				+                 (__v8di) _mm512_setzero_si512 (),
			
 
				+                 (__mmask8) -1);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_mask_conflict_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A,
			
 
				+               (__v8di) __W,
			
 
				+               (__mmask8) __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_maskz_conflict_epi64 (__mmask8 __U, __m512i __A)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A,
			
 
				+                 (__v8di) _mm512_setzero_si512 (),
			
 
				+                 (__mmask8) __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_conflict_epi32 (__m512i __A)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A,
			
 
				+                 (__v16si) _mm512_setzero_si512 (),
			
 
				+                 (__mmask16) -1);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_mask_conflict_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A,
			
 
				+               (__v16si) __W,
			
 
				+               (__mmask16) __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_maskz_conflict_epi32 (__mmask16 __U, __m512i __A)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A,
			
 
				+                 (__v16si) _mm512_setzero_si512 (),
			
 
				+                 (__mmask16) __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_lzcnt_epi32 (__m512i __A)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_vplzcntd_512_mask ((__v16si) __A,
			
 
				+             (__v16si) _mm512_setzero_si512 (),
			
 
				+             (__mmask16) -1);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_mask_lzcnt_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_vplzcntd_512_mask ((__v16si) __A,
			
 
				+                 (__v16si) __W,
			
 
				+                 (__mmask16) __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_maskz_lzcnt_epi32 (__mmask16 __U, __m512i __A)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_vplzcntd_512_mask ((__v16si) __A,
			
 
				+             (__v16si) _mm512_setzero_si512 (),
			
 
				+             (__mmask16) __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_lzcnt_epi64 (__m512i __A)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_vplzcntq_512_mask ((__v8di) __A,
			
 
				+             (__v8di) _mm512_setzero_si512 (),
			
 
				+             (__mmask8) -1);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_mask_lzcnt_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_vplzcntq_512_mask ((__v8di) __A,
			
 
				+                 (__v8di) __W,
			
 
				+                 (__mmask8) __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_maskz_lzcnt_epi64 (__mmask8 __U, __m512i __A)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_vplzcntq_512_mask ((__v8di) __A,
			
 
				+             (__v8di) _mm512_setzero_si512 (),
			
 
				+             (__mmask8) __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_broadcastmb_epi64 (__mmask8 __A)
			
 
				+{
			
 
				+  return (__m512i) _mm512_set1_epi64((long long) __A);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_broadcastmw_epi32 (__mmask16 __A)
			
 
				+{
			
 
				+  return (__m512i) _mm512_set1_epi32((int) __A);
			
 
				+
			
 
				+}
			
 
				+
			
 
				+#undef __DEFAULT_FN_ATTRS
			
 
				+
			
 
				+#endif
			
--- a/demo/include/avx512dqintrin.h
+++ b/demo/include/avx512dqintrin.h
--- a/demo/include/avx512erintrin.h
+++ b/demo/include/avx512erintrin.h
@@ -0,0 +1,285 @@
 
				+/*===---- avx512erintrin.h - AVX512ER intrinsics ---------------------------===
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+#ifndef __IMMINTRIN_H
			
 
				+#error "Never use <avx512erintrin.h> directly; include <immintrin.h> instead."
			
 
				+#endif
			
 
				+
			
 
				+#ifndef __AVX512ERINTRIN_H
			
 
				+#define __AVX512ERINTRIN_H
			
 
				+
			
 
				+// exp2a23
			
 
				+#define _mm512_exp2a23_round_pd(A, R) __extension__ ({ \
			
 
				+  (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
			
 
				+                                      (__v8df)_mm512_setzero_pd(), \
			
 
				+                                      (__mmask8)-1, (int)(R)); })
			
 
				+
			
 
				+#define _mm512_mask_exp2a23_round_pd(S, M, A, R) __extension__ ({ \
			
 
				+  (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
			
 
				+                                      (__v8df)(__m512d)(S), (__mmask8)(M), \
			
 
				+                                      (int)(R)); })
			
 
				+
			
 
				+#define _mm512_maskz_exp2a23_round_pd(M, A, R) __extension__ ({ \
			
 
				+  (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
			
 
				+                                      (__v8df)_mm512_setzero_pd(), \
			
 
				+                                      (__mmask8)(M), (int)(R)); })
			
 
				+
			
 
				+#define _mm512_exp2a23_pd(A) \
			
 
				+  _mm512_exp2a23_round_pd((A), _MM_FROUND_CUR_DIRECTION)
			
 
				+
			
 
				+#define _mm512_mask_exp2a23_pd(S, M, A) \
			
 
				+  _mm512_mask_exp2a23_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
			
 
				+
			
 
				+#define _mm512_maskz_exp2a23_pd(M, A) \
			
 
				+  _mm512_maskz_exp2a23_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
			
 
				+
			
 
				+#define _mm512_exp2a23_round_ps(A, R) __extension__ ({ \
			
 
				+  (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
			
 
				+                                     (__v16sf)_mm512_setzero_ps(), \
			
 
				+                                     (__mmask16)-1, (int)(R)); })
			
 
				+
			
 
				+#define _mm512_mask_exp2a23_round_ps(S, M, A, R) __extension__ ({ \
			
 
				+  (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
			
 
				+                                     (__v16sf)(__m512)(S), (__mmask16)(M), \
			
 
				+                                     (int)(R)); })
			
 
				+
			
 
				+#define _mm512_maskz_exp2a23_round_ps(M, A, R) __extension__ ({ \
			
 
				+  (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
			
 
				+                                     (__v16sf)_mm512_setzero_ps(), \
			
 
				+                                     (__mmask16)(M), (int)(R)); })
			
 
				+
			
 
				+#define _mm512_exp2a23_ps(A) \
			
 
				+  _mm512_exp2a23_round_ps((A), _MM_FROUND_CUR_DIRECTION)
			
 
				+
			
 
				+#define _mm512_mask_exp2a23_ps(S, M, A) \
			
 
				+  _mm512_mask_exp2a23_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
			
 
				+
			
 
				+#define _mm512_maskz_exp2a23_ps(M, A) \
			
 
				+  _mm512_maskz_exp2a23_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
			
 
				+
			
 
				+// rsqrt28
			
 
				+#define _mm512_rsqrt28_round_pd(A, R) __extension__ ({ \
			
 
				+  (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
			
 
				+                                         (__v8df)_mm512_setzero_pd(), \
			
 
				+                                         (__mmask8)-1, (int)(R)); })
			
 
				+
			
 
				+#define _mm512_mask_rsqrt28_round_pd(S, M, A, R) __extension__ ({ \
			
 
				+  (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
			
 
				+                                         (__v8df)(__m512d)(S), (__mmask8)(M), \
			
 
				+                                         (int)(R)); })
			
 
				+
			
 
				+#define _mm512_maskz_rsqrt28_round_pd(M, A, R) __extension__ ({ \
			
 
				+  (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
			
 
				+                                         (__v8df)_mm512_setzero_pd(), \
			
 
				+                                         (__mmask8)(M), (int)(R)); })
			
 
				+
			
 
				+#define _mm512_rsqrt28_pd(A) \
			
 
				+  _mm512_rsqrt28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
			
 
				+
			
 
				+#define _mm512_mask_rsqrt28_pd(S, M, A) \
			
 
				+  _mm512_mask_rsqrt28_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
			
 
				+
			
 
				+#define _mm512_maskz_rsqrt28_pd(M, A) \
			
 
				+  _mm512_maskz_rsqrt28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
			
 
				+
			
 
				+#define _mm512_rsqrt28_round_ps(A, R) __extension__ ({ \
			
 
				+  (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
			
 
				+                                        (__v16sf)_mm512_setzero_ps(), \
			
 
				+                                        (__mmask16)-1, (int)(R)); })
			
 
				+
			
 
				+#define _mm512_mask_rsqrt28_round_ps(S, M, A, R) __extension__ ({ \
			
 
				+  (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
			
 
				+                                        (__v16sf)(__m512)(S), (__mmask16)(M), \
			
 
				+                                        (int)(R)); })
			
 
				+
			
 
				+#define _mm512_maskz_rsqrt28_round_ps(M, A, R) __extension__ ({ \
			
 
				+  (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
			
 
				+                                        (__v16sf)_mm512_setzero_ps(), \
			
 
				+                                        (__mmask16)(M), (int)(R)); })
			
 
				+
			
 
				+#define _mm512_rsqrt28_ps(A) \
			
 
				+  _mm512_rsqrt28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
			
 
				+
			
 
				+#define _mm512_mask_rsqrt28_ps(S, M, A) \
			
 
				+  _mm512_mask_rsqrt28_round_ps((S), (M), A, _MM_FROUND_CUR_DIRECTION)
			
 
				+
			
 
				+#define _mm512_maskz_rsqrt28_ps(M, A) \
			
 
				+  _mm512_maskz_rsqrt28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
			
 
				+
			
 
				+#define _mm_rsqrt28_round_ss(A, B, R) __extension__ ({ \
			
 
				+  (__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
			
 
				+                                              (__v4sf)(__m128)(B), \
			
 
				+                                              (__v4sf)_mm_setzero_ps(), \
			
 
				+                                              (__mmask8)-1, (int)(R)); })
			
 
				+
			
 
				+#define _mm_mask_rsqrt28_round_ss(S, M, A, B, R) __extension__ ({ \
			
 
				+  (__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
			
 
				+                                              (__v4sf)(__m128)(B), \
			
 
				+                                              (__v4sf)(__m128)(S), \
			
 
				+                                              (__mmask8)(M), (int)(R)); })
			
 
				+
			
 
				+#define _mm_maskz_rsqrt28_round_ss(M, A, B, R) __extension__ ({ \
			
 
				+  (__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
			
 
				+                                              (__v4sf)(__m128)(B), \
			
 
				+                                              (__v4sf)_mm_setzero_ps(), \
			
 
				+                                              (__mmask8)(M), (int)(R)); })
			
 
				+
			
 
				+#define _mm_rsqrt28_ss(A, B) \
			
 
				+  _mm_rsqrt28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
			
 
				+
			
 
				+#define _mm_mask_rsqrt28_ss(S, M, A, B) \
			
 
				+  _mm_mask_rsqrt28_round_ss((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
			
 
				+
			
 
				+#define _mm_maskz_rsqrt28_ss(M, A, B) \
			
 
				+  _mm_maskz_rsqrt28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
			
 
				+
			
 
				+#define _mm_rsqrt28_round_sd(A, B, R) __extension__ ({ \
			
 
				+  (__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
			
 
				+                                               (__v2df)(__m128d)(B), \
			
 
				+                                               (__v2df)_mm_setzero_pd(), \
			
 
				+                                               (__mmask8)-1, (int)(R)); })
			
 
				+
			
 
				+#define _mm_mask_rsqrt28_round_sd(S, M, A, B, R) __extension__ ({ \
			
 
				+  (__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
			
 
				+                                               (__v2df)(__m128d)(B), \
			
 
				+                                               (__v2df)(__m128d)(S), \
			
 
				+                                               (__mmask8)(M), (int)(R)); })
			
 
				+
			
 
				+#define _mm_maskz_rsqrt28_round_sd(M, A, B, R) __extension__ ({ \
			
 
				+  (__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
			
 
				+                                               (__v2df)(__m128d)(B), \
			
 
				+                                               (__v2df)_mm_setzero_pd(), \
			
 
				+                                               (__mmask8)(M), (int)(R)); })
			
 
				+
			
 
				+#define _mm_rsqrt28_sd(A, B) \
			
 
				+  _mm_rsqrt28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
			
 
				+
			
 
				+#define _mm_mask_rsqrt28_sd(S, M, A, B) \
			
 
				+  _mm_mask_rsqrt28_round_sd((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
			
 
				+
			
 
				+#define _mm_maskz_rsqrt28_sd(M, A, B) \
			
 
				+  _mm_maskz_rsqrt28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
			
 
				+
			
 
				+// rcp28
			
 
				+#define _mm512_rcp28_round_pd(A, R) __extension__ ({ \
			
 
				+  (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
			
 
				+                                       (__v8df)_mm512_setzero_pd(), \
			
 
				+                                       (__mmask8)-1, (int)(R)); })
			
 
				+
			
 
				+#define _mm512_mask_rcp28_round_pd(S, M, A, R) __extension__ ({ \
			
 
				+  (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
			
 
				+                                       (__v8df)(__m512d)(S), (__mmask8)(M), \
			
 
				+                                       (int)(R)); })
			
 
				+
			
 
				+#define _mm512_maskz_rcp28_round_pd(M, A, R) __extension__ ({ \
			
 
				+  (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
			
 
				+                                       (__v8df)_mm512_setzero_pd(), \
			
 
				+                                       (__mmask8)(M), (int)(R)); })
			
 
				+
			
 
				+#define _mm512_rcp28_pd(A) \
			
 
				+  _mm512_rcp28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
			
 
				+
			
 
				+#define _mm512_mask_rcp28_pd(S, M, A) \
			
 
				+  _mm512_mask_rcp28_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
			
 
				+
			
 
				+#define _mm512_maskz_rcp28_pd(M, A) \
			
 
				+  _mm512_maskz_rcp28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
			
 
				+
			
 
				+#define _mm512_rcp28_round_ps(A, R) __extension__ ({ \
			
 
				+  (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
			
 
				+                                      (__v16sf)_mm512_setzero_ps(), \
			
 
				+                                      (__mmask16)-1, (int)(R)); })
			
 
				+
			
 
				+#define _mm512_mask_rcp28_round_ps(S, M, A, R) __extension__ ({ \
			
 
				+  (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
			
 
				+                                      (__v16sf)(__m512)(S), (__mmask16)(M), \
			
 
				+                                      (int)(R)); })
			
 
				+
			
 
				+#define _mm512_maskz_rcp28_round_ps(M, A, R) __extension__ ({ \
			
 
				+  (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
			
 
				+                                      (__v16sf)_mm512_setzero_ps(), \
			
 
				+                                      (__mmask16)(M), (int)(R)); })
			
 
				+
			
 
				+#define _mm512_rcp28_ps(A) \
			
 
				+  _mm512_rcp28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
			
 
				+
			
 
				+#define _mm512_mask_rcp28_ps(S, M, A) \
			
 
				+  _mm512_mask_rcp28_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
			
 
				+
			
 
				+#define _mm512_maskz_rcp28_ps(M, A) \
			
 
				+  _mm512_maskz_rcp28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
			
 
				+
			
 
				+#define _mm_rcp28_round_ss(A, B, R) __extension__ ({ \
			
 
				+  (__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
			
 
				+                                            (__v4sf)(__m128)(B), \
			
 
				+                                            (__v4sf)_mm_setzero_ps(), \
			
 
				+                                            (__mmask8)-1, (int)(R)); })
			
 
				+
			
 
				+#define _mm_mask_rcp28_round_ss(S, M, A, B, R) __extension__ ({ \
			
 
				+  (__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
			
 
				+                                            (__v4sf)(__m128)(B), \
			
 
				+                                            (__v4sf)(__m128)(S), \
			
 
				+                                            (__mmask8)(M), (int)(R)); })
			
 
				+
			
 
				+#define _mm_maskz_rcp28_round_ss(M, A, B, R) __extension__ ({ \
			
 
				+  (__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
			
 
				+                                            (__v4sf)(__m128)(B), \
			
 
				+                                            (__v4sf)_mm_setzero_ps(), \
			
 
				+                                            (__mmask8)(M), (int)(R)); })
			
 
				+
			
 
				+#define _mm_rcp28_ss(A, B) \
			
 
				+  _mm_rcp28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
			
 
				+
			
 
				+#define _mm_mask_rcp28_ss(S, M, A, B) \
			
 
				+  _mm_mask_rcp28_round_ss((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
			
 
				+
			
 
				+#define _mm_maskz_rcp28_ss(M, A, B) \
			
 
				+  _mm_maskz_rcp28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
			
 
				+
			
 
				+#define _mm_rcp28_round_sd(A, B, R) __extension__ ({ \
			
 
				+  (__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
			
 
				+                                             (__v2df)(__m128d)(B), \
			
 
				+                                             (__v2df)_mm_setzero_pd(), \
			
 
				+                                             (__mmask8)-1, (int)(R)); })
			
 
				+
			
 
				+#define _mm_mask_rcp28_round_sd(S, M, A, B, R) __extension__ ({ \
			
 
				+  (__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
			
 
				+                                             (__v2df)(__m128d)(B), \
			
 
				+                                             (__v2df)(__m128d)(S), \
			
 
				+                                             (__mmask8)(M), (int)(R)); })
			
 
				+
			
 
				+#define _mm_maskz_rcp28_round_sd(M, A, B, R) __extension__ ({ \
			
 
				+  (__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
			
 
				+                                             (__v2df)(__m128d)(B), \
			
 
				+                                             (__v2df)_mm_setzero_pd(), \
			
 
				+                                             (__mmask8)(M), (int)(R)); })
			
 
				+
			
 
				+#define _mm_rcp28_sd(A, B) \
			
 
				+  _mm_rcp28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
			
 
				+
			
 
				+#define _mm_mask_rcp28_sd(S, M, A, B) \
			
 
				+  _mm_mask_rcp28_round_sd((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
			
 
				+
			
 
				+#define _mm_maskz_rcp28_sd(M, A, B) \
			
 
				+  _mm_maskz_rcp28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
			
 
				+
			
 
				+#endif // __AVX512ERINTRIN_H
			
--- a/demo/include/avx512fintrin.h
+++ b/demo/include/avx512fintrin.h
--- a/demo/include/avx512ifmaintrin.h
+++ b/demo/include/avx512ifmaintrin.h
@@ -0,0 +1,92 @@
 
				+/*===------------- avx512ifmaintrin.h - IFMA intrinsics ------------------===
			
 
				+ *
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+#ifndef __IMMINTRIN_H
			
 
				+#error "Never use <avx512ifmaintrin.h> directly; include <immintrin.h> instead."
			
 
				+#endif
			
 
				+
			
 
				+#ifndef __IFMAINTRIN_H
			
 
				+#define __IFMAINTRIN_H
			
 
				+
			
 
				+/* Define the default attributes for the functions in this file. */
			
 
				+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma")))
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_madd52hi_epu64 (__m512i __X, __m512i __Y, __m512i __Z)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_vpmadd52huq512_mask ((__v8di) __X,
			
 
				+                   (__v8di) __Y,
			
 
				+                   (__v8di) __Z,
			
 
				+                   (__mmask8) -1);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_mask_madd52hi_epu64 (__m512i __W, __mmask8 __M, __m512i __X,
			
 
				+          __m512i __Y)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_vpmadd52huq512_mask ((__v8di) __W,
			
 
				+                   (__v8di) __X,
			
 
				+                   (__v8di) __Y,
			
 
				+                   (__mmask8) __M);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_maskz_madd52hi_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_vpmadd52huq512_maskz ((__v8di) __X,
			
 
				+              (__v8di) __Y,
			
 
				+              (__v8di) __Z,
			
 
				+              (__mmask8) __M);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_madd52lo_epu64 (__m512i __X, __m512i __Y, __m512i __Z)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_vpmadd52luq512_mask ((__v8di) __X,
			
 
				+                   (__v8di) __Y,
			
 
				+                   (__v8di) __Z,
			
 
				+                   (__mmask8) -1);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_mask_madd52lo_epu64 (__m512i __W, __mmask8 __M, __m512i __X,
			
 
				+          __m512i __Y)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_vpmadd52luq512_mask ((__v8di) __W,
			
 
				+                   (__v8di) __X,
			
 
				+                   (__v8di) __Y,
			
 
				+                   (__mmask8) __M);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_maskz_madd52lo_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_vpmadd52luq512_maskz ((__v8di) __X,
			
 
				+              (__v8di) __Y,
			
 
				+              (__v8di) __Z,
			
 
				+              (__mmask8) __M);
			
 
				+}
			
 
				+
			
 
				+#undef __DEFAULT_FN_ATTRS
			
 
				+
			
 
				+#endif
			
--- a/demo/include/avx512ifmavlintrin.h
+++ b/demo/include/avx512ifmavlintrin.h
@@ -0,0 +1,149 @@
 
				+/*===------------- avx512ifmavlintrin.h - IFMA intrinsics ------------------===
			
 
				+ *
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+#ifndef __IMMINTRIN_H
			
 
				+#error "Never use <avx512ifmavlintrin.h> directly; include <immintrin.h> instead."
			
 
				+#endif
			
 
				+
			
 
				+#ifndef __IFMAVLINTRIN_H
			
 
				+#define __IFMAVLINTRIN_H
			
 
				+
			
 
				+/* Define the default attributes for the functions in this file. */
			
 
				+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma,avx512vl")))
			
 
				+
			
 
				+
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm_madd52hi_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_vpmadd52huq128_mask ((__v2di) __X,
			
 
				+                   (__v2di) __Y,
			
 
				+                   (__v2di) __Z,
			
 
				+                   (__mmask8) -1);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm_mask_madd52hi_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_vpmadd52huq128_mask ((__v2di) __W,
			
 
				+                   (__v2di) __X,
			
 
				+                   (__v2di) __Y,
			
 
				+                   (__mmask8) __M);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm_maskz_madd52hi_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_vpmadd52huq128_maskz ((__v2di) __X,
			
 
				+              (__v2di) __Y,
			
 
				+              (__v2di) __Z,
			
 
				+              (__mmask8) __M);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_madd52hi_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_vpmadd52huq256_mask ((__v4di) __X,
			
 
				+                   (__v4di) __Y,
			
 
				+                   (__v4di) __Z,
			
 
				+                   (__mmask8) -1);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_mask_madd52hi_epu64 (__m256i __W, __mmask8 __M, __m256i __X,
			
 
				+          __m256i __Y)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_vpmadd52huq256_mask ((__v4di) __W,
			
 
				+                   (__v4di) __X,
			
 
				+                   (__v4di) __Y,
			
 
				+                   (__mmask8) __M);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_maskz_madd52hi_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_vpmadd52huq256_maskz ((__v4di) __X,
			
 
				+              (__v4di) __Y,
			
 
				+              (__v4di) __Z,
			
 
				+              (__mmask8) __M);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm_madd52lo_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_vpmadd52luq128_mask ((__v2di) __X,
			
 
				+                   (__v2di) __Y,
			
 
				+                   (__v2di) __Z,
			
 
				+                   (__mmask8) -1);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm_mask_madd52lo_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_vpmadd52luq128_mask ((__v2di) __W,
			
 
				+                   (__v2di) __X,
			
 
				+                   (__v2di) __Y,
			
 
				+                   (__mmask8) __M);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm_maskz_madd52lo_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_vpmadd52luq128_maskz ((__v2di) __X,
			
 
				+              (__v2di) __Y,
			
 
				+              (__v2di) __Z,
			
 
				+              (__mmask8) __M);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_madd52lo_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_vpmadd52luq256_mask ((__v4di) __X,
			
 
				+                   (__v4di) __Y,
			
 
				+                   (__v4di) __Z,
			
 
				+                   (__mmask8) -1);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_mask_madd52lo_epu64 (__m256i __W, __mmask8 __M, __m256i __X,
			
 
				+          __m256i __Y)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_vpmadd52luq256_mask ((__v4di) __W,
			
 
				+                   (__v4di) __X,
			
 
				+                   (__v4di) __Y,
			
 
				+                   (__mmask8) __M);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_maskz_madd52lo_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_vpmadd52luq256_maskz ((__v4di) __X,
			
 
				+              (__v4di) __Y,
			
 
				+              (__v4di) __Z,
			
 
				+              (__mmask8) __M);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+#undef __DEFAULT_FN_ATTRS
			
 
				+
			
 
				+#endif
			
--- a/demo/include/avx512pfintrin.h
+++ b/demo/include/avx512pfintrin.h
@@ -0,0 +1,111 @@
 
				+/*===------------- avx512pfintrin.h - PF intrinsics ------------------===
			
 
				+ *
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+#ifndef __IMMINTRIN_H
			
 
				+#error "Never use <avx512pfintrin.h> directly; include <immintrin.h> instead."
			
 
				+#endif
			
 
				+
			
 
				+#ifndef __AVX512PFINTRIN_H
			
 
				+#define __AVX512PFINTRIN_H
			
 
				+
			
 
				+/* Define the default attributes for the functions in this file. */
			
 
				+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512pf")))
			
 
				+
			
 
				+#define _mm512_mask_prefetch_i32gather_pd(index, mask, addr, scale, hint) __extension__ ({\
			
 
				+  __builtin_ia32_gatherpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \
			
 
				+                             (long long const *)(addr), (int)(scale), \
			
 
				+                             (int)(hint)); })
			
 
				+              
			
 
				+#define _mm512_prefetch_i32gather_pd(index, addr, scale, hint) __extension__ ({\
			
 
				+  __builtin_ia32_gatherpfdpd((__mmask8) -1, (__v8si)(__m256i)(index), \
			
 
				+                             (long long const *)(addr), (int)(scale), \
			
 
				+                             (int)(hint)); })
			
 
				+
			
 
				+#define _mm512_mask_prefetch_i32gather_ps(index, mask, addr, scale, hint) ({\
			
 
				+  __builtin_ia32_gatherpfdps((__mmask16)(mask), \
			
 
				+                             (__v16si)(__m512i)(index), (int const *)(addr), \
			
 
				+                             (int)(scale), (int)(hint)); })
			
 
				+
			
 
				+#define _mm512_prefetch_i32gather_ps(index, addr, scale, hint) ({\
			
 
				+  __builtin_ia32_gatherpfdps((__mmask16) -1, \
			
 
				+                             (__v16si)(__m512i)(index), (int const *)(addr), \
			
 
				+                             (int)(scale), (int)(hint)); })
			
 
				+
			
 
				+#define _mm512_mask_prefetch_i64gather_pd(index, mask, addr, scale, hint) __extension__ ({\
			
 
				+  __builtin_ia32_gatherpfqpd((__mmask8)(mask), (__v8di)(__m512i)(index), \
			
 
				+                             (long long const *)(addr), (int)(scale), \
			
 
				+                             (int)(hint)); })
			
 
				+
			
 
				+#define _mm512_prefetch_i64gather_pd(index, addr, scale, hint) __extension__ ({\
			
 
				+  __builtin_ia32_gatherpfqpd((__mmask8) -1, (__v8di)(__m512i)(index), \
			
 
				+                             (long long const *)(addr), (int)(scale), \
			
 
				+                             (int)(hint)); })
			
 
				+              
			
 
				+#define _mm512_mask_prefetch_i64gather_ps(index, mask, addr, scale, hint) ({\
			
 
				+  __builtin_ia32_gatherpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \
			
 
				+                             (int const *)(addr), (int)(scale), (int)(hint)); })
			
 
				+
			
 
				+#define _mm512_prefetch_i64gather_ps(index, addr, scale, hint) ({\
			
 
				+  __builtin_ia32_gatherpfqps((__mmask8) -1, (__v8di)(__m512i)(index), \
			
 
				+                             (int const *)(addr), (int)(scale), (int)(hint)); })
			
 
				+
			
 
				+#define _mm512_prefetch_i32scatter_pd(addr, index, scale, hint) __extension__ ({\
			
 
				+  __builtin_ia32_scatterpfdpd((__mmask8)-1, (__v8si)(__m256i)(index), \
			
 
				+                              (long long *)(addr), (int)(scale), \
			
 
				+                              (int)(hint)); })
			
 
				+
			
 
				+#define _mm512_mask_prefetch_i32scatter_pd(addr, mask, index, scale, hint) __extension__ ({\
			
 
				+  __builtin_ia32_scatterpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \
			
 
				+                              (long long *)(addr), (int)(scale), \
			
 
				+                              (int)(hint)); })
			
 
				+
			
 
				+#define _mm512_prefetch_i32scatter_ps(addr, index, scale, hint) __extension__ ({\
			
 
				+  __builtin_ia32_scatterpfdps((__mmask16)-1, (__v16si)(__m512i)(index), \
			
 
				+                              (int *)(addr), (int)(scale), (int)(hint)); })
			
 
				+
			
 
				+#define _mm512_mask_prefetch_i32scatter_ps(addr, mask, index, scale, hint) __extension__ ({\
			
 
				+  __builtin_ia32_scatterpfdps((__mmask16)(mask), \
			
 
				+                              (__v16si)(__m512i)(index), (int *)(addr), \
			
 
				+                              (int)(scale), (int)(hint)); })
			
 
				+
			
 
				+#define _mm512_prefetch_i64scatter_pd(addr, index, scale, hint) __extension__ ({\
			
 
				+  __builtin_ia32_scatterpfqpd((__mmask8)-1, (__v8di)(__m512i)(index), \
			
 
				+                              (long long *)(addr), (int)(scale), \
			
 
				+                              (int)(hint)); })
			
 
				+
			
 
				+#define _mm512_mask_prefetch_i64scatter_pd(addr, mask, index, scale, hint) __extension__ ({\
			
 
				+  __builtin_ia32_scatterpfqpd((__mmask8)(mask), (__v8di)(__m512i)(index), \
			
 
				+                              (long long *)(addr), (int)(scale), \
			
 
				+                              (int)(hint)); })
			
 
				+
			
 
				+#define _mm512_prefetch_i64scatter_ps(addr, index, scale, hint) __extension__ ({\
			
 
				+  __builtin_ia32_scatterpfqps((__mmask8)-1, (__v8di)(__m512i)(index), \
			
 
				+                              (int *)(addr), (int)(scale), (int)(hint)); })
			
 
				+
			
 
				+#define _mm512_mask_prefetch_i64scatter_ps(addr, mask, index, scale, hint) __extension__ ({\
			
 
				+  __builtin_ia32_scatterpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \
			
 
				+                              (int *)(addr), (int)(scale), (int)(hint)); })
			
 
				+
			
 
				+#undef __DEFAULT_FN_ATTRS
			
 
				+
			
 
				+#endif
			
--- a/demo/include/avx512vbmi2intrin.h
+++ b/demo/include/avx512vbmi2intrin.h
@@ -0,0 +1,391 @@
 
				+/*===------------- avx512vbmi2intrin.h - VBMI2 intrinsics ------------------===
			
 
				+ *
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+#ifndef __IMMINTRIN_H
			
 
				+#error "Never use <avx512vbmi2intrin.h> directly; include <immintrin.h> instead."
			
 
				+#endif
			
 
				+
			
 
				+#ifndef __AVX512VBMI2INTRIN_H
			
 
				+#define __AVX512VBMI2INTRIN_H
			
 
				+
			
 
				+/* Define the default attributes for the functions in this file. */
			
 
				+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi2")))
			
 
				+
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_mask_compress_epi16(__m512i __S, __mmask32 __U, __m512i __D)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_compresshi512_mask ((__v32hi) __D,
			
 
				+              (__v32hi) __S,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_maskz_compress_epi16(__mmask32 __U, __m512i __D)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_compresshi512_mask ((__v32hi) __D,
			
 
				+              (__v32hi) _mm512_setzero_hi(),
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_mask_compress_epi8(__m512i __S, __mmask64 __U, __m512i __D)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_compressqi512_mask ((__v64qi) __D,
			
 
				+              (__v64qi) __S,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_maskz_compress_epi8(__mmask64 __U, __m512i __D)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_compressqi512_mask ((__v64qi) __D,
			
 
				+              (__v64qi) _mm512_setzero_qi(),
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ void __DEFAULT_FN_ATTRS
			
 
				+_mm512_mask_compressstoreu_epi16(void *__P, __mmask32 __U, __m512i __D)
			
 
				+{
			
 
				+  __builtin_ia32_compressstorehi512_mask ((__v32hi *) __P, (__v32hi) __D,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ void __DEFAULT_FN_ATTRS
			
 
				+_mm512_mask_compressstoreu_epi8(void *__P, __mmask64 __U, __m512i __D)
			
 
				+{
			
 
				+  __builtin_ia32_compressstoreqi512_mask ((__v64qi *) __P, (__v64qi) __D,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_mask_expand_epi16(__m512i __S, __mmask32 __U, __m512i __D)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_expandhi512_mask ((__v32hi) __D,
			
 
				+              (__v32hi) __S,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_maskz_expand_epi16(__mmask32 __U, __m512i __D)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_expandhi512_mask ((__v32hi) __D,
			
 
				+              (__v32hi) _mm512_setzero_hi(),
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_mask_expand_epi8(__m512i __S, __mmask64 __U, __m512i __D)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_expandqi512_mask ((__v64qi) __D,
			
 
				+              (__v64qi) __S,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_maskz_expand_epi8(__mmask64 __U, __m512i __D)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_expandqi512_mask ((__v64qi) __D,
			
 
				+              (__v64qi) _mm512_setzero_qi(),
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_mask_expandloadu_epi16(__m512i __S, __mmask32 __U, void const *__P)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_expandloadhi512_mask ((const __v32hi *)__P,
			
 
				+              (__v32hi) __S,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_maskz_expandloadu_epi16(__mmask32 __U, void const *__P)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_expandloadhi512_mask ((const __v32hi *)__P,
			
 
				+              (__v32hi) _mm512_setzero_hi(),
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_mask_expandloadu_epi8(__m512i __S, __mmask64 __U, void const *__P)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_expandloadqi512_mask ((const __v64qi *)__P,
			
 
				+              (__v64qi) __S,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_maskz_expandloadu_epi8(__mmask64 __U, void const *__P)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_expandloadqi512_mask ((const __v64qi *)__P,
			
 
				+              (__v64qi) _mm512_setzero_qi(),
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+#define _mm512_mask_shldi_epi64(S, U, A, B, I) __extension__ ({ \
			
 
				+  (__m512i)__builtin_ia32_vpshldq512_mask((__v8di)(A), \
			
 
				+                                          (__v8di)(B), \
			
 
				+                                          (int)(I), \
			
 
				+                                          (__v8di)(S), \
			
 
				+                                          (__mmask8)(U)); })
			
 
				+
			
 
				+#define _mm512_maskz_shldi_epi64(U, A, B, I) \
			
 
				+  _mm512_mask_shldi_epi64(_mm512_setzero_hi(), (U), (A), (B), (I))
			
 
				+
			
 
				+#define _mm512_shldi_epi64(A, B, I) \
			
 
				+  _mm512_mask_shldi_epi64(_mm512_undefined(), (__mmask8)(-1), (A), (B), (I))
			
 
				+
			
 
				+#define _mm512_mask_shldi_epi32(S, U, A, B, I) __extension__ ({ \
			
 
				+  (__m512i)__builtin_ia32_vpshldd512_mask((__v16si)(A), \
			
 
				+                                          (__v16si)(B), \
			
 
				+                                          (int)(I), \
			
 
				+                                          (__v16si)(S), \
			
 
				+                                          (__mmask16)(U)); })
			
 
				+
			
 
				+#define _mm512_maskz_shldi_epi32(U, A, B, I) \
			
 
				+  _mm512_mask_shldi_epi32(_mm512_setzero_hi(), (U), (A), (B), (I))
			
 
				+
			
 
				+#define _mm512_shldi_epi32(A, B, I) \
			
 
				+  _mm512_mask_shldi_epi32(_mm512_undefined(), (__mmask16)(-1), (A), (B), (I))
			
 
				+
			
 
				+#define _mm512_mask_shldi_epi16(S, U, A, B, I) __extension__ ({ \
			
 
				+  (__m512i)__builtin_ia32_vpshldw512_mask((__v32hi)(A), \
			
 
				+                                          (__v32hi)(B), \
			
 
				+                                          (int)(I), \
			
 
				+                                          (__v32hi)(S), \
			
 
				+                                          (__mmask32)(U)); })
			
 
				+
			
 
				+#define _mm512_maskz_shldi_epi16(U, A, B, I) \
			
 
				+  _mm512_mask_shldi_epi16(_mm512_setzero_hi(), (U), (A), (B), (I))
			
 
				+
			
 
				+#define _mm512_shldi_epi16(A, B, I) \
			
 
				+  _mm512_mask_shldi_epi16(_mm512_undefined(), (__mmask32)(-1), (A), (B), (I))
			
 
				+
			
 
				+#define _mm512_mask_shrdi_epi64(S, U, A, B, I) __extension__ ({ \
			
 
				+  (__m512i)__builtin_ia32_vpshrdq512_mask((__v8di)(A), \
			
 
				+                                          (__v8di)(B), \
			
 
				+                                          (int)(I), \
			
 
				+                                          (__v8di)(S), \
			
 
				+                                          (__mmask8)(U)); })
			
 
				+
			
 
				+#define _mm512_maskz_shrdi_epi64(U, A, B, I) \
			
 
				+  _mm512_mask_shrdi_epi64(_mm512_setzero_hi(), (U), (A), (B), (I))
			
 
				+
			
 
				+#define _mm512_shrdi_epi64(A, B, I) \
			
 
				+  _mm512_mask_shrdi_epi64(_mm512_undefined(), (__mmask8)(-1), (A), (B), (I))
			
 
				+
			
 
				+#define _mm512_mask_shrdi_epi32(S, U, A, B, I) __extension__ ({ \
			
 
				+  (__m512i)__builtin_ia32_vpshrdd512_mask((__v16si)(A), \
			
 
				+                                          (__v16si)(B), \
			
 
				+                                          (int)(I), \
			
 
				+                                          (__v16si)(S), \
			
 
				+                                          (__mmask16)(U)); })
			
 
				+
			
 
				+#define _mm512_maskz_shrdi_epi32(U, A, B, I) \
			
 
				+  _mm512_mask_shrdi_epi32(_mm512_setzero_hi(), (U), (A), (B), (I))
			
 
				+
			
 
				+#define _mm512_shrdi_epi32(A, B, I) \
			
 
				+  _mm512_mask_shrdi_epi32(_mm512_undefined(), (__mmask16)(-1), (A), (B), (I))
			
 
				+
			
 
				+#define _mm512_mask_shrdi_epi16(S, U, A, B, I) __extension__ ({ \
			
 
				+  (__m512i)__builtin_ia32_vpshrdw512_mask((__v32hi)(A), \
			
 
				+                                          (__v32hi)(B), \
			
 
				+                                          (int)(I), \
			
 
				+                                          (__v32hi)(S), \
			
 
				+                                          (__mmask32)(U)); })
			
 
				+
			
 
				+#define _mm512_maskz_shrdi_epi16(U, A, B, I) \
			
 
				+  _mm512_mask_shrdi_epi16(_mm512_setzero_hi(), (U), (A), (B), (I))
			
 
				+
			
 
				+#define _mm512_shrdi_epi16(A, B, I) \
			
 
				+  _mm512_mask_shrdi_epi16(_mm512_undefined(), (__mmask32)(-1), (A), (B), (I))
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_mask_shldv_epi64(__m512i __S, __mmask8 __U, __m512i __A, __m512i __B)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_vpshldvq512_mask ((__v8di) __S,
			
 
				+              (__v8di) __A,
			
 
				+              (__v8di) __B,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_maskz_shldv_epi64(__mmask8 __U, __m512i __S, __m512i __A, __m512i __B)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_vpshldvq512_maskz ((__v8di) __S,
			
 
				+              (__v8di) __A,
			
 
				+              (__v8di) __B,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_shldv_epi64(__m512i __S, __m512i __A, __m512i __B)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_vpshldvq512_mask ((__v8di) __S,
			
 
				+              (__v8di) __A,
			
 
				+              (__v8di) __B,
			
 
				+              (__mmask8) -1);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_mask_shldv_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_vpshldvd512_mask ((__v16si) __S,
			
 
				+              (__v16si) __A,
			
 
				+              (__v16si) __B,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_maskz_shldv_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_vpshldvd512_maskz ((__v16si) __S,
			
 
				+              (__v16si) __A,
			
 
				+              (__v16si) __B,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_shldv_epi32(__m512i __S, __m512i __A, __m512i __B)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_vpshldvd512_mask ((__v16si) __S,
			
 
				+              (__v16si) __A,
			
 
				+              (__v16si) __B,
			
 
				+              (__mmask16) -1);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_mask_shldv_epi16(__m512i __S, __mmask32 __U, __m512i __A, __m512i __B)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_vpshldvw512_mask ((__v32hi) __S,
			
 
				+              (__v32hi) __A,
			
 
				+              (__v32hi) __B,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_maskz_shldv_epi16(__mmask32 __U, __m512i __S, __m512i __A, __m512i __B)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_vpshldvw512_maskz ((__v32hi) __S,
			
 
				+              (__v32hi) __A,
			
 
				+              (__v32hi) __B,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_shldv_epi16(__m512i __S, __m512i __A, __m512i __B)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_vpshldvw512_mask ((__v32hi) __S,
			
 
				+              (__v32hi) __A,
			
 
				+              (__v32hi) __B,
			
 
				+              (__mmask32) -1);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_mask_shrdv_epi64(__m512i __S, __mmask8 __U, __m512i __A, __m512i __B)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_vpshrdvq512_mask ((__v8di) __S,
			
 
				+              (__v8di) __A,
			
 
				+              (__v8di) __B,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_maskz_shrdv_epi64(__mmask8 __U, __m512i __S, __m512i __A, __m512i __B)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_vpshrdvq512_maskz ((__v8di) __S,
			
 
				+              (__v8di) __A,
			
 
				+              (__v8di) __B,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_shrdv_epi64(__m512i __S, __m512i __A, __m512i __B)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_vpshrdvq512_mask ((__v8di) __S,
			
 
				+              (__v8di) __A,
			
 
				+              (__v8di) __B,
			
 
				+              (__mmask8) -1);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_mask_shrdv_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_vpshrdvd512_mask ((__v16si) __S,
			
 
				+              (__v16si) __A,
			
 
				+              (__v16si) __B,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_maskz_shrdv_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_vpshrdvd512_maskz ((__v16si) __S,
			
 
				+              (__v16si) __A,
			
 
				+              (__v16si) __B,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_shrdv_epi32(__m512i __S, __m512i __A, __m512i __B)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_vpshrdvd512_mask ((__v16si) __S,
			
 
				+              (__v16si) __A,
			
 
				+              (__v16si) __B,
			
 
				+              (__mmask16) -1);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_mask_shrdv_epi16(__m512i __S, __mmask32 __U, __m512i __A, __m512i __B)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_vpshrdvw512_mask ((__v32hi) __S,
			
 
				+              (__v32hi) __A,
			
 
				+              (__v32hi) __B,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_maskz_shrdv_epi16(__mmask32 __U, __m512i __S, __m512i __A, __m512i __B)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_vpshrdvw512_maskz ((__v32hi) __S,
			
 
				+              (__v32hi) __A,
			
 
				+              (__v32hi) __B,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_shrdv_epi16(__m512i __S, __m512i __A, __m512i __B)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_vpshrdvw512_mask ((__v32hi) __S,
			
 
				+              (__v32hi) __A,
			
 
				+              (__v32hi) __B,
			
 
				+              (__mmask32) -1);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+#undef __DEFAULT_FN_ATTRS
			
 
				+
			
 
				+#endif
			
 
				+
			
--- a/demo/include/avx512vbmiintrin.h
+++ b/demo/include/avx512vbmiintrin.h
@@ -0,0 +1,137 @@
 
				+/*===------------- avx512vbmiintrin.h - VBMI intrinsics ------------------===
			
 
				+ *
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+#ifndef __IMMINTRIN_H
			
 
				+#error "Never use <avx512vbmiintrin.h> directly; include <immintrin.h> instead."
			
 
				+#endif
			
 
				+
			
 
				+#ifndef __VBMIINTRIN_H
			
 
				+#define __VBMIINTRIN_H
			
 
				+
			
 
				+/* Define the default attributes for the functions in this file. */
			
 
				+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi")))
			
 
				+
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_mask2_permutex2var_epi8 (__m512i __A, __m512i __I,
			
 
				+         __mmask64 __U, __m512i __B)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_vpermi2varqi512_mask ((__v64qi) __A,
			
 
				+              (__v64qi) __I
			
 
				+              /* idx */ ,
			
 
				+              (__v64qi) __B,
			
 
				+              (__mmask64) __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_permutex2var_epi8 (__m512i __A, __m512i __I, __m512i __B)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_vpermt2varqi512_mask ((__v64qi) __I
			
 
				+              /* idx */ ,
			
 
				+              (__v64qi) __A,
			
 
				+              (__v64qi) __B,
			
 
				+              (__mmask64) -1);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_mask_permutex2var_epi8 (__m512i __A, __mmask64 __U,
			
 
				+        __m512i __I, __m512i __B)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_vpermt2varqi512_mask ((__v64qi) __I
			
 
				+              /* idx */ ,
			
 
				+              (__v64qi) __A,
			
 
				+              (__v64qi) __B,
			
 
				+              (__mmask64) __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_maskz_permutex2var_epi8 (__mmask64 __U, __m512i __A,
			
 
				+         __m512i __I, __m512i __B)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_vpermt2varqi512_maskz ((__v64qi) __I
			
 
				+               /* idx */ ,
			
 
				+               (__v64qi) __A,
			
 
				+               (__v64qi) __B,
			
 
				+               (__mmask64) __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_permutexvar_epi8 (__m512i __A, __m512i __B)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_permvarqi512_mask ((__v64qi) __B,
			
 
				+                 (__v64qi) __A,
			
 
				+                 (__v64qi) _mm512_undefined_epi32 (),
			
 
				+                 (__mmask64) -1);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_maskz_permutexvar_epi8 (__mmask64 __M, __m512i __A,
			
 
				+        __m512i __B)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_permvarqi512_mask ((__v64qi) __B,
			
 
				+                 (__v64qi) __A,
			
 
				+                 (__v64qi) _mm512_setzero_si512(),
			
 
				+                 (__mmask64) __M);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_mask_permutexvar_epi8 (__m512i __W, __mmask64 __M, __m512i __A,
			
 
				+             __m512i __B)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_permvarqi512_mask ((__v64qi) __B,
			
 
				+                 (__v64qi) __A,
			
 
				+                 (__v64qi) __W,
			
 
				+                 (__mmask64) __M);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_mask_multishift_epi64_epi8 (__m512i __W, __mmask64 __M, __m512i __X, __m512i __Y)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_vpmultishiftqb512_mask ((__v64qi) __X,
			
 
				+                (__v64qi) __Y,
			
 
				+                (__v64qi) __W,
			
 
				+                (__mmask64) __M);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_maskz_multishift_epi64_epi8 (__mmask64 __M, __m512i __X, __m512i __Y)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_vpmultishiftqb512_mask ((__v64qi) __X,
			
 
				+                (__v64qi) __Y,
			
 
				+                (__v64qi) _mm512_setzero_si512 (),
			
 
				+                (__mmask64) __M);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_multishift_epi64_epi8 (__m512i __X, __m512i __Y)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_vpmultishiftqb512_mask ((__v64qi) __X,
			
 
				+                (__v64qi) __Y,
			
 
				+                (__v64qi) _mm512_undefined_epi32 (),
			
 
				+                (__mmask64) -1);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+#undef __DEFAULT_FN_ATTRS
			
 
				+
			
 
				+#endif
			
--- a/demo/include/avx512vbmivlintrin.h
+++ b/demo/include/avx512vbmivlintrin.h
@@ -0,0 +1,247 @@
 
				+/*===------------- avx512vbmivlintrin.h - VBMI intrinsics ------------------===
			
 
				+ *
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+#ifndef __IMMINTRIN_H
			
 
				+#error "Never use <avx512vbmivlintrin.h> directly; include <immintrin.h> instead."
			
 
				+#endif
			
 
				+
			
 
				+#ifndef __VBMIVLINTRIN_H
			
 
				+#define __VBMIVLINTRIN_H
			
 
				+
			
 
				+/* Define the default attributes for the functions in this file. */
			
 
				+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi,avx512vl")))
			
 
				+
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm_mask2_permutex2var_epi8 (__m128i __A, __m128i __I, __mmask16 __U,
			
 
				+            __m128i __B)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_vpermi2varqi128_mask ((__v16qi) __A,
			
 
				+              (__v16qi) __I
			
 
				+              /* idx */ ,
			
 
				+              (__v16qi) __B,
			
 
				+              (__mmask16)
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_mask2_permutex2var_epi8 (__m256i __A, __m256i __I,
			
 
				+         __mmask32 __U, __m256i __B)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_vpermi2varqi256_mask ((__v32qi) __A,
			
 
				+              (__v32qi) __I
			
 
				+              /* idx */ ,
			
 
				+              (__v32qi) __B,
			
 
				+              (__mmask32)
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm_permutex2var_epi8 (__m128i __A, __m128i __I, __m128i __B)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_vpermt2varqi128_mask ((__v16qi) __I
			
 
				+              /* idx */ ,
			
 
				+              (__v16qi) __A,
			
 
				+              (__v16qi) __B,
			
 
				+              (__mmask16) -
			
 
				+              1);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm_mask_permutex2var_epi8 (__m128i __A, __mmask16 __U, __m128i __I,
			
 
				+           __m128i __B)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_vpermt2varqi128_mask ((__v16qi) __I
			
 
				+              /* idx */ ,
			
 
				+              (__v16qi) __A,
			
 
				+              (__v16qi) __B,
			
 
				+              (__mmask16)
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm_maskz_permutex2var_epi8 (__mmask16 __U, __m128i __A, __m128i __I,
			
 
				+            __m128i __B)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_vpermt2varqi128_maskz ((__v16qi) __I
			
 
				+               /* idx */ ,
			
 
				+               (__v16qi) __A,
			
 
				+               (__v16qi) __B,
			
 
				+               (__mmask16)
			
 
				+               __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_permutex2var_epi8 (__m256i __A, __m256i __I, __m256i __B)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_vpermt2varqi256_mask ((__v32qi) __I
			
 
				+              /* idx */ ,
			
 
				+              (__v32qi) __A,
			
 
				+              (__v32qi) __B,
			
 
				+              (__mmask32) -
			
 
				+              1);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_mask_permutex2var_epi8 (__m256i __A, __mmask32 __U,
			
 
				+        __m256i __I, __m256i __B)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_vpermt2varqi256_mask ((__v32qi) __I
			
 
				+              /* idx */ ,
			
 
				+              (__v32qi) __A,
			
 
				+              (__v32qi) __B,
			
 
				+              (__mmask32)
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_maskz_permutex2var_epi8 (__mmask32 __U, __m256i __A,
			
 
				+         __m256i __I, __m256i __B)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_vpermt2varqi256_maskz ((__v32qi) __I
			
 
				+               /* idx */ ,
			
 
				+               (__v32qi) __A,
			
 
				+               (__v32qi) __B,
			
 
				+               (__mmask32)
			
 
				+               __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm_permutexvar_epi8 (__m128i __A, __m128i __B)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_permvarqi128_mask ((__v16qi) __B,
			
 
				+                 (__v16qi) __A,
			
 
				+                 (__v16qi) _mm_undefined_si128 (),
			
 
				+                 (__mmask16) -1);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm_maskz_permutexvar_epi8 (__mmask16 __M, __m128i __A, __m128i __B)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_permvarqi128_mask ((__v16qi) __B,
			
 
				+                 (__v16qi) __A,
			
 
				+                 (__v16qi) _mm_setzero_si128 (),
			
 
				+                 (__mmask16) __M);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm_mask_permutexvar_epi8 (__m128i __W, __mmask16 __M, __m128i __A,
			
 
				+          __m128i __B)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_permvarqi128_mask ((__v16qi) __B,
			
 
				+                 (__v16qi) __A,
			
 
				+                 (__v16qi) __W,
			
 
				+                 (__mmask16) __M);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_permutexvar_epi8 (__m256i __A, __m256i __B)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_permvarqi256_mask ((__v32qi) __B,
			
 
				+                 (__v32qi) __A,
			
 
				+                 (__v32qi) _mm256_undefined_si256 (),
			
 
				+                 (__mmask32) -1);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_maskz_permutexvar_epi8 (__mmask32 __M, __m256i __A,
			
 
				+        __m256i __B)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_permvarqi256_mask ((__v32qi) __B,
			
 
				+                 (__v32qi) __A,
			
 
				+                 (__v32qi) _mm256_setzero_si256 (),
			
 
				+                 (__mmask32) __M);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_mask_permutexvar_epi8 (__m256i __W, __mmask32 __M, __m256i __A,
			
 
				+             __m256i __B)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_permvarqi256_mask ((__v32qi) __B,
			
 
				+                 (__v32qi) __A,
			
 
				+                 (__v32qi) __W,
			
 
				+                 (__mmask32) __M);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm_mask_multishift_epi64_epi8 (__m128i __W, __mmask16 __M, __m128i __X, __m128i __Y)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_vpmultishiftqb128_mask ((__v16qi) __X,
			
 
				+                (__v16qi) __Y,
			
 
				+                (__v16qi) __W,
			
 
				+                (__mmask16) __M);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm_maskz_multishift_epi64_epi8 (__mmask16 __M, __m128i __X, __m128i __Y)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_vpmultishiftqb128_mask ((__v16qi) __X,
			
 
				+                (__v16qi) __Y,
			
 
				+                (__v16qi)
			
 
				+                _mm_setzero_si128 (),
			
 
				+                (__mmask16) __M);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm_multishift_epi64_epi8 (__m128i __X, __m128i __Y)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_vpmultishiftqb128_mask ((__v16qi) __X,
			
 
				+                (__v16qi) __Y,
			
 
				+                (__v16qi)
			
 
				+                _mm_undefined_si128 (),
			
 
				+                (__mmask16) -1);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_mask_multishift_epi64_epi8 (__m256i __W, __mmask32 __M, __m256i __X, __m256i __Y)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_vpmultishiftqb256_mask ((__v32qi) __X,
			
 
				+                (__v32qi) __Y,
			
 
				+                (__v32qi) __W,
			
 
				+                (__mmask32) __M);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_maskz_multishift_epi64_epi8 (__mmask32 __M, __m256i __X, __m256i __Y)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_vpmultishiftqb256_mask ((__v32qi) __X,
			
 
				+                (__v32qi) __Y,
			
 
				+                (__v32qi)
			
 
				+                _mm256_setzero_si256 (),
			
 
				+                (__mmask32) __M);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_multishift_epi64_epi8 (__m256i __X, __m256i __Y)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_vpmultishiftqb256_mask ((__v32qi) __X,
			
 
				+                (__v32qi) __Y,
			
 
				+                (__v32qi)
			
 
				+                _mm256_undefined_si256 (),
			
 
				+                (__mmask32) -1);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+#undef __DEFAULT_FN_ATTRS
			
 
				+
			
 
				+#endif
			
--- a/demo/include/avx512vlbitalgintrin.h
+++ b/demo/include/avx512vlbitalgintrin.h
@@ -0,0 +1,157 @@
 
				+/*===------------- avx512vlbitalgintrin.h - BITALG intrinsics ------------------===
			
 
				+ *
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+#ifndef __IMMINTRIN_H
			
 
				+#error "Never use <avx512vlbitalgintrin.h> directly; include <immintrin.h> instead."
			
 
				+#endif
			
 
				+
			
 
				+#ifndef __AVX512VLBITALGINTRIN_H
			
 
				+#define __AVX512VLBITALGINTRIN_H
			
 
				+
			
 
				+/* Define the default attributes for the functions in this file. */
			
 
				+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512bitalg")))
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_popcnt_epi16(__m256i __A)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_vpopcntw_256((__v16hi) __A);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_mask_popcnt_epi16(__m256i __A, __mmask16 __U, __m256i __B)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_selectw_256((__mmask16) __U,
			
 
				+              (__v16hi) _mm256_popcnt_epi16(__B),
			
 
				+              (__v16hi) __A);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_maskz_popcnt_epi16(__mmask16 __U, __m256i __B)
			
 
				+{
			
 
				+  return _mm256_mask_popcnt_epi16((__m256i) _mm256_setzero_si256(),
			
 
				+              __U,
			
 
				+              __B);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm128_popcnt_epi16(__m128i __A)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_vpopcntw_128((__v8hi) __A);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm128_mask_popcnt_epi16(__m128i __A, __mmask8 __U, __m128i __B)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_selectw_128((__mmask8) __U,
			
 
				+              (__v8hi) _mm128_popcnt_epi16(__B),
			
 
				+              (__v8hi) __A);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm128_maskz_popcnt_epi16(__mmask8 __U, __m128i __B)
			
 
				+{
			
 
				+  return _mm128_mask_popcnt_epi16((__m128i) _mm_setzero_si128(),
			
 
				+              __U,
			
 
				+              __B);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_popcnt_epi8(__m256i __A)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_vpopcntb_256((__v32qi) __A);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_mask_popcnt_epi8(__m256i __A, __mmask32 __U, __m256i __B)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_selectb_256((__mmask32) __U,
			
 
				+              (__v32qi) _mm256_popcnt_epi8(__B),
			
 
				+              (__v32qi) __A);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_maskz_popcnt_epi8(__mmask32 __U, __m256i __B)
			
 
				+{
			
 
				+  return _mm256_mask_popcnt_epi8((__m256i) _mm256_setzero_si256(),
			
 
				+              __U,
			
 
				+              __B);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm128_popcnt_epi8(__m128i __A)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_vpopcntb_128((__v16qi) __A);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm128_mask_popcnt_epi8(__m128i __A, __mmask16 __U, __m128i __B)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_selectb_128((__mmask16) __U,
			
 
				+              (__v16qi) _mm128_popcnt_epi8(__B),
			
 
				+              (__v16qi) __A);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm128_maskz_popcnt_epi8(__mmask16 __U, __m128i __B)
			
 
				+{
			
 
				+  return _mm128_mask_popcnt_epi8((__m128i) _mm_setzero_si128(),
			
 
				+              __U,
			
 
				+              __B);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
			
 
				+_mm256_mask_bitshuffle_epi32_mask(__mmask32 __U, __m256i __A, __m256i __B)
			
 
				+{
			
 
				+  return (__mmask32) __builtin_ia32_vpshufbitqmb256_mask((__v32qi) __A,
			
 
				+              (__v32qi) __B,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
			
 
				+_mm256_bitshuffle_epi32_mask(__m256i __A, __m256i __B)
			
 
				+{
			
 
				+  return _mm256_mask_bitshuffle_epi32_mask((__mmask32) -1,
			
 
				+              __A,
			
 
				+              __B);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
			
 
				+_mm128_mask_bitshuffle_epi16_mask(__mmask16 __U, __m128i __A, __m128i __B)
			
 
				+{
			
 
				+  return (__mmask16) __builtin_ia32_vpshufbitqmb128_mask((__v16qi) __A,
			
 
				+              (__v16qi) __B,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
			
 
				+_mm128_bitshuffle_epi16_mask(__m128i __A, __m128i __B)
			
 
				+{
			
 
				+  return _mm128_mask_bitshuffle_epi16_mask((__mmask16) -1,
			
 
				+              __A,
			
 
				+              __B);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+#undef __DEFAULT_FN_ATTRS
			
 
				+
			
 
				+#endif
			
--- a/demo/include/avx512vlbwintrin.h
+++ b/demo/include/avx512vlbwintrin.h
--- a/demo/include/avx512vlcdintrin.h
+++ b/demo/include/avx512vlcdintrin.h
@@ -0,0 +1,263 @@
 
				+/*===---- avx512vlcdintrin.h - AVX512VL and AVX512CD intrinsics ---------------------------===
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+#ifndef __IMMINTRIN_H
			
 
				+#error "Never use <avx512vlcdintrin.h> directly; include <immintrin.h> instead."
			
 
				+#endif
			
 
				+
			
 
				+#ifndef __AVX512VLCDINTRIN_H
			
 
				+#define __AVX512VLCDINTRIN_H
			
 
				+
			
 
				+/* Define the default attributes for the functions in this file. */
			
 
				+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512cd")))
			
 
				+
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm_broadcastmb_epi64 (__mmask8 __A)
			
 
				+{ 
			
 
				+  return (__m128i) _mm_set1_epi64x((long long) __A);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_broadcastmb_epi64 (__mmask8 __A)
			
 
				+{
			
 
				+  return (__m256i) _mm256_set1_epi64x((long long)__A);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm_broadcastmw_epi32 (__mmask16 __A)
			
 
				+{
			
 
				+  return (__m128i) _mm_set1_epi32((int)__A);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_broadcastmw_epi32 (__mmask16 __A)
			
 
				+{
			
 
				+  return (__m256i) _mm256_set1_epi32((int)__A);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm_conflict_epi64 (__m128i __A)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
			
 
				+               (__v2di) _mm_undefined_si128 (),
			
 
				+               (__mmask8) -1);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm_mask_conflict_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
			
 
				+               (__v2di) __W,
			
 
				+               (__mmask8) __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm_maskz_conflict_epi64 (__mmask8 __U, __m128i __A)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
			
 
				+               (__v2di)
			
 
				+               _mm_setzero_di (),
			
 
				+               (__mmask8) __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_conflict_epi64 (__m256i __A)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
			
 
				+               (__v4di)  _mm256_undefined_si256 (),
			
 
				+               (__mmask8) -1);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_mask_conflict_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
			
 
				+               (__v4di) __W,
			
 
				+               (__mmask8) __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_maskz_conflict_epi64 (__mmask8 __U, __m256i __A)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
			
 
				+               (__v4di) _mm256_setzero_si256 (),
			
 
				+               (__mmask8) __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm_conflict_epi32 (__m128i __A)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
			
 
				+               (__v4si) _mm_undefined_si128 (),
			
 
				+               (__mmask8) -1);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm_mask_conflict_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
			
 
				+               (__v4si) __W,
			
 
				+               (__mmask8) __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm_maskz_conflict_epi32 (__mmask8 __U, __m128i __A)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
			
 
				+               (__v4si) _mm_setzero_si128 (),
			
 
				+               (__mmask8) __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_conflict_epi32 (__m256i __A)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
			
 
				+               (__v8si) _mm256_undefined_si256 (),
			
 
				+               (__mmask8) -1);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_mask_conflict_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
			
 
				+               (__v8si) __W,
			
 
				+               (__mmask8) __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_maskz_conflict_epi32 (__mmask8 __U, __m256i __A)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
			
 
				+               (__v8si)
			
 
				+               _mm256_setzero_si256 (),
			
 
				+               (__mmask8) __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm_lzcnt_epi32 (__m128i __A)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A,
			
 
				+                 (__v4si)
			
 
				+                 _mm_setzero_si128 (),
			
 
				+                 (__mmask8) -1);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm_mask_lzcnt_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A,
			
 
				+                 (__v4si) __W,
			
 
				+                 (__mmask8) __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm_maskz_lzcnt_epi32 (__mmask8 __U, __m128i __A)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A,
			
 
				+                 (__v4si)
			
 
				+                 _mm_setzero_si128 (),
			
 
				+                 (__mmask8) __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_lzcnt_epi32 (__m256i __A)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A,
			
 
				+                 (__v8si)
			
 
				+                 _mm256_setzero_si256 (),
			
 
				+                 (__mmask8) -1);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_mask_lzcnt_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A,
			
 
				+                 (__v8si) __W,
			
 
				+                 (__mmask8) __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_maskz_lzcnt_epi32 (__mmask8 __U, __m256i __A)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A,
			
 
				+                 (__v8si)
			
 
				+                 _mm256_setzero_si256 (),
			
 
				+                 (__mmask8) __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm_lzcnt_epi64 (__m128i __A)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A,
			
 
				+                 (__v2di)
			
 
				+                 _mm_setzero_di (),
			
 
				+                 (__mmask8) -1);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm_mask_lzcnt_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A,
			
 
				+                 (__v2di) __W,
			
 
				+                 (__mmask8) __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm_maskz_lzcnt_epi64 (__mmask8 __U, __m128i __A)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A,
			
 
				+                 (__v2di)
			
 
				+                 _mm_setzero_di (),
			
 
				+                 (__mmask8) __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_lzcnt_epi64 (__m256i __A)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A,
			
 
				+                 (__v4di)
			
 
				+                 _mm256_setzero_si256 (),
			
 
				+                 (__mmask8) -1);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_mask_lzcnt_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A,
			
 
				+                 (__v4di) __W,
			
 
				+                 (__mmask8) __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_maskz_lzcnt_epi64 (__mmask8 __U, __m256i __A)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A,
			
 
				+                 (__v4di)
			
 
				+                 _mm256_setzero_si256 (),
			
 
				+                 (__mmask8) __U);
			
 
				+}
			
 
				+
			
 
				+#undef __DEFAULT_FN_ATTRS
			
 
				+
			
 
				+#endif /* __AVX512VLCDINTRIN_H */
			
--- a/demo/include/avx512vldqintrin.h
+++ b/demo/include/avx512vldqintrin.h
--- a/demo/include/avx512vlintrin.h
+++ b/demo/include/avx512vlintrin.h
--- a/demo/include/avx512vlvbmi2intrin.h
+++ b/demo/include/avx512vlvbmi2intrin.h
@@ -0,0 +1,748 @@
 
				+/*===------------- avx512vlvbmi2intrin.h - VBMI2 intrinsics -----------------===
			
 
				+ *
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+#ifndef __IMMINTRIN_H
			
 
				+#error "Never use <avx512vlvbmi2intrin.h> directly; include <immintrin.h> instead."
			
 
				+#endif
			
 
				+
			
 
				+#ifndef __AVX512VLVBMI2INTRIN_H
			
 
				+#define __AVX512VLVBMI2INTRIN_H
			
 
				+
			
 
				+/* Define the default attributes for the functions in this file. */
			
 
				+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vbmi2")))
			
 
				+
			
 
				+static  __inline __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm128_setzero_hi(void) {
			
 
				+  return (__m128i)(__v8hi){ 0, 0, 0, 0, 0, 0, 0, 0 };
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm128_mask_compress_epi16(__m128i __S, __mmask8 __U, __m128i __D)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_compresshi128_mask ((__v8hi) __D,
			
 
				+              (__v8hi) __S,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm128_maskz_compress_epi16(__mmask8 __U, __m128i __D)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_compresshi128_mask ((__v8hi) __D,
			
 
				+              (__v8hi) _mm128_setzero_hi(),
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm128_mask_compress_epi8(__m128i __S, __mmask16 __U, __m128i __D)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_compressqi128_mask ((__v16qi) __D,
			
 
				+              (__v16qi) __S,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm128_maskz_compress_epi8(__mmask16 __U, __m128i __D)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_compressqi128_mask ((__v16qi) __D,
			
 
				+              (__v16qi) _mm128_setzero_hi(),
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ void __DEFAULT_FN_ATTRS
			
 
				+_mm128_mask_compressstoreu_epi16(void *__P, __mmask8 __U, __m128i __D)
			
 
				+{
			
 
				+  __builtin_ia32_compressstorehi128_mask ((__v8hi *) __P, (__v8hi) __D,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ void __DEFAULT_FN_ATTRS
			
 
				+_mm128_mask_compressstoreu_epi8(void *__P, __mmask16 __U, __m128i __D)
			
 
				+{
			
 
				+  __builtin_ia32_compressstoreqi128_mask ((__v16qi *) __P, (__v16qi) __D,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm128_mask_expand_epi16(__m128i __S, __mmask8 __U, __m128i __D)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_expandhi128_mask ((__v8hi) __D,
			
 
				+              (__v8hi) __S,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm128_maskz_expand_epi16(__mmask8 __U, __m128i __D)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_expandhi128_mask ((__v8hi) __D,
			
 
				+              (__v8hi) _mm128_setzero_hi(),
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm128_mask_expand_epi8(__m128i __S, __mmask16 __U, __m128i __D)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_expandqi128_mask ((__v16qi) __D,
			
 
				+              (__v16qi) __S,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm128_maskz_expand_epi8(__mmask16 __U, __m128i __D)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_expandqi128_mask ((__v16qi) __D,
			
 
				+              (__v16qi) _mm128_setzero_hi(),
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm128_mask_expandloadu_epi16(__m128i __S, __mmask8 __U, void const *__P)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_expandloadhi128_mask ((const __v8hi *)__P,
			
 
				+              (__v8hi) __S,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm128_maskz_expandloadu_epi16(__mmask8 __U, void const *__P)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_expandloadhi128_mask ((const __v8hi *)__P,
			
 
				+              (__v8hi) _mm128_setzero_hi(),
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm128_mask_expandloadu_epi8(__m128i __S, __mmask16 __U, void const *__P)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_expandloadqi128_mask ((const __v16qi *)__P,
			
 
				+              (__v16qi) __S,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm128_maskz_expandloadu_epi8(__mmask16 __U, void const *__P)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_expandloadqi128_mask ((const __v16qi *)__P,
			
 
				+              (__v16qi) _mm128_setzero_hi(),
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static  __inline __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_setzero_hi(void) {
			
 
				+  return (__m256i)(__v16hi){ 0, 0, 0, 0, 0, 0, 0, 0,
			
 
				+                             0, 0, 0, 0, 0, 0, 0, 0 };
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_mask_compress_epi16(__m256i __S, __mmask16 __U, __m256i __D)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_compresshi256_mask ((__v16hi) __D,
			
 
				+              (__v16hi) __S,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_maskz_compress_epi16(__mmask16 __U, __m256i __D)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_compresshi256_mask ((__v16hi) __D,
			
 
				+              (__v16hi) _mm256_setzero_hi(),
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_mask_compress_epi8(__m256i __S, __mmask32 __U, __m256i __D)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_compressqi256_mask ((__v32qi) __D,
			
 
				+              (__v32qi) __S,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_maskz_compress_epi8(__mmask32 __U, __m256i __D)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_compressqi256_mask ((__v32qi) __D,
			
 
				+              (__v32qi) _mm256_setzero_hi(),
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ void __DEFAULT_FN_ATTRS
			
 
				+_mm256_mask_compressstoreu_epi16(void *__P, __mmask16 __U, __m256i __D)
			
 
				+{
			
 
				+  __builtin_ia32_compressstorehi256_mask ((__v16hi *) __P, (__v16hi) __D,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ void __DEFAULT_FN_ATTRS
			
 
				+_mm256_mask_compressstoreu_epi8(void *__P, __mmask32 __U, __m256i __D)
			
 
				+{
			
 
				+  __builtin_ia32_compressstoreqi256_mask ((__v32qi *) __P, (__v32qi) __D,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_mask_expand_epi16(__m256i __S, __mmask16 __U, __m256i __D)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_expandhi256_mask ((__v16hi) __D,
			
 
				+              (__v16hi) __S,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_maskz_expand_epi16(__mmask16 __U, __m256i __D)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_expandhi256_mask ((__v16hi) __D,
			
 
				+              (__v16hi) _mm256_setzero_hi(),
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_mask_expand_epi8(__m256i __S, __mmask32 __U, __m256i __D)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_expandqi256_mask ((__v32qi) __D,
			
 
				+              (__v32qi) __S,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_maskz_expand_epi8(__mmask32 __U, __m256i __D)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_expandqi256_mask ((__v32qi) __D,
			
 
				+              (__v32qi) _mm256_setzero_hi(),
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_mask_expandloadu_epi16(__m256i __S, __mmask16 __U, void const *__P)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_expandloadhi256_mask ((const __v16hi *)__P,
			
 
				+              (__v16hi) __S,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_maskz_expandloadu_epi16(__mmask16 __U, void const *__P)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_expandloadhi256_mask ((const __v16hi *)__P,
			
 
				+              (__v16hi) _mm256_setzero_hi(),
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_mask_expandloadu_epi8(__m256i __S, __mmask32 __U, void const *__P)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_expandloadqi256_mask ((const __v32qi *)__P,
			
 
				+              (__v32qi) __S,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_maskz_expandloadu_epi8(__mmask32 __U, void const *__P)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_expandloadqi256_mask ((const __v32qi *)__P,
			
 
				+              (__v32qi) _mm256_setzero_hi(),
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+#define _mm256_mask_shldi_epi64(S, U, A, B, I) __extension__ ({ \
			
 
				+  (__m256i)__builtin_ia32_vpshldq256_mask((__v4di)(A), \
			
 
				+                                          (__v4di)(B), \
			
 
				+                                          (int)(I), \
			
 
				+                                          (__v4di)(S), \
			
 
				+                                          (__mmask8)(U)); })
			
 
				+
			
 
				+#define _mm256_maskz_shldi_epi64(U, A, B, I) \
			
 
				+  _mm256_mask_shldi_epi64(_mm256_setzero_hi(), (U), (A), (B), (I))
			
 
				+
			
 
				+#define _mm256_shldi_epi64(A, B, I) \
			
 
				+  _mm256_mask_shldi_epi64(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I))
			
 
				+
			
 
				+#define _mm128_mask_shldi_epi64(S, U, A, B, I) __extension__ ({ \
			
 
				+  (__m128i)__builtin_ia32_vpshldq128_mask((__v2di)(A), \
			
 
				+                                          (__v2di)(B), \
			
 
				+                                          (int)(I), \
			
 
				+                                          (__v2di)(S), \
			
 
				+                                          (__mmask8)(U)); })
			
 
				+
			
 
				+#define _mm128_maskz_shldi_epi64(U, A, B, I) \
			
 
				+  _mm128_mask_shldi_epi64(_mm128_setzero_hi(), (U), (A), (B), (I))
			
 
				+
			
 
				+#define _mm128_shldi_epi64(A, B, I) \
			
 
				+  _mm128_mask_shldi_epi64(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I))
			
 
				+
			
 
				+#define _mm256_mask_shldi_epi32(S, U, A, B, I) __extension__ ({ \
			
 
				+  (__m256i)__builtin_ia32_vpshldd256_mask((__v8si)(A), \
			
 
				+                                          (__v8si)(B), \
			
 
				+                                          (int)(I), \
			
 
				+                                          (__v8si)(S), \
			
 
				+                                          (__mmask8)(U)); })
			
 
				+
			
 
				+#define _mm256_maskz_shldi_epi32(U, A, B, I) \
			
 
				+  _mm256_mask_shldi_epi32(_mm256_setzero_hi(), (U), (A), (B), (I))
			
 
				+
			
 
				+#define _mm256_shldi_epi32(A, B, I) \
			
 
				+  _mm256_mask_shldi_epi32(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I))
			
 
				+
			
 
				+#define _mm128_mask_shldi_epi32(S, U, A, B, I) __extension__ ({ \
			
 
				+  (__m128i)__builtin_ia32_vpshldd128_mask((__v4si)(A), \
			
 
				+                                          (__v4si)(B), \
			
 
				+                                          (int)(I), \
			
 
				+                                          (__v4si)(S), \
			
 
				+                                          (__mmask8)(U)); })
			
 
				+
			
 
				+#define _mm128_maskz_shldi_epi32(U, A, B, I) \
			
 
				+  _mm128_mask_shldi_epi32(_mm128_setzero_hi(), (U), (A), (B), (I))
			
 
				+
			
 
				+#define _mm128_shldi_epi32(A, B, I) \
			
 
				+  _mm128_mask_shldi_epi32(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I))
			
 
				+
			
 
				+#define _mm256_mask_shldi_epi16(S, U, A, B, I) __extension__ ({ \
			
 
				+  (__m256i)__builtin_ia32_vpshldw256_mask((__v16hi)(A), \
			
 
				+                                          (__v16hi)(B), \
			
 
				+                                          (int)(I), \
			
 
				+                                          (__v16hi)(S), \
			
 
				+                                          (__mmask16)(U)); })
			
 
				+
			
 
				+#define _mm256_maskz_shldi_epi16(U, A, B, I) \
			
 
				+  _mm256_mask_shldi_epi16(_mm256_setzero_hi(), (U), (A), (B), (I))
			
 
				+
			
 
				+#define _mm256_shldi_epi16(A, B, I) \
			
 
				+  _mm256_mask_shldi_epi16(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I))
			
 
				+
			
 
				+#define _mm128_mask_shldi_epi16(S, U, A, B, I) __extension__ ({ \
			
 
				+  (__m128i)__builtin_ia32_vpshldw128_mask((__v8hi)(A), \
			
 
				+                                          (__v8hi)(B), \
			
 
				+                                          (int)(I), \
			
 
				+                                          (__v8hi)(S), \
			
 
				+                                          (__mmask8)(U)); })
			
 
				+
			
 
				+#define _mm128_maskz_shldi_epi16(U, A, B, I) \
			
 
				+  _mm128_mask_shldi_epi16(_mm128_setzero_hi(), (U), (A), (B), (I))
			
 
				+
			
 
				+#define _mm128_shldi_epi16(A, B, I) \
			
 
				+  _mm128_mask_shldi_epi16(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I))
			
 
				+
			
 
				+#define _mm256_mask_shrdi_epi64(S, U, A, B, I) __extension__ ({ \
			
 
				+  (__m256i)__builtin_ia32_vpshrdq256_mask((__v4di)(A), \
			
 
				+                                          (__v4di)(B), \
			
 
				+                                          (int)(I), \
			
 
				+                                          (__v4di)(S), \
			
 
				+                                          (__mmask8)(U)); })
			
 
				+
			
 
				+#define _mm256_maskz_shrdi_epi64(U, A, B, I) \
			
 
				+  _mm256_mask_shrdi_epi64(_mm256_setzero_hi(), (U), (A), (B), (I))
			
 
				+
			
 
				+#define _mm256_shrdi_epi64(A, B, I) \
			
 
				+  _mm256_mask_shrdi_epi64(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I))
			
 
				+
			
 
				+#define _mm128_mask_shrdi_epi64(S, U, A, B, I) __extension__ ({ \
			
 
				+  (__m128i)__builtin_ia32_vpshrdq128_mask((__v2di)(A), \
			
 
				+                                          (__v2di)(B), \
			
 
				+                                          (int)(I), \
			
 
				+                                          (__v2di)(S), \
			
 
				+                                          (__mmask8)(U)); })
			
 
				+
			
 
				+#define _mm128_maskz_shrdi_epi64(U, A, B, I) \
			
 
				+  _mm128_mask_shrdi_epi64(_mm128_setzero_hi(), (U), (A), (B), (I))
			
 
				+
			
 
				+#define _mm128_shrdi_epi64(A, B, I) \
			
 
				+  _mm128_mask_shrdi_epi64(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I))
			
 
				+
			
 
				+#define _mm256_mask_shrdi_epi32(S, U, A, B, I) __extension__ ({ \
			
 
				+  (__m256i)__builtin_ia32_vpshrdd256_mask((__v8si)(A), \
			
 
				+                                          (__v8si)(B), \
			
 
				+                                          (int)(I), \
			
 
				+                                          (__v8si)(S), \
			
 
				+                                          (__mmask8)(U)); })
			
 
				+
			
 
				+#define _mm256_maskz_shrdi_epi32(U, A, B, I) \
			
 
				+  _mm256_mask_shrdi_epi32(_mm256_setzero_hi(), (U), (A), (B), (I))
			
 
				+
			
 
				+#define _mm256_shrdi_epi32(A, B, I) \
			
 
				+  _mm256_mask_shrdi_epi32(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I))
			
 
				+
			
 
				+#define _mm128_mask_shrdi_epi32(S, U, A, B, I) __extension__ ({ \
			
 
				+  (__m128i)__builtin_ia32_vpshrdd128_mask((__v4si)(A), \
			
 
				+                                          (__v4si)(B), \
			
 
				+                                          (int)(I), \
			
 
				+                                          (__v4si)(S), \
			
 
				+                                          (__mmask8)(U)); })
			
 
				+
			
 
				+#define _mm128_maskz_shrdi_epi32(U, A, B, I) \
			
 
				+  _mm128_mask_shrdi_epi32(_mm128_setzero_hi(), (U), (A), (B), (I))
			
 
				+
			
 
				+#define _mm128_shrdi_epi32(A, B, I) \
			
 
				+  _mm128_mask_shrdi_epi32(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I))
			
 
				+
			
 
				+#define _mm256_mask_shrdi_epi16(S, U, A, B, I) __extension__ ({ \
			
 
				+  (__m256i)__builtin_ia32_vpshrdw256_mask((__v16hi)(A), \
			
 
				+                                          (__v16hi)(B), \
			
 
				+                                          (int)(I), \
			
 
				+                                          (__v16hi)(S), \
			
 
				+                                          (__mmask16)(U)); })
			
 
				+
			
 
				+#define _mm256_maskz_shrdi_epi16(U, A, B, I) \
			
 
				+  _mm256_mask_shrdi_epi16(_mm256_setzero_hi(), (U), (A), (B), (I))
			
 
				+
			
 
				+#define _mm256_shrdi_epi16(A, B, I) \
			
 
				+  _mm256_mask_shrdi_epi16(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I))
			
 
				+
			
 
				+#define _mm128_mask_shrdi_epi16(S, U, A, B, I) __extension__ ({ \
			
 
				+  (__m128i)__builtin_ia32_vpshrdw128_mask((__v8hi)(A), \
			
 
				+                                          (__v8hi)(B), \
			
 
				+                                          (int)(I), \
			
 
				+                                          (__v8hi)(S), \
			
 
				+                                          (__mmask8)(U)); })
			
 
				+
			
 
				+#define _mm128_maskz_shrdi_epi16(U, A, B, I) \
			
 
				+  _mm128_mask_shrdi_epi16(_mm128_setzero_hi(), (U), (A), (B), (I))
			
 
				+
			
 
				+#define _mm128_shrdi_epi16(A, B, I) \
			
 
				+  _mm128_mask_shrdi_epi16(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I))
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_mask_shldv_epi64(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_vpshldvq256_mask ((__v4di) __S,
			
 
				+              (__v4di) __A,
			
 
				+              (__v4di) __B,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_maskz_shldv_epi64(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_vpshldvq256_maskz ((__v4di) __S,
			
 
				+              (__v4di) __A,
			
 
				+              (__v4di) __B,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_shldv_epi64(__m256i __S, __m256i __A, __m256i __B)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_vpshldvq256_mask ((__v4di) __S,
			
 
				+              (__v4di) __A,
			
 
				+              (__v4di) __B,
			
 
				+              (__mmask8) -1);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm128_mask_shldv_epi64(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_vpshldvq128_mask ((__v2di) __S,
			
 
				+              (__v2di) __A,
			
 
				+              (__v2di) __B,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm128_maskz_shldv_epi64(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_vpshldvq128_maskz ((__v2di) __S,
			
 
				+              (__v2di) __A,
			
 
				+              (__v2di) __B,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm128_shldv_epi64(__m128i __S, __m128i __A, __m128i __B)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_vpshldvq128_mask ((__v2di) __S,
			
 
				+              (__v2di) __A,
			
 
				+              (__v2di) __B,
			
 
				+              (__mmask8) -1);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_mask_shldv_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_vpshldvd256_mask ((__v8si) __S,
			
 
				+              (__v8si) __A,
			
 
				+              (__v8si) __B,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_maskz_shldv_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_vpshldvd256_maskz ((__v8si) __S,
			
 
				+              (__v8si) __A,
			
 
				+              (__v8si) __B,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_shldv_epi32(__m256i __S, __m256i __A, __m256i __B)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_vpshldvd256_mask ((__v8si) __S,
			
 
				+              (__v8si) __A,
			
 
				+              (__v8si) __B,
			
 
				+              (__mmask8) -1);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm128_mask_shldv_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_vpshldvd128_mask ((__v4si) __S,
			
 
				+              (__v4si) __A,
			
 
				+              (__v4si) __B,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm128_maskz_shldv_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_vpshldvd128_maskz ((__v4si) __S,
			
 
				+              (__v4si) __A,
			
 
				+              (__v4si) __B,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm128_shldv_epi32(__m128i __S, __m128i __A, __m128i __B)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_vpshldvd128_mask ((__v4si) __S,
			
 
				+              (__v4si) __A,
			
 
				+              (__v4si) __B,
			
 
				+              (__mmask8) -1);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_mask_shldv_epi16(__m256i __S, __mmask16 __U, __m256i __A, __m256i __B)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_vpshldvw256_mask ((__v16hi) __S,
			
 
				+              (__v16hi) __A,
			
 
				+              (__v16hi) __B,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_maskz_shldv_epi16(__mmask16 __U, __m256i __S, __m256i __A, __m256i __B)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_vpshldvw256_maskz ((__v16hi) __S,
			
 
				+              (__v16hi) __A,
			
 
				+              (__v16hi) __B,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_shldv_epi16(__m256i __S, __m256i __A, __m256i __B)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_vpshldvw256_mask ((__v16hi) __S,
			
 
				+              (__v16hi) __A,
			
 
				+              (__v16hi) __B,
			
 
				+              (__mmask16) -1);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm128_mask_shldv_epi16(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_vpshldvw128_mask ((__v8hi) __S,
			
 
				+              (__v8hi) __A,
			
 
				+              (__v8hi) __B,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm128_maskz_shldv_epi16(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_vpshldvw128_maskz ((__v8hi) __S,
			
 
				+              (__v8hi) __A,
			
 
				+              (__v8hi) __B,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm128_shldv_epi16(__m128i __S, __m128i __A, __m128i __B)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_vpshldvw128_mask ((__v8hi) __S,
			
 
				+              (__v8hi) __A,
			
 
				+              (__v8hi) __B,
			
 
				+              (__mmask8) -1);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_mask_shrdv_epi64(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_vpshrdvq256_mask ((__v4di) __S,
			
 
				+              (__v4di) __A,
			
 
				+              (__v4di) __B,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_maskz_shrdv_epi64(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_vpshrdvq256_maskz ((__v4di) __S,
			
 
				+              (__v4di) __A,
			
 
				+              (__v4di) __B,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_shrdv_epi64(__m256i __S, __m256i __A, __m256i __B)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_vpshrdvq256_mask ((__v4di) __S,
			
 
				+              (__v4di) __A,
			
 
				+              (__v4di) __B,
			
 
				+              (__mmask8) -1);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm128_mask_shrdv_epi64(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_vpshrdvq128_mask ((__v2di) __S,
			
 
				+              (__v2di) __A,
			
 
				+              (__v2di) __B,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm128_maskz_shrdv_epi64(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_vpshrdvq128_maskz ((__v2di) __S,
			
 
				+              (__v2di) __A,
			
 
				+              (__v2di) __B,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm128_shrdv_epi64(__m128i __S, __m128i __A, __m128i __B)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_vpshrdvq128_mask ((__v2di) __S,
			
 
				+              (__v2di) __A,
			
 
				+              (__v2di) __B,
			
 
				+              (__mmask8) -1);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_mask_shrdv_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_vpshrdvd256_mask ((__v8si) __S,
			
 
				+              (__v8si) __A,
			
 
				+              (__v8si) __B,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_maskz_shrdv_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_vpshrdvd256_maskz ((__v8si) __S,
			
 
				+              (__v8si) __A,
			
 
				+              (__v8si) __B,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_shrdv_epi32(__m256i __S, __m256i __A, __m256i __B)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_vpshrdvd256_mask ((__v8si) __S,
			
 
				+              (__v8si) __A,
			
 
				+              (__v8si) __B,
			
 
				+              (__mmask8) -1);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm128_mask_shrdv_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_vpshrdvd128_mask ((__v4si) __S,
			
 
				+              (__v4si) __A,
			
 
				+              (__v4si) __B,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm128_maskz_shrdv_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_vpshrdvd128_maskz ((__v4si) __S,
			
 
				+              (__v4si) __A,
			
 
				+              (__v4si) __B,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm128_shrdv_epi32(__m128i __S, __m128i __A, __m128i __B)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_vpshrdvd128_mask ((__v4si) __S,
			
 
				+              (__v4si) __A,
			
 
				+              (__v4si) __B,
			
 
				+              (__mmask8) -1);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_mask_shrdv_epi16(__m256i __S, __mmask16 __U, __m256i __A, __m256i __B)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_vpshrdvw256_mask ((__v16hi) __S,
			
 
				+              (__v16hi) __A,
			
 
				+              (__v16hi) __B,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_maskz_shrdv_epi16(__mmask16 __U, __m256i __S, __m256i __A, __m256i __B)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_vpshrdvw256_maskz ((__v16hi) __S,
			
 
				+              (__v16hi) __A,
			
 
				+              (__v16hi) __B,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_shrdv_epi16(__m256i __S, __m256i __A, __m256i __B)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_vpshrdvw256_mask ((__v16hi) __S,
			
 
				+              (__v16hi) __A,
			
 
				+              (__v16hi) __B,
			
 
				+              (__mmask16) -1);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm128_mask_shrdv_epi16(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_vpshrdvw128_mask ((__v8hi) __S,
			
 
				+              (__v8hi) __A,
			
 
				+              (__v8hi) __B,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm128_maskz_shrdv_epi16(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_vpshrdvw128_maskz ((__v8hi) __S,
			
 
				+              (__v8hi) __A,
			
 
				+              (__v8hi) __B,
			
 
				+              __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm128_shrdv_epi16(__m128i __S, __m128i __A, __m128i __B)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_vpshrdvw128_mask ((__v8hi) __S,
			
 
				+              (__v8hi) __A,
			
 
				+              (__v8hi) __B,
			
 
				+              (__mmask8) -1);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+#undef __DEFAULT_FN_ATTRS
			
 
				+
			
 
				+#endif
			
--- a/demo/include/avx512vlvnniintrin.h
+++ b/demo/include/avx512vlvnniintrin.h
@@ -0,0 +1,254 @@
 
				+/*===------------- avx512vlvnniintrin.h - VNNI intrinsics ------------------===
			
 
				+ *
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+#ifndef __IMMINTRIN_H
			
 
				+#error "Never use <avx512vlvnniintrin.h> directly; include <immintrin.h> instead."
			
 
				+#endif
			
 
				+
			
 
				+#ifndef __AVX512VLVNNIINTRIN_H
			
 
				+#define __AVX512VLVNNIINTRIN_H
			
 
				+
			
 
				+/* Define the default attributes for the functions in this file. */
			
 
				+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni")))
			
 
				+
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_vpdpbusd256_mask ((__v8si) __S,
			
 
				+              (__v8si) __A,
			
 
				+              (__v8si) __B,
			
 
				+              (__mmask8) __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_vpdpbusd256_maskz ((__v8si) __S,
			
 
				+              (__v8si) __A,
			
 
				+              (__v8si) __B,
			
 
				+              (__mmask8) __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_vpdpbusd256_mask ((__v8si) __S,
			
 
				+              (__v8si) __A,
			
 
				+              (__v8si) __B,
			
 
				+              (__mmask8) -1);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_vpdpbusds256_mask ((__v8si) __S,
			
 
				+              (__v8si) __A,
			
 
				+              (__v8si) __B,
			
 
				+              (__mmask8) __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_vpdpbusds256_maskz ((__v8si) __S,
			
 
				+              (__v8si) __A,
			
 
				+              (__v8si) __B,
			
 
				+              (__mmask8) __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_vpdpbusds256_mask ((__v8si) __S,
			
 
				+              (__v8si) __A,
			
 
				+              (__v8si) __B,
			
 
				+              (__mmask8) -1);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_vpdpwssd256_mask ((__v8si) __S,
			
 
				+              (__v8si) __A,
			
 
				+              (__v8si) __B,
			
 
				+              (__mmask8) __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_vpdpwssd256_maskz ((__v8si) __S,
			
 
				+              (__v8si) __A,
			
 
				+              (__v8si) __B,
			
 
				+              (__mmask8) __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_vpdpwssd256_mask ((__v8si) __S,
			
 
				+              (__v8si) __A,
			
 
				+              (__v8si) __B,
			
 
				+              (__mmask8) -1);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_vpdpwssds256_mask ((__v8si) __S,
			
 
				+              (__v8si) __A,
			
 
				+              (__v8si) __B,
			
 
				+              (__mmask8) __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_vpdpwssds256_maskz ((__v8si) __S,
			
 
				+              (__v8si) __A,
			
 
				+              (__v8si) __B,
			
 
				+              (__mmask8) __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_vpdpwssds256_mask ((__v8si) __S,
			
 
				+              (__v8si) __A,
			
 
				+              (__v8si) __B,
			
 
				+              (__mmask8) -1);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm128_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_vpdpbusd128_mask ((__v4si) __S,
			
 
				+              (__v4si) __A,
			
 
				+              (__v4si) __B,
			
 
				+              (__mmask8) __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm128_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_vpdpbusd128_maskz ((__v4si) __S,
			
 
				+              (__v4si) __A,
			
 
				+              (__v4si) __B,
			
 
				+              (__mmask8) __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm128_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_vpdpbusd128_mask ((__v4si) __S,
			
 
				+              (__v4si) __A,
			
 
				+              (__v4si) __B,
			
 
				+              (__mmask8) -1);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm128_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_vpdpbusds128_mask ((__v4si) __S,
			
 
				+              (__v4si) __A,
			
 
				+              (__v4si) __B,
			
 
				+              (__mmask8) __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm128_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_vpdpbusds128_maskz ((__v4si) __S,
			
 
				+              (__v4si) __A,
			
 
				+              (__v4si) __B,
			
 
				+              (__mmask8) __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm128_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_vpdpbusds128_mask ((__v4si) __S,
			
 
				+              (__v4si) __A,
			
 
				+              (__v4si) __B,
			
 
				+              (__mmask8) -1);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm128_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_vpdpwssd128_mask ((__v4si) __S,
			
 
				+              (__v4si) __A,
			
 
				+              (__v4si) __B,
			
 
				+              (__mmask8) __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm128_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_vpdpwssd128_maskz ((__v4si) __S,
			
 
				+              (__v4si) __A,
			
 
				+              (__v4si) __B,
			
 
				+              (__mmask8) __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm128_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_vpdpwssd128_mask ((__v4si) __S,
			
 
				+              (__v4si) __A,
			
 
				+              (__v4si) __B,
			
 
				+              (__mmask8) -1);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm128_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_vpdpwssds128_mask ((__v4si) __S,
			
 
				+              (__v4si) __A,
			
 
				+              (__v4si) __B,
			
 
				+              (__mmask8) __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm128_maskz_dpwssds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_vpdpwssds128_maskz ((__v4si) __S,
			
 
				+              (__v4si) __A,
			
 
				+              (__v4si) __B,
			
 
				+              (__mmask8) __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm128_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_vpdpwssds128_mask ((__v4si) __S,
			
 
				+              (__v4si) __A,
			
 
				+              (__v4si) __B,
			
 
				+              (__mmask8) -1);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+#undef __DEFAULT_FN_ATTRS
			
 
				+
			
 
				+#endif
			
--- a/demo/include/avx512vnniintrin.h
+++ b/demo/include/avx512vnniintrin.h
@@ -0,0 +1,146 @@
 
				+/*===------------- avx512vnniintrin.h - VNNI intrinsics ------------------===
			
 
				+ *
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+#ifndef __IMMINTRIN_H
			
 
				+#error "Never use <avx512vnniintrin.h> directly; include <immintrin.h> instead."
			
 
				+#endif
			
 
				+
			
 
				+#ifndef __AVX512VNNIINTRIN_H
			
 
				+#define __AVX512VNNIINTRIN_H
			
 
				+
			
 
				+/* Define the default attributes for the functions in this file. */
			
 
				+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vnni")))
			
 
				+
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_mask_dpbusd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_vpdpbusd512_mask ((__v16si) __S,
			
 
				+              (__v16si) __A,
			
 
				+              (__v16si) __B,
			
 
				+              (__mmask16) __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_maskz_dpbusd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_vpdpbusd512_maskz ((__v16si) __S,
			
 
				+              (__v16si) __A,
			
 
				+              (__v16si) __B,
			
 
				+              (__mmask16) __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_dpbusd_epi32(__m512i __S, __m512i __A, __m512i __B)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_vpdpbusd512_mask ((__v16si) __S,
			
 
				+              (__v16si) __A,
			
 
				+              (__v16si) __B,
			
 
				+              (__mmask16) -1);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_mask_dpbusds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_vpdpbusds512_mask ((__v16si) __S,
			
 
				+              (__v16si) __A,
			
 
				+              (__v16si) __B,
			
 
				+              (__mmask16) __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_maskz_dpbusds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_vpdpbusds512_maskz ((__v16si) __S,
			
 
				+              (__v16si) __A,
			
 
				+              (__v16si) __B,
			
 
				+              (__mmask16) __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_dpbusds_epi32(__m512i __S, __m512i __A, __m512i __B)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_vpdpbusds512_mask ((__v16si) __S,
			
 
				+              (__v16si) __A,
			
 
				+              (__v16si) __B,
			
 
				+              (__mmask16) -1);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_mask_dpwssd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_vpdpwssd512_mask ((__v16si) __S,
			
 
				+              (__v16si) __A,
			
 
				+              (__v16si) __B,
			
 
				+              (__mmask16) __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_maskz_dpwssd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_vpdpwssd512_maskz ((__v16si) __S,
			
 
				+              (__v16si) __A,
			
 
				+              (__v16si) __B,
			
 
				+              (__mmask16) __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_dpwssd_epi32(__m512i __S, __m512i __A, __m512i __B)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_vpdpwssd512_mask ((__v16si) __S,
			
 
				+              (__v16si) __A,
			
 
				+              (__v16si) __B,
			
 
				+              (__mmask16) -1);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_mask_dpwssds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_vpdpwssds512_mask ((__v16si) __S,
			
 
				+              (__v16si) __A,
			
 
				+              (__v16si) __B,
			
 
				+              (__mmask16) __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_maskz_dpwssds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_vpdpwssds512_maskz ((__v16si) __S,
			
 
				+              (__v16si) __A,
			
 
				+              (__v16si) __B,
			
 
				+              (__mmask16) __U);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_dpwssds_epi32(__m512i __S, __m512i __A, __m512i __B)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_vpdpwssds512_mask ((__v16si) __S,
			
 
				+              (__v16si) __A,
			
 
				+              (__v16si) __B,
			
 
				+              (__mmask16) -1);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+#undef __DEFAULT_FN_ATTRS
			
 
				+
			
 
				+#endif
			
--- a/demo/include/avx512vpopcntdqintrin.h
+++ b/demo/include/avx512vpopcntdqintrin.h
@@ -0,0 +1,70 @@
 
				+/*===------------- avx512vpopcntdqintrin.h - AVX512VPOPCNTDQ intrinsics
			
 
				+ *------------------===
			
 
				+ *
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+#ifndef __IMMINTRIN_H
			
 
				+#error                                                                         \
			
 
				+    "Never use <avx512vpopcntdqintrin.h> directly; include <immintrin.h> instead."
			
 
				+#endif
			
 
				+
			
 
				+#ifndef __AVX512VPOPCNTDQINTRIN_H
			
 
				+#define __AVX512VPOPCNTDQINTRIN_H
			
 
				+
			
 
				+/* Define the default attributes for the functions in this file. */
			
 
				+#define __DEFAULT_FN_ATTRS                                                     \
			
 
				+  __attribute__((__always_inline__, __nodebug__, __target__("avx512vpopcntd"   \
			
 
				+                                                            "q")))
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_popcnt_epi64(__m512i __A) {
			
 
				+  return (__m512i)__builtin_ia32_vpopcntq_512((__v8di)__A);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_mask_popcnt_epi64(__m512i __W, __mmask8 __U, __m512i __A) {
			
 
				+  return (__m512i)__builtin_ia32_selectq_512(
			
 
				+      (__mmask8)__U, (__v8di)_mm512_popcnt_epi64(__A), (__v8di)__W);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_maskz_popcnt_epi64(__mmask8 __U, __m512i __A) {
			
 
				+  return _mm512_mask_popcnt_epi64((__m512i)_mm512_setzero_si512(), __U, __A);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_popcnt_epi32(__m512i __A) {
			
 
				+  return (__m512i)__builtin_ia32_vpopcntd_512((__v16si)__A);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_mask_popcnt_epi32(__m512i __W, __mmask16 __U, __m512i __A) {
			
 
				+  return (__m512i)__builtin_ia32_selectd_512(
			
 
				+      (__mmask16)__U, (__v16si)_mm512_popcnt_epi32(__A), (__v16si)__W);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS
			
 
				+_mm512_maskz_popcnt_epi32(__mmask16 __U, __m512i __A) {
			
 
				+  return _mm512_mask_popcnt_epi32((__m512i)_mm512_setzero_si512(), __U, __A);
			
 
				+}
			
 
				+
			
 
				+#undef __DEFAULT_FN_ATTRS
			
 
				+
			
 
				+#endif
			
--- a/demo/include/avx512vpopcntdqvlintrin.h
+++ b/demo/include/avx512vpopcntdqvlintrin.h
@@ -0,0 +1,99 @@
 
				+/*===------------- avx512vpopcntdqintrin.h - AVX512VPOPCNTDQ intrinsics
			
 
				+ *------------------===
			
 
				+ *
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+#ifndef __IMMINTRIN_H
			
 
				+#error                                                                         \
			
 
				+    "Never use <avx512vpopcntdqvlintrin.h> directly; include <immintrin.h> instead."
			
 
				+#endif
			
 
				+
			
 
				+#ifndef __AVX512VPOPCNTDQVLINTRIN_H
			
 
				+#define __AVX512VPOPCNTDQVLINTRIN_H
			
 
				+
			
 
				+/* Define the default attributes for the functions in this file. */
			
 
				+#define __DEFAULT_FN_ATTRS                                                     \
			
 
				+  __attribute__((__always_inline__, __nodebug__, __target__("avx512vpopcntdq,avx512vl")))
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_popcnt_epi64(__m128i __A) {
			
 
				+  return (__m128i)__builtin_ia32_vpopcntq_128((__v2di)__A);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm_mask_popcnt_epi64(__m128i __W, __mmask8 __U, __m128i __A) {
			
 
				+  return (__m128i)__builtin_ia32_selectq_128(
			
 
				+      (__mmask8)__U, (__v2di)_mm_popcnt_epi64(__A), (__v2di)__W);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm_maskz_popcnt_epi64(__mmask8 __U, __m128i __A) {
			
 
				+  return _mm_mask_popcnt_epi64((__m128i)_mm_setzero_si128(), __U, __A);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_popcnt_epi32(__m128i __A) {
			
 
				+  return (__m128i)__builtin_ia32_vpopcntd_128((__v4si)__A);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm_mask_popcnt_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
			
 
				+  return (__m128i)__builtin_ia32_selectd_128(
			
 
				+      (__mmask8)__U, (__v4si)_mm_popcnt_epi32(__A), (__v4si)__W);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm_maskz_popcnt_epi32(__mmask8 __U, __m128i __A) {
			
 
				+  return _mm_mask_popcnt_epi32((__m128i)_mm_setzero_si128(), __U, __A);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_popcnt_epi64(__m256i __A) {
			
 
				+  return (__m256i)__builtin_ia32_vpopcntq_256((__v4di)__A);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_mask_popcnt_epi64(__m256i __W, __mmask8 __U, __m256i __A) {
			
 
				+  return (__m256i)__builtin_ia32_selectq_256(
			
 
				+      (__mmask8)__U, (__v4di)_mm256_popcnt_epi64(__A), (__v4di)__W);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_maskz_popcnt_epi64(__mmask8 __U, __m256i __A) {
			
 
				+  return _mm256_mask_popcnt_epi64((__m256i)_mm256_setzero_si256(), __U, __A);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_popcnt_epi32(__m256i __A) {
			
 
				+  return (__m256i)__builtin_ia32_vpopcntd_256((__v8si)__A);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_mask_popcnt_epi32(__m256i __W, __mmask8 __U, __m256i __A) {
			
 
				+  return (__m256i)__builtin_ia32_selectd_256(
			
 
				+      (__mmask8)__U, (__v8si)_mm256_popcnt_epi32(__A), (__v8si)__W);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_maskz_popcnt_epi32(__mmask8 __U, __m256i __A) {
			
 
				+  return _mm256_mask_popcnt_epi32((__m256i)_mm256_setzero_si256(), __U, __A);
			
 
				+}
			
 
				+
			
 
				+#undef __DEFAULT_FN_ATTRS
			
 
				+
			
 
				+#endif
			
--- a/demo/include/avxintrin.h
+++ b/demo/include/avxintrin.h
--- a/demo/include/bmi2intrin.h
+++ b/demo/include/bmi2intrin.h
@@ -0,0 +1,95 @@
 
				+/*===---- bmi2intrin.h - BMI2 intrinsics -----------------------------------===
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+
			
 
				+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
			
 
				+#error "Never use <bmi2intrin.h> directly; include <x86intrin.h> instead."
			
 
				+#endif
			
 
				+
			
 
				+#ifndef __BMI2INTRIN_H
			
 
				+#define __BMI2INTRIN_H
			
 
				+
			
 
				+/* Define the default attributes for the functions in this file. */
			
 
				+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("bmi2")))
			
 
				+
			
 
				+static __inline__ unsigned int __DEFAULT_FN_ATTRS
			
 
				+_bzhi_u32(unsigned int __X, unsigned int __Y)
			
 
				+{
			
 
				+  return __builtin_ia32_bzhi_si(__X, __Y);
			
 
				+}
			
 
				+
			
 
				+static __inline__ unsigned int __DEFAULT_FN_ATTRS
			
 
				+_pdep_u32(unsigned int __X, unsigned int __Y)
			
 
				+{
			
 
				+  return __builtin_ia32_pdep_si(__X, __Y);
			
 
				+}
			
 
				+
			
 
				+static __inline__ unsigned int __DEFAULT_FN_ATTRS
			
 
				+_pext_u32(unsigned int __X, unsigned int __Y)
			
 
				+{
			
 
				+  return __builtin_ia32_pext_si(__X, __Y);
			
 
				+}
			
 
				+
			
 
				+#ifdef  __x86_64__
			
 
				+
			
 
				+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
			
 
				+_bzhi_u64(unsigned long long __X, unsigned long long __Y)
			
 
				+{
			
 
				+  return __builtin_ia32_bzhi_di(__X, __Y);
			
 
				+}
			
 
				+
			
 
				+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
			
 
				+_pdep_u64(unsigned long long __X, unsigned long long __Y)
			
 
				+{
			
 
				+  return __builtin_ia32_pdep_di(__X, __Y);
			
 
				+}
			
 
				+
			
 
				+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
			
 
				+_pext_u64(unsigned long long __X, unsigned long long __Y)
			
 
				+{
			
 
				+  return __builtin_ia32_pext_di(__X, __Y);
			
 
				+}
			
 
				+
			
 
				+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
			
 
				+_mulx_u64 (unsigned long long __X, unsigned long long __Y,
			
 
				+	   unsigned long long *__P)
			
 
				+{
			
 
				+  unsigned __int128 __res = (unsigned __int128) __X * __Y;
			
 
				+  *__P = (unsigned long long) (__res >> 64);
			
 
				+  return (unsigned long long) __res;
			
 
				+}
			
 
				+
			
 
				+#else /* !__x86_64__ */
			
 
				+
			
 
				+static __inline__ unsigned int __DEFAULT_FN_ATTRS
			
 
				+_mulx_u32 (unsigned int __X, unsigned int __Y, unsigned int *__P)
			
 
				+{
			
 
				+  unsigned long long __res = (unsigned long long) __X * __Y;
			
 
				+  *__P = (unsigned int) (__res >> 32);
			
 
				+  return (unsigned int) __res;
			
 
				+}
			
 
				+
			
 
				+#endif /* !__x86_64__  */
			
 
				+
			
 
				+#undef __DEFAULT_FN_ATTRS
			
 
				+
			
 
				+#endif /* __BMI2INTRIN_H */
			
--- a/demo/include/bmiintrin.h
+++ b/demo/include/bmiintrin.h
@@ -0,0 +1,382 @@
 
				+/*===---- bmiintrin.h - BMI intrinsics -------------------------------------===
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+
			
 
				+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
			
 
				+#error "Never use <bmiintrin.h> directly; include <x86intrin.h> instead."
			
 
				+#endif
			
 
				+
			
 
				+#ifndef __BMIINTRIN_H
			
 
				+#define __BMIINTRIN_H
			
 
				+
			
 
				+#define _tzcnt_u16(a)     (__tzcnt_u16((a)))
			
 
				+
			
 
				+#define _andn_u32(a, b)   (__andn_u32((a), (b)))
			
 
				+
			
 
				+/* _bextr_u32 != __bextr_u32 */
			
 
				+#define _blsi_u32(a)      (__blsi_u32((a)))
			
 
				+
			
 
				+#define _blsmsk_u32(a)    (__blsmsk_u32((a)))
			
 
				+
			
 
				+#define _blsr_u32(a)      (__blsr_u32((a)))
			
 
				+
			
 
				+#define _tzcnt_u32(a)     (__tzcnt_u32((a)))
			
 
				+
			
 
				+/* Define the default attributes for the functions in this file. */
			
 
				+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("bmi")))
			
 
				+
			
 
				+/* Allow using the tzcnt intrinsics even for non-BMI targets. Since the TZCNT
			
 
				+   instruction behaves as BSF on non-BMI targets, there is code that expects
			
 
				+   to use it as a potentially faster version of BSF. */
			
 
				+#define __RELAXED_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
			
 
				+
			
 
				+/// \brief Counts the number of trailing zero bits in the operand.
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
			
 
				+///
			
 
				+/// \param __X
			
 
				+///    An unsigned 16-bit integer whose trailing zeros are to be counted.
			
 
				+/// \returns An unsigned 16-bit integer containing the number of trailing zero
			
 
				+///    bits in the operand.
			
 
				+static __inline__ unsigned short __RELAXED_FN_ATTRS
			
 
				+__tzcnt_u16(unsigned short __X)
			
 
				+{
			
 
				+  return __X ? __builtin_ctzs(__X) : 16;
			
 
				+}
			
 
				+
			
 
				+/// \brief Performs a bitwise AND of the second operand with the one's
			
 
				+///    complement of the first operand.
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> ANDN </c> instruction.
			
 
				+///
			
 
				+/// \param __X
			
 
				+///    An unsigned integer containing one of the operands.
			
 
				+/// \param __Y
			
 
				+///    An unsigned integer containing one of the operands.
			
 
				+/// \returns An unsigned integer containing the bitwise AND of the second
			
 
				+///    operand with the one's complement of the first operand.
			
 
				+static __inline__ unsigned int __DEFAULT_FN_ATTRS
			
 
				+__andn_u32(unsigned int __X, unsigned int __Y)
			
 
				+{
			
 
				+  return ~__X & __Y;
			
 
				+}
			
 
				+
			
 
				+/* AMD-specified, double-leading-underscore version of BEXTR */
			
 
				+/// \brief Extracts the specified bits from the first operand and returns them
			
 
				+///    in the least significant bits of the result.
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
			
 
				+///
			
 
				+/// \param __X
			
 
				+///    An unsigned integer whose bits are to be extracted.
			
 
				+/// \param __Y
			
 
				+///    An unsigned integer used to specify which bits are extracted. Bits [7:0]
			
 
				+///    specify the index of the least significant bit. Bits [15:8] specify the
			
 
				+///    number of bits to be extracted.
			
 
				+/// \returns An unsigned integer whose least significant bits contain the
			
 
				+///    extracted bits.
			
 
				+static __inline__ unsigned int __DEFAULT_FN_ATTRS
			
 
				+__bextr_u32(unsigned int __X, unsigned int __Y)
			
 
				+{
			
 
				+  return __builtin_ia32_bextr_u32(__X, __Y);
			
 
				+}
			
 
				+
			
 
				+/* Intel-specified, single-leading-underscore version of BEXTR */
			
 
				+/// \brief Extracts the specified bits from the first operand and returns them
			
 
				+///    in the least significant bits of the result.
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
			
 
				+///
			
 
				+/// \param __X
			
 
				+///    An unsigned integer whose bits are to be extracted.
			
 
				+/// \param __Y
			
 
				+///    An unsigned integer used to specify the index of the least significant
			
 
				+///    bit for the bits to be extracted. Bits [7:0] specify the index.
			
 
				+/// \param __Z
			
 
				+///    An unsigned integer used to specify the number of bits to be extracted.
			
 
				+///    Bits [7:0] specify the number of bits.
			
 
				+/// \returns An unsigned integer whose least significant bits contain the
			
 
				+///    extracted bits.
			
 
				+static __inline__ unsigned int __DEFAULT_FN_ATTRS
			
 
				+_bextr_u32(unsigned int __X, unsigned int __Y, unsigned int __Z)
			
 
				+{
			
 
				+  return __builtin_ia32_bextr_u32 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
			
 
				+}
			
 
				+
			
 
				+/// \brief Clears all bits in the source except for the least significant bit
			
 
				+///    containing a value of 1 and returns the result.
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> BLSI </c> instruction.
			
 
				+///
			
 
				+/// \param __X
			
 
				+///    An unsigned integer whose bits are to be cleared.
			
 
				+/// \returns An unsigned integer containing the result of clearing the bits from
			
 
				+///    the source operand.
			
 
				+static __inline__ unsigned int __DEFAULT_FN_ATTRS
			
 
				+__blsi_u32(unsigned int __X)
			
 
				+{
			
 
				+  return __X & -__X;
			
 
				+}
			
 
				+
			
 
				+/// \brief Creates a mask whose bits are set to 1, using bit 0 up to and
			
 
				+///    including the least significant bit that is set to 1 in the source
			
 
				+///    operand and returns the result.
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> BLSMSK </c> instruction.
			
 
				+///
			
 
				+/// \param __X
			
 
				+///    An unsigned integer used to create the mask.
			
 
				+/// \returns An unsigned integer containing the newly created mask.
			
 
				+static __inline__ unsigned int __DEFAULT_FN_ATTRS
			
 
				+__blsmsk_u32(unsigned int __X)
			
 
				+{
			
 
				+  return __X ^ (__X - 1);
			
 
				+}
			
 
				+
			
 
				+/// \brief Clears the least significant bit that is set to 1 in the source
			
 
				+///    operand and returns the result.
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> BLSR </c> instruction.
			
 
				+///
			
 
				+/// \param __X
			
 
				+///    An unsigned integer containing the operand to be cleared.
			
 
				+/// \returns An unsigned integer containing the result of clearing the source
			
 
				+///    operand.
			
 
				+static __inline__ unsigned int __DEFAULT_FN_ATTRS
			
 
				+__blsr_u32(unsigned int __X)
			
 
				+{
			
 
				+  return __X & (__X - 1);
			
 
				+}
			
 
				+
			
 
				+/// \brief Counts the number of trailing zero bits in the operand.
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
			
 
				+///
			
 
				+/// \param __X
			
 
				+///    An unsigned 32-bit integer whose trailing zeros are to be counted.
			
 
				+/// \returns An unsigned 32-bit integer containing the number of trailing zero
			
 
				+///    bits in the operand.
			
 
				+static __inline__ unsigned int __RELAXED_FN_ATTRS
			
 
				+__tzcnt_u32(unsigned int __X)
			
 
				+{
			
 
				+  return __X ? __builtin_ctz(__X) : 32;
			
 
				+}
			
 
				+
			
 
				+/// \brief Counts the number of trailing zero bits in the operand.
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
			
 
				+///
			
 
				+/// \param __X
			
 
				+///    An unsigned 32-bit integer whose trailing zeros are to be counted.
			
 
				+/// \returns An 32-bit integer containing the number of trailing zero bits in
			
 
				+///    the operand.
			
 
				+static __inline__ int __RELAXED_FN_ATTRS
			
 
				+_mm_tzcnt_32(unsigned int __X)
			
 
				+{
			
 
				+  return __X ? __builtin_ctz(__X) : 32;
			
 
				+}
			
 
				+
			
 
				+#ifdef __x86_64__
			
 
				+
			
 
				+#define _andn_u64(a, b)   (__andn_u64((a), (b)))
			
 
				+
			
 
				+/* _bextr_u64 != __bextr_u64 */
			
 
				+#define _blsi_u64(a)      (__blsi_u64((a)))
			
 
				+
			
 
				+#define _blsmsk_u64(a)    (__blsmsk_u64((a)))
			
 
				+
			
 
				+#define _blsr_u64(a)      (__blsr_u64((a)))
			
 
				+
			
 
				+#define _tzcnt_u64(a)     (__tzcnt_u64((a)))
			
 
				+
			
 
				+/// \brief Performs a bitwise AND of the second operand with the one's
			
 
				+///    complement of the first operand.
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> ANDN </c> instruction.
			
 
				+///
			
 
				+/// \param __X
			
 
				+///    An unsigned 64-bit integer containing one of the operands.
			
 
				+/// \param __Y
			
 
				+///    An unsigned 64-bit integer containing one of the operands.
			
 
				+/// \returns An unsigned 64-bit integer containing the bitwise AND of the second
			
 
				+///    operand with the one's complement of the first operand.
			
 
				+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
			
 
				+__andn_u64 (unsigned long long __X, unsigned long long __Y)
			
 
				+{
			
 
				+  return ~__X & __Y;
			
 
				+}
			
 
				+
			
 
				+/* AMD-specified, double-leading-underscore version of BEXTR */
			
 
				+/// \brief Extracts the specified bits from the first operand and returns them
			
 
				+///    in the least significant bits of the result.
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
			
 
				+///
			
 
				+/// \param __X
			
 
				+///    An unsigned 64-bit integer whose bits are to be extracted.
			
 
				+/// \param __Y
			
 
				+///    An unsigned 64-bit integer used to specify which bits are extracted. Bits
			
 
				+///    [7:0] specify the index of the least significant bit. Bits [15:8] specify
			
 
				+///    the number of bits to be extracted.
			
 
				+/// \returns An unsigned 64-bit integer whose least significant bits contain the
			
 
				+///    extracted bits.
			
 
				+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
			
 
				+__bextr_u64(unsigned long long __X, unsigned long long __Y)
			
 
				+{
			
 
				+  return __builtin_ia32_bextr_u64(__X, __Y);
			
 
				+}
			
 
				+
			
 
				+/* Intel-specified, single-leading-underscore version of BEXTR */
			
 
				+/// \brief Extracts the specified bits from the first operand and returns them
			
 
				+///     in the least significant bits of the result.
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
			
 
				+///
			
 
				+/// \param __X
			
 
				+///    An unsigned 64-bit integer whose bits are to be extracted.
			
 
				+/// \param __Y
			
 
				+///    An unsigned integer used to specify the index of the least significant
			
 
				+///    bit for the bits to be extracted. Bits [7:0] specify the index.
			
 
				+/// \param __Z
			
 
				+///    An unsigned integer used to specify the number of bits to be extracted.
			
 
				+///    Bits [7:0] specify the number of bits.
			
 
				+/// \returns An unsigned 64-bit integer whose least significant bits contain the
			
 
				+///    extracted bits.
			
 
				+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
			
 
				+_bextr_u64(unsigned long long __X, unsigned int __Y, unsigned int __Z)
			
 
				+{
			
 
				+  return __builtin_ia32_bextr_u64 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
			
 
				+}
			
 
				+
			
 
				+/// \brief Clears all bits in the source except for the least significant bit
			
 
				+///    containing a value of 1 and returns the result.
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> BLSI </c> instruction.
			
 
				+///
			
 
				+/// \param __X
			
 
				+///    An unsigned 64-bit integer whose bits are to be cleared.
			
 
				+/// \returns An unsigned 64-bit integer containing the result of clearing the
			
 
				+///    bits from the source operand.
			
 
				+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
			
 
				+__blsi_u64(unsigned long long __X)
			
 
				+{
			
 
				+  return __X & -__X;
			
 
				+}
			
 
				+
			
 
				+/// \brief Creates a mask whose bits are set to 1, using bit 0 up to and
			
 
				+///    including the least significant bit that is set to 1 in the source
			
 
				+///    operand and returns the result.
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> BLSMSK </c> instruction.
			
 
				+///
			
 
				+/// \param __X
			
 
				+///    An unsigned 64-bit integer used to create the mask.
			
 
				+/// \returns An unsigned 64-bit integer containing the newly created mask.
			
 
				+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
			
 
				+__blsmsk_u64(unsigned long long __X)
			
 
				+{
			
 
				+  return __X ^ (__X - 1);
			
 
				+}
			
 
				+
			
 
				+/// \brief Clears the least significant bit that is set to 1 in the source
			
 
				+///    operand and returns the result.
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> BLSR </c> instruction.
			
 
				+///
			
 
				+/// \param __X
			
 
				+///    An unsigned 64-bit integer containing the operand to be cleared.
			
 
				+/// \returns An unsigned 64-bit integer containing the result of clearing the
			
 
				+///    source operand.
			
 
				+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
			
 
				+__blsr_u64(unsigned long long __X)
			
 
				+{
			
 
				+  return __X & (__X - 1);
			
 
				+}
			
 
				+
			
 
				+/// \brief Counts the number of trailing zero bits in the operand.
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
			
 
				+///
			
 
				+/// \param __X
			
 
				+///    An unsigned 64-bit integer whose trailing zeros are to be counted.
			
 
				+/// \returns An unsigned 64-bit integer containing the number of trailing zero
			
 
				+///    bits in the operand.
			
 
				+static __inline__ unsigned long long __RELAXED_FN_ATTRS
			
 
				+__tzcnt_u64(unsigned long long __X)
			
 
				+{
			
 
				+  return __X ? __builtin_ctzll(__X) : 64;
			
 
				+}
			
 
				+
			
 
				+/// \brief Counts the number of trailing zero bits in the operand.
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
			
 
				+///
			
 
				+/// \param __X
			
 
				+///    An unsigned 64-bit integer whose trailing zeros are to be counted.
			
 
				+/// \returns An 64-bit integer containing the number of trailing zero bits in
			
 
				+///    the operand.
			
 
				+static __inline__ long long __RELAXED_FN_ATTRS
			
 
				+_mm_tzcnt_64(unsigned long long __X)
			
 
				+{
			
 
				+  return __X ? __builtin_ctzll(__X) : 64;
			
 
				+}
			
 
				+
			
 
				+#endif /* __x86_64__ */
			
 
				+
			
 
				+#undef __DEFAULT_FN_ATTRS
			
 
				+#undef __RELAXED_FN_ATTRS
			
 
				+
			
 
				+#endif /* __BMIINTRIN_H */
			
--- a/demo/include/cetintrin.h
+++ b/demo/include/cetintrin.h
@@ -0,0 +1,93 @@
 
				+/*===---- cetintrin.h - CET intrinsic ------------------------------------===
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+
			
 
				+#ifndef __IMMINTRIN_H
			
 
				+#error "Never use <cetintrin.h> directly; include <immintrin.h> instead."
			
 
				+#endif
			
 
				+
			
 
				+#ifndef __CETINTRIN_H
			
 
				+#define __CETINTRIN_H
			
 
				+
			
 
				+/* Define the default attributes for the functions in this file. */
			
 
				+#define __DEFAULT_FN_ATTRS                                                     \
			
 
				+  __attribute__((__always_inline__, __nodebug__, __target__("shstk")))
			
 
				+
			
 
				+static __inline__ void __DEFAULT_FN_ATTRS _incsspd(int __a) {
			
 
				+  __builtin_ia32_incsspd(__a);
			
 
				+}
			
 
				+
			
 
				+#ifdef __x86_64__
			
 
				+static __inline__ void __DEFAULT_FN_ATTRS _incsspq(unsigned long long __a) {
			
 
				+  __builtin_ia32_incsspq(__a);
			
 
				+}
			
 
				+#endif /* __x86_64__ */
			
 
				+
			
 
				+static __inline__ unsigned int __DEFAULT_FN_ATTRS _rdsspd(unsigned int __a) {
			
 
				+  return __builtin_ia32_rdsspd(__a);
			
 
				+}
			
 
				+
			
 
				+#ifdef __x86_64__
			
 
				+static __inline__ unsigned long long __DEFAULT_FN_ATTRS _rdsspq(unsigned long long __a) {
			
 
				+  return __builtin_ia32_rdsspq(__a);
			
 
				+}
			
 
				+#endif /* __x86_64__ */
			
 
				+
			
 
				+static __inline__ void __DEFAULT_FN_ATTRS _saveprevssp() {
			
 
				+  __builtin_ia32_saveprevssp();
			
 
				+}
			
 
				+
			
 
				+static __inline__ void __DEFAULT_FN_ATTRS _rstorssp(void * __p) {
			
 
				+  __builtin_ia32_rstorssp(__p);
			
 
				+}
			
 
				+
			
 
				+static __inline__ void __DEFAULT_FN_ATTRS _wrssd(unsigned int __a, void * __p) {
			
 
				+  __builtin_ia32_wrssd(__a, __p);
			
 
				+}
			
 
				+
			
 
				+#ifdef __x86_64__
			
 
				+static __inline__ void __DEFAULT_FN_ATTRS _wrssq(unsigned long long __a, void * __p) {
			
 
				+  __builtin_ia32_wrssq(__a, __p);
			
 
				+}
			
 
				+#endif /* __x86_64__ */
			
 
				+
			
 
				+static __inline__ void __DEFAULT_FN_ATTRS _wrussd(unsigned int __a, void * __p) {
			
 
				+  __builtin_ia32_wrussd(__a, __p);
			
 
				+}
			
 
				+
			
 
				+#ifdef __x86_64__
			
 
				+static __inline__ void __DEFAULT_FN_ATTRS _wrussq(unsigned long long __a, void * __p) {
			
 
				+  __builtin_ia32_wrussq(__a, __p);
			
 
				+}
			
 
				+#endif /* __x86_64__ */
			
 
				+
			
 
				+static __inline__ void __DEFAULT_FN_ATTRS _setssbsy() {
			
 
				+  __builtin_ia32_setssbsy();
			
 
				+}
			
 
				+
			
 
				+static __inline__ void __DEFAULT_FN_ATTRS _clrssbsy(void * __p) {
			
 
				+  __builtin_ia32_clrssbsy(__p);
			
 
				+}
			
 
				+
			
 
				+#undef __DEFAULT_FN_ATTRS
			
 
				+
			
 
				+#endif /* __CETINTRIN_H */
			
--- a/demo/include/clflushoptintrin.h
+++ b/demo/include/clflushoptintrin.h
@@ -0,0 +1,41 @@
 
				+/*===---- clflushoptintrin.h - CLFLUSHOPT intrinsic ------------------------------------===
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+
			
 
				+#ifndef __IMMINTRIN_H
			
 
				+#error "Never use <clflushoptintrin.h> directly; include <immintrin.h> instead."
			
 
				+#endif
			
 
				+
			
 
				+#ifndef __CLFLUSHOPTINTRIN_H
			
 
				+#define __CLFLUSHOPTINTRIN_H
			
 
				+
			
 
				+/* Define the default attributes for the functions in this file. */
			
 
				+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("clflushopt")))
			
 
				+
			
 
				+static __inline__ void __DEFAULT_FN_ATTRS
			
 
				+_mm_clflushopt(void const * __m) {
			
 
				+  __builtin_ia32_clflushopt(__m);
			
 
				+}
			
 
				+
			
 
				+#undef __DEFAULT_FN_ATTRS
			
 
				+
			
 
				+#endif
			
--- a/demo/include/clwbintrin.h
+++ b/demo/include/clwbintrin.h
@@ -0,0 +1,52 @@
 
				+/*===---- clwbintrin.h - CLWB intrinsic ------------------------------------===
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+
			
 
				+#ifndef __IMMINTRIN_H
			
 
				+#error "Never use <clwbintrin.h> directly; include <immintrin.h> instead."
			
 
				+#endif
			
 
				+
			
 
				+#ifndef __CLWBINTRIN_H
			
 
				+#define __CLWBINTRIN_H
			
 
				+
			
 
				+/* Define the default attributes for the functions in this file. */
			
 
				+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("clwb")))
			
 
				+
			
 
				+/// \brief Writes back to memory the cache line (if modified) that contains the
			
 
				+/// linear address specified in \a __p from any level of the cache hierarchy in
			
 
				+/// the cache coherence domain
			
 
				+///
			
 
				+/// \headerfile <immintrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> CLWB </c> instruction.
			
 
				+///
			
 
				+/// \param __p
			
 
				+///    A pointer to the memory location used to identify the cache line to be
			
 
				+///    written back.
			
 
				+static __inline__ void __DEFAULT_FN_ATTRS
			
 
				+_mm_clwb(void const *__p) {
			
 
				+  __builtin_ia32_clwb(__p);
			
 
				+}
			
 
				+
			
 
				+#undef __DEFAULT_FN_ATTRS
			
 
				+
			
 
				+#endif
			
--- a/demo/include/clzerointrin.h
+++ b/demo/include/clzerointrin.h
@@ -0,0 +1,50 @@
 
				+/*===----------------------- clzerointrin.h - CLZERO ----------------------===
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+#ifndef __X86INTRIN_H
			
 
				+#error "Never use <clzerointrin.h> directly; include <x86intrin.h> instead."
			
 
				+#endif
			
 
				+
			
 
				+#ifndef _CLZEROINTRIN_H
			
 
				+#define _CLZEROINTRIN_H
			
 
				+
			
 
				+/* Define the default attributes for the functions in this file. */
			
 
				+#define __DEFAULT_FN_ATTRS \
			
 
				+  __attribute__((__always_inline__, __nodebug__,  __target__("clzero")))
			
 
				+
			
 
				+/// \brief Loads the cache line address and zero's out the cacheline
			
 
				+///
			
 
				+/// \headerfile <clzerointrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> CLZERO </c> instruction.
			
 
				+///
			
 
				+/// \param __line
			
 
				+///    A pointer to a cacheline which needs to be zeroed out.
			
 
				+static __inline__ void __DEFAULT_FN_ATTRS
			
 
				+_mm_clzero (void * __line)
			
 
				+{
			
 
				+  __builtin_ia32_clzero ((void *)__line);
			
 
				+}
			
 
				+
			
 
				+#undef __DEFAULT_FN_ATTRS 
			
 
				+
			
 
				+#endif /* _CLZEROINTRIN_H */
			
--- a/demo/include/cpuid.h
+++ b/demo/include/cpuid.h
@@ -0,0 +1,302 @@
 
				+/*===---- cpuid.h - X86 cpu model detection --------------------------------===
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+
			
 
				+#if !(__x86_64__ || __i386__)
			
 
				+#error this header is for x86 only
			
 
				+#endif
			
 
				+
			
 
				+/* Responses identification request with %eax 0 */
			
 
				+/* AMD:     "AuthenticAMD" */
			
 
				+#define signature_AMD_ebx 0x68747541
			
 
				+#define signature_AMD_edx 0x69746e65
			
 
				+#define signature_AMD_ecx 0x444d4163
			
 
				+/* CENTAUR: "CentaurHauls" */
			
 
				+#define signature_CENTAUR_ebx 0x746e6543
			
 
				+#define signature_CENTAUR_edx 0x48727561
			
 
				+#define signature_CENTAUR_ecx 0x736c7561
			
 
				+/* CYRIX:   "CyrixInstead" */
			
 
				+#define signature_CYRIX_ebx 0x69727943
			
 
				+#define signature_CYRIX_edx 0x736e4978
			
 
				+#define signature_CYRIX_ecx 0x64616574
			
 
				+/* INTEL:   "GenuineIntel" */
			
 
				+#define signature_INTEL_ebx 0x756e6547
			
 
				+#define signature_INTEL_edx 0x49656e69
			
 
				+#define signature_INTEL_ecx 0x6c65746e
			
 
				+/* TM1:     "TransmetaCPU" */
			
 
				+#define signature_TM1_ebx 0x6e617254
			
 
				+#define signature_TM1_edx 0x74656d73
			
 
				+#define signature_TM1_ecx 0x55504361
			
 
				+/* TM2:     "GenuineTMx86" */
			
 
				+#define signature_TM2_ebx 0x756e6547
			
 
				+#define signature_TM2_edx 0x54656e69
			
 
				+#define signature_TM2_ecx 0x3638784d
			
 
				+/* NSC:     "Geode by NSC" */
			
 
				+#define signature_NSC_ebx 0x646f6547
			
 
				+#define signature_NSC_edx 0x43534e20
			
 
				+#define signature_NSC_ecx 0x79622065
			
 
				+/* NEXGEN:  "NexGenDriven" */
			
 
				+#define signature_NEXGEN_ebx 0x4778654e
			
 
				+#define signature_NEXGEN_edx 0x72446e65
			
 
				+#define signature_NEXGEN_ecx 0x6e657669
			
 
				+/* RISE:    "RiseRiseRise" */
			
 
				+#define signature_RISE_ebx 0x65736952
			
 
				+#define signature_RISE_edx 0x65736952
			
 
				+#define signature_RISE_ecx 0x65736952
			
 
				+/* SIS:     "SiS SiS SiS " */
			
 
				+#define signature_SIS_ebx 0x20536953
			
 
				+#define signature_SIS_edx 0x20536953
			
 
				+#define signature_SIS_ecx 0x20536953
			
 
				+/* UMC:     "UMC UMC UMC " */
			
 
				+#define signature_UMC_ebx 0x20434d55
			
 
				+#define signature_UMC_edx 0x20434d55
			
 
				+#define signature_UMC_ecx 0x20434d55
			
 
				+/* VIA:     "VIA VIA VIA " */
			
 
				+#define signature_VIA_ebx 0x20414956
			
 
				+#define signature_VIA_edx 0x20414956
			
 
				+#define signature_VIA_ecx 0x20414956
			
 
				+/* VORTEX:  "Vortex86 SoC" */
			
 
				+#define signature_VORTEX_ebx 0x74726f56
			
 
				+#define signature_VORTEX_edx 0x36387865
			
 
				+#define signature_VORTEX_ecx 0x436f5320
			
 
				+
			
 
				+/* Features in %ecx for leaf 1 */
			
 
				+#define bit_SSE3        0x00000001
			
 
				+#define bit_PCLMULQDQ   0x00000002
			
 
				+#define bit_PCLMUL      bit_PCLMULQDQ   /* for gcc compat */
			
 
				+#define bit_DTES64      0x00000004
			
 
				+#define bit_MONITOR     0x00000008
			
 
				+#define bit_DSCPL       0x00000010
			
 
				+#define bit_VMX         0x00000020
			
 
				+#define bit_SMX         0x00000040
			
 
				+#define bit_EIST        0x00000080
			
 
				+#define bit_TM2         0x00000100
			
 
				+#define bit_SSSE3       0x00000200
			
 
				+#define bit_CNXTID      0x00000400
			
 
				+#define bit_FMA         0x00001000
			
 
				+#define bit_CMPXCHG16B  0x00002000
			
 
				+#define bit_xTPR        0x00004000
			
 
				+#define bit_PDCM        0x00008000
			
 
				+#define bit_PCID        0x00020000
			
 
				+#define bit_DCA         0x00040000
			
 
				+#define bit_SSE41       0x00080000
			
 
				+#define bit_SSE4_1      bit_SSE41       /* for gcc compat */
			
 
				+#define bit_SSE42       0x00100000
			
 
				+#define bit_SSE4_2      bit_SSE42       /* for gcc compat */
			
 
				+#define bit_x2APIC      0x00200000
			
 
				+#define bit_MOVBE       0x00400000
			
 
				+#define bit_POPCNT      0x00800000
			
 
				+#define bit_TSCDeadline 0x01000000
			
 
				+#define bit_AESNI       0x02000000
			
 
				+#define bit_AES         bit_AESNI       /* for gcc compat */
			
 
				+#define bit_XSAVE       0x04000000
			
 
				+#define bit_OSXSAVE     0x08000000
			
 
				+#define bit_AVX         0x10000000
			
 
				+#define bit_F16C        0x20000000
			
 
				+#define bit_RDRND       0x40000000
			
 
				+
			
 
				+/* Features in %edx for leaf 1 */
			
 
				+#define bit_FPU         0x00000001
			
 
				+#define bit_VME         0x00000002
			
 
				+#define bit_DE          0x00000004
			
 
				+#define bit_PSE         0x00000008
			
 
				+#define bit_TSC         0x00000010
			
 
				+#define bit_MSR         0x00000020
			
 
				+#define bit_PAE         0x00000040
			
 
				+#define bit_MCE         0x00000080
			
 
				+#define bit_CX8         0x00000100
			
 
				+#define bit_CMPXCHG8B   bit_CX8         /* for gcc compat */
			
 
				+#define bit_APIC        0x00000200
			
 
				+#define bit_SEP         0x00000800
			
 
				+#define bit_MTRR        0x00001000
			
 
				+#define bit_PGE         0x00002000
			
 
				+#define bit_MCA         0x00004000
			
 
				+#define bit_CMOV        0x00008000
			
 
				+#define bit_PAT         0x00010000
			
 
				+#define bit_PSE36       0x00020000
			
 
				+#define bit_PSN         0x00040000
			
 
				+#define bit_CLFSH       0x00080000
			
 
				+#define bit_DS          0x00200000
			
 
				+#define bit_ACPI        0x00400000
			
 
				+#define bit_MMX         0x00800000
			
 
				+#define bit_FXSR        0x01000000
			
 
				+#define bit_FXSAVE      bit_FXSR        /* for gcc compat */
			
 
				+#define bit_SSE         0x02000000
			
 
				+#define bit_SSE2        0x04000000
			
 
				+#define bit_SS          0x08000000
			
 
				+#define bit_HTT         0x10000000
			
 
				+#define bit_TM          0x20000000
			
 
				+#define bit_PBE         0x80000000
			
 
				+
			
 
				+/* Features in %ebx for leaf 7 sub-leaf 0 */
			
 
				+#define bit_FSGSBASE    0x00000001
			
 
				+#define bit_SGX         0x00000004
			
 
				+#define bit_BMI         0x00000008
			
 
				+#define bit_HLE         0x00000010
			
 
				+#define bit_AVX2        0x00000020
			
 
				+#define bit_SMEP        0x00000080
			
 
				+#define bit_BMI2        0x00000100
			
 
				+#define bit_ENH_MOVSB   0x00000200
			
 
				+#define bit_RTM         0x00000800
			
 
				+#define bit_MPX         0x00004000
			
 
				+#define bit_AVX512F     0x00010000
			
 
				+#define bit_AVX512DQ    0x00020000
			
 
				+#define bit_RDSEED      0x00040000
			
 
				+#define bit_ADX         0x00080000
			
 
				+#define bit_AVX512IFMA  0x00200000
			
 
				+#define bit_CLFLUSHOPT  0x00800000
			
 
				+#define bit_CLWB        0x01000000
			
 
				+#define bit_AVX512PF    0x04000000
			
 
				+#define bit_AVX51SER    0x08000000
			
 
				+#define bit_AVX512CD    0x10000000
			
 
				+#define bit_SHA         0x20000000
			
 
				+#define bit_AVX512BW    0x40000000
			
 
				+#define bit_AVX512VL    0x80000000
			
 
				+
			
 
				+/* Features in %ecx for leaf 7 sub-leaf 0 */
			
 
				+#define bit_PREFTCHWT1       0x00000001
			
 
				+#define bit_AVX512VBMI       0x00000002
			
 
				+#define bit_PKU              0x00000004
			
 
				+#define bit_OSPKE            0x00000010
			
 
				+#define bit_AVX512VBMI2      0x00000040
			
 
				+#define bit_SHSTK            0x00000080
			
 
				+#define bit_GFNI             0x00000100
			
 
				+#define bit_VAES             0x00000200
			
 
				+#define bit_VPCLMULQDQ       0x00000400
			
 
				+#define bit_AVX512VNNI       0x00000800
			
 
				+#define bit_AVX512BITALG     0x00001000
			
 
				+#define bit_AVX512VPOPCNTDQ  0x00004000
			
 
				+#define bit_RDPID            0x00400000
			
 
				+
			
 
				+/* Features in %edx for leaf 7 sub-leaf 0 */
			
 
				+#define bit_AVX5124VNNIW  0x00000004
			
 
				+#define bit_AVX5124FMAPS  0x00000008
			
 
				+#define bit_IBT           0x00100000
			
 
				+
			
 
				+/* Features in %eax for leaf 13 sub-leaf 1 */
			
 
				+#define bit_XSAVEOPT    0x00000001
			
 
				+#define bit_XSAVEC      0x00000002
			
 
				+#define bit_XSAVES      0x00000008
			
 
				+
			
 
				+/* Features in %ecx for leaf 0x80000001 */
			
 
				+#define bit_LAHF_LM     0x00000001
			
 
				+#define bit_ABM         0x00000020
			
 
				+#define bit_LZCNT       bit_ABM        /* for gcc compat */
			
 
				+#define bit_SSE4a       0x00000040
			
 
				+#define bit_PRFCHW      0x00000100
			
 
				+#define bit_XOP         0x00000800
			
 
				+#define bit_LWP         0x00008000
			
 
				+#define bit_FMA4        0x00010000
			
 
				+#define bit_TBM         0x00200000
			
 
				+#define bit_MWAITX      0x20000000
			
 
				+
			
 
				+/* Features in %edx for leaf 0x80000001 */
			
 
				+#define bit_MMXEXT      0x00400000
			
 
				+#define bit_LM          0x20000000
			
 
				+#define bit_3DNOWP      0x40000000
			
 
				+#define bit_3DNOW       0x80000000
			
 
				+
			
 
				+/* Features in %ebx for leaf 0x80000001 */
			
 
				+#define bit_CLZERO      0x00000001
			
 
				+
			
 
				+
			
 
				+#if __i386__
			
 
				+#define __cpuid(__leaf, __eax, __ebx, __ecx, __edx) \
			
 
				+    __asm("cpuid" : "=a"(__eax), "=b" (__ebx), "=c"(__ecx), "=d"(__edx) \
			
 
				+                  : "0"(__leaf))
			
 
				+
			
 
				+#define __cpuid_count(__leaf, __count, __eax, __ebx, __ecx, __edx) \
			
 
				+    __asm("cpuid" : "=a"(__eax), "=b" (__ebx), "=c"(__ecx), "=d"(__edx) \
			
 
				+                  : "0"(__leaf), "2"(__count))
			
 
				+#else
			
 
				+/* x86-64 uses %rbx as the base register, so preserve it. */
			
 
				+#define __cpuid(__leaf, __eax, __ebx, __ecx, __edx) \
			
 
				+    __asm("  xchgq  %%rbx,%q1\n" \
			
 
				+          "  cpuid\n" \
			
 
				+          "  xchgq  %%rbx,%q1" \
			
 
				+        : "=a"(__eax), "=r" (__ebx), "=c"(__ecx), "=d"(__edx) \
			
 
				+        : "0"(__leaf))
			
 
				+
			
 
				+#define __cpuid_count(__leaf, __count, __eax, __ebx, __ecx, __edx) \
			
 
				+    __asm("  xchgq  %%rbx,%q1\n" \
			
 
				+          "  cpuid\n" \
			
 
				+          "  xchgq  %%rbx,%q1" \
			
 
				+        : "=a"(__eax), "=r" (__ebx), "=c"(__ecx), "=d"(__edx) \
			
 
				+        : "0"(__leaf), "2"(__count))
			
 
				+#endif
			
 
				+
			
 
				+static __inline int __get_cpuid_max (unsigned int __leaf, unsigned int *__sig)
			
 
				+{
			
 
				+    unsigned int __eax, __ebx, __ecx, __edx;
			
 
				+#if __i386__
			
 
				+    int __cpuid_supported;
			
 
				+
			
 
				+    __asm("  pushfl\n"
			
 
				+          "  popl   %%eax\n"
			
 
				+          "  movl   %%eax,%%ecx\n"
			
 
				+          "  xorl   $0x00200000,%%eax\n"
			
 
				+          "  pushl  %%eax\n"
			
 
				+          "  popfl\n"
			
 
				+          "  pushfl\n"
			
 
				+          "  popl   %%eax\n"
			
 
				+          "  movl   $0,%0\n"
			
 
				+          "  cmpl   %%eax,%%ecx\n"
			
 
				+          "  je     1f\n"
			
 
				+          "  movl   $1,%0\n"
			
 
				+          "1:"
			
 
				+        : "=r" (__cpuid_supported) : : "eax", "ecx");
			
 
				+    if (!__cpuid_supported)
			
 
				+        return 0;
			
 
				+#endif
			
 
				+
			
 
				+    __cpuid(__leaf, __eax, __ebx, __ecx, __edx);
			
 
				+    if (__sig)
			
 
				+        *__sig = __ebx;
			
 
				+    return __eax;
			
 
				+}
			
 
				+
			
 
				+static __inline int __get_cpuid (unsigned int __leaf, unsigned int *__eax,
			
 
				+                                 unsigned int *__ebx, unsigned int *__ecx,
			
 
				+                                 unsigned int *__edx)
			
 
				+{
			
 
				+    unsigned int __max_leaf = __get_cpuid_max(__leaf & 0x80000000, 0);
			
 
				+
			
 
				+    if (__max_leaf == 0 || __max_leaf < __leaf)
			
 
				+        return 0;
			
 
				+
			
 
				+    __cpuid(__leaf, *__eax, *__ebx, *__ecx, *__edx);
			
 
				+    return 1;
			
 
				+}
			
 
				+
			
 
				+static __inline int __get_cpuid_count (unsigned int __leaf,
			
 
				+                                       unsigned int __subleaf,
			
 
				+                                       unsigned int *__eax, unsigned int *__ebx,
			
 
				+                                       unsigned int *__ecx, unsigned int *__edx)
			
 
				+{
			
 
				+    unsigned int __max_leaf = __get_cpuid_max(__leaf & 0x80000000, 0);
			
 
				+
			
 
				+    if (__max_leaf == 0 || __max_leaf < __leaf)
			
 
				+        return 0;
			
 
				+
			
 
				+    __cpuid_count(__leaf, __subleaf, *__eax, *__ebx, *__ecx, *__edx);
			
 
				+    return 1;
			
 
				+}
			
--- a/demo/include/cuda_wrappers/algorithm
+++ b/demo/include/cuda_wrappers/algorithm
@@ -0,0 +1,96 @@
 
				+/*===---- complex - CUDA wrapper for <algorithm> ----------------------------===
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+
			
 
				+#ifndef __CLANG_CUDA_WRAPPERS_ALGORITHM
			
 
				+#define __CLANG_CUDA_WRAPPERS_ALGORITHM
			
 
				+
			
 
				+// This header defines __device__ overloads of std::min/max, but only if we're
			
 
				+// <= C++11.  In C++14, these functions are constexpr, and so are implicitly
			
 
				+// __host__ __device__.
			
 
				+//
			
 
				+// We don't support the initializer_list overloads because
			
 
				+// initializer_list::begin() and end() are not __host__ __device__ functions.
			
 
				+//
			
 
				+// When compiling in C++14 mode, we could force std::min/max to have different
			
 
				+// implementations for host and device, by declaring the device overloads
			
 
				+// before the constexpr overloads appear.  We choose not to do this because
			
 
				+
			
 
				+//  a) why write our own implementation when we can use one from the standard
			
 
				+//     library? and
			
 
				+//  b) libstdc++ is evil and declares min/max inside a header that is included
			
 
				+//     *before* we include <algorithm>.  So we'd have to unconditionally
			
 
				+//     declare our __device__ overloads of min/max, but that would pollute
			
 
				+//     things for people who choose not to include <algorithm>.
			
 
				+
			
 
				+#include_next <algorithm>
			
 
				+
			
 
				+#if __cplusplus <= 201103L
			
 
				+
			
 
				+// We need to define these overloads in exactly the namespace our standard
			
 
				+// library uses (including the right inline namespace), otherwise they won't be
			
 
				+// picked up by other functions in the standard library (e.g. functions in
			
 
				+// <complex>).  Thus the ugliness below.
			
 
				+#ifdef _LIBCPP_BEGIN_NAMESPACE_STD
			
 
				+_LIBCPP_BEGIN_NAMESPACE_STD
			
 
				+#else
			
 
				+namespace std {
			
 
				+#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
			
 
				+_GLIBCXX_BEGIN_NAMESPACE_VERSION
			
 
				+#endif
			
 
				+#endif
			
 
				+
			
 
				+template <class __T, class __Cmp>
			
 
				+inline __device__ const __T &
			
 
				+max(const __T &__a, const __T &__b, __Cmp __cmp) {
			
 
				+  return __cmp(__a, __b) ? __b : __a;
			
 
				+}
			
 
				+
			
 
				+template <class __T>
			
 
				+inline __device__ const __T &
			
 
				+max(const __T &__a, const __T &__b) {
			
 
				+  return __a < __b ? __b : __a;
			
 
				+}
			
 
				+
			
 
				+template <class __T, class __Cmp>
			
 
				+inline __device__ const __T &
			
 
				+min(const __T &__a, const __T &__b, __Cmp __cmp) {
			
 
				+  return __cmp(__b, __a) ? __b : __a;
			
 
				+}
			
 
				+
			
 
				+template <class __T>
			
 
				+inline __device__ const __T &
			
 
				+min(const __T &__a, const __T &__b) {
			
 
				+  return __a < __b ? __a : __b;
			
 
				+}
			
 
				+
			
 
				+#ifdef _LIBCPP_END_NAMESPACE_STD
			
 
				+_LIBCPP_END_NAMESPACE_STD
			
 
				+#else
			
 
				+#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
			
 
				+_GLIBCXX_END_NAMESPACE_VERSION
			
 
				+#endif
			
 
				+} // namespace std
			
 
				+#endif
			
 
				+
			
 
				+#endif // __cplusplus <= 201103L
			
 
				+#endif // __CLANG_CUDA_WRAPPERS_ALGORITHM
			
--- a/demo/include/cuda_wrappers/complex
+++ b/demo/include/cuda_wrappers/complex
@@ -0,0 +1,82 @@
 
				+/*===---- complex - CUDA wrapper for <complex> ------------------------------===
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+
			
 
				+#ifndef __CLANG_CUDA_WRAPPERS_COMPLEX
			
 
				+#define __CLANG_CUDA_WRAPPERS_COMPLEX
			
 
				+
			
 
				+// Wrapper around <complex> that forces its functions to be __host__
			
 
				+// __device__.
			
 
				+
			
 
				+// First, include host-only headers we think are likely to be included by
			
 
				+// <complex>, so that the pragma below only applies to <complex> itself.
			
 
				+#if __cplusplus >= 201103L
			
 
				+#include <type_traits>
			
 
				+#endif
			
 
				+#include <stdexcept>
			
 
				+#include <cmath>
			
 
				+#include <sstream>
			
 
				+
			
 
				+// Next, include our <algorithm> wrapper, to ensure that device overloads of
			
 
				+// std::min/max are available.
			
 
				+#include <algorithm>
			
 
				+
			
 
				+#pragma clang force_cuda_host_device begin
			
 
				+
			
 
				+// When compiling for device, ask libstdc++ to use its own implements of
			
 
				+// complex functions, rather than calling builtins (which resolve to library
			
 
				+// functions that don't exist when compiling CUDA device code).
			
 
				+//
			
 
				+// This is a little dicey, because it causes libstdc++ to define a different
			
 
				+// set of overloads on host and device.
			
 
				+//
			
 
				+//   // Present only when compiling for host.
			
 
				+//   __host__ __device__ void complex<float> sin(const complex<float>& x) {
			
 
				+//     return __builtin_csinf(x);
			
 
				+//   }
			
 
				+//
			
 
				+//   // Present when compiling for host and for device.
			
 
				+//   template <typename T>
			
 
				+//   void __host__ __device__ complex<T> sin(const complex<T>& x) {
			
 
				+//     return complex<T>(sin(x.real()) * cosh(x.imag()),
			
 
				+//                       cos(x.real()), sinh(x.imag()));
			
 
				+//   }
			
 
				+//
			
 
				+// This is safe because when compiling for device, all function calls in
			
 
				+// __host__ code to sin() will still resolve to *something*, even if they don't
			
 
				+// resolve to the same function as they resolve to when compiling for host.  We
			
 
				+// don't care that they don't resolve to the right function because we won't
			
 
				+// codegen this host code when compiling for device.
			
 
				+
			
 
				+#pragma push_macro("_GLIBCXX_USE_C99_COMPLEX")
			
 
				+#pragma push_macro("_GLIBCXX_USE_C99_COMPLEX_TR1")
			
 
				+#define _GLIBCXX_USE_C99_COMPLEX 0
			
 
				+#define _GLIBCXX_USE_C99_COMPLEX_TR1 0
			
 
				+
			
 
				+#include_next <complex>
			
 
				+
			
 
				+#pragma pop_macro("_GLIBCXX_USE_C99_COMPLEX_TR1")
			
 
				+#pragma pop_macro("_GLIBCXX_USE_C99_COMPLEX")
			
 
				+
			
 
				+#pragma clang force_cuda_host_device end
			
 
				+
			
 
				+#endif // include guard
			
--- a/demo/include/cuda_wrappers/new
+++ b/demo/include/cuda_wrappers/new
@@ -0,0 +1,96 @@
 
				+/*===---- complex - CUDA wrapper for <new> ------------------------------===
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+
			
 
				+#ifndef __CLANG_CUDA_WRAPPERS_NEW
			
 
				+#define __CLANG_CUDA_WRAPPERS_NEW
			
 
				+
			
 
				+#include_next <new>
			
 
				+
			
 
				+#pragma push_macro("CUDA_NOEXCEPT")
			
 
				+#if __cplusplus >= 201103L
			
 
				+#define CUDA_NOEXCEPT noexcept
			
 
				+#else
			
 
				+#define CUDA_NOEXCEPT
			
 
				+#endif
			
 
				+
			
 
				+// Device overrides for non-placement new and delete.
			
 
				+__device__ inline void *operator new(__SIZE_TYPE__ size) {
			
 
				+  if (size == 0) {
			
 
				+    size = 1;
			
 
				+  }
			
 
				+  return ::malloc(size);
			
 
				+}
			
 
				+__device__ inline void *operator new(__SIZE_TYPE__ size,
			
 
				+                                     const std::nothrow_t &) CUDA_NOEXCEPT {
			
 
				+  return ::operator new(size);
			
 
				+}
			
 
				+
			
 
				+__device__ inline void *operator new[](__SIZE_TYPE__ size) {
			
 
				+  return ::operator new(size);
			
 
				+}
			
 
				+__device__ inline void *operator new[](__SIZE_TYPE__ size,
			
 
				+                                       const std::nothrow_t &) {
			
 
				+  return ::operator new(size);
			
 
				+}
			
 
				+
			
 
				+__device__ inline void operator delete(void* ptr) CUDA_NOEXCEPT {
			
 
				+  if (ptr) {
			
 
				+    ::free(ptr);
			
 
				+  }
			
 
				+}
			
 
				+__device__ inline void operator delete(void *ptr,
			
 
				+                                       const std::nothrow_t &) CUDA_NOEXCEPT {
			
 
				+  ::operator delete(ptr);
			
 
				+}
			
 
				+
			
 
				+__device__ inline void operator delete[](void* ptr) CUDA_NOEXCEPT {
			
 
				+  ::operator delete(ptr);
			
 
				+}
			
 
				+__device__ inline void operator delete[](void *ptr,
			
 
				+                                         const std::nothrow_t &) CUDA_NOEXCEPT {
			
 
				+  ::operator delete(ptr);
			
 
				+}
			
 
				+
			
 
				+// Sized delete, C++14 only.
			
 
				+#if __cplusplus >= 201402L
			
 
				+__device__ void operator delete(void *ptr, __SIZE_TYPE__ size) CUDA_NOEXCEPT {
			
 
				+  ::operator delete(ptr);
			
 
				+}
			
 
				+__device__ void operator delete[](void *ptr, __SIZE_TYPE__ size) CUDA_NOEXCEPT {
			
 
				+  ::operator delete(ptr);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+// Device overrides for placement new and delete.
			
 
				+__device__ inline void *operator new(__SIZE_TYPE__, void *__ptr) CUDA_NOEXCEPT {
			
 
				+  return __ptr;
			
 
				+}
			
 
				+__device__ inline void *operator new[](__SIZE_TYPE__, void *__ptr) CUDA_NOEXCEPT {
			
 
				+  return __ptr;
			
 
				+}
			
 
				+__device__ inline void operator delete(void *, void *) CUDA_NOEXCEPT {}
			
 
				+__device__ inline void operator delete[](void *, void *) CUDA_NOEXCEPT {}
			
 
				+
			
 
				+#pragma pop_macro("CUDA_NOEXCEPT")
			
 
				+
			
 
				+#endif // include guard
			
--- a/demo/include/emmintrin.h
+++ b/demo/include/emmintrin.h
--- a/demo/include/f16cintrin.h
+++ b/demo/include/f16cintrin.h
@@ -0,0 +1,124 @@
 
				+/*===---- f16cintrin.h - F16C intrinsics -----------------------------------===
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+
			
 
				+#if !defined __X86INTRIN_H && !defined __EMMINTRIN_H && !defined __IMMINTRIN_H
			
 
				+#error "Never use <f16cintrin.h> directly; include <emmintrin.h> instead."
			
 
				+#endif
			
 
				+
			
 
				+#ifndef __F16CINTRIN_H
			
 
				+#define __F16CINTRIN_H
			
 
				+
			
 
				+/* Define the default attributes for the functions in this file. */
			
 
				+#define __DEFAULT_FN_ATTRS \
			
 
				+  __attribute__((__always_inline__, __nodebug__, __target__("f16c")))
			
 
				+
			
 
				+/// \brief Converts a 16-bit half-precision float value into a 32-bit float
			
 
				+///    value.
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction.
			
 
				+///
			
 
				+/// \param __a
			
 
				+///    A 16-bit half-precision float value.
			
 
				+/// \returns The converted 32-bit float value.
			
 
				+static __inline float __DEFAULT_FN_ATTRS
			
 
				+_cvtsh_ss(unsigned short __a)
			
 
				+{
			
 
				+  __v8hi v = {(short)__a, 0, 0, 0, 0, 0, 0, 0};
			
 
				+  __v4sf r = __builtin_ia32_vcvtph2ps(v);
			
 
				+  return r[0];
			
 
				+}
			
 
				+
			
 
				+/// \brief Converts a 32-bit single-precision float value to a 16-bit
			
 
				+///    half-precision float value.
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// \code
			
 
				+/// unsigned short _cvtss_sh(float a, const int imm);
			
 
				+/// \endcode
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction.
			
 
				+///
			
 
				+/// \param a
			
 
				+///    A 32-bit single-precision float value to be converted to a 16-bit
			
 
				+///    half-precision float value.
			
 
				+/// \param imm
			
 
				+///    An immediate value controlling rounding using bits [2:0]: \n
			
 
				+///    000: Nearest \n
			
 
				+///    001: Down \n
			
 
				+///    010: Up \n
			
 
				+///    011: Truncate \n
			
 
				+///    1XX: Use MXCSR.RC for rounding
			
 
				+/// \returns The converted 16-bit half-precision float value.
			
 
				+#define _cvtss_sh(a, imm) __extension__ ({ \
			
 
				+  (unsigned short)(((__v8hi)__builtin_ia32_vcvtps2ph((__v4sf){a, 0, 0, 0}, \
			
 
				+                                                     (imm)))[0]); })
			
 
				+
			
 
				+/// \brief Converts a 128-bit vector containing 32-bit float values into a
			
 
				+///    128-bit vector containing 16-bit half-precision float values.
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// \code
			
 
				+/// __m128i _mm_cvtps_ph(__m128 a, const int imm);
			
 
				+/// \endcode
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction.
			
 
				+///
			
 
				+/// \param a
			
 
				+///    A 128-bit vector containing 32-bit float values.
			
 
				+/// \param imm
			
 
				+///    An immediate value controlling rounding using bits [2:0]: \n
			
 
				+///    000: Nearest \n
			
 
				+///    001: Down \n
			
 
				+///    010: Up \n
			
 
				+///    011: Truncate \n
			
 
				+///    1XX: Use MXCSR.RC for rounding
			
 
				+/// \returns A 128-bit vector containing converted 16-bit half-precision float
			
 
				+///    values. The lower 64 bits are used to store the converted 16-bit
			
 
				+///    half-precision floating-point values.
			
 
				+#define _mm_cvtps_ph(a, imm) __extension__ ({ \
			
 
				+  (__m128i)__builtin_ia32_vcvtps2ph((__v4sf)(__m128)(a), (imm)); })
			
 
				+
			
 
				+/// \brief Converts a 128-bit vector containing 16-bit half-precision float
			
 
				+///    values into a 128-bit vector containing 32-bit float values.
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction.
			
 
				+///
			
 
				+/// \param __a
			
 
				+///    A 128-bit vector containing 16-bit half-precision float values. The lower
			
 
				+///    64 bits are used in the conversion.
			
 
				+/// \returns A 128-bit vector of [4 x float] containing converted float values.
			
 
				+static __inline __m128 __DEFAULT_FN_ATTRS
			
 
				+_mm_cvtph_ps(__m128i __a)
			
 
				+{
			
 
				+  return (__m128)__builtin_ia32_vcvtph2ps((__v8hi)__a);
			
 
				+}
			
 
				+
			
 
				+#undef __DEFAULT_FN_ATTRS
			
 
				+
			
 
				+#endif /* __F16CINTRIN_H */
			
--- a/demo/include/float.h
+++ b/demo/include/float.h
@@ -0,0 +1,160 @@
 
				+/*===---- float.h - Characteristics of floating point types ----------------===
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+
			
 
				+#ifndef __FLOAT_H
			
 
				+#define __FLOAT_H
			
 
				+
			
 
				+/* If we're on MinGW, fall back to the system's float.h, which might have
			
 
				+ * additional definitions provided for Windows.
			
 
				+ * For more details see http://msdn.microsoft.com/en-us/library/y0ybw9fy.aspx
			
 
				+ *
			
 
				+ * Also fall back on Darwin to allow additional definitions and
			
 
				+ * implementation-defined values.
			
 
				+ */
			
 
				+#if (defined(__APPLE__) || (defined(__MINGW32__) || defined(_MSC_VER))) && \
			
 
				+    __STDC_HOSTED__ && __has_include_next(<float.h>)
			
 
				+
			
 
				+/* Prior to Apple's 10.7 SDK, float.h SDK header used to apply an extra level
			
 
				+ * of #include_next<float.h> to keep Metrowerks compilers happy. Avoid this
			
 
				+ * extra indirection.
			
 
				+ */
			
 
				+#ifdef __APPLE__
			
 
				+#define _FLOAT_H_
			
 
				+#endif
			
 
				+
			
 
				+#  include_next <float.h>
			
 
				+
			
 
				+/* Undefine anything that we'll be redefining below. */
			
 
				+#  undef FLT_EVAL_METHOD
			
 
				+#  undef FLT_ROUNDS
			
 
				+#  undef FLT_RADIX
			
 
				+#  undef FLT_MANT_DIG
			
 
				+#  undef DBL_MANT_DIG
			
 
				+#  undef LDBL_MANT_DIG
			
 
				+#  if __STDC_VERSION__ >= 199901L || !defined(__STRICT_ANSI__)
			
 
				+#    undef DECIMAL_DIG
			
 
				+#  endif
			
 
				+#  undef FLT_DIG
			
 
				+#  undef DBL_DIG
			
 
				+#  undef LDBL_DIG
			
 
				+#  undef FLT_MIN_EXP
			
 
				+#  undef DBL_MIN_EXP
			
 
				+#  undef LDBL_MIN_EXP
			
 
				+#  undef FLT_MIN_10_EXP
			
 
				+#  undef DBL_MIN_10_EXP
			
 
				+#  undef LDBL_MIN_10_EXP
			
 
				+#  undef FLT_MAX_EXP
			
 
				+#  undef DBL_MAX_EXP
			
 
				+#  undef LDBL_MAX_EXP
			
 
				+#  undef FLT_MAX_10_EXP
			
 
				+#  undef DBL_MAX_10_EXP
			
 
				+#  undef LDBL_MAX_10_EXP
			
 
				+#  undef FLT_MAX
			
 
				+#  undef DBL_MAX
			
 
				+#  undef LDBL_MAX
			
 
				+#  undef FLT_EPSILON
			
 
				+#  undef DBL_EPSILON
			
 
				+#  undef LDBL_EPSILON
			
 
				+#  undef FLT_MIN
			
 
				+#  undef DBL_MIN
			
 
				+#  undef LDBL_MIN
			
 
				+#  if __STDC_VERSION__ >= 201112L || !defined(__STRICT_ANSI__)
			
 
				+#    undef FLT_TRUE_MIN
			
 
				+#    undef DBL_TRUE_MIN
			
 
				+#    undef LDBL_TRUE_MIN
			
 
				+#    undef FLT_DECIMAL_DIG
			
 
				+#    undef DBL_DECIMAL_DIG
			
 
				+#    undef LDBL_DECIMAL_DIG
			
 
				+#  endif
			
 
				+#endif
			
 
				+
			
 
				+/* Characteristics of floating point types, C99 5.2.4.2.2 */
			
 
				+
			
 
				+#define FLT_EVAL_METHOD __FLT_EVAL_METHOD__
			
 
				+#define FLT_ROUNDS (__builtin_flt_rounds())
			
 
				+#define FLT_RADIX __FLT_RADIX__
			
 
				+
			
 
				+#define FLT_MANT_DIG __FLT_MANT_DIG__
			
 
				+#define DBL_MANT_DIG __DBL_MANT_DIG__
			
 
				+#define LDBL_MANT_DIG __LDBL_MANT_DIG__
			
 
				+
			
 
				+#if __STDC_VERSION__ >= 199901L || !defined(__STRICT_ANSI__)
			
 
				+#  define DECIMAL_DIG __DECIMAL_DIG__
			
 
				+#endif
			
 
				+
			
 
				+#define FLT_DIG __FLT_DIG__
			
 
				+#define DBL_DIG __DBL_DIG__
			
 
				+#define LDBL_DIG __LDBL_DIG__
			
 
				+
			
 
				+#define FLT_MIN_EXP __FLT_MIN_EXP__
			
 
				+#define DBL_MIN_EXP __DBL_MIN_EXP__
			
 
				+#define LDBL_MIN_EXP __LDBL_MIN_EXP__
			
 
				+
			
 
				+#define FLT_MIN_10_EXP __FLT_MIN_10_EXP__
			
 
				+#define DBL_MIN_10_EXP __DBL_MIN_10_EXP__
			
 
				+#define LDBL_MIN_10_EXP __LDBL_MIN_10_EXP__
			
 
				+
			
 
				+#define FLT_MAX_EXP __FLT_MAX_EXP__
			
 
				+#define DBL_MAX_EXP __DBL_MAX_EXP__
			
 
				+#define LDBL_MAX_EXP __LDBL_MAX_EXP__
			
 
				+
			
 
				+#define FLT_MAX_10_EXP __FLT_MAX_10_EXP__
			
 
				+#define DBL_MAX_10_EXP __DBL_MAX_10_EXP__
			
 
				+#define LDBL_MAX_10_EXP __LDBL_MAX_10_EXP__
			
 
				+
			
 
				+#define FLT_MAX __FLT_MAX__
			
 
				+#define DBL_MAX __DBL_MAX__
			
 
				+#define LDBL_MAX __LDBL_MAX__
			
 
				+
			
 
				+#define FLT_EPSILON __FLT_EPSILON__
			
 
				+#define DBL_EPSILON __DBL_EPSILON__
			
 
				+#define LDBL_EPSILON __LDBL_EPSILON__
			
 
				+
			
 
				+#define FLT_MIN __FLT_MIN__
			
 
				+#define DBL_MIN __DBL_MIN__
			
 
				+#define LDBL_MIN __LDBL_MIN__
			
 
				+
			
 
				+#if __STDC_VERSION__ >= 201112L || !defined(__STRICT_ANSI__)
			
 
				+#  define FLT_TRUE_MIN __FLT_DENORM_MIN__
			
 
				+#  define DBL_TRUE_MIN __DBL_DENORM_MIN__
			
 
				+#  define LDBL_TRUE_MIN __LDBL_DENORM_MIN__
			
 
				+#  define FLT_DECIMAL_DIG __FLT_DECIMAL_DIG__
			
 
				+#  define DBL_DECIMAL_DIG __DBL_DECIMAL_DIG__
			
 
				+#  define LDBL_DECIMAL_DIG __LDBL_DECIMAL_DIG__
			
 
				+#endif
			
 
				+
			
 
				+#ifdef __STDC_WANT_IEC_60559_TYPES_EXT__
			
 
				+#  define FLT16_MANT_DIG    __FLT16_MANT_DIG__
			
 
				+#  define FLT16_DECIMAL_DIG __FLT16_DECIMAL_DIG__
			
 
				+#  define FLT16_DIG         __FLT16_DIG__
			
 
				+#  define FLT16_MIN_EXP     __FLT16_MIN_EXP__
			
 
				+#  define FLT16_MIN_10_EXP  __FLT16_MIN_10_EXP__
			
 
				+#  define FLT16_MAX_EXP     __FLT16_MAX_EXP__
			
 
				+#  define FLT16_MAX_10_EXP  __FLT16_MAX_10_EXP__
			
 
				+#  define FLT16_MAX         __FLT16_MAX__
			
 
				+#  define FLT16_EPSILON     __FLT16_EPSILON__
			
 
				+#  define FLT16_MIN         __FLT16_MIN__
			
 
				+#  define FLT16_TRUE_MIN    __FLT16_TRUE_MIN__
			
 
				+#endif /* __STDC_WANT_IEC_60559_TYPES_EXT__ */
			
 
				+
			
 
				+#endif /* __FLOAT_H */
			
--- a/demo/include/fma4intrin.h
+++ b/demo/include/fma4intrin.h
@@ -0,0 +1,230 @@
 
				+/*===---- fma4intrin.h - FMA4 intrinsics -----------------------------------===
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+
			
 
				+#ifndef __X86INTRIN_H
			
 
				+#error "Never use <fma4intrin.h> directly; include <x86intrin.h> instead."
			
 
				+#endif
			
 
				+
			
 
				+#ifndef __FMA4INTRIN_H
			
 
				+#define __FMA4INTRIN_H
			
 
				+
			
 
				+#include <pmmintrin.h>
			
 
				+
			
 
				+/* Define the default attributes for the functions in this file. */
			
 
				+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("fma4")))
			
 
				+
			
 
				+static __inline__ __m128 __DEFAULT_FN_ATTRS
			
 
				+_mm_macc_ps(__m128 __A, __m128 __B, __m128 __C)
			
 
				+{
			
 
				+  return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128d __DEFAULT_FN_ATTRS
			
 
				+_mm_macc_pd(__m128d __A, __m128d __B, __m128d __C)
			
 
				+{
			
 
				+  return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128 __DEFAULT_FN_ATTRS
			
 
				+_mm_macc_ss(__m128 __A, __m128 __B, __m128 __C)
			
 
				+{
			
 
				+  return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128d __DEFAULT_FN_ATTRS
			
 
				+_mm_macc_sd(__m128d __A, __m128d __B, __m128d __C)
			
 
				+{
			
 
				+  return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128 __DEFAULT_FN_ATTRS
			
 
				+_mm_msub_ps(__m128 __A, __m128 __B, __m128 __C)
			
 
				+{
			
 
				+  return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128d __DEFAULT_FN_ATTRS
			
 
				+_mm_msub_pd(__m128d __A, __m128d __B, __m128d __C)
			
 
				+{
			
 
				+  return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128 __DEFAULT_FN_ATTRS
			
 
				+_mm_msub_ss(__m128 __A, __m128 __B, __m128 __C)
			
 
				+{
			
 
				+  return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128d __DEFAULT_FN_ATTRS
			
 
				+_mm_msub_sd(__m128d __A, __m128d __B, __m128d __C)
			
 
				+{
			
 
				+  return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128 __DEFAULT_FN_ATTRS
			
 
				+_mm_nmacc_ps(__m128 __A, __m128 __B, __m128 __C)
			
 
				+{
			
 
				+  return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128d __DEFAULT_FN_ATTRS
			
 
				+_mm_nmacc_pd(__m128d __A, __m128d __B, __m128d __C)
			
 
				+{
			
 
				+  return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128 __DEFAULT_FN_ATTRS
			
 
				+_mm_nmacc_ss(__m128 __A, __m128 __B, __m128 __C)
			
 
				+{
			
 
				+  return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128d __DEFAULT_FN_ATTRS
			
 
				+_mm_nmacc_sd(__m128d __A, __m128d __B, __m128d __C)
			
 
				+{
			
 
				+  return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128 __DEFAULT_FN_ATTRS
			
 
				+_mm_nmsub_ps(__m128 __A, __m128 __B, __m128 __C)
			
 
				+{
			
 
				+  return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128d __DEFAULT_FN_ATTRS
			
 
				+_mm_nmsub_pd(__m128d __A, __m128d __B, __m128d __C)
			
 
				+{
			
 
				+  return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128 __DEFAULT_FN_ATTRS
			
 
				+_mm_nmsub_ss(__m128 __A, __m128 __B, __m128 __C)
			
 
				+{
			
 
				+  return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128d __DEFAULT_FN_ATTRS
			
 
				+_mm_nmsub_sd(__m128d __A, __m128d __B, __m128d __C)
			
 
				+{
			
 
				+  return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128 __DEFAULT_FN_ATTRS
			
 
				+_mm_maddsub_ps(__m128 __A, __m128 __B, __m128 __C)
			
 
				+{
			
 
				+  return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128d __DEFAULT_FN_ATTRS
			
 
				+_mm_maddsub_pd(__m128d __A, __m128d __B, __m128d __C)
			
 
				+{
			
 
				+  return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128 __DEFAULT_FN_ATTRS
			
 
				+_mm_msubadd_ps(__m128 __A, __m128 __B, __m128 __C)
			
 
				+{
			
 
				+  return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128d __DEFAULT_FN_ATTRS
			
 
				+_mm_msubadd_pd(__m128d __A, __m128d __B, __m128d __C)
			
 
				+{
			
 
				+  return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256 __DEFAULT_FN_ATTRS
			
 
				+_mm256_macc_ps(__m256 __A, __m256 __B, __m256 __C)
			
 
				+{
			
 
				+  return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256d __DEFAULT_FN_ATTRS
			
 
				+_mm256_macc_pd(__m256d __A, __m256d __B, __m256d __C)
			
 
				+{
			
 
				+  return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256 __DEFAULT_FN_ATTRS
			
 
				+_mm256_msub_ps(__m256 __A, __m256 __B, __m256 __C)
			
 
				+{
			
 
				+  return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256d __DEFAULT_FN_ATTRS
			
 
				+_mm256_msub_pd(__m256d __A, __m256d __B, __m256d __C)
			
 
				+{
			
 
				+  return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256 __DEFAULT_FN_ATTRS
			
 
				+_mm256_nmacc_ps(__m256 __A, __m256 __B, __m256 __C)
			
 
				+{
			
 
				+  return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256d __DEFAULT_FN_ATTRS
			
 
				+_mm256_nmacc_pd(__m256d __A, __m256d __B, __m256d __C)
			
 
				+{
			
 
				+  return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256 __DEFAULT_FN_ATTRS
			
 
				+_mm256_nmsub_ps(__m256 __A, __m256 __B, __m256 __C)
			
 
				+{
			
 
				+  return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256d __DEFAULT_FN_ATTRS
			
 
				+_mm256_nmsub_pd(__m256d __A, __m256d __B, __m256d __C)
			
 
				+{
			
 
				+  return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256 __DEFAULT_FN_ATTRS
			
 
				+_mm256_maddsub_ps(__m256 __A, __m256 __B, __m256 __C)
			
 
				+{
			
 
				+  return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256d __DEFAULT_FN_ATTRS
			
 
				+_mm256_maddsub_pd(__m256d __A, __m256d __B, __m256d __C)
			
 
				+{
			
 
				+  return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256 __DEFAULT_FN_ATTRS
			
 
				+_mm256_msubadd_ps(__m256 __A, __m256 __B, __m256 __C)
			
 
				+{
			
 
				+  return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256d __DEFAULT_FN_ATTRS
			
 
				+_mm256_msubadd_pd(__m256d __A, __m256d __B, __m256d __C)
			
 
				+{
			
 
				+  return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
			
 
				+}
			
 
				+
			
 
				+#undef __DEFAULT_FN_ATTRS
			
 
				+
			
 
				+#endif /* __FMA4INTRIN_H */
			
--- a/demo/include/fmaintrin.h
+++ b/demo/include/fmaintrin.h
@@ -0,0 +1,228 @@
 
				+/*===---- fma4intrin.h - FMA4 intrinsics -----------------------------------===
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+
			
 
				+#ifndef __IMMINTRIN_H
			
 
				+#error "Never use <fmaintrin.h> directly; include <immintrin.h> instead."
			
 
				+#endif
			
 
				+
			
 
				+#ifndef __FMAINTRIN_H
			
 
				+#define __FMAINTRIN_H
			
 
				+
			
 
				+/* Define the default attributes for the functions in this file. */
			
 
				+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("fma")))
			
 
				+
			
 
				+static __inline__ __m128 __DEFAULT_FN_ATTRS
			
 
				+_mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C)
			
 
				+{
			
 
				+  return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128d __DEFAULT_FN_ATTRS
			
 
				+_mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
			
 
				+{
			
 
				+  return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128 __DEFAULT_FN_ATTRS
			
 
				+_mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
			
 
				+{
			
 
				+  return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128d __DEFAULT_FN_ATTRS
			
 
				+_mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C)
			
 
				+{
			
 
				+  return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, (__v2df)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128 __DEFAULT_FN_ATTRS
			
 
				+_mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C)
			
 
				+{
			
 
				+  return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128d __DEFAULT_FN_ATTRS
			
 
				+_mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
			
 
				+{
			
 
				+  return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128 __DEFAULT_FN_ATTRS
			
 
				+_mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
			
 
				+{
			
 
				+  return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128d __DEFAULT_FN_ATTRS
			
 
				+_mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C)
			
 
				+{
			
 
				+  return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, -(__v2df)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128 __DEFAULT_FN_ATTRS
			
 
				+_mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C)
			
 
				+{
			
 
				+  return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128d __DEFAULT_FN_ATTRS
			
 
				+_mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
			
 
				+{
			
 
				+  return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128 __DEFAULT_FN_ATTRS
			
 
				+_mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
			
 
				+{
			
 
				+  return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, (__v4sf)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128d __DEFAULT_FN_ATTRS
			
 
				+_mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C)
			
 
				+{
			
 
				+  return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, (__v2df)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128 __DEFAULT_FN_ATTRS
			
 
				+_mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C)
			
 
				+{
			
 
				+  return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128d __DEFAULT_FN_ATTRS
			
 
				+_mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
			
 
				+{
			
 
				+  return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128 __DEFAULT_FN_ATTRS
			
 
				+_mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
			
 
				+{
			
 
				+  return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, -(__v4sf)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128d __DEFAULT_FN_ATTRS
			
 
				+_mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C)
			
 
				+{
			
 
				+  return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, -(__v2df)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128 __DEFAULT_FN_ATTRS
			
 
				+_mm_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C)
			
 
				+{
			
 
				+  return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128d __DEFAULT_FN_ATTRS
			
 
				+_mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C)
			
 
				+{
			
 
				+  return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128 __DEFAULT_FN_ATTRS
			
 
				+_mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C)
			
 
				+{
			
 
				+  return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128d __DEFAULT_FN_ATTRS
			
 
				+_mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C)
			
 
				+{
			
 
				+  return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256 __DEFAULT_FN_ATTRS
			
 
				+_mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C)
			
 
				+{
			
 
				+  return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256d __DEFAULT_FN_ATTRS
			
 
				+_mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C)
			
 
				+{
			
 
				+  return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256 __DEFAULT_FN_ATTRS
			
 
				+_mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C)
			
 
				+{
			
 
				+  return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256d __DEFAULT_FN_ATTRS
			
 
				+_mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C)
			
 
				+{
			
 
				+  return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256 __DEFAULT_FN_ATTRS
			
 
				+_mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C)
			
 
				+{
			
 
				+  return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256d __DEFAULT_FN_ATTRS
			
 
				+_mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C)
			
 
				+{
			
 
				+  return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256 __DEFAULT_FN_ATTRS
			
 
				+_mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C)
			
 
				+{
			
 
				+  return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256d __DEFAULT_FN_ATTRS
			
 
				+_mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C)
			
 
				+{
			
 
				+  return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256 __DEFAULT_FN_ATTRS
			
 
				+_mm256_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C)
			
 
				+{
			
 
				+  return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256d __DEFAULT_FN_ATTRS
			
 
				+_mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C)
			
 
				+{
			
 
				+  return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256 __DEFAULT_FN_ATTRS
			
 
				+_mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C)
			
 
				+{
			
 
				+  return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256d __DEFAULT_FN_ATTRS
			
 
				+_mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C)
			
 
				+{
			
 
				+  return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
			
 
				+}
			
 
				+
			
 
				+#undef __DEFAULT_FN_ATTRS
			
 
				+
			
 
				+#endif /* __FMAINTRIN_H */
			
--- a/demo/include/fxsrintrin.h
+++ b/demo/include/fxsrintrin.h
@@ -0,0 +1,105 @@
 
				+/*===---- fxsrintrin.h - FXSR intrinsic ------------------------------------===
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+
			
 
				+#ifndef __IMMINTRIN_H
			
 
				+#error "Never use <fxsrintrin.h> directly; include <immintrin.h> instead."
			
 
				+#endif
			
 
				+
			
 
				+#ifndef __FXSRINTRIN_H
			
 
				+#define __FXSRINTRIN_H
			
 
				+
			
 
				+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("fxsr")))
			
 
				+
			
 
				+/// \brief Saves the XMM, MMX, MXCSR and x87 FPU registers into a 512-byte
			
 
				+///    memory region pointed to by the input parameter \a __p.
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> FXSAVE </c> instruction.
			
 
				+///
			
 
				+/// \param __p
			
 
				+///    A pointer to a 512-byte memory region. The beginning of this memory
			
 
				+///    region should be aligned on a 16-byte boundary.
			
 
				+static __inline__ void __DEFAULT_FN_ATTRS
			
 
				+_fxsave(void *__p)
			
 
				+{
			
 
				+  return __builtin_ia32_fxsave(__p);
			
 
				+}
			
 
				+
			
 
				+/// \brief Restores the XMM, MMX, MXCSR and x87 FPU registers from the 512-byte
			
 
				+///    memory region pointed to by the input parameter \a __p. The contents of
			
 
				+///    this memory region should have been written to by a previous \c _fxsave
			
 
				+///    or \c _fxsave64 intrinsic.
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> FXRSTOR </c> instruction.
			
 
				+///
			
 
				+/// \param __p
			
 
				+///    A pointer to a 512-byte memory region. The beginning of this memory
			
 
				+///    region should be aligned on a 16-byte boundary.
			
 
				+static __inline__ void __DEFAULT_FN_ATTRS
			
 
				+_fxrstor(void *__p)
			
 
				+{
			
 
				+  return __builtin_ia32_fxrstor(__p);
			
 
				+}
			
 
				+
			
 
				+#ifdef __x86_64__
			
 
				+/// \brief Saves the XMM, MMX, MXCSR and x87 FPU registers into a 512-byte
			
 
				+///    memory region pointed to by the input parameter \a __p.
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> FXSAVE64 </c> instruction.
			
 
				+///
			
 
				+/// \param __p
			
 
				+///    A pointer to a 512-byte memory region. The beginning of this memory
			
 
				+///    region should be aligned on a 16-byte boundary.
			
 
				+static __inline__ void __DEFAULT_FN_ATTRS
			
 
				+_fxsave64(void *__p)
			
 
				+{
			
 
				+  return __builtin_ia32_fxsave64(__p);
			
 
				+}
			
 
				+
			
 
				+/// \brief Restores the XMM, MMX, MXCSR and x87 FPU registers from the 512-byte
			
 
				+///    memory region pointed to by the input parameter \a __p. The contents of
			
 
				+///    this memory region should have been written to by a previous \c _fxsave
			
 
				+///    or \c _fxsave64 intrinsic.
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> FXRSTOR64 </c> instruction.
			
 
				+///
			
 
				+/// \param __p
			
 
				+///    A pointer to a 512-byte memory region. The beginning of this memory
			
 
				+///    region should be aligned on a 16-byte boundary.
			
 
				+static __inline__ void __DEFAULT_FN_ATTRS
			
 
				+_fxrstor64(void *__p)
			
 
				+{
			
 
				+  return __builtin_ia32_fxrstor64(__p);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#undef __DEFAULT_FN_ATTRS
			
 
				+
			
 
				+#endif
			
--- a/demo/include/gfniintrin.h
+++ b/demo/include/gfniintrin.h
@@ -0,0 +1,202 @@
 
				+/*===----------------- gfniintrin.h - GFNI intrinsics ----------------------===
			
 
				+ *
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+#ifndef __IMMINTRIN_H
			
 
				+#error "Never use <gfniintrin.h> directly; include <immintrin.h> instead."
			
 
				+#endif
			
 
				+
			
 
				+#ifndef __GFNIINTRIN_H
			
 
				+#define __GFNIINTRIN_H
			
 
				+
			
 
				+
			
 
				+#define _mm_gf2p8affineinv_epi64_epi8(A, B, I) __extension__ ({                   \
			
 
				+  (__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi((__v16qi)(__m128i)(A),          \
			
 
				+                                                  (__v16qi)(__m128i)(B),          \
			
 
				+                                                  (char)(I)); })
			
 
				+
			
 
				+#define _mm_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) __extension__ ({        \
			
 
				+  (__m128i)__builtin_ia32_selectb_128((__mmask16)(U),                             \
			
 
				+        (__v16qi)_mm_gf2p8affineinv_epi64_epi8(A, B, I),                          \
			
 
				+        (__v16qi)(__m128i)(S)); })
			
 
				+
			
 
				+
			
 
				+#define _mm_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) __extension__ ({          \
			
 
				+  (__m128i)_mm_mask_gf2p8affineinv_epi64_epi8((__m128i)_mm_setzero_si128(),       \
			
 
				+        U, A, B, I); })
			
 
				+
			
 
				+
			
 
				+#define _mm256_gf2p8affineinv_epi64_epi8(A, B, I) __extension__ ({                \
			
 
				+  (__m256i)__builtin_ia32_vgf2p8affineinvqb_v32qi((__v32qi)(__m256i)(A),          \
			
 
				+                                                  (__v32qi)(__m256i)(B),          \
			
 
				+                                                  (char)(I)); })
			
 
				+
			
 
				+#define _mm256_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) __extension__ ({     \
			
 
				+   (__m256i)__builtin_ia32_selectb_256((__mmask32)(U),                            \
			
 
				+        (__v32qi)_mm256_gf2p8affineinv_epi64_epi8(A, B, I),                       \
			
 
				+        (__v32qi)(__m256i)(S)); })
			
 
				+
			
 
				+#define _mm256_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) __extension__ ({       \
			
 
				+  (__m256i)_mm256_mask_gf2p8affineinv_epi64_epi8((__m256i)_mm256_setzero_si256(), \
			
 
				+        U, A, B, I); })
			
 
				+
			
 
				+
			
 
				+#define _mm512_gf2p8affineinv_epi64_epi8(A, B, I) __extension__ ({                \
			
 
				+  (__m512i)__builtin_ia32_vgf2p8affineinvqb_v64qi((__v64qi)(__m512i)(A),          \
			
 
				+                                                  (__v64qi)(__m512i)(B),          \
			
 
				+                                                  (char)(I)); })
			
 
				+
			
 
				+#define _mm512_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) __extension__ ({     \
			
 
				+   (__m512i)__builtin_ia32_selectb_512((__mmask64)(U),                            \
			
 
				+        (__v64qi)_mm512_gf2p8affineinv_epi64_epi8(A, B, I),                       \
			
 
				+        (__v64qi)(__m512i)(S)); })
			
 
				+
			
 
				+#define _mm512_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) __extension__ ({       \
			
 
				+  (__m512i)_mm512_mask_gf2p8affineinv_epi64_epi8((__m512i)_mm512_setzero_qi(),    \
			
 
				+        U, A, B, I); })
			
 
				+
			
 
				+#define _mm_gf2p8affine_epi64_epi8(A, B, I) __extension__ ({                      \
			
 
				+  (__m128i)__builtin_ia32_vgf2p8affineqb_v16qi((__v16qi)(__m128i)(A),             \
			
 
				+                                                  (__v16qi)(__m128i)(B),          \
			
 
				+                                                  (char)(I)); })
			
 
				+
			
 
				+#define _mm_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) __extension__ ({           \
			
 
				+  (__m128i)__builtin_ia32_selectb_128((__mmask16)(U),                             \
			
 
				+        (__v16qi)_mm_gf2p8affine_epi64_epi8(A, B, I),                             \
			
 
				+        (__v16qi)(__m128i)(S)); })
			
 
				+
			
 
				+
			
 
				+#define _mm_maskz_gf2p8affine_epi64_epi8(U, A, B, I) __extension__ ({             \
			
 
				+  (__m128i)_mm_mask_gf2p8affine_epi64_epi8((__m128i)_mm_setzero_si128(),          \
			
 
				+        U, A, B, I); })
			
 
				+
			
 
				+
			
 
				+#define _mm256_gf2p8affine_epi64_epi8(A, B, I) __extension__ ({                   \
			
 
				+  (__m256i)__builtin_ia32_vgf2p8affineqb_v32qi((__v32qi)(__m256i)(A),             \
			
 
				+                                                  (__v32qi)(__m256i)(B),          \
			
 
				+                                                  (char)(I)); })
			
 
				+
			
 
				+#define _mm256_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) __extension__ ({        \
			
 
				+   (__m256i)__builtin_ia32_selectb_256((__mmask32)(U),                            \
			
 
				+        (__v32qi)_mm256_gf2p8affine_epi64_epi8(A, B, I),                          \
			
 
				+        (__v32qi)(__m256i)(S)); })
			
 
				+
			
 
				+#define _mm256_maskz_gf2p8affine_epi64_epi8(U, A, B, I) __extension__ ({          \
			
 
				+  (__m256i)_mm256_mask_gf2p8affine_epi64_epi8((__m256i)_mm256_setzero_si256(),    \
			
 
				+        U, A, B, I); })
			
 
				+
			
 
				+
			
 
				+#define _mm512_gf2p8affine_epi64_epi8(A, B, I) __extension__ ({                   \
			
 
				+  (__m512i)__builtin_ia32_vgf2p8affineqb_v64qi((__v64qi)(__m512i)(A),             \
			
 
				+                                                  (__v64qi)(__m512i)(B),          \
			
 
				+                                                  (char)(I)); })
			
 
				+
			
 
				+#define _mm512_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) __extension__ ({        \
			
 
				+   (__m512i)__builtin_ia32_selectb_512((__mmask64)(U),                            \
			
 
				+        (__v64qi)_mm512_gf2p8affine_epi64_epi8(A, B, I),                          \
			
 
				+        (__v64qi)(__m512i)(S)); })
			
 
				+
			
 
				+#define _mm512_maskz_gf2p8affine_epi64_epi8(U, A, B, I) __extension__ ({          \
			
 
				+  (__m512i)_mm512_mask_gf2p8affine_epi64_epi8((__m512i)_mm512_setzero_qi(),       \
			
 
				+        U, A, B, I); })
			
 
				+
			
 
				+/* Default attributes for simple form (no masking). */
			
 
				+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("gfni")))
			
 
				+
			
 
				+/* Default attributes for ZMM forms. */
			
 
				+#define __DEFAULT_FN_ATTRS_F __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,gfni")))
			
 
				+
			
 
				+/* Default attributes for VLX forms. */
			
 
				+#define __DEFAULT_FN_ATTRS_VL __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni")))
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm_gf2p8mul_epi8(__m128i __A, __m128i __B)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_vgf2p8mulb_v16qi((__v16qi) __A,
			
 
				+              (__v16qi) __B);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS_VL
			
 
				+_mm_mask_gf2p8mul_epi8(__m128i __S, __mmask16 __U, __m128i __A, __m128i __B)
			
 
				+{
			
 
				+  return (__m128i) __builtin_ia32_selectb_128(__U,
			
 
				+              (__v16qi) _mm_gf2p8mul_epi8(__A, __B),
			
 
				+              (__v16qi) __S);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS_VL
			
 
				+_mm_maskz_gf2p8mul_epi8(__mmask16 __U, __m128i __A, __m128i __B)
			
 
				+{
			
 
				+  return _mm_mask_gf2p8mul_epi8((__m128i)_mm_setzero_si128(),
			
 
				+              __U, __A, __B);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS
			
 
				+_mm256_gf2p8mul_epi8(__m256i __A, __m256i __B)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_vgf2p8mulb_v32qi((__v32qi) __A,
			
 
				+              (__v32qi) __B);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS_VL
			
 
				+_mm256_mask_gf2p8mul_epi8(__m256i __S, __mmask32 __U, __m256i __A, __m256i __B)
			
 
				+{
			
 
				+  return (__m256i) __builtin_ia32_selectb_256(__U,
			
 
				+              (__v32qi) _mm256_gf2p8mul_epi8(__A, __B),
			
 
				+              (__v32qi) __S);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m256i __DEFAULT_FN_ATTRS_VL
			
 
				+_mm256_maskz_gf2p8mul_epi8(__mmask32 __U, __m256i __A, __m256i __B)
			
 
				+{
			
 
				+  return _mm256_mask_gf2p8mul_epi8((__m256i)_mm256_setzero_si256(),
			
 
				+              __U, __A, __B);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS_F
			
 
				+_mm512_gf2p8mul_epi8(__m512i __A, __m512i __B)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_vgf2p8mulb_v64qi((__v64qi) __A,
			
 
				+              (__v64qi) __B);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS_F
			
 
				+_mm512_mask_gf2p8mul_epi8(__m512i __S, __mmask64 __U, __m512i __A, __m512i __B)
			
 
				+{
			
 
				+  return (__m512i) __builtin_ia32_selectb_512(__U,
			
 
				+              (__v64qi) _mm512_gf2p8mul_epi8(__A, __B),
			
 
				+              (__v64qi) __S);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m512i __DEFAULT_FN_ATTRS_F
			
 
				+_mm512_maskz_gf2p8mul_epi8(__mmask64 __U, __m512i __A, __m512i __B)
			
 
				+{
			
 
				+  return _mm512_mask_gf2p8mul_epi8((__m512i)_mm512_setzero_qi(),
			
 
				+              __U, __A, __B);
			
 
				+}
			
 
				+
			
 
				+#undef __DEFAULT_FN_ATTRS
			
 
				+#undef __DEFAULT_FN_ATTRS_F
			
 
				+#undef __DEFAULT_FN_ATTRS_VL
			
 
				+
			
 
				+#endif // __GFNIINTRIN_H
			
 
				+
			
--- a/demo/include/htmintrin.h
+++ b/demo/include/htmintrin.h
@@ -0,0 +1,226 @@
 
				+/*===---- htmintrin.h - Standard header for PowerPC HTM ---------------===*\
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+\*===----------------------------------------------------------------------===*/
			
 
				+
			
 
				+#ifndef __HTMINTRIN_H
			
 
				+#define __HTMINTRIN_H
			
 
				+
			
 
				+#ifndef __HTM__
			
 
				+#error "HTM instruction set not enabled"
			
 
				+#endif
			
 
				+
			
 
				+#ifdef __powerpc__
			
 
				+
			
 
				+#include <stdint.h>
			
 
				+
			
 
				+typedef uint64_t texasr_t;
			
 
				+typedef uint32_t texasru_t;
			
 
				+typedef uint32_t texasrl_t;
			
 
				+typedef uintptr_t tfiar_t;
			
 
				+typedef uintptr_t tfhar_t;
			
 
				+
			
 
				+#define _HTM_STATE(CR0) ((CR0 >> 1) & 0x3)
			
 
				+#define _HTM_NONTRANSACTIONAL 0x0
			
 
				+#define _HTM_SUSPENDED        0x1
			
 
				+#define _HTM_TRANSACTIONAL    0x2
			
 
				+
			
 
				+#define _TEXASR_EXTRACT_BITS(TEXASR,BITNUM,SIZE) \
			
 
				+  (((TEXASR) >> (63-(BITNUM))) & ((1<<(SIZE))-1))
			
 
				+#define _TEXASRU_EXTRACT_BITS(TEXASR,BITNUM,SIZE) \
			
 
				+  (((TEXASR) >> (31-(BITNUM))) & ((1<<(SIZE))-1))
			
 
				+
			
 
				+#define _TEXASR_FAILURE_CODE(TEXASR) \
			
 
				+  _TEXASR_EXTRACT_BITS(TEXASR, 7, 8)
			
 
				+#define _TEXASRU_FAILURE_CODE(TEXASRU) \
			
 
				+  _TEXASRU_EXTRACT_BITS(TEXASRU, 7, 8)
			
 
				+
			
 
				+#define _TEXASR_FAILURE_PERSISTENT(TEXASR) \
			
 
				+  _TEXASR_EXTRACT_BITS(TEXASR, 7, 1)
			
 
				+#define _TEXASRU_FAILURE_PERSISTENT(TEXASRU) \
			
 
				+  _TEXASRU_EXTRACT_BITS(TEXASRU, 7, 1)
			
 
				+
			
 
				+#define _TEXASR_DISALLOWED(TEXASR) \
			
 
				+  _TEXASR_EXTRACT_BITS(TEXASR, 8, 1)
			
 
				+#define _TEXASRU_DISALLOWED(TEXASRU) \
			
 
				+  _TEXASRU_EXTRACT_BITS(TEXASRU, 8, 1)
			
 
				+
			
 
				+#define _TEXASR_NESTING_OVERFLOW(TEXASR) \
			
 
				+  _TEXASR_EXTRACT_BITS(TEXASR, 9, 1)
			
 
				+#define _TEXASRU_NESTING_OVERFLOW(TEXASRU) \
			
 
				+  _TEXASRU_EXTRACT_BITS(TEXASRU, 9, 1)
			
 
				+
			
 
				+#define _TEXASR_FOOTPRINT_OVERFLOW(TEXASR) \
			
 
				+  _TEXASR_EXTRACT_BITS(TEXASR, 10, 1)
			
 
				+#define _TEXASRU_FOOTPRINT_OVERFLOW(TEXASRU) \
			
 
				+  _TEXASRU_EXTRACT_BITS(TEXASRU, 10, 1)
			
 
				+
			
 
				+#define _TEXASR_SELF_INDUCED_CONFLICT(TEXASR) \
			
 
				+  _TEXASR_EXTRACT_BITS(TEXASR, 11, 1)
			
 
				+#define _TEXASRU_SELF_INDUCED_CONFLICT(TEXASRU) \
			
 
				+  _TEXASRU_EXTRACT_BITS(TEXASRU, 11, 1)
			
 
				+
			
 
				+#define _TEXASR_NON_TRANSACTIONAL_CONFLICT(TEXASR) \
			
 
				+  _TEXASR_EXTRACT_BITS(TEXASR, 12, 1)
			
 
				+#define _TEXASRU_NON_TRANSACTIONAL_CONFLICT(TEXASRU) \
			
 
				+  _TEXASRU_EXTRACT_BITS(TEXASRU, 12, 1)
			
 
				+
			
 
				+#define _TEXASR_TRANSACTION_CONFLICT(TEXASR) \
			
 
				+  _TEXASR_EXTRACT_BITS(TEXASR, 13, 1)
			
 
				+#define _TEXASRU_TRANSACTION_CONFLICT(TEXASRU) \
			
 
				+  _TEXASRU_EXTRACT_BITS(TEXASRU, 13, 1)
			
 
				+
			
 
				+#define _TEXASR_TRANSLATION_INVALIDATION_CONFLICT(TEXASR) \
			
 
				+  _TEXASR_EXTRACT_BITS(TEXASR, 14, 1)
			
 
				+#define _TEXASRU_TRANSLATION_INVALIDATION_CONFLICT(TEXASRU) \
			
 
				+  _TEXASRU_EXTRACT_BITS(TEXASRU, 14, 1)
			
 
				+
			
 
				+#define _TEXASR_IMPLEMENTAION_SPECIFIC(TEXASR) \
			
 
				+  _TEXASR_EXTRACT_BITS(TEXASR, 15, 1)
			
 
				+#define _TEXASRU_IMPLEMENTAION_SPECIFIC(TEXASRU) \
			
 
				+  _TEXASRU_EXTRACT_BITS(TEXASRU, 15, 1)
			
 
				+
			
 
				+#define _TEXASR_INSTRUCTION_FETCH_CONFLICT(TEXASR) \
			
 
				+  _TEXASR_EXTRACT_BITS(TEXASR, 16, 1)
			
 
				+#define _TEXASRU_INSTRUCTION_FETCH_CONFLICT(TEXASRU) \
			
 
				+  _TEXASRU_EXTRACT_BITS(TEXASRU, 16, 1)
			
 
				+
			
 
				+#define _TEXASR_ABORT(TEXASR) \
			
 
				+  _TEXASR_EXTRACT_BITS(TEXASR, 31, 1)
			
 
				+#define _TEXASRU_ABORT(TEXASRU) \
			
 
				+  _TEXASRU_EXTRACT_BITS(TEXASRU, 31, 1)
			
 
				+
			
 
				+
			
 
				+#define _TEXASR_SUSPENDED(TEXASR) \
			
 
				+  _TEXASR_EXTRACT_BITS(TEXASR, 32, 1)
			
 
				+
			
 
				+#define _TEXASR_PRIVILEGE(TEXASR) \
			
 
				+  _TEXASR_EXTRACT_BITS(TEXASR, 35, 2)
			
 
				+
			
 
				+#define _TEXASR_FAILURE_SUMMARY(TEXASR) \
			
 
				+  _TEXASR_EXTRACT_BITS(TEXASR, 36, 1)
			
 
				+
			
 
				+#define _TEXASR_TFIAR_EXACT(TEXASR) \
			
 
				+  _TEXASR_EXTRACT_BITS(TEXASR, 37, 1)
			
 
				+
			
 
				+#define _TEXASR_ROT(TEXASR) \
			
 
				+  _TEXASR_EXTRACT_BITS(TEXASR, 38, 1)
			
 
				+
			
 
				+#define _TEXASR_TRANSACTION_LEVEL(TEXASR) \
			
 
				+  _TEXASR_EXTRACT_BITS(TEXASR, 63, 12)
			
 
				+
			
 
				+#endif /* __powerpc */
			
 
				+
			
 
				+#ifdef __s390__
			
 
				+
			
 
				+/* Condition codes generated by tbegin  */
			
 
				+#define _HTM_TBEGIN_STARTED       0
			
 
				+#define _HTM_TBEGIN_INDETERMINATE 1
			
 
				+#define _HTM_TBEGIN_TRANSIENT     2
			
 
				+#define _HTM_TBEGIN_PERSISTENT    3
			
 
				+
			
 
				+/* The abort codes below this threshold are reserved for machine use.  */
			
 
				+#define _HTM_FIRST_USER_ABORT_CODE 256
			
 
				+
			
 
				+/* The transaction diagnostic block is it is defined in the Principles
			
 
				+   of Operation chapter 5-91.  */
			
 
				+
			
 
				+struct __htm_tdb {
			
 
				+  unsigned char format;                /*   0 */
			
 
				+  unsigned char flags;
			
 
				+  unsigned char reserved1[4];
			
 
				+  unsigned short nesting_depth;
			
 
				+  unsigned long long abort_code;       /*   8 */
			
 
				+  unsigned long long conflict_token;   /*  16 */
			
 
				+  unsigned long long atia;             /*  24 */
			
 
				+  unsigned char eaid;                  /*  32 */
			
 
				+  unsigned char dxc;
			
 
				+  unsigned char reserved2[2];
			
 
				+  unsigned int program_int_id;
			
 
				+  unsigned long long exception_id;     /*  40 */
			
 
				+  unsigned long long bea;              /*  48 */
			
 
				+  unsigned char reserved3[72];         /*  56 */
			
 
				+  unsigned long long gprs[16];         /* 128 */
			
 
				+} __attribute__((__packed__, __aligned__ (8)));
			
 
				+
			
 
				+
			
 
				+/* Helper intrinsics to retry tbegin in case of transient failure.  */
			
 
				+
			
 
				+static __inline int __attribute__((__always_inline__, __nodebug__))
			
 
				+__builtin_tbegin_retry_null (int __retry)
			
 
				+{
			
 
				+  int cc, i = 0;
			
 
				+
			
 
				+  while ((cc = __builtin_tbegin(0)) == _HTM_TBEGIN_TRANSIENT
			
 
				+         && i++ < __retry)
			
 
				+    __builtin_tx_assist(i);
			
 
				+
			
 
				+  return cc;
			
 
				+}
			
 
				+
			
 
				+static __inline int __attribute__((__always_inline__, __nodebug__))
			
 
				+__builtin_tbegin_retry_tdb (void *__tdb, int __retry)
			
 
				+{
			
 
				+  int cc, i = 0;
			
 
				+
			
 
				+  while ((cc = __builtin_tbegin(__tdb)) == _HTM_TBEGIN_TRANSIENT
			
 
				+         && i++ < __retry)
			
 
				+    __builtin_tx_assist(i);
			
 
				+
			
 
				+  return cc;
			
 
				+}
			
 
				+
			
 
				+#define __builtin_tbegin_retry(tdb, retry) \
			
 
				+  (__builtin_constant_p(tdb == 0) && tdb == 0 ? \
			
 
				+   __builtin_tbegin_retry_null(retry) : \
			
 
				+   __builtin_tbegin_retry_tdb(tdb, retry))
			
 
				+
			
 
				+static __inline int __attribute__((__always_inline__, __nodebug__))
			
 
				+__builtin_tbegin_retry_nofloat_null (int __retry)
			
 
				+{
			
 
				+  int cc, i = 0;
			
 
				+
			
 
				+  while ((cc = __builtin_tbegin_nofloat(0)) == _HTM_TBEGIN_TRANSIENT
			
 
				+         && i++ < __retry)
			
 
				+    __builtin_tx_assist(i);
			
 
				+
			
 
				+  return cc;
			
 
				+}
			
 
				+
			
 
				+static __inline int __attribute__((__always_inline__, __nodebug__))
			
 
				+__builtin_tbegin_retry_nofloat_tdb (void *__tdb, int __retry)
			
 
				+{
			
 
				+  int cc, i = 0;
			
 
				+
			
 
				+  while ((cc = __builtin_tbegin_nofloat(__tdb)) == _HTM_TBEGIN_TRANSIENT
			
 
				+         && i++ < __retry)
			
 
				+    __builtin_tx_assist(i);
			
 
				+
			
 
				+  return cc;
			
 
				+}
			
 
				+
			
 
				+#define __builtin_tbegin_retry_nofloat(tdb, retry) \
			
 
				+  (__builtin_constant_p(tdb == 0) && tdb == 0 ? \
			
 
				+   __builtin_tbegin_retry_nofloat_null(retry) : \
			
 
				+   __builtin_tbegin_retry_nofloat_tdb(tdb, retry))
			
 
				+
			
 
				+#endif /* __s390__ */
			
 
				+
			
 
				+#endif /* __HTMINTRIN_H */
			
--- a/demo/include/htmxlintrin.h
+++ b/demo/include/htmxlintrin.h
@@ -0,0 +1,359 @@
 
				+/*===---- htmxlintrin.h - XL compiler HTM execution intrinsics-------------===*\
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+\*===----------------------------------------------------------------------===*/
			
 
				+
			
 
				+#ifndef __HTMXLINTRIN_H
			
 
				+#define __HTMXLINTRIN_H
			
 
				+
			
 
				+#ifndef __HTM__
			
 
				+#error "HTM instruction set not enabled"
			
 
				+#endif
			
 
				+
			
 
				+#include <htmintrin.h>
			
 
				+
			
 
				+#ifdef __powerpc__
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+#define _TEXASR_PTR(TM_BUF) ((texasr_t *)((char *)(TM_BUF) + 0))
			
 
				+#define _TEXASRU_PTR(TM_BUF) ((texasru_t *)((char *)(TM_BUF) + 0))
			
 
				+#define _TEXASRL_PTR(TM_BUF) ((texasrl_t *)((char *)(TM_BUF) + 4))
			
 
				+#define _TFIAR_PTR(TM_BUF) ((tfiar_t *)((char *)(TM_BUF) + 8))
			
 
				+
			
 
				+typedef char TM_buff_type[16];
			
 
				+
			
 
				+/* This macro can be used to determine whether a transaction was successfully
			
 
				+   started from the __TM_begin() and __TM_simple_begin() intrinsic functions
			
 
				+   below.  */
			
 
				+#define _HTM_TBEGIN_STARTED     1
			
 
				+
			
 
				+extern __inline long
			
 
				+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
			
 
				+__TM_simple_begin (void)
			
 
				+{
			
 
				+  if (__builtin_expect (__builtin_tbegin (0), 1))
			
 
				+    return _HTM_TBEGIN_STARTED;
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
 
				+extern __inline long
			
 
				+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
			
 
				+__TM_begin (void* const __TM_buff)
			
 
				+{
			
 
				+  *_TEXASRL_PTR (__TM_buff) = 0;
			
 
				+  if (__builtin_expect (__builtin_tbegin (0), 1))
			
 
				+    return _HTM_TBEGIN_STARTED;
			
 
				+#ifdef __powerpc64__
			
 
				+  *_TEXASR_PTR (__TM_buff) = __builtin_get_texasr ();
			
 
				+#else
			
 
				+  *_TEXASRU_PTR (__TM_buff) = __builtin_get_texasru ();
			
 
				+  *_TEXASRL_PTR (__TM_buff) = __builtin_get_texasr ();
			
 
				+#endif
			
 
				+  *_TFIAR_PTR (__TM_buff) = __builtin_get_tfiar ();
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
 
				+extern __inline long
			
 
				+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
			
 
				+__TM_end (void)
			
 
				+{
			
 
				+  if (__builtin_expect (__builtin_tend (0), 1))
			
 
				+    return 1;
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
 
				+extern __inline void
			
 
				+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
			
 
				+__TM_abort (void)
			
 
				+{
			
 
				+  __builtin_tabort (0);
			
 
				+}
			
 
				+
			
 
				+extern __inline void
			
 
				+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
			
 
				+__TM_named_abort (unsigned char const __code)
			
 
				+{
			
 
				+  __builtin_tabort (__code);
			
 
				+}
			
 
				+
			
 
				+extern __inline void
			
 
				+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
			
 
				+__TM_resume (void)
			
 
				+{
			
 
				+  __builtin_tresume ();
			
 
				+}
			
 
				+
			
 
				+extern __inline void
			
 
				+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
			
 
				+__TM_suspend (void)
			
 
				+{
			
 
				+  __builtin_tsuspend ();
			
 
				+}
			
 
				+
			
 
				+extern __inline long
			
 
				+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
			
 
				+__TM_is_user_abort (void* const __TM_buff)
			
 
				+{
			
 
				+  texasru_t texasru = *_TEXASRU_PTR (__TM_buff);
			
 
				+  return _TEXASRU_ABORT (texasru);
			
 
				+}
			
 
				+
			
 
				+extern __inline long
			
 
				+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
			
 
				+__TM_is_named_user_abort (void* const __TM_buff, unsigned char *__code)
			
 
				+{
			
 
				+  texasru_t texasru = *_TEXASRU_PTR (__TM_buff);
			
 
				+
			
 
				+  *__code = _TEXASRU_FAILURE_CODE (texasru);
			
 
				+  return _TEXASRU_ABORT (texasru);
			
 
				+}
			
 
				+
			
 
				+extern __inline long
			
 
				+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
			
 
				+__TM_is_illegal (void* const __TM_buff)
			
 
				+{
			
 
				+  texasru_t texasru = *_TEXASRU_PTR (__TM_buff);
			
 
				+  return _TEXASRU_DISALLOWED (texasru);
			
 
				+}
			
 
				+
			
 
				+extern __inline long
			
 
				+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
			
 
				+__TM_is_footprint_exceeded (void* const __TM_buff)
			
 
				+{
			
 
				+  texasru_t texasru = *_TEXASRU_PTR (__TM_buff);
			
 
				+  return _TEXASRU_FOOTPRINT_OVERFLOW (texasru);
			
 
				+}
			
 
				+
			
 
				+extern __inline long
			
 
				+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
			
 
				+__TM_nesting_depth (void* const __TM_buff)
			
 
				+{
			
 
				+  texasrl_t texasrl;
			
 
				+
			
 
				+  if (_HTM_STATE (__builtin_ttest ()) == _HTM_NONTRANSACTIONAL)
			
 
				+    {
			
 
				+      texasrl = *_TEXASRL_PTR (__TM_buff);
			
 
				+      if (!_TEXASR_FAILURE_SUMMARY (texasrl))
			
 
				+        texasrl = 0;
			
 
				+    }
			
 
				+  else
			
 
				+    texasrl = (texasrl_t) __builtin_get_texasr ();
			
 
				+
			
 
				+  return _TEXASR_TRANSACTION_LEVEL (texasrl);
			
 
				+}
			
 
				+
			
 
				+extern __inline long
			
 
				+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
			
 
				+__TM_is_nested_too_deep(void* const __TM_buff)
			
 
				+{
			
 
				+  texasru_t texasru = *_TEXASRU_PTR (__TM_buff);
			
 
				+  return _TEXASRU_NESTING_OVERFLOW (texasru);
			
 
				+}
			
 
				+
			
 
				+extern __inline long
			
 
				+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
			
 
				+__TM_is_conflict(void* const __TM_buff)
			
 
				+{
			
 
				+  texasru_t texasru = *_TEXASRU_PTR (__TM_buff);
			
 
				+  /* Return TEXASR bits 11 (Self-Induced Conflict) through
			
 
				+     14 (Translation Invalidation Conflict).  */
			
 
				+  return (_TEXASRU_EXTRACT_BITS (texasru, 14, 4)) ? 1 : 0;
			
 
				+}
			
 
				+
			
 
				+extern __inline long
			
 
				+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
			
 
				+__TM_is_failure_persistent(void* const __TM_buff)
			
 
				+{
			
 
				+  texasru_t texasru = *_TEXASRU_PTR (__TM_buff);
			
 
				+  return _TEXASRU_FAILURE_PERSISTENT (texasru);
			
 
				+}
			
 
				+
			
 
				+extern __inline long
			
 
				+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
			
 
				+__TM_failure_address(void* const __TM_buff)
			
 
				+{
			
 
				+  return *_TFIAR_PTR (__TM_buff);
			
 
				+}
			
 
				+
			
 
				+extern __inline long long
			
 
				+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
			
 
				+__TM_failure_code(void* const __TM_buff)
			
 
				+{
			
 
				+  return *_TEXASR_PTR (__TM_buff);
			
 
				+}
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#endif /* __powerpc__ */
			
 
				+
			
 
				+#ifdef __s390__
			
 
				+
			
 
				+#include <stdint.h>
			
 
				+
			
 
				+/* These intrinsics are being made available for compatibility with
			
 
				+   the IBM XL compiler.  For documentation please see the "z/OS XL
			
 
				+   C/C++ Programming Guide" publically available on the web.  */
			
 
				+
			
 
				+static __inline long __attribute__((__always_inline__, __nodebug__))
			
 
				+__TM_simple_begin ()
			
 
				+{
			
 
				+  return __builtin_tbegin_nofloat (0);
			
 
				+}
			
 
				+
			
 
				+static __inline long __attribute__((__always_inline__, __nodebug__))
			
 
				+__TM_begin (void* const __tdb)
			
 
				+{
			
 
				+  return __builtin_tbegin_nofloat (__tdb);
			
 
				+}
			
 
				+
			
 
				+static __inline long __attribute__((__always_inline__, __nodebug__))
			
 
				+__TM_end ()
			
 
				+{
			
 
				+  return __builtin_tend ();
			
 
				+}
			
 
				+
			
 
				+static __inline void __attribute__((__always_inline__))
			
 
				+__TM_abort ()
			
 
				+{
			
 
				+  return __builtin_tabort (_HTM_FIRST_USER_ABORT_CODE);
			
 
				+}
			
 
				+
			
 
				+static __inline void __attribute__((__always_inline__, __nodebug__))
			
 
				+__TM_named_abort (unsigned char const __code)
			
 
				+{
			
 
				+  return __builtin_tabort ((int)_HTM_FIRST_USER_ABORT_CODE + __code);
			
 
				+}
			
 
				+
			
 
				+static __inline void __attribute__((__always_inline__, __nodebug__))
			
 
				+__TM_non_transactional_store (void* const __addr, long long const __value)
			
 
				+{
			
 
				+  __builtin_non_tx_store ((uint64_t*)__addr, (uint64_t)__value);
			
 
				+}
			
 
				+
			
 
				+static __inline long __attribute__((__always_inline__, __nodebug__))
			
 
				+__TM_nesting_depth (void* const __tdb_ptr)
			
 
				+{
			
 
				+  int depth = __builtin_tx_nesting_depth ();
			
 
				+  struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
			
 
				+
			
 
				+  if (depth != 0)
			
 
				+    return depth;
			
 
				+
			
 
				+  if (tdb->format != 1)
			
 
				+    return 0;
			
 
				+  return tdb->nesting_depth;
			
 
				+}
			
 
				+
			
 
				+/* Transaction failure diagnostics */
			
 
				+
			
 
				+static __inline long __attribute__((__always_inline__, __nodebug__))
			
 
				+__TM_is_user_abort (void* const __tdb_ptr)
			
 
				+{
			
 
				+  struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
			
 
				+
			
 
				+  if (tdb->format != 1)
			
 
				+    return 0;
			
 
				+
			
 
				+  return !!(tdb->abort_code >= _HTM_FIRST_USER_ABORT_CODE);
			
 
				+}
			
 
				+
			
 
				+static __inline long __attribute__((__always_inline__, __nodebug__))
			
 
				+__TM_is_named_user_abort (void* const __tdb_ptr, unsigned char* __code)
			
 
				+{
			
 
				+  struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
			
 
				+
			
 
				+  if (tdb->format != 1)
			
 
				+    return 0;
			
 
				+
			
 
				+  if (tdb->abort_code >= _HTM_FIRST_USER_ABORT_CODE)
			
 
				+    {
			
 
				+      *__code = tdb->abort_code - _HTM_FIRST_USER_ABORT_CODE;
			
 
				+      return 1;
			
 
				+    }
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
 
				+static __inline long __attribute__((__always_inline__, __nodebug__))
			
 
				+__TM_is_illegal (void* const __tdb_ptr)
			
 
				+{
			
 
				+  struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
			
 
				+
			
 
				+  return (tdb->format == 1
			
 
				+	  && (tdb->abort_code == 4 /* unfiltered program interruption */
			
 
				+	      || tdb->abort_code == 11 /* restricted instruction */));
			
 
				+}
			
 
				+
			
 
				+static __inline long __attribute__((__always_inline__, __nodebug__))
			
 
				+__TM_is_footprint_exceeded (void* const __tdb_ptr)
			
 
				+{
			
 
				+  struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
			
 
				+
			
 
				+  return (tdb->format == 1
			
 
				+	  && (tdb->abort_code == 7 /* fetch overflow */
			
 
				+	      || tdb->abort_code == 8 /* store overflow */));
			
 
				+}
			
 
				+
			
 
				+static __inline long __attribute__((__always_inline__, __nodebug__))
			
 
				+__TM_is_nested_too_deep (void* const __tdb_ptr)
			
 
				+{
			
 
				+  struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
			
 
				+
			
 
				+  return tdb->format == 1 && tdb->abort_code == 13; /* depth exceeded */
			
 
				+}
			
 
				+
			
 
				+static __inline long __attribute__((__always_inline__, __nodebug__))
			
 
				+__TM_is_conflict (void* const __tdb_ptr)
			
 
				+{
			
 
				+  struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
			
 
				+
			
 
				+  return (tdb->format == 1
			
 
				+	  && (tdb->abort_code == 9 /* fetch conflict */
			
 
				+	      || tdb->abort_code == 10 /* store conflict */));
			
 
				+}
			
 
				+
			
 
				+static __inline long __attribute__((__always_inline__, __nodebug__))
			
 
				+__TM_is_failure_persistent (long const __result)
			
 
				+{
			
 
				+  return __result == _HTM_TBEGIN_PERSISTENT;
			
 
				+}
			
 
				+
			
 
				+static __inline long __attribute__((__always_inline__, __nodebug__))
			
 
				+__TM_failure_address (void* const __tdb_ptr)
			
 
				+{
			
 
				+  struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
			
 
				+  return tdb->atia;
			
 
				+}
			
 
				+
			
 
				+static __inline long __attribute__((__always_inline__, __nodebug__))
			
 
				+__TM_failure_code (void* const __tdb_ptr)
			
 
				+{
			
 
				+  struct __htm_tdb *tdb = (struct __htm_tdb*)__tdb_ptr;
			
 
				+
			
 
				+  return tdb->abort_code;
			
 
				+}
			
 
				+
			
 
				+#endif /* __s390__ */
			
 
				+
			
 
				+#endif /* __HTMXLINTRIN_H  */
			
--- a/demo/include/ia32intrin.h
+++ b/demo/include/ia32intrin.h
@@ -0,0 +1,73 @@
 
				+/* ===-------- ia32intrin.h ---------------------------------------------------===
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+
			
 
				+#ifndef __X86INTRIN_H
			
 
				+#error "Never use <ia32intrin.h> directly; include <x86intrin.h> instead."
			
 
				+#endif
			
 
				+
			
 
				+#ifndef __IA32INTRIN_H
			
 
				+#define __IA32INTRIN_H
			
 
				+
			
 
				+#ifdef __x86_64__
			
 
				+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
			
 
				+__readeflags(void)
			
 
				+{
			
 
				+  return __builtin_ia32_readeflags_u64();
			
 
				+}
			
 
				+
			
 
				+static __inline__ void __attribute__((__always_inline__, __nodebug__))
			
 
				+__writeeflags(unsigned long long __f)
			
 
				+{
			
 
				+  __builtin_ia32_writeeflags_u64(__f);
			
 
				+}
			
 
				+
			
 
				+#else /* !__x86_64__ */
			
 
				+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
			
 
				+__readeflags(void)
			
 
				+{
			
 
				+  return __builtin_ia32_readeflags_u32();
			
 
				+}
			
 
				+
			
 
				+static __inline__ void __attribute__((__always_inline__, __nodebug__))
			
 
				+__writeeflags(unsigned int __f)
			
 
				+{
			
 
				+  __builtin_ia32_writeeflags_u32(__f);
			
 
				+}
			
 
				+#endif /* !__x86_64__ */
			
 
				+
			
 
				+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
			
 
				+__rdpmc(int __A) {
			
 
				+  return __builtin_ia32_rdpmc(__A);
			
 
				+}
			
 
				+
			
 
				+/* __rdtscp */
			
 
				+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__))
			
 
				+__rdtscp(unsigned int *__A) {
			
 
				+  return __builtin_ia32_rdtscp(__A);
			
 
				+}
			
 
				+
			
 
				+#define _rdtsc() __rdtsc()
			
 
				+
			
 
				+#define _rdpmc(A) __rdpmc(A)
			
 
				+
			
 
				+#endif /* __IA32INTRIN_H */
			
--- a/demo/include/immintrin.h
+++ b/demo/include/immintrin.h
@@ -0,0 +1,374 @@
 
				+/*===---- immintrin.h - Intel intrinsics -----------------------------------===
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+
			
 
				+#ifndef __IMMINTRIN_H
			
 
				+#define __IMMINTRIN_H
			
 
				+
			
 
				+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__MMX__)
			
 
				+#include <mmintrin.h>
			
 
				+#endif
			
 
				+
			
 
				+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SSE__)
			
 
				+#include <xmmintrin.h>
			
 
				+#endif
			
 
				+
			
 
				+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SSE2__)
			
 
				+#include <emmintrin.h>
			
 
				+#endif
			
 
				+
			
 
				+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SSE3__)
			
 
				+#include <pmmintrin.h>
			
 
				+#endif
			
 
				+
			
 
				+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SSSE3__)
			
 
				+#include <tmmintrin.h>
			
 
				+#endif
			
 
				+
			
 
				+#if !defined(_MSC_VER) || __has_feature(modules) || \
			
 
				+    (defined(__SSE4_2__) || defined(__SSE4_1__))
			
 
				+#include <smmintrin.h>
			
 
				+#endif
			
 
				+
			
 
				+#if !defined(_MSC_VER) || __has_feature(modules) || \
			
 
				+    (defined(__AES__) || defined(__PCLMUL__))
			
 
				+#include <wmmintrin.h>
			
 
				+#endif
			
 
				+
			
 
				+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__CLFLUSHOPT__)
			
 
				+#include <clflushoptintrin.h>
			
 
				+#endif
			
 
				+
			
 
				+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__CLWB__)
			
 
				+#include <clwbintrin.h>
			
 
				+#endif
			
 
				+
			
 
				+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX__)
			
 
				+#include <avxintrin.h>
			
 
				+#endif
			
 
				+
			
 
				+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX2__)
			
 
				+#include <avx2intrin.h>
			
 
				+
			
 
				+/* The 256-bit versions of functions in f16cintrin.h.
			
 
				+   Intel documents these as being in immintrin.h, and
			
 
				+   they depend on typedefs from avxintrin.h. */
			
 
				+
			
 
				+/// \brief Converts a 256-bit vector of [8 x float] into a 128-bit vector
			
 
				+///    containing 16-bit half-precision float values.
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// \code
			
 
				+/// __m128i _mm256_cvtps_ph(__m256 a, const int imm);
			
 
				+/// \endcode
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction.
			
 
				+///
			
 
				+/// \param a
			
 
				+///    A 256-bit vector containing 32-bit single-precision float values to be
			
 
				+///    converted to 16-bit half-precision float values.
			
 
				+/// \param imm
			
 
				+///    An immediate value controlling rounding using bits [2:0]: \n
			
 
				+///    000: Nearest \n
			
 
				+///    001: Down \n
			
 
				+///    010: Up \n
			
 
				+///    011: Truncate \n
			
 
				+///    1XX: Use MXCSR.RC for rounding
			
 
				+/// \returns A 128-bit vector containing the converted 16-bit half-precision
			
 
				+///    float values.
			
 
				+#define _mm256_cvtps_ph(a, imm) __extension__ ({ \
			
 
				+ (__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)(__m256)(a), (imm)); })
			
 
				+
			
 
				+/// \brief Converts a 128-bit vector containing 16-bit half-precision float
			
 
				+///    values into a 256-bit vector of [8 x float].
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction.
			
 
				+///
			
 
				+/// \param __a
			
 
				+///    A 128-bit vector containing 16-bit half-precision float values to be
			
 
				+///    converted to 32-bit single-precision float values.
			
 
				+/// \returns A vector of [8 x float] containing the converted 32-bit
			
 
				+///    single-precision float values.
			
 
				+static __inline __m256 __attribute__((__always_inline__, __nodebug__, __target__("f16c")))
			
 
				+_mm256_cvtph_ps(__m128i __a)
			
 
				+{
			
 
				+  return (__m256)__builtin_ia32_vcvtph2ps256((__v8hi)__a);
			
 
				+}
			
 
				+#endif /* __AVX2__ */
			
 
				+
			
 
				+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__VPCLMULQDQ__)
			
 
				+#include <vpclmulqdqintrin.h>
			
 
				+#endif
			
 
				+
			
 
				+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__BMI__)
			
 
				+#include <bmiintrin.h>
			
 
				+#endif
			
 
				+
			
 
				+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__BMI2__)
			
 
				+#include <bmi2intrin.h>
			
 
				+#endif
			
 
				+
			
 
				+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__LZCNT__)
			
 
				+#include <lzcntintrin.h>
			
 
				+#endif
			
 
				+
			
 
				+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__FMA__)
			
 
				+#include <fmaintrin.h>
			
 
				+#endif
			
 
				+
			
 
				+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512F__)
			
 
				+#include <avx512fintrin.h>
			
 
				+#endif
			
 
				+
			
 
				+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512VL__)
			
 
				+#include <avx512vlintrin.h>
			
 
				+#endif
			
 
				+
			
 
				+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512BW__)
			
 
				+#include <avx512bwintrin.h>
			
 
				+#endif
			
 
				+
			
 
				+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512BITALG__)
			
 
				+#include <avx512bitalgintrin.h>
			
 
				+#endif
			
 
				+
			
 
				+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512CD__)
			
 
				+#include <avx512cdintrin.h>
			
 
				+#endif
			
 
				+
			
 
				+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512VPOPCNTDQ__)
			
 
				+#include <avx512vpopcntdqintrin.h>
			
 
				+#endif
			
 
				+
			
 
				+#if !defined(_MSC_VER) || __has_feature(modules) || \
			
 
				+    (defined(__AVX512VL__) && defined(__AVX512VPOPCNTDQ__))
			
 
				+#include <avx512vpopcntdqvlintrin.h>
			
 
				+#endif
			
 
				+
			
 
				+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512VNNI__)
			
 
				+#include <avx512vnniintrin.h>
			
 
				+#endif
			
 
				+
			
 
				+#if !defined(_MSC_VER) || __has_feature(modules) || \
			
 
				+    (defined(__AVX512VL__) && defined(__AVX512VNNI__))
			
 
				+#include <avx512vlvnniintrin.h>
			
 
				+#endif
			
 
				+
			
 
				+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512DQ__)
			
 
				+#include <avx512dqintrin.h>
			
 
				+#endif
			
 
				+
			
 
				+#if !defined(_MSC_VER) || __has_feature(modules) || \
			
 
				+    (defined(__AVX512VL__) && defined(__AVX512BITALG__))
			
 
				+#include <avx512vlbitalgintrin.h>
			
 
				+#endif
			
 
				+
			
 
				+#if !defined(_MSC_VER) || __has_feature(modules) || \
			
 
				+    (defined(__AVX512VL__) && defined(__AVX512BW__))
			
 
				+#include <avx512vlbwintrin.h>
			
 
				+#endif
			
 
				+
			
 
				+#if !defined(_MSC_VER) || __has_feature(modules) || \
			
 
				+    (defined(__AVX512VL__) && defined(__AVX512CD__))
			
 
				+#include <avx512vlcdintrin.h>
			
 
				+#endif
			
 
				+
			
 
				+#if !defined(_MSC_VER) || __has_feature(modules) || \
			
 
				+    (defined(__AVX512VL__) && defined(__AVX512DQ__))
			
 
				+#include <avx512vldqintrin.h>
			
 
				+#endif
			
 
				+
			
 
				+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512ER__)
			
 
				+#include <avx512erintrin.h>
			
 
				+#endif
			
 
				+
			
 
				+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512IFMA__)
			
 
				+#include <avx512ifmaintrin.h>
			
 
				+#endif
			
 
				+
			
 
				+#if !defined(_MSC_VER) || __has_feature(modules) || \
			
 
				+    (defined(__AVX512IFMA__) && defined(__AVX512VL__))
			
 
				+#include <avx512ifmavlintrin.h>
			
 
				+#endif
			
 
				+
			
 
				+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512VBMI__)
			
 
				+#include <avx512vbmiintrin.h>
			
 
				+#endif
			
 
				+
			
 
				+#if !defined(_MSC_VER) || __has_feature(modules) || \
			
 
				+    (defined(__AVX512VBMI__) && defined(__AVX512VL__))
			
 
				+#include <avx512vbmivlintrin.h>
			
 
				+#endif
			
 
				+
			
 
				+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512VBMI2__)
			
 
				+#include <avx512vbmi2intrin.h>
			
 
				+#endif
			
 
				+
			
 
				+#if !defined(_MSC_VER) || __has_feature(modules) || \
			
 
				+    (defined(__AVX512VBMI2__) && defined(__AVX512VL__))
			
 
				+#include <avx512vlvbmi2intrin.h>
			
 
				+#endif
			
 
				+
			
 
				+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512PF__)
			
 
				+#include <avx512pfintrin.h>
			
 
				+#endif
			
 
				+
			
 
				+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__PKU__)
			
 
				+#include <pkuintrin.h>
			
 
				+#endif
			
 
				+
			
 
				+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__VAES__)
			
 
				+#include <vaesintrin.h>
			
 
				+#endif
			
 
				+
			
 
				+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__GFNI__)
			
 
				+#include <gfniintrin.h>
			
 
				+#endif
			
 
				+
			
 
				+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__RDRND__)
			
 
				+static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
			
 
				+_rdrand16_step(unsigned short *__p)
			
 
				+{
			
 
				+  return __builtin_ia32_rdrand16_step(__p);
			
 
				+}
			
 
				+
			
 
				+static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
			
 
				+_rdrand32_step(unsigned int *__p)
			
 
				+{
			
 
				+  return __builtin_ia32_rdrand32_step(__p);
			
 
				+}
			
 
				+
			
 
				+#ifdef __x86_64__
			
 
				+static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
			
 
				+_rdrand64_step(unsigned long long *__p)
			
 
				+{
			
 
				+  return __builtin_ia32_rdrand64_step(__p);
			
 
				+}
			
 
				+#endif
			
 
				+#endif /* __RDRND__ */
			
 
				+
			
 
				+/* __bit_scan_forward */
			
 
				+static __inline__ int __attribute__((__always_inline__, __nodebug__))
			
 
				+_bit_scan_forward(int __A) {
			
 
				+  return __builtin_ctz(__A);
			
 
				+}
			
 
				+
			
 
				+/* __bit_scan_reverse */
			
 
				+static __inline__ int __attribute__((__always_inline__, __nodebug__))
			
 
				+_bit_scan_reverse(int __A) {
			
 
				+  return 31 - __builtin_clz(__A);
			
 
				+}
			
 
				+
			
 
				+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__FSGSBASE__)
			
 
				+#ifdef __x86_64__
			
 
				+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
			
 
				+_readfsbase_u32(void)
			
 
				+{
			
 
				+  return __builtin_ia32_rdfsbase32();
			
 
				+}
			
 
				+
			
 
				+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
			
 
				+_readfsbase_u64(void)
			
 
				+{
			
 
				+  return __builtin_ia32_rdfsbase64();
			
 
				+}
			
 
				+
			
 
				+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
			
 
				+_readgsbase_u32(void)
			
 
				+{
			
 
				+  return __builtin_ia32_rdgsbase32();
			
 
				+}
			
 
				+
			
 
				+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
			
 
				+_readgsbase_u64(void)
			
 
				+{
			
 
				+  return __builtin_ia32_rdgsbase64();
			
 
				+}
			
 
				+
			
 
				+static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
			
 
				+_writefsbase_u32(unsigned int __V)
			
 
				+{
			
 
				+  return __builtin_ia32_wrfsbase32(__V);
			
 
				+}
			
 
				+
			
 
				+static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
			
 
				+_writefsbase_u64(unsigned long long __V)
			
 
				+{
			
 
				+  return __builtin_ia32_wrfsbase64(__V);
			
 
				+}
			
 
				+
			
 
				+static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
			
 
				+_writegsbase_u32(unsigned int __V)
			
 
				+{
			
 
				+  return __builtin_ia32_wrgsbase32(__V);
			
 
				+}
			
 
				+
			
 
				+static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
			
 
				+_writegsbase_u64(unsigned long long __V)
			
 
				+{
			
 
				+  return __builtin_ia32_wrgsbase64(__V);
			
 
				+}
			
 
				+
			
 
				+#endif
			
 
				+#endif /* __FSGSBASE__ */
			
 
				+
			
 
				+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__RTM__)
			
 
				+#include <rtmintrin.h>
			
 
				+#include <xtestintrin.h>
			
 
				+#endif
			
 
				+
			
 
				+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SHA__)
			
 
				+#include <shaintrin.h>
			
 
				+#endif
			
 
				+
			
 
				+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__FXSR__)
			
 
				+#include <fxsrintrin.h>
			
 
				+#endif
			
 
				+
			
 
				+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__XSAVE__)
			
 
				+#include <xsaveintrin.h>
			
 
				+#endif
			
 
				+
			
 
				+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__XSAVEOPT__)
			
 
				+#include <xsaveoptintrin.h>
			
 
				+#endif
			
 
				+
			
 
				+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__XSAVEC__)
			
 
				+#include <xsavecintrin.h>
			
 
				+#endif
			
 
				+
			
 
				+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__XSAVES__)
			
 
				+#include <xsavesintrin.h>
			
 
				+#endif
			
 
				+
			
 
				+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SHSTK__)
			
 
				+#include <cetintrin.h>
			
 
				+#endif
			
 
				+
			
 
				+/* Some intrinsics inside adxintrin.h are available only on processors with ADX,
			
 
				+ * whereas others are also available at all times. */
			
 
				+#include <adxintrin.h>
			
 
				+
			
 
				+#endif /* __IMMINTRIN_H */
			
--- a/demo/include/intrin.h
+++ b/demo/include/intrin.h
@@ -0,0 +1,969 @@
 
				+/* ===-------- intrin.h ---------------------------------------------------===
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+
			
 
				+/* Only include this if we're compiling for the windows platform. */
			
 
				+#ifndef _MSC_VER
			
 
				+#include_next <intrin.h>
			
 
				+#else
			
 
				+
			
 
				+#ifndef __INTRIN_H
			
 
				+#define __INTRIN_H
			
 
				+
			
 
				+/* First include the standard intrinsics. */
			
 
				+#if defined(__i386__) || defined(__x86_64__)
			
 
				+#include <x86intrin.h>
			
 
				+#endif
			
 
				+
			
 
				+#if defined(__arm__)
			
 
				+#include <armintr.h>
			
 
				+#endif
			
 
				+
			
 
				+#if defined(_M_ARM64)
			
 
				+#include <arm64intr.h>
			
 
				+#endif
			
 
				+
			
 
				+/* For the definition of jmp_buf. */
			
 
				+#if __STDC_HOSTED__
			
 
				+#include <setjmp.h>
			
 
				+#endif
			
 
				+
			
 
				+/* Define the default attributes for the functions in this file. */
			
 
				+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+#if defined(__MMX__)
			
 
				+/* And the random ones that aren't in those files. */
			
 
				+__m64 _m_from_float(float);
			
 
				+float _m_to_float(__m64);
			
 
				+#endif
			
 
				+
			
 
				+/* Other assorted instruction intrinsics. */
			
 
				+void __addfsbyte(unsigned long, unsigned char);
			
 
				+void __addfsdword(unsigned long, unsigned long);
			
 
				+void __addfsword(unsigned long, unsigned short);
			
 
				+void __code_seg(const char *);
			
 
				+static __inline__
			
 
				+void __cpuid(int[4], int);
			
 
				+static __inline__
			
 
				+void __cpuidex(int[4], int, int);
			
 
				+static __inline__
			
 
				+__int64 __emul(int, int);
			
 
				+static __inline__
			
 
				+unsigned __int64 __emulu(unsigned int, unsigned int);
			
 
				+unsigned int __getcallerseflags(void);
			
 
				+static __inline__
			
 
				+void __halt(void);
			
 
				+unsigned char __inbyte(unsigned short);
			
 
				+void __inbytestring(unsigned short, unsigned char *, unsigned long);
			
 
				+void __incfsbyte(unsigned long);
			
 
				+void __incfsdword(unsigned long);
			
 
				+void __incfsword(unsigned long);
			
 
				+unsigned long __indword(unsigned short);
			
 
				+void __indwordstring(unsigned short, unsigned long *, unsigned long);
			
 
				+void __invlpg(void *);
			
 
				+unsigned short __inword(unsigned short);
			
 
				+void __inwordstring(unsigned short, unsigned short *, unsigned long);
			
 
				+void __lidt(void *);
			
 
				+unsigned __int64 __ll_lshift(unsigned __int64, int);
			
 
				+__int64 __ll_rshift(__int64, int);
			
 
				+unsigned int __lzcnt(unsigned int);
			
 
				+unsigned short __lzcnt16(unsigned short);
			
 
				+static __inline__
			
 
				+void __movsb(unsigned char *, unsigned char const *, size_t);
			
 
				+static __inline__
			
 
				+void __movsd(unsigned long *, unsigned long const *, size_t);
			
 
				+static __inline__
			
 
				+void __movsw(unsigned short *, unsigned short const *, size_t);
			
 
				+static __inline__
			
 
				+void __nop(void);
			
 
				+void __nvreg_restore_fence(void);
			
 
				+void __nvreg_save_fence(void);
			
 
				+void __outbyte(unsigned short, unsigned char);
			
 
				+void __outbytestring(unsigned short, unsigned char *, unsigned long);
			
 
				+void __outdword(unsigned short, unsigned long);
			
 
				+void __outdwordstring(unsigned short, unsigned long *, unsigned long);
			
 
				+void __outword(unsigned short, unsigned short);
			
 
				+void __outwordstring(unsigned short, unsigned short *, unsigned long);
			
 
				+unsigned long __readcr0(void);
			
 
				+unsigned long __readcr2(void);
			
 
				+static __inline__
			
 
				+unsigned long __readcr3(void);
			
 
				+unsigned long __readcr4(void);
			
 
				+unsigned long __readcr8(void);
			
 
				+unsigned int __readdr(unsigned int);
			
 
				+#ifdef __i386__
			
 
				+static __inline__
			
 
				+unsigned char __readfsbyte(unsigned long);
			
 
				+static __inline__
			
 
				+unsigned __int64 __readfsqword(unsigned long);
			
 
				+static __inline__
			
 
				+unsigned short __readfsword(unsigned long);
			
 
				+#endif
			
 
				+static __inline__
			
 
				+unsigned __int64 __readmsr(unsigned long);
			
 
				+unsigned __int64 __readpmc(unsigned long);
			
 
				+unsigned long __segmentlimit(unsigned long);
			
 
				+void __sidt(void *);
			
 
				+static __inline__
			
 
				+void __stosb(unsigned char *, unsigned char, size_t);
			
 
				+static __inline__
			
 
				+void __stosd(unsigned long *, unsigned long, size_t);
			
 
				+static __inline__
			
 
				+void __stosw(unsigned short *, unsigned short, size_t);
			
 
				+void __svm_clgi(void);
			
 
				+void __svm_invlpga(void *, int);
			
 
				+void __svm_skinit(int);
			
 
				+void __svm_stgi(void);
			
 
				+void __svm_vmload(size_t);
			
 
				+void __svm_vmrun(size_t);
			
 
				+void __svm_vmsave(size_t);
			
 
				+unsigned __int64 __ull_rshift(unsigned __int64, int);
			
 
				+void __vmx_off(void);
			
 
				+void __vmx_vmptrst(unsigned __int64 *);
			
 
				+void __wbinvd(void);
			
 
				+void __writecr0(unsigned int);
			
 
				+static __inline__
			
 
				+void __writecr3(unsigned int);
			
 
				+void __writecr4(unsigned int);
			
 
				+void __writecr8(unsigned int);
			
 
				+void __writedr(unsigned int, unsigned int);
			
 
				+void __writefsbyte(unsigned long, unsigned char);
			
 
				+void __writefsdword(unsigned long, unsigned long);
			
 
				+void __writefsqword(unsigned long, unsigned __int64);
			
 
				+void __writefsword(unsigned long, unsigned short);
			
 
				+void __writemsr(unsigned long, unsigned __int64);
			
 
				+static __inline__
			
 
				+void *_AddressOfReturnAddress(void);
			
 
				+static __inline__
			
 
				+unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
			
 
				+static __inline__
			
 
				+unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
			
 
				+static __inline__
			
 
				+unsigned char _bittest(long const *, long);
			
 
				+static __inline__
			
 
				+unsigned char _bittestandcomplement(long *, long);
			
 
				+static __inline__
			
 
				+unsigned char _bittestandreset(long *, long);
			
 
				+static __inline__
			
 
				+unsigned char _bittestandset(long *, long);
			
 
				+void __cdecl _disable(void);
			
 
				+void __cdecl _enable(void);
			
 
				+long _InterlockedAddLargeStatistic(__int64 volatile *_Addend, long _Value);
			
 
				+unsigned char _interlockedbittestandreset(long volatile *, long);
			
 
				+unsigned char _interlockedbittestandset(long volatile *, long);
			
 
				+long _InterlockedCompareExchange_HLEAcquire(long volatile *, long, long);
			
 
				+long _InterlockedCompareExchange_HLERelease(long volatile *, long, long);
			
 
				+__int64 _InterlockedcompareExchange64_HLEAcquire(__int64 volatile *, __int64,
			
 
				+                                                 __int64);
			
 
				+__int64 _InterlockedCompareExchange64_HLERelease(__int64 volatile *, __int64,
			
 
				+                                                 __int64);
			
 
				+void *_InterlockedCompareExchangePointer_HLEAcquire(void *volatile *, void *,
			
 
				+                                                    void *);
			
 
				+void *_InterlockedCompareExchangePointer_HLERelease(void *volatile *, void *,
			
 
				+                                                    void *);
			
 
				+long _InterlockedExchangeAdd_HLEAcquire(long volatile *, long);
			
 
				+long _InterlockedExchangeAdd_HLERelease(long volatile *, long);
			
 
				+__int64 _InterlockedExchangeAdd64_HLEAcquire(__int64 volatile *, __int64);
			
 
				+__int64 _InterlockedExchangeAdd64_HLERelease(__int64 volatile *, __int64);
			
 
				+void __cdecl _invpcid(unsigned int, void *);
			
 
				+static __inline__ void
			
 
				+__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
			
 
				+_ReadBarrier(void);
			
 
				+static __inline__ void
			
 
				+__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
			
 
				+_ReadWriteBarrier(void);
			
 
				+unsigned int _rorx_u32(unsigned int, const unsigned int);
			
 
				+int _sarx_i32(int, unsigned int);
			
 
				+#if __STDC_HOSTED__
			
 
				+int __cdecl _setjmp(jmp_buf);
			
 
				+#endif
			
 
				+unsigned int _shlx_u32(unsigned int, unsigned int);
			
 
				+unsigned int _shrx_u32(unsigned int, unsigned int);
			
 
				+void _Store_HLERelease(long volatile *, long);
			
 
				+void _Store64_HLERelease(__int64 volatile *, __int64);
			
 
				+void _StorePointer_HLERelease(void *volatile *, void *);
			
 
				+static __inline__ void
			
 
				+__attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
			
 
				+_WriteBarrier(void);
			
 
				+unsigned __int32 xbegin(void);
			
 
				+void _xend(void);
			
 
				+static __inline__
			
 
				+#define _XCR_XFEATURE_ENABLED_MASK 0
			
 
				+unsigned __int64 __cdecl _xgetbv(unsigned int);
			
 
				+void __cdecl _xsetbv(unsigned int, unsigned __int64);
			
 
				+
			
 
				+/* These additional intrinsics are turned on in x64/amd64/x86_64 mode. */
			
 
				+#ifdef __x86_64__
			
 
				+void __addgsbyte(unsigned long, unsigned char);
			
 
				+void __addgsdword(unsigned long, unsigned long);
			
 
				+void __addgsqword(unsigned long, unsigned __int64);
			
 
				+void __addgsword(unsigned long, unsigned short);
			
 
				+static __inline__
			
 
				+void __faststorefence(void);
			
 
				+void __incgsbyte(unsigned long);
			
 
				+void __incgsdword(unsigned long);
			
 
				+void __incgsqword(unsigned long);
			
 
				+void __incgsword(unsigned long);
			
 
				+unsigned __int64 __lzcnt64(unsigned __int64);
			
 
				+static __inline__
			
 
				+void __movsq(unsigned long long *, unsigned long long const *, size_t);
			
 
				+static __inline__
			
 
				+unsigned char __readgsbyte(unsigned long);
			
 
				+static __inline__
			
 
				+unsigned long __readgsdword(unsigned long);
			
 
				+static __inline__
			
 
				+unsigned __int64 __readgsqword(unsigned long);
			
 
				+unsigned short __readgsword(unsigned long);
			
 
				+unsigned __int64 __shiftleft128(unsigned __int64 _LowPart,
			
 
				+                                unsigned __int64 _HighPart,
			
 
				+                                unsigned char _Shift);
			
 
				+unsigned __int64 __shiftright128(unsigned __int64 _LowPart,
			
 
				+                                 unsigned __int64 _HighPart,
			
 
				+                                 unsigned char _Shift);
			
 
				+static __inline__
			
 
				+void __stosq(unsigned __int64 *, unsigned __int64, size_t);
			
 
				+unsigned char __vmx_on(unsigned __int64 *);
			
 
				+unsigned char __vmx_vmclear(unsigned __int64 *);
			
 
				+unsigned char __vmx_vmlaunch(void);
			
 
				+unsigned char __vmx_vmptrld(unsigned __int64 *);
			
 
				+unsigned char __vmx_vmread(size_t, size_t *);
			
 
				+unsigned char __vmx_vmresume(void);
			
 
				+unsigned char __vmx_vmwrite(size_t, size_t);
			
 
				+void __writegsbyte(unsigned long, unsigned char);
			
 
				+void __writegsdword(unsigned long, unsigned long);
			
 
				+void __writegsqword(unsigned long, unsigned __int64);
			
 
				+void __writegsword(unsigned long, unsigned short);
			
 
				+static __inline__
			
 
				+unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
			
 
				+static __inline__
			
 
				+unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
			
 
				+static __inline__
			
 
				+unsigned char _bittest64(__int64 const *, __int64);
			
 
				+static __inline__
			
 
				+unsigned char _bittestandcomplement64(__int64 *, __int64);
			
 
				+static __inline__
			
 
				+unsigned char _bittestandreset64(__int64 *, __int64);
			
 
				+static __inline__
			
 
				+unsigned char _bittestandset64(__int64 *, __int64);
			
 
				+long _InterlockedAnd_np(long volatile *_Value, long _Mask);
			
 
				+short _InterlockedAnd16_np(short volatile *_Value, short _Mask);
			
 
				+__int64 _InterlockedAnd64_np(__int64 volatile *_Value, __int64 _Mask);
			
 
				+char _InterlockedAnd8_np(char volatile *_Value, char _Mask);
			
 
				+unsigned char _interlockedbittestandreset64(__int64 volatile *, __int64);
			
 
				+static __inline__
			
 
				+unsigned char _interlockedbittestandset64(__int64 volatile *, __int64);
			
 
				+long _InterlockedCompareExchange_np(long volatile *_Destination, long _Exchange,
			
 
				+                                    long _Comparand);
			
 
				+unsigned char _InterlockedCompareExchange128(__int64 volatile *_Destination,
			
 
				+                                             __int64 _ExchangeHigh,
			
 
				+                                             __int64 _ExchangeLow,
			
 
				+                                             __int64 *_CompareandResult);
			
 
				+unsigned char _InterlockedCompareExchange128_np(__int64 volatile *_Destination,
			
 
				+                                                __int64 _ExchangeHigh,
			
 
				+                                                __int64 _ExchangeLow,
			
 
				+                                                __int64 *_ComparandResult);
			
 
				+short _InterlockedCompareExchange16_np(short volatile *_Destination,
			
 
				+                                       short _Exchange, short _Comparand);
			
 
				+__int64 _InterlockedCompareExchange64_HLEAcquire(__int64 volatile *, __int64,
			
 
				+                                                 __int64);
			
 
				+__int64 _InterlockedCompareExchange64_HLERelease(__int64 volatile *, __int64,
			
 
				+                                                 __int64);
			
 
				+__int64 _InterlockedCompareExchange64_np(__int64 volatile *_Destination,
			
 
				+                                         __int64 _Exchange, __int64 _Comparand);
			
 
				+void *_InterlockedCompareExchangePointer_np(void *volatile *_Destination,
			
 
				+                                            void *_Exchange, void *_Comparand);
			
 
				+long _InterlockedOr_np(long volatile *_Value, long _Mask);
			
 
				+short _InterlockedOr16_np(short volatile *_Value, short _Mask);
			
 
				+__int64 _InterlockedOr64_np(__int64 volatile *_Value, __int64 _Mask);
			
 
				+char _InterlockedOr8_np(char volatile *_Value, char _Mask);
			
 
				+long _InterlockedXor_np(long volatile *_Value, long _Mask);
			
 
				+short _InterlockedXor16_np(short volatile *_Value, short _Mask);
			
 
				+__int64 _InterlockedXor64_np(__int64 volatile *_Value, __int64 _Mask);
			
 
				+char _InterlockedXor8_np(char volatile *_Value, char _Mask);
			
 
				+unsigned __int64 _rorx_u64(unsigned __int64, const unsigned int);
			
 
				+__int64 _sarx_i64(__int64, unsigned int);
			
 
				+unsigned __int64 _shlx_u64(unsigned __int64, unsigned int);
			
 
				+unsigned __int64 _shrx_u64(unsigned __int64, unsigned int);
			
 
				+static __inline__
			
 
				+__int64 __mulh(__int64, __int64);
			
 
				+static __inline__
			
 
				+unsigned __int64 __umulh(unsigned __int64, unsigned __int64);
			
 
				+static __inline__
			
 
				+__int64 _mul128(__int64, __int64, __int64*);
			
 
				+static __inline__
			
 
				+unsigned __int64 _umul128(unsigned __int64,
			
 
				+                          unsigned __int64,
			
 
				+                          unsigned __int64*);
			
 
				+
			
 
				+#endif /* __x86_64__ */
			
 
				+
			
 
				+#if defined(__x86_64__) || defined(__arm__)
			
 
				+
			
 
				+static __inline__
			
 
				+__int64 _InterlockedDecrement64(__int64 volatile *_Addend);
			
 
				+static __inline__
			
 
				+__int64 _InterlockedExchange64(__int64 volatile *_Target, __int64 _Value);
			
 
				+static __inline__
			
 
				+__int64 _InterlockedExchangeAdd64(__int64 volatile *_Addend, __int64 _Value);
			
 
				+static __inline__
			
 
				+__int64 _InterlockedExchangeSub64(__int64 volatile *_Subend, __int64 _Value);
			
 
				+static __inline__
			
 
				+__int64 _InterlockedIncrement64(__int64 volatile *_Addend);
			
 
				+static __inline__
			
 
				+__int64 _InterlockedOr64(__int64 volatile *_Value, __int64 _Mask);
			
 
				+static __inline__
			
 
				+__int64 _InterlockedXor64(__int64 volatile *_Value, __int64 _Mask);
			
 
				+static __inline__
			
 
				+__int64 _InterlockedAnd64(__int64 volatile *_Value, __int64 _Mask);
			
 
				+
			
 
				+#endif
			
 
				+
			
 
				+/*----------------------------------------------------------------------------*\
			
 
				+|* Bit Counting and Testing
			
 
				+\*----------------------------------------------------------------------------*/
			
 
				+static __inline__ unsigned char __DEFAULT_FN_ATTRS
			
 
				+_bittest(long const *_BitBase, long _BitPos) {
			
 
				+  return (*_BitBase >> _BitPos) & 1;
			
 
				+}
			
 
				+static __inline__ unsigned char __DEFAULT_FN_ATTRS
			
 
				+_bittestandcomplement(long *_BitBase, long _BitPos) {
			
 
				+  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
			
 
				+  *_BitBase = *_BitBase ^ (1 << _BitPos);
			
 
				+  return _Res;
			
 
				+}
			
 
				+static __inline__ unsigned char __DEFAULT_FN_ATTRS
			
 
				+_bittestandreset(long *_BitBase, long _BitPos) {
			
 
				+  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
			
 
				+  *_BitBase = *_BitBase & ~(1 << _BitPos);
			
 
				+  return _Res;
			
 
				+}
			
 
				+static __inline__ unsigned char __DEFAULT_FN_ATTRS
			
 
				+_bittestandset(long *_BitBase, long _BitPos) {
			
 
				+  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
			
 
				+  *_BitBase = *_BitBase | (1 << _BitPos);
			
 
				+  return _Res;
			
 
				+}
			
 
				+#if defined(__arm__) || defined(__aarch64__)
			
 
				+static __inline__ unsigned char __DEFAULT_FN_ATTRS
			
 
				+_interlockedbittestandset_acq(long volatile *_BitBase, long _BitPos) {
			
 
				+  long _PrevVal = __atomic_fetch_or(_BitBase, 1l << _BitPos, __ATOMIC_ACQUIRE);
			
 
				+  return (_PrevVal >> _BitPos) & 1;
			
 
				+}
			
 
				+static __inline__ unsigned char __DEFAULT_FN_ATTRS
			
 
				+_interlockedbittestandset_nf(long volatile *_BitBase, long _BitPos) {
			
 
				+  long _PrevVal = __atomic_fetch_or(_BitBase, 1l << _BitPos, __ATOMIC_RELAXED);
			
 
				+  return (_PrevVal >> _BitPos) & 1;
			
 
				+}
			
 
				+static __inline__ unsigned char __DEFAULT_FN_ATTRS
			
 
				+_interlockedbittestandset_rel(long volatile *_BitBase, long _BitPos) {
			
 
				+  long _PrevVal = __atomic_fetch_or(_BitBase, 1l << _BitPos, __ATOMIC_RELEASE);
			
 
				+  return (_PrevVal >> _BitPos) & 1;
			
 
				+}
			
 
				+#endif
			
 
				+#ifdef __x86_64__
			
 
				+static __inline__ unsigned char __DEFAULT_FN_ATTRS
			
 
				+_bittest64(__int64 const *_BitBase, __int64 _BitPos) {
			
 
				+  return (*_BitBase >> _BitPos) & 1;
			
 
				+}
			
 
				+static __inline__ unsigned char __DEFAULT_FN_ATTRS
			
 
				+_bittestandcomplement64(__int64 *_BitBase, __int64 _BitPos) {
			
 
				+  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
			
 
				+  *_BitBase = *_BitBase ^ (1ll << _BitPos);
			
 
				+  return _Res;
			
 
				+}
			
 
				+static __inline__ unsigned char __DEFAULT_FN_ATTRS
			
 
				+_bittestandreset64(__int64 *_BitBase, __int64 _BitPos) {
			
 
				+  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
			
 
				+  *_BitBase = *_BitBase & ~(1ll << _BitPos);
			
 
				+  return _Res;
			
 
				+}
			
 
				+static __inline__ unsigned char __DEFAULT_FN_ATTRS
			
 
				+_bittestandset64(__int64 *_BitBase, __int64 _BitPos) {
			
 
				+  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
			
 
				+  *_BitBase = *_BitBase | (1ll << _BitPos);
			
 
				+  return _Res;
			
 
				+}
			
 
				+static __inline__ unsigned char __DEFAULT_FN_ATTRS
			
 
				+_interlockedbittestandset64(__int64 volatile *_BitBase, __int64 _BitPos) {
			
 
				+  long long _PrevVal =
			
 
				+      __atomic_fetch_or(_BitBase, 1ll << _BitPos, __ATOMIC_SEQ_CST);
			
 
				+  return (_PrevVal >> _BitPos) & 1;
			
 
				+}
			
 
				+#endif
			
 
				+/*----------------------------------------------------------------------------*\
			
 
				+|* Interlocked Exchange Add
			
 
				+\*----------------------------------------------------------------------------*/
			
 
				+#if defined(__arm__) || defined(__aarch64__)
			
 
				+static __inline__ char __DEFAULT_FN_ATTRS
			
 
				+_InterlockedExchangeAdd8_acq(char volatile *_Addend, char _Value) {
			
 
				+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_ACQUIRE);
			
 
				+}
			
 
				+static __inline__ char __DEFAULT_FN_ATTRS
			
 
				+_InterlockedExchangeAdd8_nf(char volatile *_Addend, char _Value) {
			
 
				+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELAXED);
			
 
				+}
			
 
				+static __inline__ char __DEFAULT_FN_ATTRS
			
 
				+_InterlockedExchangeAdd8_rel(char volatile *_Addend, char _Value) {
			
 
				+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELAXED);
			
 
				+}
			
 
				+static __inline__ short __DEFAULT_FN_ATTRS
			
 
				+_InterlockedExchangeAdd16_acq(short volatile *_Addend, short _Value) {
			
 
				+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_ACQUIRE);
			
 
				+}
			
 
				+static __inline__ short __DEFAULT_FN_ATTRS
			
 
				+_InterlockedExchangeAdd16_nf(short volatile *_Addend, short _Value) {
			
 
				+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELAXED);
			
 
				+}
			
 
				+static __inline__ short __DEFAULT_FN_ATTRS
			
 
				+_InterlockedExchangeAdd16_rel(short volatile *_Addend, short _Value) {
			
 
				+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELEASE);
			
 
				+}
			
 
				+static __inline__ long __DEFAULT_FN_ATTRS
			
 
				+_InterlockedExchangeAdd_acq(long volatile *_Addend, long _Value) {
			
 
				+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_ACQUIRE);
			
 
				+}
			
 
				+static __inline__ long __DEFAULT_FN_ATTRS
			
 
				+_InterlockedExchangeAdd_nf(long volatile *_Addend, long _Value) {
			
 
				+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELAXED);
			
 
				+}
			
 
				+static __inline__ long __DEFAULT_FN_ATTRS
			
 
				+_InterlockedExchangeAdd_rel(long volatile *_Addend, long _Value) {
			
 
				+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELEASE);
			
 
				+}
			
 
				+static __inline__ __int64 __DEFAULT_FN_ATTRS
			
 
				+_InterlockedExchangeAdd64_acq(__int64 volatile *_Addend, __int64 _Value) {
			
 
				+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_ACQUIRE);
			
 
				+}
			
 
				+static __inline__ __int64 __DEFAULT_FN_ATTRS
			
 
				+_InterlockedExchangeAdd64_nf(__int64 volatile *_Addend, __int64 _Value) {
			
 
				+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELAXED);
			
 
				+}
			
 
				+static __inline__ __int64 __DEFAULT_FN_ATTRS
			
 
				+_InterlockedExchangeAdd64_rel(__int64 volatile *_Addend, __int64 _Value) {
			
 
				+  return __atomic_fetch_add(_Addend, _Value, __ATOMIC_RELEASE);
			
 
				+}
			
 
				+#endif
			
 
				+/*----------------------------------------------------------------------------*\
			
 
				+|* Interlocked Increment
			
 
				+\*----------------------------------------------------------------------------*/
			
 
				+#if defined(__arm__) || defined(__aarch64__)
			
 
				+static __inline__ short __DEFAULT_FN_ATTRS
			
 
				+_InterlockedIncrement16_acq(short volatile *_Value) {
			
 
				+  return __atomic_add_fetch(_Value, 1, __ATOMIC_ACQUIRE);
			
 
				+}
			
 
				+static __inline__ short __DEFAULT_FN_ATTRS
			
 
				+_InterlockedIncrement16_nf(short volatile *_Value) {
			
 
				+  return __atomic_add_fetch(_Value, 1, __ATOMIC_RELAXED);
			
 
				+}
			
 
				+static __inline__ short __DEFAULT_FN_ATTRS
			
 
				+_InterlockedIncrement16_rel(short volatile *_Value) {
			
 
				+  return __atomic_add_fetch(_Value, 1, __ATOMIC_RELEASE);
			
 
				+}
			
 
				+static __inline__ long __DEFAULT_FN_ATTRS
			
 
				+_InterlockedIncrement_acq(long volatile *_Value) {
			
 
				+  return __atomic_add_fetch(_Value, 1, __ATOMIC_ACQUIRE);
			
 
				+}
			
 
				+static __inline__ long __DEFAULT_FN_ATTRS
			
 
				+_InterlockedIncrement_nf(long volatile *_Value) {
			
 
				+  return __atomic_add_fetch(_Value, 1, __ATOMIC_RELAXED);
			
 
				+}
			
 
				+static __inline__ long __DEFAULT_FN_ATTRS
			
 
				+_InterlockedIncrement_rel(long volatile *_Value) {
			
 
				+  return __atomic_add_fetch(_Value, 1, __ATOMIC_RELEASE);
			
 
				+}
			
 
				+static __inline__ __int64 __DEFAULT_FN_ATTRS
			
 
				+_InterlockedIncrement64_acq(__int64 volatile *_Value) {
			
 
				+  return __atomic_add_fetch(_Value, 1, __ATOMIC_ACQUIRE);
			
 
				+}
			
 
				+static __inline__ __int64 __DEFAULT_FN_ATTRS
			
 
				+_InterlockedIncrement64_nf(__int64 volatile *_Value) {
			
 
				+  return __atomic_add_fetch(_Value, 1, __ATOMIC_RELAXED);
			
 
				+}
			
 
				+static __inline__ __int64 __DEFAULT_FN_ATTRS
			
 
				+_InterlockedIncrement64_rel(__int64 volatile *_Value) {
			
 
				+  return __atomic_add_fetch(_Value, 1, __ATOMIC_RELEASE);
			
 
				+}
			
 
				+#endif
			
 
				+/*----------------------------------------------------------------------------*\
			
 
				+|* Interlocked Decrement
			
 
				+\*----------------------------------------------------------------------------*/
			
 
				+#if defined(__arm__) || defined(__aarch64__)
			
 
				+static __inline__ short __DEFAULT_FN_ATTRS
			
 
				+_InterlockedDecrement16_acq(short volatile *_Value) {
			
 
				+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_ACQUIRE);
			
 
				+}
			
 
				+static __inline__ short __DEFAULT_FN_ATTRS
			
 
				+_InterlockedDecrement16_nf(short volatile *_Value) {
			
 
				+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_RELAXED);
			
 
				+}
			
 
				+static __inline__ short __DEFAULT_FN_ATTRS
			
 
				+_InterlockedDecrement16_rel(short volatile *_Value) {
			
 
				+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_RELEASE);
			
 
				+}
			
 
				+static __inline__ long __DEFAULT_FN_ATTRS
			
 
				+_InterlockedDecrement_acq(long volatile *_Value) {
			
 
				+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_ACQUIRE);
			
 
				+}
			
 
				+static __inline__ long __DEFAULT_FN_ATTRS
			
 
				+_InterlockedDecrement_nf(long volatile *_Value) {
			
 
				+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_RELAXED);
			
 
				+}
			
 
				+static __inline__ long __DEFAULT_FN_ATTRS
			
 
				+_InterlockedDecrement_rel(long volatile *_Value) {
			
 
				+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_RELEASE);
			
 
				+}
			
 
				+static __inline__ __int64 __DEFAULT_FN_ATTRS
			
 
				+_InterlockedDecrement64_acq(__int64 volatile *_Value) {
			
 
				+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_ACQUIRE);
			
 
				+}
			
 
				+static __inline__ __int64 __DEFAULT_FN_ATTRS
			
 
				+_InterlockedDecrement64_nf(__int64 volatile *_Value) {
			
 
				+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_RELAXED);
			
 
				+}
			
 
				+static __inline__ __int64 __DEFAULT_FN_ATTRS
			
 
				+_InterlockedDecrement64_rel(__int64 volatile *_Value) {
			
 
				+  return __atomic_sub_fetch(_Value, 1, __ATOMIC_RELEASE);
			
 
				+}
			
 
				+#endif
			
 
				+/*----------------------------------------------------------------------------*\
			
 
				+|* Interlocked And
			
 
				+\*----------------------------------------------------------------------------*/
			
 
				+#if defined(__arm__) || defined(__aarch64__)
			
 
				+static __inline__ char __DEFAULT_FN_ATTRS
			
 
				+_InterlockedAnd8_acq(char volatile *_Value, char _Mask) {
			
 
				+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_ACQUIRE);
			
 
				+}
			
 
				+static __inline__ char __DEFAULT_FN_ATTRS
			
 
				+_InterlockedAnd8_nf(char volatile *_Value, char _Mask) {
			
 
				+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELAXED);
			
 
				+}
			
 
				+static __inline__ char __DEFAULT_FN_ATTRS
			
 
				+_InterlockedAnd8_rel(char volatile *_Value, char _Mask) {
			
 
				+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELEASE);
			
 
				+}
			
 
				+static __inline__ short __DEFAULT_FN_ATTRS
			
 
				+_InterlockedAnd16_acq(short volatile *_Value, short _Mask) {
			
 
				+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_ACQUIRE);
			
 
				+}
			
 
				+static __inline__ short __DEFAULT_FN_ATTRS
			
 
				+_InterlockedAnd16_nf(short volatile *_Value, short _Mask) {
			
 
				+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELAXED);
			
 
				+}
			
 
				+static __inline__ short __DEFAULT_FN_ATTRS
			
 
				+_InterlockedAnd16_rel(short volatile *_Value, short _Mask) {
			
 
				+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELEASE);
			
 
				+}
			
 
				+static __inline__ long __DEFAULT_FN_ATTRS
			
 
				+_InterlockedAnd_acq(long volatile *_Value, long _Mask) {
			
 
				+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_ACQUIRE);
			
 
				+}
			
 
				+static __inline__ long __DEFAULT_FN_ATTRS
			
 
				+_InterlockedAnd_nf(long volatile *_Value, long _Mask) {
			
 
				+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELAXED);
			
 
				+}
			
 
				+static __inline__ long __DEFAULT_FN_ATTRS
			
 
				+_InterlockedAnd_rel(long volatile *_Value, long _Mask) {
			
 
				+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELEASE);
			
 
				+}
			
 
				+static __inline__ __int64 __DEFAULT_FN_ATTRS
			
 
				+_InterlockedAnd64_acq(__int64 volatile *_Value, __int64 _Mask) {
			
 
				+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_ACQUIRE);
			
 
				+}
			
 
				+static __inline__ __int64 __DEFAULT_FN_ATTRS
			
 
				+_InterlockedAnd64_nf(__int64 volatile *_Value, __int64 _Mask) {
			
 
				+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELAXED);
			
 
				+}
			
 
				+static __inline__ __int64 __DEFAULT_FN_ATTRS
			
 
				+_InterlockedAnd64_rel(__int64 volatile *_Value, __int64 _Mask) {
			
 
				+  return __atomic_fetch_and(_Value, _Mask, __ATOMIC_RELEASE);
			
 
				+}
			
 
				+#endif
			
 
				+/*----------------------------------------------------------------------------*\
			
 
				+|* Interlocked Or
			
 
				+\*----------------------------------------------------------------------------*/
			
 
				+#if defined(__arm__) || defined(__aarch64__)
			
 
				+static __inline__ char __DEFAULT_FN_ATTRS
			
 
				+_InterlockedOr8_acq(char volatile *_Value, char _Mask) {
			
 
				+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_ACQUIRE);
			
 
				+}
			
 
				+static __inline__ char __DEFAULT_FN_ATTRS
			
 
				+_InterlockedOr8_nf(char volatile *_Value, char _Mask) {
			
 
				+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELAXED);
			
 
				+}
			
 
				+static __inline__ char __DEFAULT_FN_ATTRS
			
 
				+_InterlockedOr8_rel(char volatile *_Value, char _Mask) {
			
 
				+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELEASE);
			
 
				+}
			
 
				+static __inline__ short __DEFAULT_FN_ATTRS
			
 
				+_InterlockedOr16_acq(short volatile *_Value, short _Mask) {
			
 
				+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_ACQUIRE);
			
 
				+}
			
 
				+static __inline__ short __DEFAULT_FN_ATTRS
			
 
				+_InterlockedOr16_nf(short volatile *_Value, short _Mask) {
			
 
				+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELAXED);
			
 
				+}
			
 
				+static __inline__ short __DEFAULT_FN_ATTRS
			
 
				+_InterlockedOr16_rel(short volatile *_Value, short _Mask) {
			
 
				+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELEASE);
			
 
				+}
			
 
				+static __inline__ long __DEFAULT_FN_ATTRS
			
 
				+_InterlockedOr_acq(long volatile *_Value, long _Mask) {
			
 
				+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_ACQUIRE);
			
 
				+}
			
 
				+static __inline__ long __DEFAULT_FN_ATTRS
			
 
				+_InterlockedOr_nf(long volatile *_Value, long _Mask) {
			
 
				+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELAXED);
			
 
				+}
			
 
				+static __inline__ long __DEFAULT_FN_ATTRS
			
 
				+_InterlockedOr_rel(long volatile *_Value, long _Mask) {
			
 
				+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELEASE);
			
 
				+}
			
 
				+static __inline__ __int64 __DEFAULT_FN_ATTRS
			
 
				+_InterlockedOr64_acq(__int64 volatile *_Value, __int64 _Mask) {
			
 
				+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_ACQUIRE);
			
 
				+}
			
 
				+static __inline__ __int64 __DEFAULT_FN_ATTRS
			
 
				+_InterlockedOr64_nf(__int64 volatile *_Value, __int64 _Mask) {
			
 
				+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELAXED);
			
 
				+}
			
 
				+static __inline__ __int64 __DEFAULT_FN_ATTRS
			
 
				+_InterlockedOr64_rel(__int64 volatile *_Value, __int64 _Mask) {
			
 
				+  return __atomic_fetch_or(_Value, _Mask, __ATOMIC_RELEASE);
			
 
				+}
			
 
				+#endif
			
 
				+/*----------------------------------------------------------------------------*\
			
 
				+|* Interlocked Xor
			
 
				+\*----------------------------------------------------------------------------*/
			
 
				+#if defined(__arm__) || defined(__aarch64__)
			
 
				+static __inline__ char __DEFAULT_FN_ATTRS
			
 
				+_InterlockedXor8_acq(char volatile *_Value, char _Mask) {
			
 
				+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_ACQUIRE);
			
 
				+}
			
 
				+static __inline__ char __DEFAULT_FN_ATTRS
			
 
				+_InterlockedXor8_nf(char volatile *_Value, char _Mask) {
			
 
				+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELAXED);
			
 
				+}
			
 
				+static __inline__ char __DEFAULT_FN_ATTRS
			
 
				+_InterlockedXor8_rel(char volatile *_Value, char _Mask) {
			
 
				+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELEASE);
			
 
				+}
			
 
				+static __inline__ short __DEFAULT_FN_ATTRS
			
 
				+_InterlockedXor16_acq(short volatile *_Value, short _Mask) {
			
 
				+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_ACQUIRE);
			
 
				+}
			
 
				+static __inline__ short __DEFAULT_FN_ATTRS
			
 
				+_InterlockedXor16_nf(short volatile *_Value, short _Mask) {
			
 
				+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELAXED);
			
 
				+}
			
 
				+static __inline__ short __DEFAULT_FN_ATTRS
			
 
				+_InterlockedXor16_rel(short volatile *_Value, short _Mask) {
			
 
				+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELEASE);
			
 
				+}
			
 
				+static __inline__ long __DEFAULT_FN_ATTRS
			
 
				+_InterlockedXor_acq(long volatile *_Value, long _Mask) {
			
 
				+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_ACQUIRE);
			
 
				+}
			
 
				+static __inline__ long __DEFAULT_FN_ATTRS
			
 
				+_InterlockedXor_nf(long volatile *_Value, long _Mask) {
			
 
				+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELAXED);
			
 
				+}
			
 
				+static __inline__ long __DEFAULT_FN_ATTRS
			
 
				+_InterlockedXor_rel(long volatile *_Value, long _Mask) {
			
 
				+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELEASE);
			
 
				+}
			
 
				+static __inline__ __int64 __DEFAULT_FN_ATTRS
			
 
				+_InterlockedXor64_acq(__int64 volatile *_Value, __int64 _Mask) {
			
 
				+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_ACQUIRE);
			
 
				+}
			
 
				+static __inline__ __int64 __DEFAULT_FN_ATTRS
			
 
				+_InterlockedXor64_nf(__int64 volatile *_Value, __int64 _Mask) {
			
 
				+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELAXED);
			
 
				+}
			
 
				+static __inline__ __int64 __DEFAULT_FN_ATTRS
			
 
				+_InterlockedXor64_rel(__int64 volatile *_Value, __int64 _Mask) {
			
 
				+  return __atomic_fetch_xor(_Value, _Mask, __ATOMIC_RELEASE);
			
 
				+}
			
 
				+#endif
			
 
				+/*----------------------------------------------------------------------------*\
			
 
				+|* Interlocked Exchange
			
 
				+\*----------------------------------------------------------------------------*/
			
 
				+#if defined(__arm__) || defined(__aarch64__)
			
 
				+static __inline__ char __DEFAULT_FN_ATTRS
			
 
				+_InterlockedExchange8_acq(char volatile *_Target, char _Value) {
			
 
				+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_ACQUIRE);
			
 
				+  return _Value;
			
 
				+}
			
 
				+static __inline__ char __DEFAULT_FN_ATTRS
			
 
				+_InterlockedExchange8_nf(char volatile *_Target, char _Value) {
			
 
				+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELAXED);
			
 
				+  return _Value;
			
 
				+}
			
 
				+static __inline__ char __DEFAULT_FN_ATTRS
			
 
				+_InterlockedExchange8_rel(char volatile *_Target, char _Value) {
			
 
				+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELEASE);
			
 
				+  return _Value;
			
 
				+}
			
 
				+static __inline__ short __DEFAULT_FN_ATTRS
			
 
				+_InterlockedExchange16_acq(short volatile *_Target, short _Value) {
			
 
				+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_ACQUIRE);
			
 
				+  return _Value;
			
 
				+}
			
 
				+static __inline__ short __DEFAULT_FN_ATTRS
			
 
				+_InterlockedExchange16_nf(short volatile *_Target, short _Value) {
			
 
				+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELAXED);
			
 
				+  return _Value;
			
 
				+}
			
 
				+static __inline__ short __DEFAULT_FN_ATTRS
			
 
				+_InterlockedExchange16_rel(short volatile *_Target, short _Value) {
			
 
				+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELEASE);
			
 
				+  return _Value;
			
 
				+}
			
 
				+static __inline__ long __DEFAULT_FN_ATTRS
			
 
				+_InterlockedExchange_acq(long volatile *_Target, long _Value) {
			
 
				+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_ACQUIRE);
			
 
				+  return _Value;
			
 
				+}
			
 
				+static __inline__ long __DEFAULT_FN_ATTRS
			
 
				+_InterlockedExchange_nf(long volatile *_Target, long _Value) {
			
 
				+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELAXED);
			
 
				+  return _Value;
			
 
				+}
			
 
				+static __inline__ long __DEFAULT_FN_ATTRS
			
 
				+_InterlockedExchange_rel(long volatile *_Target, long _Value) {
			
 
				+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELEASE);
			
 
				+  return _Value;
			
 
				+}
			
 
				+static __inline__ __int64 __DEFAULT_FN_ATTRS
			
 
				+_InterlockedExchange64_acq(__int64 volatile *_Target, __int64 _Value) {
			
 
				+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_ACQUIRE);
			
 
				+  return _Value;
			
 
				+}
			
 
				+static __inline__ __int64 __DEFAULT_FN_ATTRS
			
 
				+_InterlockedExchange64_nf(__int64 volatile *_Target, __int64 _Value) {
			
 
				+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELAXED);
			
 
				+  return _Value;
			
 
				+}
			
 
				+static __inline__ __int64 __DEFAULT_FN_ATTRS
			
 
				+_InterlockedExchange64_rel(__int64 volatile *_Target, __int64 _Value) {
			
 
				+  __atomic_exchange(_Target, &_Value, &_Value, __ATOMIC_RELEASE);
			
 
				+  return _Value;
			
 
				+}
			
 
				+#endif
			
 
				+/*----------------------------------------------------------------------------*\
			
 
				+|* Interlocked Compare Exchange
			
 
				+\*----------------------------------------------------------------------------*/
			
 
				+#if defined(__arm__) || defined(__aarch64__)
			
 
				+static __inline__ char __DEFAULT_FN_ATTRS
			
 
				+_InterlockedCompareExchange8_acq(char volatile *_Destination,
			
 
				+                             char _Exchange, char _Comparand) {
			
 
				+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
			
 
				+                            __ATOMIC_SEQ_CST, __ATOMIC_ACQUIRE);
			
 
				+  return _Comparand;
			
 
				+}
			
 
				+static __inline__ char __DEFAULT_FN_ATTRS
			
 
				+_InterlockedCompareExchange8_nf(char volatile *_Destination,
			
 
				+                             char _Exchange, char _Comparand) {
			
 
				+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
			
 
				+                            __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
			
 
				+  return _Comparand;
			
 
				+}
			
 
				+static __inline__ char __DEFAULT_FN_ATTRS
			
 
				+_InterlockedCompareExchange8_rel(char volatile *_Destination,
			
 
				+                             char _Exchange, char _Comparand) {
			
 
				+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
			
 
				+                            __ATOMIC_SEQ_CST, __ATOMIC_RELEASE);
			
 
				+  return _Comparand;
			
 
				+}
			
 
				+static __inline__ short __DEFAULT_FN_ATTRS
			
 
				+_InterlockedCompareExchange16_acq(short volatile *_Destination,
			
 
				+                              short _Exchange, short _Comparand) {
			
 
				+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
			
 
				+                            __ATOMIC_SEQ_CST, __ATOMIC_ACQUIRE);
			
 
				+  return _Comparand;
			
 
				+}
			
 
				+static __inline__ short __DEFAULT_FN_ATTRS
			
 
				+_InterlockedCompareExchange16_nf(short volatile *_Destination,
			
 
				+                              short _Exchange, short _Comparand) {
			
 
				+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
			
 
				+                            __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
			
 
				+  return _Comparand;
			
 
				+}
			
 
				+static __inline__ short __DEFAULT_FN_ATTRS
			
 
				+_InterlockedCompareExchange16_rel(short volatile *_Destination,
			
 
				+                              short _Exchange, short _Comparand) {
			
 
				+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
			
 
				+                            __ATOMIC_SEQ_CST, __ATOMIC_RELEASE);
			
 
				+  return _Comparand;
			
 
				+}
			
 
				+static __inline__ long __DEFAULT_FN_ATTRS
			
 
				+_InterlockedCompareExchange_acq(long volatile *_Destination,
			
 
				+                              long _Exchange, long _Comparand) {
			
 
				+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
			
 
				+                            __ATOMIC_SEQ_CST, __ATOMIC_ACQUIRE);
			
 
				+  return _Comparand;
			
 
				+}
			
 
				+static __inline__ long __DEFAULT_FN_ATTRS
			
 
				+_InterlockedCompareExchange_nf(long volatile *_Destination,
			
 
				+                              long _Exchange, long _Comparand) {
			
 
				+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
			
 
				+                            __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
			
 
				+  return _Comparand;
			
 
				+}
			
 
				+static __inline__ long __DEFAULT_FN_ATTRS
			
 
				+_InterlockedCompareExchange_rel(long volatile *_Destination,
			
 
				+                              long _Exchange, long _Comparand) {
			
 
				+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
			
 
				+                            __ATOMIC_SEQ_CST, __ATOMIC_RELEASE);
			
 
				+  return _Comparand;
			
 
				+}
			
 
				+static __inline__ __int64 __DEFAULT_FN_ATTRS
			
 
				+_InterlockedCompareExchange64_acq(__int64 volatile *_Destination,
			
 
				+                              __int64 _Exchange, __int64 _Comparand) {
			
 
				+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
			
 
				+                            __ATOMIC_SEQ_CST, __ATOMIC_ACQUIRE);
			
 
				+  return _Comparand;
			
 
				+}
			
 
				+static __inline__ __int64 __DEFAULT_FN_ATTRS
			
 
				+_InterlockedCompareExchange64_nf(__int64 volatile *_Destination,
			
 
				+                              __int64 _Exchange, __int64 _Comparand) {
			
 
				+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
			
 
				+                            __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
			
 
				+  return _Comparand;
			
 
				+}
			
 
				+static __inline__ __int64 __DEFAULT_FN_ATTRS
			
 
				+_InterlockedCompareExchange64_rel(__int64 volatile *_Destination,
			
 
				+                              __int64 _Exchange, __int64 _Comparand) {
			
 
				+  __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
			
 
				+                            __ATOMIC_SEQ_CST, __ATOMIC_RELEASE);
			
 
				+  return _Comparand;
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+/*----------------------------------------------------------------------------*\
			
 
				+|* movs, stos
			
 
				+\*----------------------------------------------------------------------------*/
			
 
				+#if defined(__i386__) || defined(__x86_64__)
			
 
				+static __inline__ void __DEFAULT_FN_ATTRS
			
 
				+__movsb(unsigned char *__dst, unsigned char const *__src, size_t __n) {
			
 
				+  __asm__("rep movsb" : : "D"(__dst), "S"(__src), "c"(__n));
			
 
				+}
			
 
				+static __inline__ void __DEFAULT_FN_ATTRS
			
 
				+__movsd(unsigned long *__dst, unsigned long const *__src, size_t __n) {
			
 
				+  __asm__("rep movsl" : : "D"(__dst), "S"(__src), "c"(__n));
			
 
				+}
			
 
				+static __inline__ void __DEFAULT_FN_ATTRS
			
 
				+__movsw(unsigned short *__dst, unsigned short const *__src, size_t __n) {
			
 
				+  __asm__("rep movsw" : : "D"(__dst), "S"(__src), "c"(__n));
			
 
				+}
			
 
				+static __inline__ void __DEFAULT_FN_ATTRS
			
 
				+__stosd(unsigned long *__dst, unsigned long __x, size_t __n) {
			
 
				+  __asm__("rep stosl" : : "D"(__dst), "a"(__x), "c"(__n));
			
 
				+}
			
 
				+static __inline__ void __DEFAULT_FN_ATTRS
			
 
				+__stosw(unsigned short *__dst, unsigned short __x, size_t __n) {
			
 
				+  __asm__("rep stosw" : : "D"(__dst), "a"(__x), "c"(__n));
			
 
				+}
			
 
				+#endif
			
 
				+#ifdef __x86_64__
			
 
				+static __inline__ void __DEFAULT_FN_ATTRS
			
 
				+__movsq(unsigned long long *__dst, unsigned long long const *__src, size_t __n) {
			
 
				+  __asm__("rep movsq" : : "D"(__dst), "S"(__src), "c"(__n));
			
 
				+}
			
 
				+static __inline__ void __DEFAULT_FN_ATTRS
			
 
				+__stosq(unsigned __int64 *__dst, unsigned __int64 __x, size_t __n) {
			
 
				+  __asm__("rep stosq" : : "D"(__dst), "a"(__x), "c"(__n));
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+/*----------------------------------------------------------------------------*\
			
 
				+|* Misc
			
 
				+\*----------------------------------------------------------------------------*/
			
 
				+#if defined(__i386__) || defined(__x86_64__)
			
 
				+static __inline__ void __DEFAULT_FN_ATTRS
			
 
				+__cpuid(int __info[4], int __level) {
			
 
				+  __asm__ ("cpuid" : "=a"(__info[0]), "=b" (__info[1]), "=c"(__info[2]), "=d"(__info[3])
			
 
				+                   : "a"(__level));
			
 
				+}
			
 
				+static __inline__ void __DEFAULT_FN_ATTRS
			
 
				+__cpuidex(int __info[4], int __level, int __ecx) {
			
 
				+  __asm__ ("cpuid" : "=a"(__info[0]), "=b" (__info[1]), "=c"(__info[2]), "=d"(__info[3])
			
 
				+                   : "a"(__level), "c"(__ecx));
			
 
				+}
			
 
				+static __inline__ unsigned __int64 __cdecl __DEFAULT_FN_ATTRS
			
 
				+_xgetbv(unsigned int __xcr_no) {
			
 
				+  unsigned int __eax, __edx;
			
 
				+  __asm__ ("xgetbv" : "=a" (__eax), "=d" (__edx) : "c" (__xcr_no));
			
 
				+  return ((unsigned __int64)__edx << 32) | __eax;
			
 
				+}
			
 
				+static __inline__ void __DEFAULT_FN_ATTRS
			
 
				+__halt(void) {
			
 
				+  __asm__ volatile ("hlt");
			
 
				+}
			
 
				+static __inline__ void __DEFAULT_FN_ATTRS
			
 
				+__nop(void) {
			
 
				+  __asm__ volatile ("nop");
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+/*----------------------------------------------------------------------------*\
			
 
				+|* Privileged intrinsics
			
 
				+\*----------------------------------------------------------------------------*/
			
 
				+#if defined(__i386__) || defined(__x86_64__)
			
 
				+static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
			
 
				+__readmsr(unsigned long __register) {
			
 
				+  // Loads the contents of a 64-bit model specific register (MSR) specified in
			
 
				+  // the ECX register into registers EDX:EAX. The EDX register is loaded with
			
 
				+  // the high-order 32 bits of the MSR and the EAX register is loaded with the
			
 
				+  // low-order 32 bits. If less than 64 bits are implemented in the MSR being
			
 
				+  // read, the values returned to EDX:EAX in unimplemented bit locations are
			
 
				+  // undefined.
			
 
				+  unsigned long __edx;
			
 
				+  unsigned long __eax;
			
 
				+  __asm__ ("rdmsr" : "=d"(__edx), "=a"(__eax) : "c"(__register));
			
 
				+  return (((unsigned __int64)__edx) << 32) | (unsigned __int64)__eax;
			
 
				+}
			
 
				+
			
 
				+static __inline__ unsigned long __DEFAULT_FN_ATTRS
			
 
				+__readcr3(void) {
			
 
				+  unsigned long __cr3_val;
			
 
				+  __asm__ __volatile__ ("mov %%cr3, %0" : "=q"(__cr3_val) : : "memory");
			
 
				+  return __cr3_val;
			
 
				+}
			
 
				+
			
 
				+static __inline__ void __DEFAULT_FN_ATTRS
			
 
				+__writecr3(unsigned int __cr3_val) {
			
 
				+  __asm__ ("mov %0, %%cr3" : : "q"(__cr3_val) : "memory");
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#undef __DEFAULT_FN_ATTRS
			
 
				+
			
 
				+#endif /* __INTRIN_H */
			
 
				+#endif /* _MSC_VER */
			
--- a/demo/include/inttypes.h
+++ b/demo/include/inttypes.h
@@ -0,0 +1,106 @@
 
				+/*===---- inttypes.h - Standard header for integer printf macros ----------===*\
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+\*===----------------------------------------------------------------------===*/
			
 
				+
			
 
				+#ifndef __CLANG_INTTYPES_H
			
 
				+#define __CLANG_INTTYPES_H
			
 
				+
			
 
				+#if defined(_MSC_VER) && _MSC_VER < 1800
			
 
				+#error MSVC does not have inttypes.h prior to Visual Studio 2013
			
 
				+#endif
			
 
				+
			
 
				+#include_next <inttypes.h>
			
 
				+
			
 
				+#if defined(_MSC_VER) && _MSC_VER < 1900
			
 
				+/* MSVC headers define int32_t as int, but PRIx32 as "lx" instead of "x".
			
 
				+ * This triggers format warnings, so fix it up here. */
			
 
				+#undef PRId32
			
 
				+#undef PRIdLEAST32
			
 
				+#undef PRIdFAST32
			
 
				+#undef PRIi32
			
 
				+#undef PRIiLEAST32
			
 
				+#undef PRIiFAST32
			
 
				+#undef PRIo32
			
 
				+#undef PRIoLEAST32
			
 
				+#undef PRIoFAST32
			
 
				+#undef PRIu32
			
 
				+#undef PRIuLEAST32
			
 
				+#undef PRIuFAST32
			
 
				+#undef PRIx32
			
 
				+#undef PRIxLEAST32
			
 
				+#undef PRIxFAST32
			
 
				+#undef PRIX32
			
 
				+#undef PRIXLEAST32
			
 
				+#undef PRIXFAST32
			
 
				+
			
 
				+#undef SCNd32
			
 
				+#undef SCNdLEAST32
			
 
				+#undef SCNdFAST32
			
 
				+#undef SCNi32
			
 
				+#undef SCNiLEAST32
			
 
				+#undef SCNiFAST32
			
 
				+#undef SCNo32
			
 
				+#undef SCNoLEAST32
			
 
				+#undef SCNoFAST32
			
 
				+#undef SCNu32
			
 
				+#undef SCNuLEAST32
			
 
				+#undef SCNuFAST32
			
 
				+#undef SCNx32
			
 
				+#undef SCNxLEAST32
			
 
				+#undef SCNxFAST32
			
 
				+
			
 
				+#define PRId32 "d"
			
 
				+#define PRIdLEAST32 "d"
			
 
				+#define PRIdFAST32 "d"
			
 
				+#define PRIi32 "i"
			
 
				+#define PRIiLEAST32 "i"
			
 
				+#define PRIiFAST32 "i"
			
 
				+#define PRIo32 "o"
			
 
				+#define PRIoLEAST32 "o"
			
 
				+#define PRIoFAST32 "o"
			
 
				+#define PRIu32 "u"
			
 
				+#define PRIuLEAST32 "u"
			
 
				+#define PRIuFAST32 "u"
			
 
				+#define PRIx32 "x"
			
 
				+#define PRIxLEAST32 "x"
			
 
				+#define PRIxFAST32 "x"
			
 
				+#define PRIX32 "X"
			
 
				+#define PRIXLEAST32 "X"
			
 
				+#define PRIXFAST32 "X"
			
 
				+
			
 
				+#define SCNd32 "d"
			
 
				+#define SCNdLEAST32 "d"
			
 
				+#define SCNdFAST32 "d"
			
 
				+#define SCNi32 "i"
			
 
				+#define SCNiLEAST32 "i"
			
 
				+#define SCNiFAST32 "i"
			
 
				+#define SCNo32 "o"
			
 
				+#define SCNoLEAST32 "o"
			
 
				+#define SCNoFAST32 "o"
			
 
				+#define SCNu32 "u"
			
 
				+#define SCNuLEAST32 "u"
			
 
				+#define SCNuFAST32 "u"
			
 
				+#define SCNx32 "x"
			
 
				+#define SCNxLEAST32 "x"
			
 
				+#define SCNxFAST32 "x"
			
 
				+#endif
			
 
				+
			
 
				+#endif /* __CLANG_INTTYPES_H */
			
--- a/demo/include/iso646.h
+++ b/demo/include/iso646.h
@@ -0,0 +1,43 @@
 
				+/*===---- iso646.h - Standard header for alternate spellings of operators---===
			
 
				+ *
			
 
				+ * Copyright (c) 2008 Eli Friedman
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+
			
 
				+#ifndef __ISO646_H
			
 
				+#define __ISO646_H
			
 
				+
			
 
				+#ifndef __cplusplus
			
 
				+#define and    &&
			
 
				+#define and_eq &=
			
 
				+#define bitand &
			
 
				+#define bitor  |
			
 
				+#define compl  ~
			
 
				+#define not    !
			
 
				+#define not_eq !=
			
 
				+#define or     ||
			
 
				+#define or_eq  |=
			
 
				+#define xor    ^
			
 
				+#define xor_eq ^=
			
 
				+#endif
			
 
				+
			
 
				+#endif /* __ISO646_H */
			
--- a/demo/include/limits.h
+++ b/demo/include/limits.h
@@ -0,0 +1,118 @@
 
				+/*===---- limits.h - Standard header for integer sizes --------------------===*\
			
 
				+ *
			
 
				+ * Copyright (c) 2009 Chris Lattner
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+\*===----------------------------------------------------------------------===*/
			
 
				+
			
 
				+#ifndef __CLANG_LIMITS_H
			
 
				+#define __CLANG_LIMITS_H
			
 
				+
			
 
				+/* The system's limits.h may, in turn, try to #include_next GCC's limits.h.
			
 
				+   Avert this #include_next madness. */
			
 
				+#if defined __GNUC__ && !defined _GCC_LIMITS_H_
			
 
				+#define _GCC_LIMITS_H_
			
 
				+#endif
			
 
				+
			
 
				+/* System headers include a number of constants from POSIX in <limits.h>.
			
 
				+   Include it if we're hosted. */
			
 
				+#if __STDC_HOSTED__ && __has_include_next(<limits.h>)
			
 
				+#include_next <limits.h>
			
 
				+#endif
			
 
				+
			
 
				+/* Many system headers try to "help us out" by defining these.  No really, we
			
 
				+   know how big each datatype is. */
			
 
				+#undef  SCHAR_MIN
			
 
				+#undef  SCHAR_MAX
			
 
				+#undef  UCHAR_MAX
			
 
				+#undef  SHRT_MIN
			
 
				+#undef  SHRT_MAX
			
 
				+#undef  USHRT_MAX
			
 
				+#undef  INT_MIN
			
 
				+#undef  INT_MAX
			
 
				+#undef  UINT_MAX
			
 
				+#undef  LONG_MIN
			
 
				+#undef  LONG_MAX
			
 
				+#undef  ULONG_MAX
			
 
				+
			
 
				+#undef  CHAR_BIT
			
 
				+#undef  CHAR_MIN
			
 
				+#undef  CHAR_MAX
			
 
				+
			
 
				+/* C90/99 5.2.4.2.1 */
			
 
				+#define SCHAR_MAX __SCHAR_MAX__
			
 
				+#define SHRT_MAX  __SHRT_MAX__
			
 
				+#define INT_MAX   __INT_MAX__
			
 
				+#define LONG_MAX  __LONG_MAX__
			
 
				+
			
 
				+#define SCHAR_MIN (-__SCHAR_MAX__-1)
			
 
				+#define SHRT_MIN  (-__SHRT_MAX__ -1)
			
 
				+#define INT_MIN   (-__INT_MAX__  -1)
			
 
				+#define LONG_MIN  (-__LONG_MAX__ -1L)
			
 
				+
			
 
				+#define UCHAR_MAX (__SCHAR_MAX__*2  +1)
			
 
				+#define USHRT_MAX (__SHRT_MAX__ *2  +1)
			
 
				+#define UINT_MAX  (__INT_MAX__  *2U +1U)
			
 
				+#define ULONG_MAX (__LONG_MAX__ *2UL+1UL)
			
 
				+
			
 
				+#ifndef MB_LEN_MAX
			
 
				+#define MB_LEN_MAX 1
			
 
				+#endif
			
 
				+
			
 
				+#define CHAR_BIT  __CHAR_BIT__
			
 
				+
			
 
				+#ifdef __CHAR_UNSIGNED__  /* -funsigned-char */
			
 
				+#define CHAR_MIN 0
			
 
				+#define CHAR_MAX UCHAR_MAX
			
 
				+#else
			
 
				+#define CHAR_MIN SCHAR_MIN
			
 
				+#define CHAR_MAX __SCHAR_MAX__
			
 
				+#endif
			
 
				+
			
 
				+/* C99 5.2.4.2.1: Added long long.
			
 
				+   C++11 18.3.3.2: same contents as the Standard C Library header <limits.h>.
			
 
				+ */
			
 
				+#if __STDC_VERSION__ >= 199901L || __cplusplus >= 201103L
			
 
				+
			
 
				+#undef  LLONG_MIN
			
 
				+#undef  LLONG_MAX
			
 
				+#undef  ULLONG_MAX
			
 
				+
			
 
				+#define LLONG_MAX  __LONG_LONG_MAX__
			
 
				+#define LLONG_MIN  (-__LONG_LONG_MAX__-1LL)
			
 
				+#define ULLONG_MAX (__LONG_LONG_MAX__*2ULL+1ULL)
			
 
				+#endif
			
 
				+
			
 
				+/* LONG_LONG_MIN/LONG_LONG_MAX/ULONG_LONG_MAX are a GNU extension.  It's too bad
			
 
				+   that we don't have something like #pragma poison that could be used to
			
 
				+   deprecate a macro - the code should just use LLONG_MAX and friends.
			
 
				+ */
			
 
				+#if defined(__GNU_LIBRARY__) ? defined(__USE_GNU) : !defined(__STRICT_ANSI__)
			
 
				+
			
 
				+#undef   LONG_LONG_MIN
			
 
				+#undef   LONG_LONG_MAX
			
 
				+#undef   ULONG_LONG_MAX
			
 
				+
			
 
				+#define LONG_LONG_MAX  __LONG_LONG_MAX__
			
 
				+#define LONG_LONG_MIN  (-__LONG_LONG_MAX__-1LL)
			
 
				+#define ULONG_LONG_MAX (__LONG_LONG_MAX__*2ULL+1ULL)
			
 
				+#endif
			
 
				+
			
 
				+#endif /* __CLANG_LIMITS_H */
			
--- a/demo/include/lwpintrin.h
+++ b/demo/include/lwpintrin.h
@@ -0,0 +1,150 @@
 
				+/*===---- lwpintrin.h - LWP intrinsics -------------------------------------===
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+
			
 
				+#ifndef __X86INTRIN_H
			
 
				+#error "Never use <lwpintrin.h> directly; include <x86intrin.h> instead."
			
 
				+#endif
			
 
				+
			
 
				+#ifndef __LWPINTRIN_H
			
 
				+#define __LWPINTRIN_H
			
 
				+
			
 
				+/* Define the default attributes for the functions in this file. */
			
 
				+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("lwp")))
			
 
				+
			
 
				+/// \brief Parses the LWPCB at the specified address and enables
			
 
				+///        profiling if valid.
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> LLWPCB </c> instruction.
			
 
				+///
			
 
				+/// \param __addr
			
 
				+///    Address to the new Lightweight Profiling Control Block (LWPCB). If the
			
 
				+///    LWPCB is valid, writes the address into the LWP_CBADDR MSR and enables
			
 
				+///    Lightweight Profiling.
			
 
				+static __inline__ void __DEFAULT_FN_ATTRS
			
 
				+__llwpcb (void *__addr)
			
 
				+{
			
 
				+  __builtin_ia32_llwpcb(__addr);
			
 
				+}
			
 
				+
			
 
				+/// \brief Flushes the LWP state to memory and returns the address of the LWPCB.
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> SLWPCB </c> instruction.
			
 
				+///
			
 
				+/// \return
			
 
				+///    Address to the current Lightweight Profiling Control Block (LWPCB).
			
 
				+///    If LWP is not currently enabled, returns NULL.
			
 
				+static __inline__ void* __DEFAULT_FN_ATTRS
			
 
				+__slwpcb ()
			
 
				+{
			
 
				+  return __builtin_ia32_slwpcb();
			
 
				+}
			
 
				+
			
 
				+/// \brief Inserts programmed event record into the LWP event ring buffer
			
 
				+///        and advances the ring buffer pointer.
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> LWPINS </c> instruction.
			
 
				+///
			
 
				+/// \param DATA2
			
 
				+///    A 32-bit value is zero-extended and inserted into the 64-bit Data2 field.
			
 
				+/// \param DATA1
			
 
				+///    A 32-bit value is inserted into the 32-bit Data1 field.
			
 
				+/// \param FLAGS
			
 
				+///    A 32-bit immediate value is inserted into the 32-bit Flags field.
			
 
				+/// \returns If the ring buffer is full and LWP is running in Synchronized Mode,
			
 
				+///    the event record overwrites the last record in the buffer, the MissedEvents
			
 
				+///    counter in the LWPCB is incremented, the head pointer is not advanced, and
			
 
				+///    1 is returned. Otherwise 0 is returned.
			
 
				+#define __lwpins32(DATA2, DATA1, FLAGS) \
			
 
				+  (__builtin_ia32_lwpins32((unsigned int) (DATA2), (unsigned int) (DATA1), \
			
 
				+                           (unsigned int) (FLAGS)))
			
 
				+
			
 
				+/// \brief Decrements the LWP programmed value sample event counter. If the result is 
			
 
				+///        negative, inserts an event record into the LWP event ring buffer in memory
			
 
				+///        and advances the ring buffer pointer.
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> LWPVAL </c> instruction.
			
 
				+///
			
 
				+/// \param DATA2
			
 
				+///    A 32-bit value is zero-extended and inserted into the 64-bit Data2 field.
			
 
				+/// \param DATA1
			
 
				+///    A 32-bit value is inserted into the 32-bit Data1 field.
			
 
				+/// \param FLAGS
			
 
				+///    A 32-bit immediate value is inserted into the 32-bit Flags field.
			
 
				+#define __lwpval32(DATA2, DATA1, FLAGS) \
			
 
				+  (__builtin_ia32_lwpval32((unsigned int) (DATA2), (unsigned int) (DATA1), \
			
 
				+                           (unsigned int) (FLAGS)))
			
 
				+
			
 
				+#ifdef __x86_64__
			
 
				+
			
 
				+/// \brief Inserts programmed event record into the LWP event ring buffer
			
 
				+///        and advances the ring buffer pointer.
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> LWPINS </c> instruction.
			
 
				+///
			
 
				+/// \param DATA2
			
 
				+///    A 64-bit value is inserted into the 64-bit Data2 field.
			
 
				+/// \param DATA1
			
 
				+///    A 32-bit value is inserted into the 32-bit Data1 field.
			
 
				+/// \param FLAGS
			
 
				+///    A 32-bit immediate value is inserted into the 32-bit Flags field.
			
 
				+/// \returns If the ring buffer is full and LWP is running in Synchronized Mode,
			
 
				+///    the event record overwrites the last record in the buffer, the MissedEvents
			
 
				+///    counter in the LWPCB is incremented, the head pointer is not advanced, and
			
 
				+///    1 is returned. Otherwise 0 is returned.
			
 
				+#define __lwpins64(DATA2, DATA1, FLAGS) \
			
 
				+  (__builtin_ia32_lwpins64((unsigned long long) (DATA2), (unsigned int) (DATA1), \
			
 
				+                           (unsigned int) (FLAGS)))
			
 
				+
			
 
				+/// \brief Decrements the LWP programmed value sample event counter. If the result is 
			
 
				+///        negative, inserts an event record into the LWP event ring buffer in memory
			
 
				+///        and advances the ring buffer pointer.
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> LWPVAL </c> instruction.
			
 
				+///
			
 
				+/// \param DATA2
			
 
				+///    A 64-bit value is and inserted into the 64-bit Data2 field.
			
 
				+/// \param DATA1
			
 
				+///    A 32-bit value is inserted into the 32-bit Data1 field.
			
 
				+/// \param FLAGS
			
 
				+///    A 32-bit immediate value is inserted into the 32-bit Flags field.
			
 
				+#define __lwpval64(DATA2, DATA1, FLAGS) \
			
 
				+  (__builtin_ia32_lwpval64((unsigned long long) (DATA2), (unsigned int) (DATA1), \
			
 
				+                           (unsigned int) (FLAGS)))
			
 
				+
			
 
				+#endif
			
 
				+
			
 
				+#undef __DEFAULT_FN_ATTRS
			
 
				+
			
 
				+#endif /* __LWPINTRIN_H */
			
--- a/demo/include/lzcntintrin.h
+++ b/demo/include/lzcntintrin.h
@@ -0,0 +1,118 @@
 
				+/*===---- lzcntintrin.h - LZCNT intrinsics ---------------------------------===
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+
			
 
				+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
			
 
				+#error "Never use <lzcntintrin.h> directly; include <x86intrin.h> instead."
			
 
				+#endif
			
 
				+
			
 
				+#ifndef __LZCNTINTRIN_H
			
 
				+#define __LZCNTINTRIN_H
			
 
				+
			
 
				+/* Define the default attributes for the functions in this file. */
			
 
				+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("lzcnt")))
			
 
				+
			
 
				+/// \brief Counts the number of leading zero bits in the operand.
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the \c LZCNT instruction.
			
 
				+///
			
 
				+/// \param __X
			
 
				+///    An unsigned 16-bit integer whose leading zeros are to be counted.
			
 
				+/// \returns An unsigned 16-bit integer containing the number of leading zero
			
 
				+///    bits in the operand.
			
 
				+static __inline__ unsigned short __DEFAULT_FN_ATTRS
			
 
				+__lzcnt16(unsigned short __X)
			
 
				+{
			
 
				+  return __X ? __builtin_clzs(__X) : 16;
			
 
				+}
			
 
				+
			
 
				+/// \brief Counts the number of leading zero bits in the operand.
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the \c LZCNT instruction.
			
 
				+///
			
 
				+/// \param __X
			
 
				+///    An unsigned 32-bit integer whose leading zeros are to be counted.
			
 
				+/// \returns An unsigned 32-bit integer containing the number of leading zero
			
 
				+///    bits in the operand.
			
 
				+static __inline__ unsigned int __DEFAULT_FN_ATTRS
			
 
				+__lzcnt32(unsigned int __X)
			
 
				+{
			
 
				+  return __X ? __builtin_clz(__X) : 32;
			
 
				+}
			
 
				+
			
 
				+/// \brief Counts the number of leading zero bits in the operand.
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the \c LZCNT instruction.
			
 
				+///
			
 
				+/// \param __X
			
 
				+///    An unsigned 32-bit integer whose leading zeros are to be counted.
			
 
				+/// \returns An unsigned 32-bit integer containing the number of leading zero
			
 
				+///    bits in the operand.
			
 
				+static __inline__ unsigned int __DEFAULT_FN_ATTRS
			
 
				+_lzcnt_u32(unsigned int __X)
			
 
				+{
			
 
				+  return __X ? __builtin_clz(__X) : 32;
			
 
				+}
			
 
				+
			
 
				+#ifdef __x86_64__
			
 
				+/// \brief Counts the number of leading zero bits in the operand.
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the \c LZCNT instruction.
			
 
				+///
			
 
				+/// \param __X
			
 
				+///    An unsigned 64-bit integer whose leading zeros are to be counted.
			
 
				+/// \returns An unsigned 64-bit integer containing the number of leading zero
			
 
				+///    bits in the operand.
			
 
				+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
			
 
				+__lzcnt64(unsigned long long __X)
			
 
				+{
			
 
				+  return __X ? __builtin_clzll(__X) : 64;
			
 
				+}
			
 
				+
			
 
				+/// \brief Counts the number of leading zero bits in the operand.
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the \c LZCNT instruction.
			
 
				+///
			
 
				+/// \param __X
			
 
				+///    An unsigned 64-bit integer whose leading zeros are to be counted.
			
 
				+/// \returns An unsigned 64-bit integer containing the number of leading zero
			
 
				+///    bits in the operand.
			
 
				+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
			
 
				+_lzcnt_u64(unsigned long long __X)
			
 
				+{
			
 
				+  return __X ? __builtin_clzll(__X) : 64;
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#undef __DEFAULT_FN_ATTRS
			
 
				+
			
 
				+#endif /* __LZCNTINTRIN_H */
			
--- a/demo/include/mm3dnow.h
+++ b/demo/include/mm3dnow.h
@@ -0,0 +1,171 @@
 
				+/*===---- mm3dnow.h - 3DNow! intrinsics ------------------------------------===
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+
			
 
				+#ifndef _MM3DNOW_H_INCLUDED
			
 
				+#define _MM3DNOW_H_INCLUDED
			
 
				+
			
 
				+#include <mmintrin.h>
			
 
				+#include <prfchwintrin.h>
			
 
				+
			
 
				+typedef float __v2sf __attribute__((__vector_size__(8)));
			
 
				+
			
 
				+/* Define the default attributes for the functions in this file. */
			
 
				+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("3dnow")))
			
 
				+
			
 
				+static __inline__ void __DEFAULT_FN_ATTRS
			
 
				+_m_femms(void) {
			
 
				+  __builtin_ia32_femms();
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m64 __DEFAULT_FN_ATTRS
			
 
				+_m_pavgusb(__m64 __m1, __m64 __m2) {
			
 
				+  return (__m64)__builtin_ia32_pavgusb((__v8qi)__m1, (__v8qi)__m2);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m64 __DEFAULT_FN_ATTRS
			
 
				+_m_pf2id(__m64 __m) {
			
 
				+  return (__m64)__builtin_ia32_pf2id((__v2sf)__m);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m64 __DEFAULT_FN_ATTRS
			
 
				+_m_pfacc(__m64 __m1, __m64 __m2) {
			
 
				+  return (__m64)__builtin_ia32_pfacc((__v2sf)__m1, (__v2sf)__m2);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m64 __DEFAULT_FN_ATTRS
			
 
				+_m_pfadd(__m64 __m1, __m64 __m2) {
			
 
				+  return (__m64)__builtin_ia32_pfadd((__v2sf)__m1, (__v2sf)__m2);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m64 __DEFAULT_FN_ATTRS
			
 
				+_m_pfcmpeq(__m64 __m1, __m64 __m2) {
			
 
				+  return (__m64)__builtin_ia32_pfcmpeq((__v2sf)__m1, (__v2sf)__m2);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m64 __DEFAULT_FN_ATTRS
			
 
				+_m_pfcmpge(__m64 __m1, __m64 __m2) {
			
 
				+  return (__m64)__builtin_ia32_pfcmpge((__v2sf)__m1, (__v2sf)__m2);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m64 __DEFAULT_FN_ATTRS
			
 
				+_m_pfcmpgt(__m64 __m1, __m64 __m2) {
			
 
				+  return (__m64)__builtin_ia32_pfcmpgt((__v2sf)__m1, (__v2sf)__m2);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m64 __DEFAULT_FN_ATTRS
			
 
				+_m_pfmax(__m64 __m1, __m64 __m2) {
			
 
				+  return (__m64)__builtin_ia32_pfmax((__v2sf)__m1, (__v2sf)__m2);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m64 __DEFAULT_FN_ATTRS
			
 
				+_m_pfmin(__m64 __m1, __m64 __m2) {
			
 
				+  return (__m64)__builtin_ia32_pfmin((__v2sf)__m1, (__v2sf)__m2);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m64 __DEFAULT_FN_ATTRS
			
 
				+_m_pfmul(__m64 __m1, __m64 __m2) {
			
 
				+  return (__m64)__builtin_ia32_pfmul((__v2sf)__m1, (__v2sf)__m2);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m64 __DEFAULT_FN_ATTRS
			
 
				+_m_pfrcp(__m64 __m) {
			
 
				+  return (__m64)__builtin_ia32_pfrcp((__v2sf)__m);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m64 __DEFAULT_FN_ATTRS
			
 
				+_m_pfrcpit1(__m64 __m1, __m64 __m2) {
			
 
				+  return (__m64)__builtin_ia32_pfrcpit1((__v2sf)__m1, (__v2sf)__m2);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m64 __DEFAULT_FN_ATTRS
			
 
				+_m_pfrcpit2(__m64 __m1, __m64 __m2) {
			
 
				+  return (__m64)__builtin_ia32_pfrcpit2((__v2sf)__m1, (__v2sf)__m2);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m64 __DEFAULT_FN_ATTRS
			
 
				+_m_pfrsqrt(__m64 __m) {
			
 
				+  return (__m64)__builtin_ia32_pfrsqrt((__v2sf)__m);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m64 __DEFAULT_FN_ATTRS
			
 
				+_m_pfrsqrtit1(__m64 __m1, __m64 __m2) {
			
 
				+  return (__m64)__builtin_ia32_pfrsqit1((__v2sf)__m1, (__v2sf)__m2);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m64 __DEFAULT_FN_ATTRS
			
 
				+_m_pfsub(__m64 __m1, __m64 __m2) {
			
 
				+  return (__m64)__builtin_ia32_pfsub((__v2sf)__m1, (__v2sf)__m2);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m64 __DEFAULT_FN_ATTRS
			
 
				+_m_pfsubr(__m64 __m1, __m64 __m2) {
			
 
				+  return (__m64)__builtin_ia32_pfsubr((__v2sf)__m1, (__v2sf)__m2);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m64 __DEFAULT_FN_ATTRS
			
 
				+_m_pi2fd(__m64 __m) {
			
 
				+  return (__m64)__builtin_ia32_pi2fd((__v2si)__m);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m64 __DEFAULT_FN_ATTRS
			
 
				+_m_pmulhrw(__m64 __m1, __m64 __m2) {
			
 
				+  return (__m64)__builtin_ia32_pmulhrw((__v4hi)__m1, (__v4hi)__m2);
			
 
				+}
			
 
				+
			
 
				+/* Handle the 3dnowa instructions here. */
			
 
				+#undef __DEFAULT_FN_ATTRS
			
 
				+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("3dnowa")))
			
 
				+
			
 
				+static __inline__ __m64 __DEFAULT_FN_ATTRS
			
 
				+_m_pf2iw(__m64 __m) {
			
 
				+  return (__m64)__builtin_ia32_pf2iw((__v2sf)__m);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m64 __DEFAULT_FN_ATTRS
			
 
				+_m_pfnacc(__m64 __m1, __m64 __m2) {
			
 
				+  return (__m64)__builtin_ia32_pfnacc((__v2sf)__m1, (__v2sf)__m2);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m64 __DEFAULT_FN_ATTRS
			
 
				+_m_pfpnacc(__m64 __m1, __m64 __m2) {
			
 
				+  return (__m64)__builtin_ia32_pfpnacc((__v2sf)__m1, (__v2sf)__m2);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m64 __DEFAULT_FN_ATTRS
			
 
				+_m_pi2fw(__m64 __m) {
			
 
				+  return (__m64)__builtin_ia32_pi2fw((__v2si)__m);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m64 __DEFAULT_FN_ATTRS
			
 
				+_m_pswapdsf(__m64 __m) {
			
 
				+  return (__m64)__builtin_ia32_pswapdsf((__v2sf)__m);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m64 __DEFAULT_FN_ATTRS
			
 
				+_m_pswapdsi(__m64 __m) {
			
 
				+  return (__m64)__builtin_ia32_pswapdsi((__v2si)__m);
			
 
				+}
			
 
				+
			
 
				+#undef __DEFAULT_FN_ATTRS
			
 
				+
			
 
				+#endif
			
--- a/demo/include/mm_malloc.h
+++ b/demo/include/mm_malloc.h
@@ -0,0 +1,75 @@
 
				+/*===---- mm_malloc.h - Allocating and Freeing Aligned Memory Blocks -------===
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+
			
 
				+#ifndef __MM_MALLOC_H
			
 
				+#define __MM_MALLOC_H
			
 
				+
			
 
				+#include <stdlib.h>
			
 
				+
			
 
				+#ifdef _WIN32
			
 
				+#include <malloc.h>
			
 
				+#else
			
 
				+#ifndef __cplusplus
			
 
				+extern int posix_memalign(void **__memptr, size_t __alignment, size_t __size);
			
 
				+#else
			
 
				+// Some systems (e.g. those with GNU libc) declare posix_memalign with an
			
 
				+// exception specifier. Via an "egregious workaround" in
			
 
				+// Sema::CheckEquivalentExceptionSpec, Clang accepts the following as a valid
			
 
				+// redeclaration of glibc's declaration.
			
 
				+extern "C" int posix_memalign(void **__memptr, size_t __alignment, size_t __size);
			
 
				+#endif
			
 
				+#endif
			
 
				+
			
 
				+#if !(defined(_WIN32) && defined(_mm_malloc))
			
 
				+static __inline__ void *__attribute__((__always_inline__, __nodebug__,
			
 
				+                                       __malloc__))
			
 
				+_mm_malloc(size_t __size, size_t __align)
			
 
				+{
			
 
				+  if (__align == 1) {
			
 
				+    return malloc(__size);
			
 
				+  }
			
 
				+
			
 
				+  if (!(__align & (__align - 1)) && __align < sizeof(void *))
			
 
				+    __align = sizeof(void *);
			
 
				+
			
 
				+  void *__mallocedMemory;
			
 
				+#if defined(__MINGW32__)
			
 
				+  __mallocedMemory = __mingw_aligned_malloc(__size, __align);
			
 
				+#elif defined(_WIN32)
			
 
				+  __mallocedMemory = _aligned_malloc(__size, __align);
			
 
				+#else
			
 
				+  if (posix_memalign(&__mallocedMemory, __align, __size))
			
 
				+    return 0;
			
 
				+#endif
			
 
				+
			
 
				+  return __mallocedMemory;
			
 
				+}
			
 
				+
			
 
				+static __inline__ void __attribute__((__always_inline__, __nodebug__))
			
 
				+_mm_free(void *__p)
			
 
				+{
			
 
				+  free(__p);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#endif /* __MM_MALLOC_H */
			
--- a/demo/include/mmintrin.h
+++ b/demo/include/mmintrin.h
--- a/demo/include/module.modulemap
+++ b/demo/include/module.modulemap
@@ -0,0 +1,167 @@
 
				+/*===---- module.modulemap - intrinsics module map -------------------------===
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+
			
 
				+module _Builtin_intrinsics [system] [extern_c] {
			
 
				+  explicit module altivec {
			
 
				+    requires altivec
			
 
				+    header "altivec.h"
			
 
				+  }
			
 
				+
			
 
				+  explicit module arm {
			
 
				+    requires arm
			
 
				+
			
 
				+    explicit module acle {
			
 
				+      header "arm_acle.h"
			
 
				+      export *
			
 
				+    }
			
 
				+
			
 
				+    explicit module neon {
			
 
				+      requires neon
			
 
				+      header "arm_neon.h"
			
 
				+      export *
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  explicit module intel {
			
 
				+    requires x86
			
 
				+    export *
			
 
				+
			
 
				+    header "immintrin.h"
			
 
				+    textual header "f16cintrin.h"
			
 
				+    textual header "avxintrin.h"
			
 
				+    textual header "avx2intrin.h"
			
 
				+    textual header "avx512fintrin.h"
			
 
				+    textual header "avx512erintrin.h"
			
 
				+    textual header "fmaintrin.h"
			
 
				+
			
 
				+    header "x86intrin.h"
			
 
				+    textual header "bmiintrin.h"
			
 
				+    textual header "bmi2intrin.h"
			
 
				+    textual header "lzcntintrin.h"
			
 
				+    textual header "xopintrin.h"
			
 
				+    textual header "fma4intrin.h"
			
 
				+    textual header "mwaitxintrin.h"
			
 
				+    textual header "clzerointrin.h"
			
 
				+
			
 
				+    explicit module mm_malloc {
			
 
				+      requires !freestanding
			
 
				+      header "mm_malloc.h"
			
 
				+      export * // note: for <stdlib.h> dependency
			
 
				+    }
			
 
				+
			
 
				+    explicit module cpuid {
			
 
				+      requires gnuinlineasm
			
 
				+      header "cpuid.h"
			
 
				+    }
			
 
				+
			
 
				+    explicit module mmx {
			
 
				+      header "mmintrin.h"
			
 
				+    }
			
 
				+
			
 
				+    explicit module sse {
			
 
				+      export mm_malloc
			
 
				+      export mmx
			
 
				+      export sse2 // note: for hackish <emmintrin.h> dependency
			
 
				+      header "xmmintrin.h"
			
 
				+    }
			
 
				+
			
 
				+    explicit module sse2 {
			
 
				+      export sse
			
 
				+      header "emmintrin.h"
			
 
				+    }
			
 
				+
			
 
				+    explicit module sse3 {
			
 
				+      export sse2
			
 
				+      header "pmmintrin.h"
			
 
				+    }
			
 
				+
			
 
				+    explicit module ssse3 {
			
 
				+      export sse3
			
 
				+      header "tmmintrin.h"
			
 
				+    }
			
 
				+
			
 
				+    explicit module sse4_1 {
			
 
				+      export ssse3
			
 
				+      header "smmintrin.h"
			
 
				+    }
			
 
				+
			
 
				+    explicit module sse4_2 {
			
 
				+      export sse4_1
			
 
				+      header "nmmintrin.h"
			
 
				+    }
			
 
				+
			
 
				+    explicit module sse4a {
			
 
				+      export sse3
			
 
				+      header "ammintrin.h"
			
 
				+    }
			
 
				+
			
 
				+    explicit module popcnt {
			
 
				+      header "popcntintrin.h"
			
 
				+    }
			
 
				+
			
 
				+    explicit module mm3dnow {
			
 
				+      header "mm3dnow.h"
			
 
				+    }
			
 
				+
			
 
				+    explicit module aes_pclmul {
			
 
				+      header "wmmintrin.h"
			
 
				+      export aes
			
 
				+      export pclmul
			
 
				+    }
			
 
				+
			
 
				+    explicit module aes {
			
 
				+      header "__wmmintrin_aes.h"
			
 
				+    }
			
 
				+
			
 
				+    explicit module pclmul {
			
 
				+      header "__wmmintrin_pclmul.h"
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  explicit module systemz {
			
 
				+    requires systemz
			
 
				+    export *
			
 
				+
			
 
				+    header "s390intrin.h"
			
 
				+
			
 
				+    explicit module htm {
			
 
				+      requires htm
			
 
				+      header "htmintrin.h"
			
 
				+      header "htmxlintrin.h"
			
 
				+    }
			
 
				+
			
 
				+    explicit module zvector {
			
 
				+      requires zvector, vx
			
 
				+      header "vecintrin.h"
			
 
				+    }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+module _Builtin_stddef_max_align_t [system] [extern_c] {
			
 
				+  header "__stddef_max_align_t.h"
			
 
				+}
			
 
				+
			
 
				+module opencl_c {
			
 
				+  requires opencl
			
 
				+  header "opencl-c.h"
			
 
				+}
			
--- a/demo/include/msa.h
+++ b/demo/include/msa.h
@@ -0,0 +1,583 @@
 
				+/*===---- msa.h - MIPS MSA intrinsics --------------------------------------===
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+
			
 
				+#ifndef _MSA_H
			
 
				+#define _MSA_H 1
			
 
				+
			
 
				+#if defined(__mips_msa)
			
 
				+typedef signed char v16i8 __attribute__((vector_size(16), aligned(16)));
			
 
				+typedef signed char v16i8_b __attribute__((vector_size(16), aligned(1)));
			
 
				+typedef unsigned char v16u8 __attribute__((vector_size(16), aligned(16)));
			
 
				+typedef unsigned char v16u8_b __attribute__((vector_size(16), aligned(1)));
			
 
				+typedef short v8i16 __attribute__((vector_size(16), aligned(16)));
			
 
				+typedef short v8i16_h __attribute__((vector_size(16), aligned(2)));
			
 
				+typedef unsigned short v8u16 __attribute__((vector_size(16), aligned(16)));
			
 
				+typedef unsigned short v8u16_h __attribute__((vector_size(16), aligned(2)));
			
 
				+typedef int v4i32 __attribute__((vector_size(16), aligned(16)));
			
 
				+typedef int v4i32_w __attribute__((vector_size(16), aligned(4)));
			
 
				+typedef unsigned int v4u32 __attribute__((vector_size(16), aligned(16)));
			
 
				+typedef unsigned int v4u32_w __attribute__((vector_size(16), aligned(4)));
			
 
				+typedef long long v2i64 __attribute__((vector_size(16), aligned(16)));
			
 
				+typedef long long v2i64_d __attribute__((vector_size(16), aligned(8)));
			
 
				+typedef unsigned long long v2u64 __attribute__((vector_size(16), aligned(16)));
			
 
				+typedef unsigned long long v2u64_d __attribute__((vector_size(16), aligned(8)));
			
 
				+typedef float v4f32 __attribute__((vector_size(16), aligned(16)));
			
 
				+typedef float v4f32_w __attribute__((vector_size(16), aligned(4)));
			
 
				+typedef double v2f64 __attribute__ ((vector_size(16), aligned(16)));
			
 
				+typedef double v2f64_d __attribute__ ((vector_size(16), aligned(8)));
			
 
				+
			
 
				+#define __msa_sll_b __builtin_msa_sll_b
			
 
				+#define __msa_sll_h __builtin_msa_sll_h
			
 
				+#define __msa_sll_w __builtin_msa_sll_w
			
 
				+#define __msa_sll_d __builtin_msa_sll_d
			
 
				+#define __msa_slli_b __builtin_msa_slli_b
			
 
				+#define __msa_slli_h __builtin_msa_slli_h
			
 
				+#define __msa_slli_w __builtin_msa_slli_w
			
 
				+#define __msa_slli_d __builtin_msa_slli_d
			
 
				+#define __msa_sra_b __builtin_msa_sra_b
			
 
				+#define __msa_sra_h __builtin_msa_sra_h
			
 
				+#define __msa_sra_w __builtin_msa_sra_w
			
 
				+#define __msa_sra_d __builtin_msa_sra_d
			
 
				+#define __msa_srai_b __builtin_msa_srai_b
			
 
				+#define __msa_srai_h __builtin_msa_srai_h
			
 
				+#define __msa_srai_w __builtin_msa_srai_w
			
 
				+#define __msa_srai_d __builtin_msa_srai_d
			
 
				+#define __msa_srar_b __builtin_msa_srar_b
			
 
				+#define __msa_srar_h __builtin_msa_srar_h
			
 
				+#define __msa_srar_w __builtin_msa_srar_w
			
 
				+#define __msa_srar_d __builtin_msa_srar_d
			
 
				+#define __msa_srari_b __builtin_msa_srari_b
			
 
				+#define __msa_srari_h __builtin_msa_srari_h
			
 
				+#define __msa_srari_w __builtin_msa_srari_w
			
 
				+#define __msa_srari_d __builtin_msa_srari_d
			
 
				+#define __msa_srl_b __builtin_msa_srl_b
			
 
				+#define __msa_srl_h __builtin_msa_srl_h
			
 
				+#define __msa_srl_w __builtin_msa_srl_w
			
 
				+#define __msa_srl_d __builtin_msa_srl_d
			
 
				+#define __msa_srli_b __builtin_msa_srli_b
			
 
				+#define __msa_srli_h __builtin_msa_srli_h
			
 
				+#define __msa_srli_w __builtin_msa_srli_w
			
 
				+#define __msa_srli_d __builtin_msa_srli_d
			
 
				+#define __msa_srlr_b __builtin_msa_srlr_b
			
 
				+#define __msa_srlr_h __builtin_msa_srlr_h
			
 
				+#define __msa_srlr_w __builtin_msa_srlr_w
			
 
				+#define __msa_srlr_d __builtin_msa_srlr_d
			
 
				+#define __msa_srlri_b __builtin_msa_srlri_b
			
 
				+#define __msa_srlri_h __builtin_msa_srlri_h
			
 
				+#define __msa_srlri_w __builtin_msa_srlri_w
			
 
				+#define __msa_srlri_d __builtin_msa_srlri_d
			
 
				+#define __msa_bclr_b __builtin_msa_bclr_b
			
 
				+#define __msa_bclr_h __builtin_msa_bclr_h
			
 
				+#define __msa_bclr_w __builtin_msa_bclr_w
			
 
				+#define __msa_bclr_d __builtin_msa_bclr_d
			
 
				+#define __msa_bclri_b __builtin_msa_bclri_b
			
 
				+#define __msa_bclri_h __builtin_msa_bclri_h
			
 
				+#define __msa_bclri_w __builtin_msa_bclri_w
			
 
				+#define __msa_bclri_d __builtin_msa_bclri_d
			
 
				+#define __msa_bset_b __builtin_msa_bset_b
			
 
				+#define __msa_bset_h __builtin_msa_bset_h
			
 
				+#define __msa_bset_w __builtin_msa_bset_w
			
 
				+#define __msa_bset_d __builtin_msa_bset_d
			
 
				+#define __msa_bseti_b __builtin_msa_bseti_b
			
 
				+#define __msa_bseti_h __builtin_msa_bseti_h
			
 
				+#define __msa_bseti_w __builtin_msa_bseti_w
			
 
				+#define __msa_bseti_d __builtin_msa_bseti_d
			
 
				+#define __msa_bneg_b __builtin_msa_bneg_b
			
 
				+#define __msa_bneg_h __builtin_msa_bneg_h
			
 
				+#define __msa_bneg_w __builtin_msa_bneg_w
			
 
				+#define __msa_bneg_d __builtin_msa_bneg_d
			
 
				+#define __msa_bnegi_b __builtin_msa_bnegi_b
			
 
				+#define __msa_bnegi_h __builtin_msa_bnegi_h
			
 
				+#define __msa_bnegi_w __builtin_msa_bnegi_w
			
 
				+#define __msa_bnegi_d __builtin_msa_bnegi_d
			
 
				+#define __msa_binsl_b __builtin_msa_binsl_b
			
 
				+#define __msa_binsl_h __builtin_msa_binsl_h
			
 
				+#define __msa_binsl_w __builtin_msa_binsl_w
			
 
				+#define __msa_binsl_d __builtin_msa_binsl_d
			
 
				+#define __msa_binsli_b __builtin_msa_binsli_b
			
 
				+#define __msa_binsli_h __builtin_msa_binsli_h
			
 
				+#define __msa_binsli_w __builtin_msa_binsli_w
			
 
				+#define __msa_binsli_d __builtin_msa_binsli_d
			
 
				+#define __msa_binsr_b __builtin_msa_binsr_b
			
 
				+#define __msa_binsr_h __builtin_msa_binsr_h
			
 
				+#define __msa_binsr_w __builtin_msa_binsr_w
			
 
				+#define __msa_binsr_d __builtin_msa_binsr_d
			
 
				+#define __msa_binsri_b __builtin_msa_binsri_b
			
 
				+#define __msa_binsri_h __builtin_msa_binsri_h
			
 
				+#define __msa_binsri_w __builtin_msa_binsri_w
			
 
				+#define __msa_binsri_d __builtin_msa_binsri_d
			
 
				+#define __msa_addv_b __builtin_msa_addv_b
			
 
				+#define __msa_addv_h __builtin_msa_addv_h
			
 
				+#define __msa_addv_w __builtin_msa_addv_w
			
 
				+#define __msa_addv_d __builtin_msa_addv_d
			
 
				+#define __msa_addvi_b __builtin_msa_addvi_b
			
 
				+#define __msa_addvi_h __builtin_msa_addvi_h
			
 
				+#define __msa_addvi_w __builtin_msa_addvi_w
			
 
				+#define __msa_addvi_d __builtin_msa_addvi_d
			
 
				+#define __msa_subv_b __builtin_msa_subv_b
			
 
				+#define __msa_subv_h __builtin_msa_subv_h
			
 
				+#define __msa_subv_w __builtin_msa_subv_w
			
 
				+#define __msa_subv_d __builtin_msa_subv_d
			
 
				+#define __msa_subvi_b __builtin_msa_subvi_b
			
 
				+#define __msa_subvi_h __builtin_msa_subvi_h
			
 
				+#define __msa_subvi_w __builtin_msa_subvi_w
			
 
				+#define __msa_subvi_d __builtin_msa_subvi_d
			
 
				+#define __msa_max_s_b __builtin_msa_max_s_b
			
 
				+#define __msa_max_s_h __builtin_msa_max_s_h
			
 
				+#define __msa_max_s_w __builtin_msa_max_s_w
			
 
				+#define __msa_max_s_d __builtin_msa_max_s_d
			
 
				+#define __msa_maxi_s_b __builtin_msa_maxi_s_b
			
 
				+#define __msa_maxi_s_h __builtin_msa_maxi_s_h
			
 
				+#define __msa_maxi_s_w __builtin_msa_maxi_s_w
			
 
				+#define __msa_maxi_s_d __builtin_msa_maxi_s_d
			
 
				+#define __msa_max_u_b __builtin_msa_max_u_b
			
 
				+#define __msa_max_u_h __builtin_msa_max_u_h
			
 
				+#define __msa_max_u_w __builtin_msa_max_u_w
			
 
				+#define __msa_max_u_d __builtin_msa_max_u_d
			
 
				+#define __msa_maxi_u_b __builtin_msa_maxi_u_b
			
 
				+#define __msa_maxi_u_h __builtin_msa_maxi_u_h
			
 
				+#define __msa_maxi_u_w __builtin_msa_maxi_u_w
			
 
				+#define __msa_maxi_u_d __builtin_msa_maxi_u_d
			
 
				+#define __msa_min_s_b __builtin_msa_min_s_b
			
 
				+#define __msa_min_s_h __builtin_msa_min_s_h
			
 
				+#define __msa_min_s_w __builtin_msa_min_s_w
			
 
				+#define __msa_min_s_d __builtin_msa_min_s_d
			
 
				+#define __msa_mini_s_b __builtin_msa_mini_s_b
			
 
				+#define __msa_mini_s_h __builtin_msa_mini_s_h
			
 
				+#define __msa_mini_s_w __builtin_msa_mini_s_w
			
 
				+#define __msa_mini_s_d __builtin_msa_mini_s_d
			
 
				+#define __msa_min_u_b __builtin_msa_min_u_b
			
 
				+#define __msa_min_u_h __builtin_msa_min_u_h
			
 
				+#define __msa_min_u_w __builtin_msa_min_u_w
			
 
				+#define __msa_min_u_d __builtin_msa_min_u_d
			
 
				+#define __msa_mini_u_b __builtin_msa_mini_u_b
			
 
				+#define __msa_mini_u_h __builtin_msa_mini_u_h
			
 
				+#define __msa_mini_u_w __builtin_msa_mini_u_w
			
 
				+#define __msa_mini_u_d __builtin_msa_mini_u_d
			
 
				+#define __msa_max_a_b __builtin_msa_max_a_b
			
 
				+#define __msa_max_a_h __builtin_msa_max_a_h
			
 
				+#define __msa_max_a_w __builtin_msa_max_a_w
			
 
				+#define __msa_max_a_d __builtin_msa_max_a_d
			
 
				+#define __msa_min_a_b __builtin_msa_min_a_b
			
 
				+#define __msa_min_a_h __builtin_msa_min_a_h
			
 
				+#define __msa_min_a_w __builtin_msa_min_a_w
			
 
				+#define __msa_min_a_d __builtin_msa_min_a_d
			
 
				+#define __msa_ceq_b __builtin_msa_ceq_b
			
 
				+#define __msa_ceq_h __builtin_msa_ceq_h
			
 
				+#define __msa_ceq_w __builtin_msa_ceq_w
			
 
				+#define __msa_ceq_d __builtin_msa_ceq_d
			
 
				+#define __msa_ceqi_b __builtin_msa_ceqi_b
			
 
				+#define __msa_ceqi_h __builtin_msa_ceqi_h
			
 
				+#define __msa_ceqi_w __builtin_msa_ceqi_w
			
 
				+#define __msa_ceqi_d __builtin_msa_ceqi_d
			
 
				+#define __msa_clt_s_b __builtin_msa_clt_s_b
			
 
				+#define __msa_clt_s_h __builtin_msa_clt_s_h
			
 
				+#define __msa_clt_s_w __builtin_msa_clt_s_w
			
 
				+#define __msa_clt_s_d __builtin_msa_clt_s_d
			
 
				+#define __msa_clti_s_b __builtin_msa_clti_s_b
			
 
				+#define __msa_clti_s_h __builtin_msa_clti_s_h
			
 
				+#define __msa_clti_s_w __builtin_msa_clti_s_w
			
 
				+#define __msa_clti_s_d __builtin_msa_clti_s_d
			
 
				+#define __msa_clt_u_b __builtin_msa_clt_u_b
			
 
				+#define __msa_clt_u_h __builtin_msa_clt_u_h
			
 
				+#define __msa_clt_u_w __builtin_msa_clt_u_w
			
 
				+#define __msa_clt_u_d __builtin_msa_clt_u_d
			
 
				+#define __msa_clti_u_b __builtin_msa_clti_u_b
			
 
				+#define __msa_clti_u_h __builtin_msa_clti_u_h
			
 
				+#define __msa_clti_u_w __builtin_msa_clti_u_w
			
 
				+#define __msa_clti_u_d __builtin_msa_clti_u_d
			
 
				+#define __msa_cle_s_b __builtin_msa_cle_s_b
			
 
				+#define __msa_cle_s_h __builtin_msa_cle_s_h
			
 
				+#define __msa_cle_s_w __builtin_msa_cle_s_w
			
 
				+#define __msa_cle_s_d __builtin_msa_cle_s_d
			
 
				+#define __msa_clei_s_b __builtin_msa_clei_s_b
			
 
				+#define __msa_clei_s_h __builtin_msa_clei_s_h
			
 
				+#define __msa_clei_s_w __builtin_msa_clei_s_w
			
 
				+#define __msa_clei_s_d __builtin_msa_clei_s_d
			
 
				+#define __msa_cle_u_b __builtin_msa_cle_u_b
			
 
				+#define __msa_cle_u_h __builtin_msa_cle_u_h
			
 
				+#define __msa_cle_u_w __builtin_msa_cle_u_w
			
 
				+#define __msa_cle_u_d __builtin_msa_cle_u_d
			
 
				+#define __msa_clei_u_b __builtin_msa_clei_u_b
			
 
				+#define __msa_clei_u_h __builtin_msa_clei_u_h
			
 
				+#define __msa_clei_u_w __builtin_msa_clei_u_w
			
 
				+#define __msa_clei_u_d __builtin_msa_clei_u_d
			
 
				+#define __msa_ld_b __builtin_msa_ld_b
			
 
				+#define __msa_ld_h __builtin_msa_ld_h
			
 
				+#define __msa_ld_w __builtin_msa_ld_w
			
 
				+#define __msa_ld_d __builtin_msa_ld_d
			
 
				+#define __msa_st_b __builtin_msa_st_b
			
 
				+#define __msa_st_h __builtin_msa_st_h
			
 
				+#define __msa_st_w __builtin_msa_st_w
			
 
				+#define __msa_st_d __builtin_msa_st_d
			
 
				+#define __msa_sat_s_b __builtin_msa_sat_s_b
			
 
				+#define __msa_sat_s_h __builtin_msa_sat_s_h
			
 
				+#define __msa_sat_s_w __builtin_msa_sat_s_w
			
 
				+#define __msa_sat_s_d __builtin_msa_sat_s_d
			
 
				+#define __msa_sat_u_b __builtin_msa_sat_u_b
			
 
				+#define __msa_sat_u_h __builtin_msa_sat_u_h
			
 
				+#define __msa_sat_u_w __builtin_msa_sat_u_w
			
 
				+#define __msa_sat_u_d __builtin_msa_sat_u_d
			
 
				+#define __msa_add_a_b __builtin_msa_add_a_b
			
 
				+#define __msa_add_a_h __builtin_msa_add_a_h
			
 
				+#define __msa_add_a_w __builtin_msa_add_a_w
			
 
				+#define __msa_add_a_d __builtin_msa_add_a_d
			
 
				+#define __msa_adds_a_b __builtin_msa_adds_a_b
			
 
				+#define __msa_adds_a_h __builtin_msa_adds_a_h
			
 
				+#define __msa_adds_a_w __builtin_msa_adds_a_w
			
 
				+#define __msa_adds_a_d __builtin_msa_adds_a_d
			
 
				+#define __msa_adds_s_b __builtin_msa_adds_s_b
			
 
				+#define __msa_adds_s_h __builtin_msa_adds_s_h
			
 
				+#define __msa_adds_s_w __builtin_msa_adds_s_w
			
 
				+#define __msa_adds_s_d __builtin_msa_adds_s_d
			
 
				+#define __msa_adds_u_b __builtin_msa_adds_u_b
			
 
				+#define __msa_adds_u_h __builtin_msa_adds_u_h
			
 
				+#define __msa_adds_u_w __builtin_msa_adds_u_w
			
 
				+#define __msa_adds_u_d __builtin_msa_adds_u_d
			
 
				+#define __msa_ave_s_b __builtin_msa_ave_s_b
			
 
				+#define __msa_ave_s_h __builtin_msa_ave_s_h
			
 
				+#define __msa_ave_s_w __builtin_msa_ave_s_w
			
 
				+#define __msa_ave_s_d __builtin_msa_ave_s_d
			
 
				+#define __msa_ave_u_b __builtin_msa_ave_u_b
			
 
				+#define __msa_ave_u_h __builtin_msa_ave_u_h
			
 
				+#define __msa_ave_u_w __builtin_msa_ave_u_w
			
 
				+#define __msa_ave_u_d __builtin_msa_ave_u_d
			
 
				+#define __msa_aver_s_b __builtin_msa_aver_s_b
			
 
				+#define __msa_aver_s_h __builtin_msa_aver_s_h
			
 
				+#define __msa_aver_s_w __builtin_msa_aver_s_w
			
 
				+#define __msa_aver_s_d __builtin_msa_aver_s_d
			
 
				+#define __msa_aver_u_b __builtin_msa_aver_u_b
			
 
				+#define __msa_aver_u_h __builtin_msa_aver_u_h
			
 
				+#define __msa_aver_u_w __builtin_msa_aver_u_w
			
 
				+#define __msa_aver_u_d __builtin_msa_aver_u_d
			
 
				+#define __msa_subs_s_b __builtin_msa_subs_s_b
			
 
				+#define __msa_subs_s_h __builtin_msa_subs_s_h
			
 
				+#define __msa_subs_s_w __builtin_msa_subs_s_w
			
 
				+#define __msa_subs_s_d __builtin_msa_subs_s_d
			
 
				+#define __msa_subs_u_b __builtin_msa_subs_u_b
			
 
				+#define __msa_subs_u_h __builtin_msa_subs_u_h
			
 
				+#define __msa_subs_u_w __builtin_msa_subs_u_w
			
 
				+#define __msa_subs_u_d __builtin_msa_subs_u_d
			
 
				+#define __msa_subsuu_s_b __builtin_msa_subsuu_s_b
			
 
				+#define __msa_subsuu_s_h __builtin_msa_subsuu_s_h
			
 
				+#define __msa_subsuu_s_w __builtin_msa_subsuu_s_w
			
 
				+#define __msa_subsuu_s_d __builtin_msa_subsuu_s_d
			
 
				+#define __msa_subsus_u_b __builtin_msa_subsus_u_b
			
 
				+#define __msa_subsus_u_h __builtin_msa_subsus_u_h
			
 
				+#define __msa_subsus_u_w __builtin_msa_subsus_u_w
			
 
				+#define __msa_subsus_u_d __builtin_msa_subsus_u_d
			
 
				+#define __msa_asub_s_b __builtin_msa_asub_s_b
			
 
				+#define __msa_asub_s_h __builtin_msa_asub_s_h
			
 
				+#define __msa_asub_s_w __builtin_msa_asub_s_w
			
 
				+#define __msa_asub_s_d __builtin_msa_asub_s_d
			
 
				+#define __msa_asub_u_b __builtin_msa_asub_u_b
			
 
				+#define __msa_asub_u_h __builtin_msa_asub_u_h
			
 
				+#define __msa_asub_u_w __builtin_msa_asub_u_w
			
 
				+#define __msa_asub_u_d __builtin_msa_asub_u_d
			
 
				+#define __msa_mulv_b __builtin_msa_mulv_b
			
 
				+#define __msa_mulv_h __builtin_msa_mulv_h
			
 
				+#define __msa_mulv_w __builtin_msa_mulv_w
			
 
				+#define __msa_mulv_d __builtin_msa_mulv_d
			
 
				+#define __msa_maddv_b __builtin_msa_maddv_b
			
 
				+#define __msa_maddv_h __builtin_msa_maddv_h
			
 
				+#define __msa_maddv_w __builtin_msa_maddv_w
			
 
				+#define __msa_maddv_d __builtin_msa_maddv_d
			
 
				+#define __msa_msubv_b __builtin_msa_msubv_b
			
 
				+#define __msa_msubv_h __builtin_msa_msubv_h
			
 
				+#define __msa_msubv_w __builtin_msa_msubv_w
			
 
				+#define __msa_msubv_d __builtin_msa_msubv_d
			
 
				+#define __msa_div_s_b __builtin_msa_div_s_b
			
 
				+#define __msa_div_s_h __builtin_msa_div_s_h
			
 
				+#define __msa_div_s_w __builtin_msa_div_s_w
			
 
				+#define __msa_div_s_d __builtin_msa_div_s_d
			
 
				+#define __msa_div_u_b __builtin_msa_div_u_b
			
 
				+#define __msa_div_u_h __builtin_msa_div_u_h
			
 
				+#define __msa_div_u_w __builtin_msa_div_u_w
			
 
				+#define __msa_div_u_d __builtin_msa_div_u_d
			
 
				+#define __msa_hadd_s_h __builtin_msa_hadd_s_h
			
 
				+#define __msa_hadd_s_w __builtin_msa_hadd_s_w
			
 
				+#define __msa_hadd_s_d __builtin_msa_hadd_s_d
			
 
				+#define __msa_hadd_u_h __builtin_msa_hadd_u_h
			
 
				+#define __msa_hadd_u_w __builtin_msa_hadd_u_w
			
 
				+#define __msa_hadd_u_d __builtin_msa_hadd_u_d
			
 
				+#define __msa_hsub_s_h __builtin_msa_hsub_s_h
			
 
				+#define __msa_hsub_s_w __builtin_msa_hsub_s_w
			
 
				+#define __msa_hsub_s_d __builtin_msa_hsub_s_d
			
 
				+#define __msa_hsub_u_h __builtin_msa_hsub_u_h
			
 
				+#define __msa_hsub_u_w __builtin_msa_hsub_u_w
			
 
				+#define __msa_hsub_u_d __builtin_msa_hsub_u_d
			
 
				+#define __msa_mod_s_b __builtin_msa_mod_s_b
			
 
				+#define __msa_mod_s_h __builtin_msa_mod_s_h
			
 
				+#define __msa_mod_s_w __builtin_msa_mod_s_w
			
 
				+#define __msa_mod_s_d __builtin_msa_mod_s_d
			
 
				+#define __msa_mod_u_b __builtin_msa_mod_u_b
			
 
				+#define __msa_mod_u_h __builtin_msa_mod_u_h
			
 
				+#define __msa_mod_u_w __builtin_msa_mod_u_w
			
 
				+#define __msa_mod_u_d __builtin_msa_mod_u_d
			
 
				+#define __msa_dotp_s_h __builtin_msa_dotp_s_h
			
 
				+#define __msa_dotp_s_w __builtin_msa_dotp_s_w
			
 
				+#define __msa_dotp_s_d __builtin_msa_dotp_s_d
			
 
				+#define __msa_dotp_u_h __builtin_msa_dotp_u_h
			
 
				+#define __msa_dotp_u_w __builtin_msa_dotp_u_w
			
 
				+#define __msa_dotp_u_d __builtin_msa_dotp_u_d
			
 
				+#define __msa_dpadd_s_h __builtin_msa_dpadd_s_h
			
 
				+#define __msa_dpadd_s_w __builtin_msa_dpadd_s_w
			
 
				+#define __msa_dpadd_s_d __builtin_msa_dpadd_s_d
			
 
				+#define __msa_dpadd_u_h __builtin_msa_dpadd_u_h
			
 
				+#define __msa_dpadd_u_w __builtin_msa_dpadd_u_w
			
 
				+#define __msa_dpadd_u_d __builtin_msa_dpadd_u_d
			
 
				+#define __msa_dpsub_s_h __builtin_msa_dpsub_s_h
			
 
				+#define __msa_dpsub_s_w __builtin_msa_dpsub_s_w
			
 
				+#define __msa_dpsub_s_d __builtin_msa_dpsub_s_d
			
 
				+#define __msa_dpsub_u_h __builtin_msa_dpsub_u_h
			
 
				+#define __msa_dpsub_u_w __builtin_msa_dpsub_u_w
			
 
				+#define __msa_dpsub_u_d __builtin_msa_dpsub_u_d
			
 
				+#define __msa_sld_b __builtin_msa_sld_b
			
 
				+#define __msa_sld_h __builtin_msa_sld_h
			
 
				+#define __msa_sld_w __builtin_msa_sld_w
			
 
				+#define __msa_sld_d __builtin_msa_sld_d
			
 
				+#define __msa_sldi_b __builtin_msa_sldi_b
			
 
				+#define __msa_sldi_h __builtin_msa_sldi_h
			
 
				+#define __msa_sldi_w __builtin_msa_sldi_w
			
 
				+#define __msa_sldi_d __builtin_msa_sldi_d
			
 
				+#define __msa_splat_b __builtin_msa_splat_b
			
 
				+#define __msa_splat_h __builtin_msa_splat_h
			
 
				+#define __msa_splat_w __builtin_msa_splat_w
			
 
				+#define __msa_splat_d __builtin_msa_splat_d
			
 
				+#define __msa_splati_b __builtin_msa_splati_b
			
 
				+#define __msa_splati_h __builtin_msa_splati_h
			
 
				+#define __msa_splati_w __builtin_msa_splati_w
			
 
				+#define __msa_splati_d __builtin_msa_splati_d
			
 
				+#define __msa_pckev_b __builtin_msa_pckev_b
			
 
				+#define __msa_pckev_h __builtin_msa_pckev_h
			
 
				+#define __msa_pckev_w __builtin_msa_pckev_w
			
 
				+#define __msa_pckev_d __builtin_msa_pckev_d
			
 
				+#define __msa_pckod_b __builtin_msa_pckod_b
			
 
				+#define __msa_pckod_h __builtin_msa_pckod_h
			
 
				+#define __msa_pckod_w __builtin_msa_pckod_w
			
 
				+#define __msa_pckod_d __builtin_msa_pckod_d
			
 
				+#define __msa_ilvl_b __builtin_msa_ilvl_b
			
 
				+#define __msa_ilvl_h __builtin_msa_ilvl_h
			
 
				+#define __msa_ilvl_w __builtin_msa_ilvl_w
			
 
				+#define __msa_ilvl_d __builtin_msa_ilvl_d
			
 
				+#define __msa_ilvr_b __builtin_msa_ilvr_b
			
 
				+#define __msa_ilvr_h __builtin_msa_ilvr_h
			
 
				+#define __msa_ilvr_w __builtin_msa_ilvr_w
			
 
				+#define __msa_ilvr_d __builtin_msa_ilvr_d
			
 
				+#define __msa_ilvev_b __builtin_msa_ilvev_b
			
 
				+#define __msa_ilvev_h __builtin_msa_ilvev_h
			
 
				+#define __msa_ilvev_w __builtin_msa_ilvev_w
			
 
				+#define __msa_ilvev_d __builtin_msa_ilvev_d
			
 
				+#define __msa_ilvod_b __builtin_msa_ilvod_b
			
 
				+#define __msa_ilvod_h __builtin_msa_ilvod_h
			
 
				+#define __msa_ilvod_w __builtin_msa_ilvod_w
			
 
				+#define __msa_ilvod_d __builtin_msa_ilvod_d
			
 
				+#define __msa_vshf_b __builtin_msa_vshf_b
			
 
				+#define __msa_vshf_h __builtin_msa_vshf_h
			
 
				+#define __msa_vshf_w __builtin_msa_vshf_w
			
 
				+#define __msa_vshf_d __builtin_msa_vshf_d
			
 
				+#define __msa_and_v __builtin_msa_and_v
			
 
				+#define __msa_andi_b __builtin_msa_andi_b
			
 
				+#define __msa_or_v __builtin_msa_or_v
			
 
				+#define __msa_ori_b __builtin_msa_ori_b
			
 
				+#define __msa_nor_v __builtin_msa_nor_v
			
 
				+#define __msa_nori_b __builtin_msa_nori_b
			
 
				+#define __msa_xor_v __builtin_msa_xor_v
			
 
				+#define __msa_xori_b __builtin_msa_xori_b
			
 
				+#define __msa_bmnz_v __builtin_msa_bmnz_v
			
 
				+#define __msa_bmnzi_b __builtin_msa_bmnzi_b
			
 
				+#define __msa_bmz_v __builtin_msa_bmz_v
			
 
				+#define __msa_bmzi_b __builtin_msa_bmzi_b
			
 
				+#define __msa_bsel_v __builtin_msa_bsel_v
			
 
				+#define __msa_bseli_b __builtin_msa_bseli_b
			
 
				+#define __msa_shf_b __builtin_msa_shf_b
			
 
				+#define __msa_shf_h __builtin_msa_shf_h
			
 
				+#define __msa_shf_w __builtin_msa_shf_w
			
 
				+#define __msa_test_bnz_v __builtin_msa_bnz_v
			
 
				+#define __msa_test_bz_v __builtin_msa_bz_v
			
 
				+#define __msa_fill_b __builtin_msa_fill_b
			
 
				+#define __msa_fill_h __builtin_msa_fill_h
			
 
				+#define __msa_fill_w __builtin_msa_fill_w
			
 
				+#define __msa_fill_d __builtin_msa_fill_d
			
 
				+#define __msa_pcnt_b __builtin_msa_pcnt_b
			
 
				+#define __msa_pcnt_h __builtin_msa_pcnt_h
			
 
				+#define __msa_pcnt_w __builtin_msa_pcnt_w
			
 
				+#define __msa_pcnt_d __builtin_msa_pcnt_d
			
 
				+#define __msa_nloc_b __builtin_msa_nloc_b
			
 
				+#define __msa_nloc_h __builtin_msa_nloc_h
			
 
				+#define __msa_nloc_w __builtin_msa_nloc_w
			
 
				+#define __msa_nloc_d __builtin_msa_nloc_d
			
 
				+#define __msa_nlzc_b __builtin_msa_nlzc_b
			
 
				+#define __msa_nlzc_h __builtin_msa_nlzc_h
			
 
				+#define __msa_nlzc_w __builtin_msa_nlzc_w
			
 
				+#define __msa_nlzc_d __builtin_msa_nlzc_d
			
 
				+#define __msa_copy_s_b __builtin_msa_copy_s_b
			
 
				+#define __msa_copy_s_h __builtin_msa_copy_s_h
			
 
				+#define __msa_copy_s_w __builtin_msa_copy_s_w
			
 
				+#define __msa_copy_s_d __builtin_msa_copy_s_d
			
 
				+#define __msa_copy_u_b __builtin_msa_copy_u_b
			
 
				+#define __msa_copy_u_h __builtin_msa_copy_u_h
			
 
				+#define __msa_copy_u_w __builtin_msa_copy_u_w
			
 
				+#define __msa_copy_u_d __builtin_msa_copy_u_d
			
 
				+#define __msa_insert_b __builtin_msa_insert_b
			
 
				+#define __msa_insert_h __builtin_msa_insert_h
			
 
				+#define __msa_insert_w __builtin_msa_insert_w
			
 
				+#define __msa_insert_d __builtin_msa_insert_d
			
 
				+#define __msa_insve_b __builtin_msa_insve_b
			
 
				+#define __msa_insve_h __builtin_msa_insve_h
			
 
				+#define __msa_insve_w __builtin_msa_insve_w
			
 
				+#define __msa_insve_d __builtin_msa_insve_d
			
 
				+#define __msa_test_bnz_b __builtin_msa_bnz_b
			
 
				+#define __msa_test_bnz_h __builtin_msa_bnz_h
			
 
				+#define __msa_test_bnz_w __builtin_msa_bnz_w
			
 
				+#define __msa_test_bnz_d __builtin_msa_bnz_d
			
 
				+#define __msa_test_bz_b __builtin_msa_bz_b
			
 
				+#define __msa_test_bz_h __builtin_msa_bz_h
			
 
				+#define __msa_test_bz_w __builtin_msa_bz_w
			
 
				+#define __msa_test_bz_d __builtin_msa_bz_d
			
 
				+#define __msa_ldi_b __builtin_msa_ldi_b
			
 
				+#define __msa_ldi_h __builtin_msa_ldi_h
			
 
				+#define __msa_ldi_w __builtin_msa_ldi_w
			
 
				+#define __msa_ldi_d __builtin_msa_ldi_d
			
 
				+#define __msa_fcaf_w __builtin_msa_fcaf_w
			
 
				+#define __msa_fcaf_d __builtin_msa_fcaf_d
			
 
				+#define __msa_fcor_w __builtin_msa_fcor_w
			
 
				+#define __msa_fcor_d __builtin_msa_fcor_d
			
 
				+#define __msa_fcun_w __builtin_msa_fcun_w
			
 
				+#define __msa_fcun_d __builtin_msa_fcun_d
			
 
				+#define __msa_fcune_w __builtin_msa_fcune_w
			
 
				+#define __msa_fcune_d __builtin_msa_fcune_d
			
 
				+#define __msa_fcueq_w __builtin_msa_fcueq_w
			
 
				+#define __msa_fcueq_d __builtin_msa_fcueq_d
			
 
				+#define __msa_fceq_w __builtin_msa_fceq_w
			
 
				+#define __msa_fceq_d __builtin_msa_fceq_d
			
 
				+#define __msa_fcne_w __builtin_msa_fcne_w
			
 
				+#define __msa_fcne_d __builtin_msa_fcne_d
			
 
				+#define __msa_fclt_w __builtin_msa_fclt_w
			
 
				+#define __msa_fclt_d __builtin_msa_fclt_d
			
 
				+#define __msa_fcult_w __builtin_msa_fcult_w
			
 
				+#define __msa_fcult_d __builtin_msa_fcult_d
			
 
				+#define __msa_fcle_w __builtin_msa_fcle_w
			
 
				+#define __msa_fcle_d __builtin_msa_fcle_d
			
 
				+#define __msa_fcule_w __builtin_msa_fcule_w
			
 
				+#define __msa_fcule_d __builtin_msa_fcule_d
			
 
				+#define __msa_fsaf_w __builtin_msa_fsaf_w
			
 
				+#define __msa_fsaf_d __builtin_msa_fsaf_d
			
 
				+#define __msa_fsor_w __builtin_msa_fsor_w
			
 
				+#define __msa_fsor_d __builtin_msa_fsor_d
			
 
				+#define __msa_fsun_w __builtin_msa_fsun_w
			
 
				+#define __msa_fsun_d __builtin_msa_fsun_d
			
 
				+#define __msa_fsune_w __builtin_msa_fsune_w
			
 
				+#define __msa_fsune_d __builtin_msa_fsune_d
			
 
				+#define __msa_fsueq_w __builtin_msa_fsueq_w
			
 
				+#define __msa_fsueq_d __builtin_msa_fsueq_d
			
 
				+#define __msa_fseq_w __builtin_msa_fseq_w
			
 
				+#define __msa_fseq_d __builtin_msa_fseq_d
			
 
				+#define __msa_fsne_w __builtin_msa_fsne_w
			
 
				+#define __msa_fsne_d __builtin_msa_fsne_d
			
 
				+#define __msa_fslt_w __builtin_msa_fslt_w
			
 
				+#define __msa_fslt_d __builtin_msa_fslt_d
			
 
				+#define __msa_fsult_w __builtin_msa_fsult_w
			
 
				+#define __msa_fsult_d __builtin_msa_fsult_d
			
 
				+#define __msa_fsle_w __builtin_msa_fsle_w
			
 
				+#define __msa_fsle_d __builtin_msa_fsle_d
			
 
				+#define __msa_fsule_w __builtin_msa_fsule_w
			
 
				+#define __msa_fsule_d __builtin_msa_fsule_d
			
 
				+#define __msa_fadd_w __builtin_msa_fadd_w
			
 
				+#define __msa_fadd_d __builtin_msa_fadd_d
			
 
				+#define __msa_fsub_w __builtin_msa_fsub_w
			
 
				+#define __msa_fsub_d __builtin_msa_fsub_d
			
 
				+#define __msa_fmul_w __builtin_msa_fmul_w
			
 
				+#define __msa_fmul_d __builtin_msa_fmul_d
			
 
				+#define __msa_fdiv_w __builtin_msa_fdiv_w
			
 
				+#define __msa_fdiv_d __builtin_msa_fdiv_d
			
 
				+#define __msa_fmadd_w __builtin_msa_fmadd_w
			
 
				+#define __msa_fmadd_d __builtin_msa_fmadd_d
			
 
				+#define __msa_fmsub_w __builtin_msa_fmsub_w
			
 
				+#define __msa_fmsub_d __builtin_msa_fmsub_d
			
 
				+#define __msa_fexp2_w __builtin_msa_fexp2_w
			
 
				+#define __msa_fexp2_d __builtin_msa_fexp2_d
			
 
				+#define __msa_fexdo_h __builtin_msa_fexdo_h
			
 
				+#define __msa_fexdo_w __builtin_msa_fexdo_w
			
 
				+#define __msa_ftq_h __builtin_msa_ftq_h
			
 
				+#define __msa_ftq_w __builtin_msa_ftq_w
			
 
				+#define __msa_fmin_w __builtin_msa_fmin_w
			
 
				+#define __msa_fmin_d __builtin_msa_fmin_d
			
 
				+#define __msa_fmin_a_w __builtin_msa_fmin_a_w
			
 
				+#define __msa_fmin_a_d __builtin_msa_fmin_a_d
			
 
				+#define __msa_fmax_w __builtin_msa_fmax_w
			
 
				+#define __msa_fmax_d __builtin_msa_fmax_d
			
 
				+#define __msa_fmax_a_w __builtin_msa_fmax_a_w
			
 
				+#define __msa_fmax_a_d __builtin_msa_fmax_a_d
			
 
				+#define __msa_mul_q_h __builtin_msa_mul_q_h
			
 
				+#define __msa_mul_q_w __builtin_msa_mul_q_w
			
 
				+#define __msa_mulr_q_h __builtin_msa_mulr_q_h
			
 
				+#define __msa_mulr_q_w __builtin_msa_mulr_q_w
			
 
				+#define __msa_madd_q_h __builtin_msa_madd_q_h
			
 
				+#define __msa_madd_q_w __builtin_msa_madd_q_w
			
 
				+#define __msa_maddr_q_h __builtin_msa_maddr_q_h
			
 
				+#define __msa_maddr_q_w __builtin_msa_maddr_q_w
			
 
				+#define __msa_msub_q_h __builtin_msa_msub_q_h
			
 
				+#define __msa_msub_q_w __builtin_msa_msub_q_w
			
 
				+#define __msa_msubr_q_h __builtin_msa_msubr_q_h
			
 
				+#define __msa_msubr_q_w __builtin_msa_msubr_q_w
			
 
				+#define __msa_fclass_w __builtin_msa_fclass_w
			
 
				+#define __msa_fclass_d __builtin_msa_fclass_d
			
 
				+#define __msa_fsqrt_w __builtin_msa_fsqrt_w
			
 
				+#define __msa_fsqrt_d __builtin_msa_fsqrt_d
			
 
				+#define __msa_frcp_w __builtin_msa_frcp_w
			
 
				+#define __msa_frcp_d __builtin_msa_frcp_d
			
 
				+#define __msa_frint_w __builtin_msa_frint_w
			
 
				+#define __msa_frint_d __builtin_msa_frint_d
			
 
				+#define __msa_frsqrt_w __builtin_msa_frsqrt_w
			
 
				+#define __msa_frsqrt_d __builtin_msa_frsqrt_d
			
 
				+#define __msa_flog2_w __builtin_msa_flog2_w
			
 
				+#define __msa_flog2_d __builtin_msa_flog2_d
			
 
				+#define __msa_fexupl_w __builtin_msa_fexupl_w
			
 
				+#define __msa_fexupl_d __builtin_msa_fexupl_d
			
 
				+#define __msa_fexupr_w __builtin_msa_fexupr_w
			
 
				+#define __msa_fexupr_d __builtin_msa_fexupr_d
			
 
				+#define __msa_ffql_w __builtin_msa_ffql_w
			
 
				+#define __msa_ffql_d __builtin_msa_ffql_d
			
 
				+#define __msa_ffqr_w __builtin_msa_ffqr_w
			
 
				+#define __msa_ffqr_d __builtin_msa_ffqr_d
			
 
				+#define __msa_ftint_s_w __builtin_msa_ftint_s_w
			
 
				+#define __msa_ftint_s_d __builtin_msa_ftint_s_d
			
 
				+#define __msa_ftint_u_w __builtin_msa_ftint_u_w
			
 
				+#define __msa_ftint_u_d __builtin_msa_ftint_u_d
			
 
				+#define __msa_ftrunc_s_w __builtin_msa_ftrunc_s_w
			
 
				+#define __msa_ftrunc_s_d __builtin_msa_ftrunc_s_d
			
 
				+#define __msa_ftrunc_u_w __builtin_msa_ftrunc_u_w
			
 
				+#define __msa_ftrunc_u_d __builtin_msa_ftrunc_u_d
			
 
				+#define __msa_ffint_s_w __builtin_msa_ffint_s_w
			
 
				+#define __msa_ffint_s_d __builtin_msa_ffint_s_d
			
 
				+#define __msa_ffint_u_w __builtin_msa_ffint_u_w
			
 
				+#define __msa_ffint_u_d __builtin_msa_ffint_u_d
			
 
				+#define __msa_cfcmsa __builtin_msa_cfcmsa
			
 
				+#define __msa_move_v __builtin_msa_move_v
			
 
				+#define __msa_cast_to_vector_float __builtin_msa_cast_to_vector_float
			
 
				+#define __msa_cast_to_vector_double __builtin_msa_cast_to_vector_double
			
 
				+#define __msa_cast_to_scalar_float __builtin_msa_cast_to_scalar_float
			
 
				+#define __msa_cast_to_scalar_double __builtin_msa_cast_to_scalar_double
			
 
				+#endif /* defined(__mips_msa) */
			
 
				+#endif /* _MSA_H */
			
--- a/demo/include/mwaitxintrin.h
+++ b/demo/include/mwaitxintrin.h
@@ -0,0 +1,47 @@
 
				+/*===---- mwaitxintrin.h - MONITORX/MWAITX intrinsics ----------------------===
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+
			
 
				+#ifndef __X86INTRIN_H
			
 
				+#error "Never use <mwaitxintrin.h> directly; include <x86intrin.h> instead."
			
 
				+#endif
			
 
				+
			
 
				+#ifndef _MWAITXINTRIN_H
			
 
				+#define _MWAITXINTRIN_H
			
 
				+
			
 
				+/* Define the default attributes for the functions in this file. */
			
 
				+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("mwaitx")))
			
 
				+static __inline__ void __DEFAULT_FN_ATTRS
			
 
				+_mm_monitorx(void const * __p, unsigned __extensions, unsigned __hints)
			
 
				+{
			
 
				+  __builtin_ia32_monitorx((void *)__p, __extensions, __hints);
			
 
				+}
			
 
				+
			
 
				+static __inline__ void __DEFAULT_FN_ATTRS
			
 
				+_mm_mwaitx(unsigned __extensions, unsigned __hints, unsigned __clock)
			
 
				+{
			
 
				+  __builtin_ia32_mwaitx(__extensions, __hints, __clock);
			
 
				+}
			
 
				+
			
 
				+#undef __DEFAULT_FN_ATTRS
			
 
				+
			
 
				+#endif /* _MWAITXINTRIN_H */
			
--- a/demo/include/nmmintrin.h
+++ b/demo/include/nmmintrin.h
@@ -0,0 +1,30 @@
 
				+/*===---- nmmintrin.h - SSE4 intrinsics ------------------------------------===
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+
			
 
				+#ifndef _NMMINTRIN_H
			
 
				+#define _NMMINTRIN_H
			
 
				+
			
 
				+/* To match expectations of gcc we put the sse4.2 definitions into smmintrin.h,
			
 
				+   just include it now then.  */
			
 
				+#include <smmintrin.h>
			
 
				+#endif /* _NMMINTRIN_H */
			
--- a/demo/include/opencl-c.h
+++ b/demo/include/opencl-c.h
--- a/demo/include/pkuintrin.h
+++ b/demo/include/pkuintrin.h
@@ -0,0 +1,48 @@
 
				+/*===------------- pkuintrin.h - PKU intrinsics ------------------===
			
 
				+ *
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+#ifndef __IMMINTRIN_H
			
 
				+#error "Never use <pkuintrin.h> directly; include <immintrin.h> instead."
			
 
				+#endif
			
 
				+
			
 
				+#ifndef __PKUINTRIN_H
			
 
				+#define __PKUINTRIN_H
			
 
				+
			
 
				+/* Define the default attributes for the functions in this file. */
			
 
				+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("pku")))
			
 
				+
			
 
				+static __inline__ unsigned int __DEFAULT_FN_ATTRS
			
 
				+_rdpkru_u32(void)
			
 
				+{
			
 
				+  return __builtin_ia32_rdpkru();
			
 
				+}
			
 
				+
			
 
				+static __inline__ void __DEFAULT_FN_ATTRS
			
 
				+_wrpkru(unsigned int __val)
			
 
				+{
			
 
				+  return __builtin_ia32_wrpkru(__val);
			
 
				+}
			
 
				+
			
 
				+#undef __DEFAULT_FN_ATTRS
			
 
				+
			
 
				+#endif
			
--- a/demo/include/pmmintrin.h
+++ b/demo/include/pmmintrin.h
@@ -0,0 +1,304 @@
 
				+/*===---- pmmintrin.h - SSE3 intrinsics ------------------------------------===
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+
			
 
				+#ifndef __PMMINTRIN_H
			
 
				+#define __PMMINTRIN_H
			
 
				+
			
 
				+#include <emmintrin.h>
			
 
				+
			
 
				+/* Define the default attributes for the functions in this file. */
			
 
				+#define __DEFAULT_FN_ATTRS \
			
 
				+  __attribute__((__always_inline__, __nodebug__, __target__("sse3")))
			
 
				+
			
 
				+/// \brief Loads data from an unaligned memory location to elements in a 128-bit
			
 
				+///    vector.
			
 
				+///
			
 
				+///    If the address of the data is not 16-byte aligned, the instruction may
			
 
				+///    read two adjacent aligned blocks of memory to retrieve the requested
			
 
				+///    data.
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
			
 
				+///
			
 
				+/// \param __p
			
 
				+///    A pointer to a 128-bit integer vector containing integer values.
			
 
				+/// \returns A 128-bit vector containing the moved values.
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm_lddqu_si128(__m128i const *__p)
			
 
				+{
			
 
				+  return (__m128i)__builtin_ia32_lddqu((char const *)__p);
			
 
				+}
			
 
				+
			
 
				+/// \brief Adds the even-indexed values and subtracts the odd-indexed values of
			
 
				+///    two 128-bit vectors of [4 x float].
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
			
 
				+///
			
 
				+/// \param __a
			
 
				+///    A 128-bit vector of [4 x float] containing the left source operand.
			
 
				+/// \param __b
			
 
				+///    A 128-bit vector of [4 x float] containing the right source operand.
			
 
				+/// \returns A 128-bit vector of [4 x float] containing the alternating sums and
			
 
				+///    differences of both operands.
			
 
				+static __inline__ __m128 __DEFAULT_FN_ATTRS
			
 
				+_mm_addsub_ps(__m128 __a, __m128 __b)
			
 
				+{
			
 
				+  return __builtin_ia32_addsubps((__v4sf)__a, (__v4sf)__b);
			
 
				+}
			
 
				+
			
 
				+/// \brief Horizontally adds the adjacent pairs of values contained in two
			
 
				+///    128-bit vectors of [4 x float].
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
			
 
				+///
			
 
				+/// \param __a
			
 
				+///    A 128-bit vector of [4 x float] containing one of the source operands.
			
 
				+///    The horizontal sums of the values are stored in the lower bits of the
			
 
				+///    destination.
			
 
				+/// \param __b
			
 
				+///    A 128-bit vector of [4 x float] containing one of the source operands.
			
 
				+///    The horizontal sums of the values are stored in the upper bits of the
			
 
				+///    destination.
			
 
				+/// \returns A 128-bit vector of [4 x float] containing the horizontal sums of
			
 
				+///    both operands.
			
 
				+static __inline__ __m128 __DEFAULT_FN_ATTRS
			
 
				+_mm_hadd_ps(__m128 __a, __m128 __b)
			
 
				+{
			
 
				+  return __builtin_ia32_haddps((__v4sf)__a, (__v4sf)__b);
			
 
				+}
			
 
				+
			
 
				+/// \brief Horizontally subtracts the adjacent pairs of values contained in two
			
 
				+///    128-bit vectors of [4 x float].
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
			
 
				+///
			
 
				+/// \param __a
			
 
				+///    A 128-bit vector of [4 x float] containing one of the source operands.
			
 
				+///    The horizontal differences between the values are stored in the lower
			
 
				+///    bits of the destination.
			
 
				+/// \param __b
			
 
				+///    A 128-bit vector of [4 x float] containing one of the source operands.
			
 
				+///    The horizontal differences between the values are stored in the upper
			
 
				+///    bits of the destination.
			
 
				+/// \returns A 128-bit vector of [4 x float] containing the horizontal
			
 
				+///    differences of both operands.
			
 
				+static __inline__ __m128 __DEFAULT_FN_ATTRS
			
 
				+_mm_hsub_ps(__m128 __a, __m128 __b)
			
 
				+{
			
 
				+  return __builtin_ia32_hsubps((__v4sf)__a, (__v4sf)__b);
			
 
				+}
			
 
				+
			
 
				+/// \brief Moves and duplicates odd-indexed values from a 128-bit vector
			
 
				+///    of [4 x float] to float values stored in a 128-bit vector of
			
 
				+///    [4 x float].
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
			
 
				+///
			
 
				+/// \param __a
			
 
				+///    A 128-bit vector of [4 x float]. \n
			
 
				+///    Bits [127:96] of the source are written to bits [127:96] and [95:64] of
			
 
				+///    the destination. \n
			
 
				+///    Bits [63:32] of the source are written to bits [63:32] and [31:0] of the
			
 
				+///    destination.
			
 
				+/// \returns A 128-bit vector of [4 x float] containing the moved and duplicated
			
 
				+///    values.
			
 
				+static __inline__ __m128 __DEFAULT_FN_ATTRS
			
 
				+_mm_movehdup_ps(__m128 __a)
			
 
				+{
			
 
				+  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 1, 1, 3, 3);
			
 
				+}
			
 
				+
			
 
				+/// \brief Duplicates even-indexed values from a 128-bit vector of
			
 
				+///    [4 x float] to float values stored in a 128-bit vector of [4 x float].
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
			
 
				+///
			
 
				+/// \param __a
			
 
				+///    A 128-bit vector of [4 x float] \n
			
 
				+///    Bits [95:64] of the source are written to bits [127:96] and [95:64] of
			
 
				+///    the destination. \n
			
 
				+///    Bits [31:0] of the source are written to bits [63:32] and [31:0] of the
			
 
				+///    destination.
			
 
				+/// \returns A 128-bit vector of [4 x float] containing the moved and duplicated
			
 
				+///    values.
			
 
				+static __inline__ __m128 __DEFAULT_FN_ATTRS
			
 
				+_mm_moveldup_ps(__m128 __a)
			
 
				+{
			
 
				+  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 2, 2);
			
 
				+}
			
 
				+
			
 
				+/// \brief Adds the even-indexed values and subtracts the odd-indexed values of
			
 
				+///    two 128-bit vectors of [2 x double].
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
			
 
				+///
			
 
				+/// \param __a
			
 
				+///    A 128-bit vector of [2 x double] containing the left source operand.
			
 
				+/// \param __b
			
 
				+///    A 128-bit vector of [2 x double] containing the right source operand.
			
 
				+/// \returns A 128-bit vector of [2 x double] containing the alternating sums
			
 
				+///    and differences of both operands.
			
 
				+static __inline__ __m128d __DEFAULT_FN_ATTRS
			
 
				+_mm_addsub_pd(__m128d __a, __m128d __b)
			
 
				+{
			
 
				+  return __builtin_ia32_addsubpd((__v2df)__a, (__v2df)__b);
			
 
				+}
			
 
				+
			
 
				+/// \brief Horizontally adds the pairs of values contained in two 128-bit
			
 
				+///    vectors of [2 x double].
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
			
 
				+///
			
 
				+/// \param __a
			
 
				+///    A 128-bit vector of [2 x double] containing one of the source operands.
			
 
				+///    The horizontal sum of the values is stored in the lower bits of the
			
 
				+///    destination.
			
 
				+/// \param __b
			
 
				+///    A 128-bit vector of [2 x double] containing one of the source operands.
			
 
				+///    The horizontal sum of the values is stored in the upper bits of the
			
 
				+///    destination.
			
 
				+/// \returns A 128-bit vector of [2 x double] containing the horizontal sums of
			
 
				+///    both operands.
			
 
				+static __inline__ __m128d __DEFAULT_FN_ATTRS
			
 
				+_mm_hadd_pd(__m128d __a, __m128d __b)
			
 
				+{
			
 
				+  return __builtin_ia32_haddpd((__v2df)__a, (__v2df)__b);
			
 
				+}
			
 
				+
			
 
				+/// \brief Horizontally subtracts the pairs of values contained in two 128-bit
			
 
				+///    vectors of [2 x double].
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
			
 
				+///
			
 
				+/// \param __a
			
 
				+///    A 128-bit vector of [2 x double] containing one of the source operands.
			
 
				+///    The horizontal difference of the values is stored in the lower bits of
			
 
				+///    the destination.
			
 
				+/// \param __b
			
 
				+///    A 128-bit vector of [2 x double] containing one of the source operands.
			
 
				+///    The horizontal difference of the values is stored in the upper bits of
			
 
				+///    the destination.
			
 
				+/// \returns A 128-bit vector of [2 x double] containing the horizontal
			
 
				+///    differences of both operands.
			
 
				+static __inline__ __m128d __DEFAULT_FN_ATTRS
			
 
				+_mm_hsub_pd(__m128d __a, __m128d __b)
			
 
				+{
			
 
				+  return __builtin_ia32_hsubpd((__v2df)__a, (__v2df)__b);
			
 
				+}
			
 
				+
			
 
				+/// \brief Moves and duplicates one double-precision value to double-precision
			
 
				+///    values stored in a 128-bit vector of [2 x double].
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// \code
			
 
				+/// __m128d _mm_loaddup_pd(double const * dp);
			
 
				+/// \endcode
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
			
 
				+///
			
 
				+/// \param dp
			
 
				+///    A pointer to a double-precision value to be moved and duplicated.
			
 
				+/// \returns A 128-bit vector of [2 x double] containing the moved and
			
 
				+///    duplicated values.
			
 
				+#define        _mm_loaddup_pd(dp)        _mm_load1_pd(dp)
			
 
				+
			
 
				+/// \brief Moves and duplicates the double-precision value in the lower bits of
			
 
				+///    a 128-bit vector of [2 x double] to double-precision values stored in a
			
 
				+///    128-bit vector of [2 x double].
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
			
 
				+///
			
 
				+/// \param __a
			
 
				+///    A 128-bit vector of [2 x double]. Bits [63:0] are written to bits
			
 
				+///    [127:64] and [63:0] of the destination.
			
 
				+/// \returns A 128-bit vector of [2 x double] containing the moved and
			
 
				+///    duplicated values.
			
 
				+static __inline__ __m128d __DEFAULT_FN_ATTRS
			
 
				+_mm_movedup_pd(__m128d __a)
			
 
				+{
			
 
				+  return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
			
 
				+}
			
 
				+
			
 
				+/// \brief Establishes a linear address memory range to be monitored and puts
			
 
				+///    the processor in the monitor event pending state. Data stored in the
			
 
				+///    monitored address range causes the processor to exit the pending state.
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> MONITOR </c> instruction.
			
 
				+///
			
 
				+/// \param __p
			
 
				+///    The memory range to be monitored. The size of the range is determined by
			
 
				+///    CPUID function 0000_0005h.
			
 
				+/// \param __extensions
			
 
				+///    Optional extensions for the monitoring state.
			
 
				+/// \param __hints
			
 
				+///    Optional hints for the monitoring state.
			
 
				+static __inline__ void __DEFAULT_FN_ATTRS
			
 
				+_mm_monitor(void const *__p, unsigned __extensions, unsigned __hints)
			
 
				+{
			
 
				+  __builtin_ia32_monitor((void *)__p, __extensions, __hints);
			
 
				+}
			
 
				+
			
 
				+/// \brief Used with the MONITOR instruction to wait while the processor is in
			
 
				+///    the monitor event pending state. Data stored in the monitored address
			
 
				+///    range causes the processor to exit the pending state.
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> MWAIT </c> instruction.
			
 
				+///
			
 
				+/// \param __extensions
			
 
				+///    Optional extensions for the monitoring state, which may vary by
			
 
				+///    processor.
			
 
				+/// \param __hints
			
 
				+///    Optional hints for the monitoring state, which may vary by processor.
			
 
				+static __inline__ void __DEFAULT_FN_ATTRS
			
 
				+_mm_mwait(unsigned __extensions, unsigned __hints)
			
 
				+{
			
 
				+  __builtin_ia32_mwait(__extensions, __hints);
			
 
				+}
			
 
				+
			
 
				+#undef __DEFAULT_FN_ATTRS
			
 
				+
			
 
				+#endif /* __PMMINTRIN_H */
			
--- a/demo/include/popcntintrin.h
+++ b/demo/include/popcntintrin.h
@@ -0,0 +1,98 @@
 
				+/*===---- popcntintrin.h - POPCNT intrinsics -------------------------------===
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+
			
 
				+#ifndef _POPCNTINTRIN_H
			
 
				+#define _POPCNTINTRIN_H
			
 
				+
			
 
				+/* Define the default attributes for the functions in this file. */
			
 
				+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("popcnt")))
			
 
				+
			
 
				+/// \brief Counts the number of bits in the source operand having a value of 1.
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> POPCNT </c> instruction.
			
 
				+///
			
 
				+/// \param __A
			
 
				+///    An unsigned 32-bit integer operand.
			
 
				+/// \returns A 32-bit integer containing the number of bits with value 1 in the
			
 
				+///    source operand.
			
 
				+static __inline__ int __DEFAULT_FN_ATTRS
			
 
				+_mm_popcnt_u32(unsigned int __A)
			
 
				+{
			
 
				+  return __builtin_popcount(__A);
			
 
				+}
			
 
				+
			
 
				+/// \brief Counts the number of bits in the source operand having a value of 1.
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> POPCNT </c> instruction.
			
 
				+///
			
 
				+/// \param __A
			
 
				+///    A signed 32-bit integer operand.
			
 
				+/// \returns A 32-bit integer containing the number of bits with value 1 in the
			
 
				+///    source operand.
			
 
				+static __inline__ int __DEFAULT_FN_ATTRS
			
 
				+_popcnt32(int __A)
			
 
				+{
			
 
				+  return __builtin_popcount(__A);
			
 
				+}
			
 
				+
			
 
				+#ifdef __x86_64__
			
 
				+/// \brief Counts the number of bits in the source operand having a value of 1.
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> POPCNT </c> instruction.
			
 
				+///
			
 
				+/// \param __A
			
 
				+///    An unsigned 64-bit integer operand.
			
 
				+/// \returns A 64-bit integer containing the number of bits with value 1 in the
			
 
				+///    source operand.
			
 
				+static __inline__ long long __DEFAULT_FN_ATTRS
			
 
				+_mm_popcnt_u64(unsigned long long __A)
			
 
				+{
			
 
				+  return __builtin_popcountll(__A);
			
 
				+}
			
 
				+
			
 
				+/// \brief Counts the number of bits in the source operand having a value of 1.
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the <c> POPCNT </c> instruction.
			
 
				+///
			
 
				+/// \param __A
			
 
				+///    A signed 64-bit integer operand.
			
 
				+/// \returns A 64-bit integer containing the number of bits with value 1 in the
			
 
				+///    source operand.
			
 
				+static __inline__ long long __DEFAULT_FN_ATTRS
			
 
				+_popcnt64(long long __A)
			
 
				+{
			
 
				+  return __builtin_popcountll(__A);
			
 
				+}
			
 
				+#endif /* __x86_64__ */
			
 
				+
			
 
				+#undef __DEFAULT_FN_ATTRS
			
 
				+
			
 
				+#endif /* _POPCNTINTRIN_H */
			
--- a/demo/include/prfchwintrin.h
+++ b/demo/include/prfchwintrin.h
@@ -0,0 +1,71 @@
 
				+/*===---- prfchwintrin.h - PREFETCHW intrinsic -----------------------------===
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+
			
 
				+#if !defined(__X86INTRIN_H) && !defined(_MM3DNOW_H_INCLUDED)
			
 
				+#error "Never use <prfchwintrin.h> directly; include <x86intrin.h> or <mm3dnow.h> instead."
			
 
				+#endif
			
 
				+
			
 
				+#ifndef __PRFCHWINTRIN_H
			
 
				+#define __PRFCHWINTRIN_H
			
 
				+
			
 
				+#if defined(__PRFCHW__) || defined(__3dNOW__)
			
 
				+/// \brief Loads a memory sequence containing the specified memory address into
			
 
				+///    all data cache levels. The cache-coherency state is set to exclusive.
			
 
				+///    Data can be read from and written to the cache line without additional
			
 
				+///    delay.
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the \c PREFETCHT0 instruction.
			
 
				+///
			
 
				+/// \param __P
			
 
				+///    A pointer specifying the memory address to be prefetched.
			
 
				+static __inline__ void __attribute__((__always_inline__, __nodebug__))
			
 
				+_m_prefetch(void *__P)
			
 
				+{
			
 
				+  __builtin_prefetch (__P, 0, 3 /* _MM_HINT_T0 */);
			
 
				+}
			
 
				+
			
 
				+/// \brief Loads a memory sequence containing the specified memory address into
			
 
				+///    the L1 data cache and sets the cache-coherency to modified. This
			
 
				+///    provides a hint to the processor that the cache line will be modified.
			
 
				+///    It is intended for use when the cache line will be written to shortly
			
 
				+///    after the prefetch is performed.
			
 
				+///
			
 
				+///    Note that the effect of this intrinsic is dependent on the processor
			
 
				+///    implementation.
			
 
				+///
			
 
				+/// \headerfile <x86intrin.h>
			
 
				+///
			
 
				+/// This intrinsic corresponds to the \c PREFETCHW instruction.
			
 
				+///
			
 
				+/// \param __P
			
 
				+///    A pointer specifying the memory address to be prefetched.
			
 
				+static __inline__ void __attribute__((__always_inline__, __nodebug__))
			
 
				+_m_prefetchw(void *__P)
			
 
				+{
			
 
				+  __builtin_prefetch (__P, 1, 3 /* _MM_HINT_T0 */);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#endif /* __PRFCHWINTRIN_H */
			
--- a/demo/include/rdseedintrin.h
+++ b/demo/include/rdseedintrin.h
@@ -0,0 +1,56 @@
 
				+/*===---- rdseedintrin.h - RDSEED intrinsics -------------------------------===
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+
			
 
				+#ifndef __X86INTRIN_H
			
 
				+#error "Never use <rdseedintrin.h> directly; include <x86intrin.h> instead."
			
 
				+#endif
			
 
				+
			
 
				+#ifndef __RDSEEDINTRIN_H
			
 
				+#define __RDSEEDINTRIN_H
			
 
				+
			
 
				+/* Define the default attributes for the functions in this file. */
			
 
				+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("rdseed")))
			
 
				+
			
 
				+static __inline__ int __DEFAULT_FN_ATTRS
			
 
				+_rdseed16_step(unsigned short *__p)
			
 
				+{
			
 
				+  return __builtin_ia32_rdseed16_step(__p);
			
 
				+}
			
 
				+
			
 
				+static __inline__ int __DEFAULT_FN_ATTRS
			
 
				+_rdseed32_step(unsigned int *__p)
			
 
				+{
			
 
				+  return __builtin_ia32_rdseed32_step(__p);
			
 
				+}
			
 
				+
			
 
				+#ifdef __x86_64__
			
 
				+static __inline__ int __DEFAULT_FN_ATTRS
			
 
				+_rdseed64_step(unsigned long long *__p)
			
 
				+{
			
 
				+  return __builtin_ia32_rdseed64_step(__p);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#undef __DEFAULT_FN_ATTRS
			
 
				+
			
 
				+#endif /* __RDSEEDINTRIN_H */
			
--- a/demo/include/rtmintrin.h
+++ b/demo/include/rtmintrin.h
@@ -0,0 +1,59 @@
 
				+/*===---- rtmintrin.h - RTM intrinsics -------------------------------------===
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+
			
 
				+#ifndef __IMMINTRIN_H
			
 
				+#error "Never use <rtmintrin.h> directly; include <immintrin.h> instead."
			
 
				+#endif
			
 
				+
			
 
				+#ifndef __RTMINTRIN_H
			
 
				+#define __RTMINTRIN_H
			
 
				+
			
 
				+#define _XBEGIN_STARTED   (~0u)
			
 
				+#define _XABORT_EXPLICIT  (1 << 0)
			
 
				+#define _XABORT_RETRY     (1 << 1)
			
 
				+#define _XABORT_CONFLICT  (1 << 2)
			
 
				+#define _XABORT_CAPACITY  (1 << 3)
			
 
				+#define _XABORT_DEBUG     (1 << 4)
			
 
				+#define _XABORT_NESTED    (1 << 5)
			
 
				+#define _XABORT_CODE(x)   (((x) >> 24) & 0xFF)
			
 
				+
			
 
				+/* Define the default attributes for the functions in this file. */
			
 
				+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("rtm")))
			
 
				+
			
 
				+static __inline__ unsigned int __DEFAULT_FN_ATTRS
			
 
				+_xbegin(void)
			
 
				+{
			
 
				+  return __builtin_ia32_xbegin();
			
 
				+}
			
 
				+
			
 
				+static __inline__ void __DEFAULT_FN_ATTRS
			
 
				+_xend(void)
			
 
				+{
			
 
				+  __builtin_ia32_xend();
			
 
				+}
			
 
				+
			
 
				+#define _xabort(imm) __builtin_ia32_xabort((imm))
			
 
				+
			
 
				+#undef __DEFAULT_FN_ATTRS
			
 
				+
			
 
				+#endif /* __RTMINTRIN_H */
			
--- a/demo/include/s390intrin.h
+++ b/demo/include/s390intrin.h
@@ -0,0 +1,39 @@
 
				+/*===---- s390intrin.h - SystemZ intrinsics --------------------------------===
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+
			
 
				+#ifndef __S390INTRIN_H
			
 
				+#define __S390INTRIN_H
			
 
				+
			
 
				+#ifndef __s390__
			
 
				+#error "<s390intrin.h> is for s390 only"
			
 
				+#endif
			
 
				+
			
 
				+#ifdef __HTM__
			
 
				+#include <htmintrin.h>
			
 
				+#endif
			
 
				+
			
 
				+#ifdef __VEC__
			
 
				+#include <vecintrin.h>
			
 
				+#endif
			
 
				+
			
 
				+#endif /* __S390INTRIN_H*/
			
--- a/demo/include/sanitizer/allocator_interface.h
+++ b/demo/include/sanitizer/allocator_interface.h
@@ -0,0 +1,90 @@
 
				+//===-- allocator_interface.h ---------------------------------------------===//
			
 
				+//
			
 
				+//                     The LLVM Compiler Infrastructure
			
 
				+//
			
 
				+// This file is distributed under the University of Illinois Open Source
			
 
				+// License. See LICENSE.TXT for details.
			
 
				+//
			
 
				+//===----------------------------------------------------------------------===//
			
 
				+//
			
 
				+// Public interface header for allocator used in sanitizers (ASan/TSan/MSan).
			
 
				+//===----------------------------------------------------------------------===//
			
 
				+#ifndef SANITIZER_ALLOCATOR_INTERFACE_H
			
 
				+#define SANITIZER_ALLOCATOR_INTERFACE_H
			
 
				+
			
 
				+#include <stddef.h>
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+  /* Returns the estimated number of bytes that will be reserved by allocator
			
 
				+     for request of "size" bytes. If allocator can't allocate that much
			
 
				+     memory, returns the maximal possible allocation size, otherwise returns
			
 
				+     "size". */
			
 
				+  size_t __sanitizer_get_estimated_allocated_size(size_t size);
			
 
				+
			
 
				+  /* Returns true if p was returned by the allocator and
			
 
				+     is not yet freed. */
			
 
				+  int __sanitizer_get_ownership(const volatile void *p);
			
 
				+
			
 
				+  /* Returns the number of bytes reserved for the pointer p.
			
 
				+     Requires (get_ownership(p) == true) or (p == 0). */
			
 
				+  size_t __sanitizer_get_allocated_size(const volatile void *p);
			
 
				+
			
 
				+  /* Number of bytes, allocated and not yet freed by the application. */
			
 
				+  size_t __sanitizer_get_current_allocated_bytes(void);
			
 
				+
			
 
				+  /* Number of bytes, mmaped by the allocator to fulfill allocation requests.
			
 
				+     Generally, for request of X bytes, allocator can reserve and add to free
			
 
				+     lists a large number of chunks of size X to use them for future requests.
			
 
				+     All these chunks count toward the heap size. Currently, allocator never
			
 
				+     releases memory to OS (instead, it just puts freed chunks to free
			
 
				+     lists). */
			
 
				+  size_t __sanitizer_get_heap_size(void);
			
 
				+
			
 
				+  /* Number of bytes, mmaped by the allocator, which can be used to fulfill
			
 
				+     allocation requests. When a user program frees memory chunk, it can first
			
 
				+     fall into quarantine and will count toward __sanitizer_get_free_bytes()
			
 
				+     later. */
			
 
				+  size_t __sanitizer_get_free_bytes(void);
			
 
				+
			
 
				+  /* Number of bytes in unmapped pages, that are released to OS. Currently,
			
 
				+     always returns 0. */
			
 
				+  size_t __sanitizer_get_unmapped_bytes(void);
			
 
				+
			
 
				+  /* Malloc hooks that may be optionally provided by user.
			
 
				+     __sanitizer_malloc_hook(ptr, size) is called immediately after
			
 
				+       allocation of "size" bytes, which returned "ptr".
			
 
				+     __sanitizer_free_hook(ptr) is called immediately before
			
 
				+       deallocation of "ptr". */
			
 
				+  void __sanitizer_malloc_hook(const volatile void *ptr, size_t size);
			
 
				+  void __sanitizer_free_hook(const volatile void *ptr);
			
 
				+
			
 
				+  /* Installs a pair of hooks for malloc/free.
			
 
				+     Several (currently, 5) hook pairs may be installed, they are executed
			
 
				+     in the order they were installed and after calling
			
 
				+     __sanitizer_malloc_hook/__sanitizer_free_hook.
			
 
				+     Unlike __sanitizer_malloc_hook/__sanitizer_free_hook these hooks can be
			
 
				+     chained and do not rely on weak symbols working on the platform, but
			
 
				+     require __sanitizer_install_malloc_and_free_hooks to be called at startup
			
 
				+     and thus will not be called on malloc/free very early in the process.
			
 
				+     Returns the number of hooks currently installed or 0 on failure.
			
 
				+     Not thread-safe, should be called in the main thread before starting
			
 
				+     other threads.
			
 
				+  */
			
 
				+  int __sanitizer_install_malloc_and_free_hooks(
			
 
				+      void (*malloc_hook)(const volatile void *, size_t),
			
 
				+      void (*free_hook)(const volatile void *));
			
 
				+
			
 
				+  /* Drains allocator quarantines (calling thread's and global ones), returns
			
 
				+     freed memory back to OS and releases other non-essential internal allocator
			
 
				+     resources in attempt to reduce process RSS.
			
 
				+     Currently available with ASan only.
			
 
				+  */
			
 
				+  void __sanitizer_purge_allocator(void);
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}  // extern "C"
			
 
				+#endif
			
 
				+
			
 
				+#endif
			
--- a/demo/include/sanitizer/asan_interface.h
+++ b/demo/include/sanitizer/asan_interface.h
@@ -0,0 +1,155 @@
 
				+//===-- sanitizer/asan_interface.h ------------------------------*- C++ -*-===//
			
 
				+//
			
 
				+//                     The LLVM Compiler Infrastructure
			
 
				+//
			
 
				+// This file is distributed under the University of Illinois Open Source
			
 
				+// License. See LICENSE.TXT for details.
			
 
				+//
			
 
				+//===----------------------------------------------------------------------===//
			
 
				+//
			
 
				+// This file is a part of AddressSanitizer.
			
 
				+//
			
 
				+// Public interface header.
			
 
				+//===----------------------------------------------------------------------===//
			
 
				+#ifndef SANITIZER_ASAN_INTERFACE_H
			
 
				+#define SANITIZER_ASAN_INTERFACE_H
			
 
				+
			
 
				+#include <sanitizer/common_interface_defs.h>
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+  // Marks memory region [addr, addr+size) as unaddressable.
			
 
				+  // This memory must be previously allocated by the user program. Accessing
			
 
				+  // addresses in this region from instrumented code is forbidden until
			
 
				+  // this region is unpoisoned. This function is not guaranteed to poison
			
 
				+  // the whole region - it may poison only subregion of [addr, addr+size) due
			
 
				+  // to ASan alignment restrictions.
			
 
				+  // Method is NOT thread-safe in the sense that no two threads can
			
 
				+  // (un)poison memory in the same memory region simultaneously.
			
 
				+  void __asan_poison_memory_region(void const volatile *addr, size_t size);
			
 
				+  // Marks memory region [addr, addr+size) as addressable.
			
 
				+  // This memory must be previously allocated by the user program. Accessing
			
 
				+  // addresses in this region is allowed until this region is poisoned again.
			
 
				+  // This function may unpoison a superregion of [addr, addr+size) due to
			
 
				+  // ASan alignment restrictions.
			
 
				+  // Method is NOT thread-safe in the sense that no two threads can
			
 
				+  // (un)poison memory in the same memory region simultaneously.
			
 
				+  void __asan_unpoison_memory_region(void const volatile *addr, size_t size);
			
 
				+
			
 
				+// User code should use macros instead of functions.
			
 
				+#if __has_feature(address_sanitizer) || defined(__SANITIZE_ADDRESS__)
			
 
				+#define ASAN_POISON_MEMORY_REGION(addr, size) \
			
 
				+  __asan_poison_memory_region((addr), (size))
			
 
				+#define ASAN_UNPOISON_MEMORY_REGION(addr, size) \
			
 
				+  __asan_unpoison_memory_region((addr), (size))
			
 
				+#else
			
 
				+#define ASAN_POISON_MEMORY_REGION(addr, size) \
			
 
				+  ((void)(addr), (void)(size))
			
 
				+#define ASAN_UNPOISON_MEMORY_REGION(addr, size) \
			
 
				+  ((void)(addr), (void)(size))
			
 
				+#endif
			
 
				+
			
 
				+  // Returns 1 if addr is poisoned (i.e. 1-byte read/write access to this
			
 
				+  // address will result in error report from AddressSanitizer).
			
 
				+  // Otherwise returns 0.
			
 
				+  int __asan_address_is_poisoned(void const volatile *addr);
			
 
				+
			
 
				+  // If at least one byte in [beg, beg+size) is poisoned, return the address
			
 
				+  // of the first such byte. Otherwise return 0.
			
 
				+  void *__asan_region_is_poisoned(void *beg, size_t size);
			
 
				+
			
 
				+  // Print the description of addr (useful when debugging in gdb).
			
 
				+  void __asan_describe_address(void *addr);
			
 
				+
			
 
				+  // Useful for calling from a debugger to get information about an ASan error.
			
 
				+  // Returns 1 if an error has been (or is being) reported, otherwise returns 0.
			
 
				+  int __asan_report_present(void);
			
 
				+
			
 
				+  // Useful for calling from a debugger to get information about an ASan error.
			
 
				+  // If an error has been (or is being) reported, the following functions return
			
 
				+  // the pc, bp, sp, address, access type (0 = read, 1 = write), access size and
			
 
				+  // bug description (e.g. "heap-use-after-free"). Otherwise they return 0.
			
 
				+  void *__asan_get_report_pc(void);
			
 
				+  void *__asan_get_report_bp(void);
			
 
				+  void *__asan_get_report_sp(void);
			
 
				+  void *__asan_get_report_address(void);
			
 
				+  int __asan_get_report_access_type(void);
			
 
				+  size_t __asan_get_report_access_size(void);
			
 
				+  const char *__asan_get_report_description(void);
			
 
				+
			
 
				+  // Useful for calling from the debugger to get information about a pointer.
			
 
				+  // Returns the category of the given pointer as a constant string.
			
 
				+  // Possible return values are "global", "stack", "stack-fake", "heap",
			
 
				+  // "heap-invalid", "shadow-low", "shadow-gap", "shadow-high", "unknown".
			
 
				+  // If global or stack, tries to also return the variable name, address and
			
 
				+  // size. If heap, tries to return the chunk address and size. 'name' should
			
 
				+  // point to an allocated buffer of size 'name_size'.
			
 
				+  const char *__asan_locate_address(void *addr, char *name, size_t name_size,
			
 
				+                                    void **region_address, size_t *region_size);
			
 
				+
			
 
				+  // Useful for calling from the debugger to get the allocation stack trace
			
 
				+  // and thread ID for a heap address. Stores up to 'size' frames into 'trace',
			
 
				+  // returns the number of stored frames or 0 on error.
			
 
				+  size_t __asan_get_alloc_stack(void *addr, void **trace, size_t size,
			
 
				+                                int *thread_id);
			
 
				+
			
 
				+  // Useful for calling from the debugger to get the free stack trace
			
 
				+  // and thread ID for a heap address. Stores up to 'size' frames into 'trace',
			
 
				+  // returns the number of stored frames or 0 on error.
			
 
				+  size_t __asan_get_free_stack(void *addr, void **trace, size_t size,
			
 
				+                               int *thread_id);
			
 
				+
			
 
				+  // Useful for calling from the debugger to get the current shadow memory
			
 
				+  // mapping.
			
 
				+  void __asan_get_shadow_mapping(size_t *shadow_scale, size_t *shadow_offset);
			
 
				+
			
 
				+  // This is an internal function that is called to report an error.
			
 
				+  // However it is still a part of the interface because users may want to
			
 
				+  // set a breakpoint on this function in a debugger.
			
 
				+  void __asan_report_error(void *pc, void *bp, void *sp,
			
 
				+                           void *addr, int is_write, size_t access_size);
			
 
				+
			
 
				+  // Deprecated. Call __sanitizer_set_death_callback instead.
			
 
				+  void __asan_set_death_callback(void (*callback)(void));
			
 
				+
			
 
				+  void __asan_set_error_report_callback(void (*callback)(const char*));
			
 
				+
			
 
				+  // User may provide function that would be called right when ASan detects
			
 
				+  // an error. This can be used to notice cases when ASan detects an error, but
			
 
				+  // the program crashes before ASan report is printed.
			
 
				+  void __asan_on_error(void);
			
 
				+
			
 
				+  // Prints accumulated stats to stderr. Used for debugging.
			
 
				+  void __asan_print_accumulated_stats(void);
			
 
				+
			
 
				+  // This function may be optionally provided by user and should return
			
 
				+  // a string containing ASan runtime options. See asan_flags.h for details.
			
 
				+  const char* __asan_default_options(void);
			
 
				+
			
 
				+  // The following 2 functions facilitate garbage collection in presence of
			
 
				+  // asan's fake stack.
			
 
				+
			
 
				+  // Returns an opaque handler to be used later in __asan_addr_is_in_fake_stack.
			
 
				+  // Returns NULL if the current thread does not have a fake stack.
			
 
				+  void *__asan_get_current_fake_stack(void);
			
 
				+
			
 
				+  // If fake_stack is non-NULL and addr belongs to a fake frame in
			
 
				+  // fake_stack, returns the address on real stack that corresponds to
			
 
				+  // the fake frame and sets beg/end to the boundaries of this fake frame.
			
 
				+  // Otherwise returns NULL and does not touch beg/end.
			
 
				+  // If beg/end are NULL, they are not touched.
			
 
				+  // This function may be called from a thread other than the owner of
			
 
				+  // fake_stack, but the owner thread need to be alive.
			
 
				+  void *__asan_addr_is_in_fake_stack(void *fake_stack, void *addr, void **beg,
			
 
				+                                     void **end);
			
 
				+
			
 
				+  // Performs cleanup before a [[noreturn]] function.  Must be called
			
 
				+  // before things like _exit and execl to avoid false positives on stack.
			
 
				+  void __asan_handle_no_return(void);
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}  // extern "C"
			
 
				+#endif
			
 
				+
			
 
				+#endif  // SANITIZER_ASAN_INTERFACE_H
			
--- a/demo/include/sanitizer/common_interface_defs.h
+++ b/demo/include/sanitizer/common_interface_defs.h
@@ -0,0 +1,198 @@
 
				+//===-- sanitizer/common_interface_defs.h -----------------------*- C++ -*-===//
			
 
				+//
			
 
				+//                     The LLVM Compiler Infrastructure
			
 
				+//
			
 
				+// This file is distributed under the University of Illinois Open Source
			
 
				+// License. See LICENSE.TXT for details.
			
 
				+//
			
 
				+//===----------------------------------------------------------------------===//
			
 
				+//
			
 
				+// Common part of the public sanitizer interface.
			
 
				+//===----------------------------------------------------------------------===//
			
 
				+
			
 
				+#ifndef SANITIZER_COMMON_INTERFACE_DEFS_H
			
 
				+#define SANITIZER_COMMON_INTERFACE_DEFS_H
			
 
				+
			
 
				+#include <stddef.h>
			
 
				+#include <stdint.h>
			
 
				+
			
 
				+// GCC does not understand __has_feature.
			
 
				+#if !defined(__has_feature)
			
 
				+# define __has_feature(x) 0
			
 
				+#endif
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+  // Arguments for __sanitizer_sandbox_on_notify() below.
			
 
				+  typedef struct {
			
 
				+    // Enable sandbox support in sanitizer coverage.
			
 
				+    int coverage_sandboxed;
			
 
				+    // File descriptor to write coverage data to. If -1 is passed, a file will
			
 
				+    // be pre-opened by __sanitizer_sandobx_on_notify(). This field has no
			
 
				+    // effect if coverage_sandboxed == 0.
			
 
				+    intptr_t coverage_fd;
			
 
				+    // If non-zero, split the coverage data into well-formed blocks. This is
			
 
				+    // useful when coverage_fd is a socket descriptor. Each block will contain
			
 
				+    // a header, allowing data from multiple processes to be sent over the same
			
 
				+    // socket.
			
 
				+    unsigned int coverage_max_block_size;
			
 
				+  } __sanitizer_sandbox_arguments;
			
 
				+
			
 
				+  // Tell the tools to write their reports to "path.<pid>" instead of stderr.
			
 
				+  void __sanitizer_set_report_path(const char *path);
			
 
				+  // Tell the tools to write their reports to the provided file descriptor
			
 
				+  // (casted to void *).
			
 
				+  void __sanitizer_set_report_fd(void *fd);
			
 
				+
			
 
				+  // Notify the tools that the sandbox is going to be turned on. The reserved
			
 
				+  // parameter will be used in the future to hold a structure with functions
			
 
				+  // that the tools may call to bypass the sandbox.
			
 
				+  void __sanitizer_sandbox_on_notify(__sanitizer_sandbox_arguments *args);
			
 
				+
			
 
				+  // This function is called by the tool when it has just finished reporting
			
 
				+  // an error. 'error_summary' is a one-line string that summarizes
			
 
				+  // the error message. This function can be overridden by the client.
			
 
				+  void __sanitizer_report_error_summary(const char *error_summary);
			
 
				+
			
 
				+  // Some of the sanitizers (e.g. asan/tsan) may miss bugs that happen
			
 
				+  // in unaligned loads/stores. In order to find such bugs reliably one needs
			
 
				+  // to replace plain unaligned loads/stores with these calls.
			
 
				+  uint16_t __sanitizer_unaligned_load16(const void *p);
			
 
				+  uint32_t __sanitizer_unaligned_load32(const void *p);
			
 
				+  uint64_t __sanitizer_unaligned_load64(const void *p);
			
 
				+  void __sanitizer_unaligned_store16(void *p, uint16_t x);
			
 
				+  void __sanitizer_unaligned_store32(void *p, uint32_t x);
			
 
				+  void __sanitizer_unaligned_store64(void *p, uint64_t x);
			
 
				+
			
 
				+  // Annotate the current state of a contiguous container, such as
			
 
				+  // std::vector, std::string or similar.
			
 
				+  // A contiguous container is a container that keeps all of its elements
			
 
				+  // in a contiguous region of memory. The container owns the region of memory
			
 
				+  // [beg, end); the memory [beg, mid) is used to store the current elements
			
 
				+  // and the memory [mid, end) is reserved for future elements;
			
 
				+  // beg <= mid <= end. For example, in "std::vector<> v"
			
 
				+  //   beg = &v[0];
			
 
				+  //   end = beg + v.capacity() * sizeof(v[0]);
			
 
				+  //   mid = beg + v.size()     * sizeof(v[0]);
			
 
				+  //
			
 
				+  // This annotation tells the Sanitizer tool about the current state of the
			
 
				+  // container so that the tool can report errors when memory from [mid, end)
			
 
				+  // is accessed. Insert this annotation into methods like push_back/pop_back.
			
 
				+  // Supply the old and the new values of mid (old_mid/new_mid).
			
 
				+  // In the initial state mid == end and so should be the final
			
 
				+  // state when the container is destroyed or when it reallocates the storage.
			
 
				+  //
			
 
				+  // Use with caution and don't use for anything other than vector-like classes.
			
 
				+  //
			
 
				+  // For AddressSanitizer, 'beg' should be 8-aligned and 'end' should
			
 
				+  // be either 8-aligned or it should point to the end of a separate heap-,
			
 
				+  // stack-, or global- allocated buffer. I.e. the following will not work:
			
 
				+  //   int64_t x[2];  // 16 bytes, 8-aligned.
			
 
				+  //   char *beg = (char *)&x[0];
			
 
				+  //   char *end = beg + 12;  // Not 8 aligned, not the end of the buffer.
			
 
				+  // This however will work fine:
			
 
				+  //   int32_t x[3];  // 12 bytes, but 8-aligned under AddressSanitizer.
			
 
				+  //   char *beg = (char*)&x[0];
			
 
				+  //   char *end = beg + 12;  // Not 8-aligned, but is the end of the buffer.
			
 
				+  void __sanitizer_annotate_contiguous_container(const void *beg,
			
 
				+                                                 const void *end,
			
 
				+                                                 const void *old_mid,
			
 
				+                                                 const void *new_mid);
			
 
				+  // Returns true if the contiguous container [beg, end) is properly poisoned
			
 
				+  // (e.g. with __sanitizer_annotate_contiguous_container), i.e. if
			
 
				+  //  - [beg, mid) is addressable,
			
 
				+  //  - [mid, end) is unaddressable.
			
 
				+  // Full verification requires O(end-beg) time; this function tries to avoid
			
 
				+  // such complexity by touching only parts of the container around beg/mid/end.
			
 
				+  int __sanitizer_verify_contiguous_container(const void *beg, const void *mid,
			
 
				+                                              const void *end);
			
 
				+
			
 
				+  // Similar to __sanitizer_verify_contiguous_container but returns the address
			
 
				+  // of the first improperly poisoned byte otherwise. Returns null if the area
			
 
				+  // is poisoned properly.
			
 
				+  const void *__sanitizer_contiguous_container_find_bad_address(
			
 
				+      const void *beg, const void *mid, const void *end);
			
 
				+
			
 
				+  // Print the stack trace leading to this call. Useful for debugging user code.
			
 
				+  void __sanitizer_print_stack_trace(void);
			
 
				+
			
 
				+  // Symbolizes the supplied 'pc' using the format string 'fmt'.
			
 
				+  // Outputs at most 'out_buf_size' bytes into 'out_buf'.
			
 
				+  // The format syntax is described in
			
 
				+  // lib/sanitizer_common/sanitizer_stacktrace_printer.h.
			
 
				+  void __sanitizer_symbolize_pc(void *pc, const char *fmt, char *out_buf,
			
 
				+                                size_t out_buf_size);
			
 
				+  // Same as __sanitizer_symbolize_pc, but for data section (i.e. globals).
			
 
				+  void __sanitizer_symbolize_global(void *data_ptr, const char *fmt,
			
 
				+                                    char *out_buf, size_t out_buf_size);
			
 
				+
			
 
				+  // Sets the callback to be called right before death on error.
			
 
				+  // Passing 0 will unset the callback.
			
 
				+  void __sanitizer_set_death_callback(void (*callback)(void));
			
 
				+
			
 
				+  // Interceptor hooks.
			
 
				+  // Whenever a libc function interceptor is called it checks if the
			
 
				+  // corresponding weak hook is defined, and it so -- calls it.
			
 
				+  // The primary use case is data-flow-guided fuzzing, where the fuzzer needs
			
 
				+  // to know what is being passed to libc functions, e.g. memcmp.
			
 
				+  // FIXME: implement more hooks.
			
 
				+  void __sanitizer_weak_hook_memcmp(void *called_pc, const void *s1,
			
 
				+                                    const void *s2, size_t n, int result);
			
 
				+  void __sanitizer_weak_hook_strncmp(void *called_pc, const char *s1,
			
 
				+                                    const char *s2, size_t n, int result);
			
 
				+  void __sanitizer_weak_hook_strncasecmp(void *called_pc, const char *s1,
			
 
				+                                         const char *s2, size_t n, int result);
			
 
				+  void __sanitizer_weak_hook_strcmp(void *called_pc, const char *s1,
			
 
				+                                    const char *s2, int result);
			
 
				+  void __sanitizer_weak_hook_strcasecmp(void *called_pc, const char *s1,
			
 
				+                                        const char *s2, int result);
			
 
				+  void __sanitizer_weak_hook_strstr(void *called_pc, const char *s1,
			
 
				+                                    const char *s2, char *result);
			
 
				+  void __sanitizer_weak_hook_strcasestr(void *called_pc, const char *s1,
			
 
				+                                        const char *s2, char *result);
			
 
				+  void __sanitizer_weak_hook_memmem(void *called_pc,
			
 
				+                                    const void *s1, size_t len1,
			
 
				+                                    const void *s2, size_t len2, void *result);
			
 
				+
			
 
				+  // Prints stack traces for all live heap allocations ordered by total
			
 
				+  // allocation size until `top_percent` of total live heap is shown.
			
 
				+  // `top_percent` should be between 1 and 100.
			
 
				+  // At most `max_number_of_contexts` contexts (stack traces) is printed.
			
 
				+  // Experimental feature currently available only with asan on Linux/x86_64.
			
 
				+  void __sanitizer_print_memory_profile(size_t top_percent,
			
 
				+                                        size_t max_number_of_contexts);
			
 
				+
			
 
				+  // Fiber annotation interface.
			
 
				+  // Before switching to a different stack, one must call
			
 
				+  // __sanitizer_start_switch_fiber with a pointer to the bottom of the
			
 
				+  // destination stack and its size. When code starts running on the new stack,
			
 
				+  // it must call __sanitizer_finish_switch_fiber to finalize the switch.
			
 
				+  // The start_switch function takes a void** to store the current fake stack if
			
 
				+  // there is one (it is needed when detect_stack_use_after_return is enabled).
			
 
				+  // When restoring a stack, this pointer must be given to the finish_switch
			
 
				+  // function. In most cases, this void* can be stored on the stack just before
			
 
				+  // switching.  When leaving a fiber definitely, null must be passed as first
			
 
				+  // argument to the start_switch function so that the fake stack is destroyed.
			
 
				+  // If you do not want support for stack use-after-return detection, you can
			
 
				+  // always pass null to these two functions.
			
 
				+  // Note that the fake stack mechanism is disabled during fiber switch, so if a
			
 
				+  // signal callback runs during the switch, it will not benefit from the stack
			
 
				+  // use-after-return detection.
			
 
				+  void __sanitizer_start_switch_fiber(void **fake_stack_save,
			
 
				+                                      const void *bottom, size_t size);
			
 
				+  void __sanitizer_finish_switch_fiber(void *fake_stack_save,
			
 
				+                                       const void **bottom_old,
			
 
				+                                       size_t *size_old);
			
 
				+
			
 
				+  // Get full module name and calculate pc offset within it.
			
 
				+  // Returns 1 if pc belongs to some module, 0 if module was not found.
			
 
				+  int __sanitizer_get_module_and_offset_for_pc(void *pc, char *module_path,
			
 
				+                                               size_t module_path_len,
			
 
				+                                               void **pc_offset);
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}  // extern "C"
			
 
				+#endif
			
 
				+
			
 
				+#endif  // SANITIZER_COMMON_INTERFACE_DEFS_H
			
--- a/demo/include/sanitizer/coverage_interface.h
+++ b/demo/include/sanitizer/coverage_interface.h
@@ -0,0 +1,36 @@
 
				+//===-- sanitizer/coverage_interface.h --------------------------*- C++ -*-===//
			
 
				+//
			
 
				+//                     The LLVM Compiler Infrastructure
			
 
				+//
			
 
				+// This file is distributed under the University of Illinois Open Source
			
 
				+// License. See LICENSE.TXT for details.
			
 
				+//
			
 
				+//===----------------------------------------------------------------------===//
			
 
				+//
			
 
				+// Public interface for sanitizer coverage.
			
 
				+//===----------------------------------------------------------------------===//
			
 
				+
			
 
				+#ifndef SANITIZER_COVERAG_INTERFACE_H
			
 
				+#define SANITIZER_COVERAG_INTERFACE_H
			
 
				+
			
 
				+#include <sanitizer/common_interface_defs.h>
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+  // Record and dump coverage info.
			
 
				+  void __sanitizer_cov_dump(void);
			
 
				+
			
 
				+  // Clear collected coverage info.
			
 
				+  void __sanitizer_cov_reset(void);
			
 
				+
			
 
				+  // Dump collected coverage info. Sorts pcs by module into individual .sancov
			
 
				+  // files.
			
 
				+  void __sanitizer_dump_coverage(const uintptr_t *pcs, uintptr_t len);
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}  // extern "C"
			
 
				+#endif
			
 
				+
			
 
				+#endif  // SANITIZER_COVERAG_INTERFACE_H
			
--- a/demo/include/sanitizer/dfsan_interface.h
+++ b/demo/include/sanitizer/dfsan_interface.h
@@ -0,0 +1,116 @@
 
				+//===-- dfsan_interface.h -------------------------------------------------===//
			
 
				+//
			
 
				+//                     The LLVM Compiler Infrastructure
			
 
				+//
			
 
				+// This file is distributed under the University of Illinois Open Source
			
 
				+// License. See LICENSE.TXT for details.
			
 
				+//
			
 
				+//===----------------------------------------------------------------------===//
			
 
				+//
			
 
				+// This file is a part of DataFlowSanitizer.
			
 
				+//
			
 
				+// Public interface header.
			
 
				+//===----------------------------------------------------------------------===//
			
 
				+#ifndef DFSAN_INTERFACE_H
			
 
				+#define DFSAN_INTERFACE_H
			
 
				+
			
 
				+#include <stddef.h>
			
 
				+#include <stdint.h>
			
 
				+#include <sanitizer/common_interface_defs.h>
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+typedef uint16_t dfsan_label;
			
 
				+
			
 
				+/// Stores information associated with a specific label identifier.  A label
			
 
				+/// may be a base label created using dfsan_create_label, with associated
			
 
				+/// text description and user data, or an automatically created union label,
			
 
				+/// which represents the union of two label identifiers (which may themselves
			
 
				+/// be base or union labels).
			
 
				+struct dfsan_label_info {
			
 
				+  // Fields for union labels, set to 0 for base labels.
			
 
				+  dfsan_label l1;
			
 
				+  dfsan_label l2;
			
 
				+
			
 
				+  // Fields for base labels.
			
 
				+  const char *desc;
			
 
				+  void *userdata;
			
 
				+};
			
 
				+
			
 
				+/// Signature of the callback argument to dfsan_set_write_callback().
			
 
				+typedef void (*dfsan_write_callback_t)(int fd, const void *buf, size_t count);
			
 
				+
			
 
				+/// Computes the union of \c l1 and \c l2, possibly creating a union label in
			
 
				+/// the process.
			
 
				+dfsan_label dfsan_union(dfsan_label l1, dfsan_label l2);
			
 
				+
			
 
				+/// Creates and returns a base label with the given description and user data.
			
 
				+dfsan_label dfsan_create_label(const char *desc, void *userdata);
			
 
				+
			
 
				+/// Sets the label for each address in [addr,addr+size) to \c label.
			
 
				+void dfsan_set_label(dfsan_label label, void *addr, size_t size);
			
 
				+
			
 
				+/// Sets the label for each address in [addr,addr+size) to the union of the
			
 
				+/// current label for that address and \c label.
			
 
				+void dfsan_add_label(dfsan_label label, void *addr, size_t size);
			
 
				+
			
 
				+/// Retrieves the label associated with the given data.
			
 
				+///
			
 
				+/// The type of 'data' is arbitrary.  The function accepts a value of any type,
			
 
				+/// which can be truncated or extended (implicitly or explicitly) as necessary.
			
 
				+/// The truncation/extension operations will preserve the label of the original
			
 
				+/// value.
			
 
				+dfsan_label dfsan_get_label(long data);
			
 
				+
			
 
				+/// Retrieves the label associated with the data at the given address.
			
 
				+dfsan_label dfsan_read_label(const void *addr, size_t size);
			
 
				+
			
 
				+/// Retrieves a pointer to the dfsan_label_info struct for the given label.
			
 
				+const struct dfsan_label_info *dfsan_get_label_info(dfsan_label label);
			
 
				+
			
 
				+/// Returns whether the given label label contains the label elem.
			
 
				+int dfsan_has_label(dfsan_label label, dfsan_label elem);
			
 
				+
			
 
				+/// If the given label label contains a label with the description desc, returns
			
 
				+/// that label, else returns 0.
			
 
				+dfsan_label dfsan_has_label_with_desc(dfsan_label label, const char *desc);
			
 
				+
			
 
				+/// Returns the number of labels allocated.
			
 
				+size_t dfsan_get_label_count(void);
			
 
				+
			
 
				+/// Sets a callback to be invoked on calls to write().  The callback is invoked
			
 
				+/// before the write is done.  The write is not guaranteed to succeed when the
			
 
				+/// callback executes.  Pass in NULL to remove any callback.
			
 
				+void dfsan_set_write_callback(dfsan_write_callback_t labeled_write_callback);
			
 
				+
			
 
				+/// Writes the labels currently used by the program to the given file
			
 
				+/// descriptor. The lines of the output have the following format:
			
 
				+///
			
 
				+/// <label> <parent label 1> <parent label 2> <label description if any>
			
 
				+void dfsan_dump_labels(int fd);
			
 
				+
			
 
				+/// Interceptor hooks.
			
 
				+/// Whenever a dfsan's custom function is called the corresponding
			
 
				+/// hook is called it non-zero. The hooks should be defined by the user.
			
 
				+/// The primary use case is taint-guided fuzzing, where the fuzzer
			
 
				+/// needs to see the parameters of the function and the labels.
			
 
				+/// FIXME: implement more hooks.
			
 
				+void dfsan_weak_hook_memcmp(void *caller_pc, const void *s1, const void *s2,
			
 
				+                            size_t n, dfsan_label s1_label,
			
 
				+                            dfsan_label s2_label, dfsan_label n_label);
			
 
				+void dfsan_weak_hook_strncmp(void *caller_pc, const char *s1, const char *s2,
			
 
				+                             size_t n, dfsan_label s1_label,
			
 
				+                             dfsan_label s2_label, dfsan_label n_label);
			
 
				+#ifdef __cplusplus
			
 
				+}  // extern "C"
			
 
				+
			
 
				+template <typename T>
			
 
				+void dfsan_set_label(dfsan_label label, T &data) {  // NOLINT
			
 
				+  dfsan_set_label(label, (void *)&data, sizeof(T));
			
 
				+}
			
 
				+
			
 
				+#endif
			
 
				+
			
 
				+#endif  // DFSAN_INTERFACE_H
			
--- a/demo/include/sanitizer/esan_interface.h
+++ b/demo/include/sanitizer/esan_interface.h
@@ -0,0 +1,50 @@
 
				+//===-- sanitizer/esan_interface.h ------------------------------*- C++ -*-===//
			
 
				+//
			
 
				+//                     The LLVM Compiler Infrastructure
			
 
				+//
			
 
				+// This file is distributed under the University of Illinois Open Source
			
 
				+// License. See LICENSE.TXT for details.
			
 
				+//
			
 
				+//===----------------------------------------------------------------------===//
			
 
				+//
			
 
				+// This file is a part of EfficiencySanitizer, a family of performance tuners.
			
 
				+//
			
 
				+// Public interface header.
			
 
				+//===----------------------------------------------------------------------===//
			
 
				+#ifndef SANITIZER_ESAN_INTERFACE_H
			
 
				+#define SANITIZER_ESAN_INTERFACE_H
			
 
				+
			
 
				+#include <sanitizer/common_interface_defs.h>
			
 
				+
			
 
				+// We declare our interface routines as weak to allow the user to avoid
			
 
				+// ifdefs and instead use this pattern to allow building the same sources
			
 
				+// with and without our runtime library:
			
 
				+//     if (__esan_report)
			
 
				+//       __esan_report();
			
 
				+#ifdef _MSC_VER
			
 
				+/* selectany is as close to weak as we'll get. */
			
 
				+#define COMPILER_RT_WEAK __declspec(selectany)
			
 
				+#elif __GNUC__
			
 
				+#define COMPILER_RT_WEAK __attribute__((weak))
			
 
				+#else
			
 
				+#define COMPILER_RT_WEAK
			
 
				+#endif
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+// This function can be called mid-run (or at the end of a run for
			
 
				+// a server process that doesn't shut down normally) to request that
			
 
				+// data for that point in the run be reported from the tool.
			
 
				+void COMPILER_RT_WEAK __esan_report(void);
			
 
				+
			
 
				+// This function returns the number of samples that the esan tool has collected
			
 
				+// to this point.  This is useful for testing.
			
 
				+unsigned int COMPILER_RT_WEAK __esan_get_sample_count(void);
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+} // extern "C"
			
 
				+#endif
			
 
				+
			
 
				+#endif // SANITIZER_ESAN_INTERFACE_H
			
--- a/demo/include/sanitizer/hwasan_interface.h
+++ b/demo/include/sanitizer/hwasan_interface.h
@@ -0,0 +1,33 @@
 
				+//===-- sanitizer/asan_interface.h ------------------------------*- C++ -*-===//
			
 
				+//
			
 
				+//                     The LLVM Compiler Infrastructure
			
 
				+//
			
 
				+// This file is distributed under the University of Illinois Open Source
			
 
				+// License. See LICENSE.TXT for details.
			
 
				+//
			
 
				+//===----------------------------------------------------------------------===//
			
 
				+//
			
 
				+// This file is a part of HWAddressSanitizer.
			
 
				+//
			
 
				+// Public interface header.
			
 
				+//===----------------------------------------------------------------------===//
			
 
				+#ifndef SANITIZER_HWASAN_INTERFACE_H
			
 
				+#define SANITIZER_HWASAN_INTERFACE_H
			
 
				+
			
 
				+#include <sanitizer/common_interface_defs.h>
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+  // This function may be optionally provided by user and should return
			
 
				+  // a string containing HWASan runtime options. See asan_flags.h for details.
			
 
				+  const char* __hwasan_default_options(void);
			
 
				+
			
 
				+  void __hwasan_enable_allocator_tagging(void);
			
 
				+  void __hwasan_disable_allocator_tagging(void);
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}  // extern "C"
			
 
				+#endif
			
 
				+
			
 
				+#endif  // SANITIZER_HWASAN_INTERFACE_H
			
--- a/demo/include/sanitizer/linux_syscall_hooks.h
+++ b/demo/include/sanitizer/linux_syscall_hooks.h
--- a/demo/include/sanitizer/lsan_interface.h
+++ b/demo/include/sanitizer/lsan_interface.h
@@ -0,0 +1,90 @@
 
				+//===-- sanitizer/lsan_interface.h ------------------------------*- C++ -*-===//
			
 
				+//
			
 
				+//                     The LLVM Compiler Infrastructure
			
 
				+//
			
 
				+// This file is distributed under the University of Illinois Open Source
			
 
				+// License. See LICENSE.TXT for details.
			
 
				+//
			
 
				+//===----------------------------------------------------------------------===//
			
 
				+//
			
 
				+// This file is a part of LeakSanitizer.
			
 
				+//
			
 
				+// Public interface header.
			
 
				+//===----------------------------------------------------------------------===//
			
 
				+#ifndef SANITIZER_LSAN_INTERFACE_H
			
 
				+#define SANITIZER_LSAN_INTERFACE_H
			
 
				+
			
 
				+#include <sanitizer/common_interface_defs.h>
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+  // Allocations made between calls to __lsan_disable() and __lsan_enable() will
			
 
				+  // be treated as non-leaks. Disable/enable pairs may be nested.
			
 
				+  void __lsan_disable(void);
			
 
				+  void __lsan_enable(void);
			
 
				+
			
 
				+  // The heap object into which p points will be treated as a non-leak.
			
 
				+  void __lsan_ignore_object(const void *p);
			
 
				+
			
 
				+  // Memory regions registered through this interface will be treated as sources
			
 
				+  // of live pointers during leak checking. Useful if you store pointers in
			
 
				+  // mapped memory.
			
 
				+  // Points of note:
			
 
				+  // - __lsan_unregister_root_region() must be called with the same pointer and
			
 
				+  // size that have earlier been passed to __lsan_register_root_region()
			
 
				+  // - LSan will skip any inaccessible memory when scanning a root region. E.g.,
			
 
				+  // if you map memory within a larger region that you have mprotect'ed, you can
			
 
				+  // register the entire large region.
			
 
				+  // - the implementation is not optimized for performance. This interface is
			
 
				+  // intended to be used for a small number of relatively static regions.
			
 
				+  void __lsan_register_root_region(const void *p, size_t size);
			
 
				+  void __lsan_unregister_root_region(const void *p, size_t size);
			
 
				+
			
 
				+  // Check for leaks now. This function behaves identically to the default
			
 
				+  // end-of-process leak check. In particular, it will terminate the process if
			
 
				+  // leaks are found and the exitcode runtime flag is non-zero.
			
 
				+  // Subsequent calls to this function will have no effect and end-of-process
			
 
				+  // leak check will not run. Effectively, end-of-process leak check is moved to
			
 
				+  // the time of first invocation of this function.
			
 
				+  // By calling this function early during process shutdown, you can instruct
			
 
				+  // LSan to ignore shutdown-only leaks which happen later on.
			
 
				+  void __lsan_do_leak_check(void);
			
 
				+
			
 
				+  // Check for leaks now. Returns zero if no leaks have been found or if leak
			
 
				+  // detection is disabled, non-zero otherwise.
			
 
				+  // This function may be called repeatedly, e.g. to periodically check a
			
 
				+  // long-running process. It prints a leak report if appropriate, but does not
			
 
				+  // terminate the process. It does not affect the behavior of
			
 
				+  // __lsan_do_leak_check() or the end-of-process leak check, and is not
			
 
				+  // affected by them.
			
 
				+  int __lsan_do_recoverable_leak_check(void);
			
 
				+
			
 
				+  // The user may optionally provide this function to disallow leak checking
			
 
				+  // for the program it is linked into (if the return value is non-zero). This
			
 
				+  // function must be defined as returning a constant value; any behavior beyond
			
 
				+  // that is unsupported.
			
 
				+  // To avoid dead stripping, you may need to define this function with
			
 
				+  // __attribute__((used))
			
 
				+  int __lsan_is_turned_off(void);
			
 
				+
			
 
				+  // This function may be optionally provided by user and should return
			
 
				+  // a string containing LSan runtime options. See lsan_flags.inc for details.
			
 
				+  const char *__lsan_default_options(void);
			
 
				+
			
 
				+  // This function may be optionally provided by the user and should return
			
 
				+  // a string containing LSan suppressions.
			
 
				+  const char *__lsan_default_suppressions(void);
			
 
				+#ifdef __cplusplus
			
 
				+}  // extern "C"
			
 
				+
			
 
				+namespace __lsan {
			
 
				+class ScopedDisabler {
			
 
				+ public:
			
 
				+  ScopedDisabler() { __lsan_disable(); }
			
 
				+  ~ScopedDisabler() { __lsan_enable(); }
			
 
				+};
			
 
				+}  // namespace __lsan
			
 
				+#endif
			
 
				+
			
 
				+#endif  // SANITIZER_LSAN_INTERFACE_H
			
--- a/demo/include/sanitizer/msan_interface.h
+++ b/demo/include/sanitizer/msan_interface.h
@@ -0,0 +1,111 @@
 
				+//===-- msan_interface.h --------------------------------------------------===//
			
 
				+//
			
 
				+//                     The LLVM Compiler Infrastructure
			
 
				+//
			
 
				+// This file is distributed under the University of Illinois Open Source
			
 
				+// License. See LICENSE.TXT for details.
			
 
				+//
			
 
				+//===----------------------------------------------------------------------===//
			
 
				+//
			
 
				+// This file is a part of MemorySanitizer.
			
 
				+//
			
 
				+// Public interface header.
			
 
				+//===----------------------------------------------------------------------===//
			
 
				+#ifndef MSAN_INTERFACE_H
			
 
				+#define MSAN_INTERFACE_H
			
 
				+
			
 
				+#include <sanitizer/common_interface_defs.h>
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+  /* Set raw origin for the memory range. */
			
 
				+  void __msan_set_origin(const volatile void *a, size_t size, uint32_t origin);
			
 
				+
			
 
				+  /* Get raw origin for an address. */
			
 
				+  uint32_t __msan_get_origin(const volatile void *a);
			
 
				+
			
 
				+  /* Test that this_id is a descendant of prev_id (or they are simply equal).
			
 
				+   * "descendant" here means they are part of the same chain, created with
			
 
				+   * __msan_chain_origin. */
			
 
				+  int __msan_origin_is_descendant_or_same(uint32_t this_id, uint32_t prev_id);
			
 
				+
			
 
				+  /* Returns non-zero if tracking origins. */
			
 
				+  int __msan_get_track_origins(void);
			
 
				+
			
 
				+  /* Returns the origin id of the latest UMR in the calling thread. */
			
 
				+  uint32_t __msan_get_umr_origin(void);
			
 
				+
			
 
				+  /* Make memory region fully initialized (without changing its contents). */
			
 
				+  void __msan_unpoison(const volatile void *a, size_t size);
			
 
				+
			
 
				+  /* Make a null-terminated string fully initialized (without changing its
			
 
				+     contents). */
			
 
				+  void __msan_unpoison_string(const volatile char *a);
			
 
				+
			
 
				+  /* Make memory region fully uninitialized (without changing its contents).
			
 
				+     This is a legacy interface that does not update origin information. Use
			
 
				+     __msan_allocated_memory() instead. */
			
 
				+  void __msan_poison(const volatile void *a, size_t size);
			
 
				+
			
 
				+  /* Make memory region partially uninitialized (without changing its contents).
			
 
				+   */
			
 
				+  void __msan_partial_poison(const volatile void *data, void *shadow,
			
 
				+                             size_t size);
			
 
				+
			
 
				+  /* Returns the offset of the first (at least partially) poisoned byte in the
			
 
				+     memory range, or -1 if the whole range is good. */
			
 
				+  intptr_t __msan_test_shadow(const volatile void *x, size_t size);
			
 
				+
			
 
				+  /* Checks that memory range is fully initialized, and reports an error if it
			
 
				+   * is not. */
			
 
				+  void __msan_check_mem_is_initialized(const volatile void *x, size_t size);
			
 
				+
			
 
				+  /* For testing:
			
 
				+     __msan_set_expect_umr(1);
			
 
				+     ... some buggy code ...
			
 
				+     __msan_set_expect_umr(0);
			
 
				+     The last line will verify that a UMR happened. */
			
 
				+  void __msan_set_expect_umr(int expect_umr);
			
 
				+
			
 
				+  /* Change the value of keep_going flag. Non-zero value means don't terminate
			
 
				+     program execution when an error is detected. This will not affect error in
			
 
				+     modules that were compiled without the corresponding compiler flag. */
			
 
				+  void __msan_set_keep_going(int keep_going);
			
 
				+
			
 
				+  /* Print shadow and origin for the memory range to stderr in a human-readable
			
 
				+     format. */
			
 
				+  void __msan_print_shadow(const volatile void *x, size_t size);
			
 
				+
			
 
				+  /* Print shadow for the memory range to stderr in a minimalistic
			
 
				+     human-readable format. */
			
 
				+  void __msan_dump_shadow(const volatile void *x, size_t size);
			
 
				+
			
 
				+  /* Returns true if running under a dynamic tool (DynamoRio-based). */
			
 
				+  int  __msan_has_dynamic_component(void);
			
 
				+
			
 
				+  /* Tell MSan about newly allocated memory (ex.: custom allocator).
			
 
				+     Memory will be marked uninitialized, with origin at the call site. */
			
 
				+  void __msan_allocated_memory(const volatile void* data, size_t size);
			
 
				+
			
 
				+  /* Tell MSan about newly destroyed memory. Mark memory as uninitialized. */
			
 
				+  void __sanitizer_dtor_callback(const volatile void* data, size_t size);
			
 
				+
			
 
				+  /* This function may be optionally provided by user and should return
			
 
				+     a string containing Msan runtime options. See msan_flags.h for details. */
			
 
				+  const char* __msan_default_options(void);
			
 
				+
			
 
				+  /* Deprecated. Call __sanitizer_set_death_callback instead. */
			
 
				+  void __msan_set_death_callback(void (*callback)(void));
			
 
				+
			
 
				+  /* Update shadow for the application copy of size bytes from src to dst.
			
 
				+     Src and dst are application addresses. This function does not copy the
			
 
				+     actual application memory, it only updates shadow and origin for such
			
 
				+     copy. Source and destination regions can overlap. */
			
 
				+  void __msan_copy_shadow(const volatile void *dst, const volatile void *src,
			
 
				+                          size_t size);
			
 
				+#ifdef __cplusplus
			
 
				+}  // extern "C"
			
 
				+#endif
			
 
				+
			
 
				+#endif
			
--- a/demo/include/sanitizer/scudo_interface.h
+++ b/demo/include/sanitizer/scudo_interface.h
@@ -0,0 +1,34 @@
 
				+//===-- sanitizer/scudo_interface.h -----------------------------*- C++ -*-===//
			
 
				+//
			
 
				+//                     The LLVM Compiler Infrastructure
			
 
				+//
			
 
				+// This file is distributed under the University of Illinois Open Source
			
 
				+// License. See LICENSE.TXT for details.
			
 
				+//
			
 
				+//===----------------------------------------------------------------------===//
			
 
				+//
			
 
				+/// Public Scudo interface header.
			
 
				+//
			
 
				+//===----------------------------------------------------------------------===//
			
 
				+#ifndef SANITIZER_SCUDO_INTERFACE_H_
			
 
				+#define SANITIZER_SCUDO_INTERFACE_H_
			
 
				+
			
 
				+#include <sanitizer/common_interface_defs.h>
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+  // This function may be optionally provided by a user and should return
			
 
				+  // a string containing Scudo runtime options. See scudo_flags.h for details.
			
 
				+  const char* __scudo_default_options(void);
			
 
				+
			
 
				+  // This function allows to set the RSS limit at runtime. This can be either
			
 
				+  // the hard limit (HardLimit=1) or the soft limit (HardLimit=0). The limit
			
 
				+  // can be removed by setting LimitMb to 0. This function's parameters should
			
 
				+  // be fully trusted to avoid security mishaps.
			
 
				+  void __scudo_set_rss_limit(unsigned long LimitMb, int HardLimit);
			
 
				+#ifdef __cplusplus
			
 
				+}  // extern "C"
			
 
				+#endif
			
 
				+
			
 
				+#endif  // SANITIZER_SCUDO_INTERFACE_H_
			
--- a/demo/include/sanitizer/tsan_interface.h
+++ b/demo/include/sanitizer/tsan_interface.h
@@ -0,0 +1,144 @@
 
				+//===-- tsan_interface.h ----------------------------------------*- C++ -*-===//
			
 
				+//
			
 
				+//                     The LLVM Compiler Infrastructure
			
 
				+//
			
 
				+// This file is distributed under the University of Illinois Open Source
			
 
				+// License. See LICENSE.TXT for details.
			
 
				+//
			
 
				+//===----------------------------------------------------------------------===//
			
 
				+//
			
 
				+// This file is a part of ThreadSanitizer (TSan), a race detector.
			
 
				+//
			
 
				+// Public interface header for TSan.
			
 
				+//===----------------------------------------------------------------------===//
			
 
				+#ifndef SANITIZER_TSAN_INTERFACE_H
			
 
				+#define SANITIZER_TSAN_INTERFACE_H
			
 
				+
			
 
				+#include <sanitizer/common_interface_defs.h>
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+// __tsan_release establishes a happens-before relation with a preceding
			
 
				+// __tsan_acquire on the same address.
			
 
				+void __tsan_acquire(void *addr);
			
 
				+void __tsan_release(void *addr);
			
 
				+
			
 
				+// Annotations for custom mutexes.
			
 
				+// The annotations allow to get better reports (with sets of locked mutexes),
			
 
				+// detect more types of bugs (e.g. mutex misuses, races between lock/unlock and
			
 
				+// destruction and potential deadlocks) and improve precision and performance
			
 
				+// (by ignoring individual atomic operations in mutex code). However, the
			
 
				+// downside is that annotated mutex code itself is not checked for correctness.
			
 
				+
			
 
				+// Mutex creation flags are passed to __tsan_mutex_create annotation.
			
 
				+// If mutex has no constructor and __tsan_mutex_create is not called,
			
 
				+// the flags may be passed to __tsan_mutex_pre_lock/__tsan_mutex_post_lock
			
 
				+// annotations.
			
 
				+
			
 
				+// Mutex has static storage duration and no-op constructor and destructor.
			
 
				+// This effectively makes tsan ignore destroy annotation.
			
 
				+const unsigned __tsan_mutex_linker_init      = 1 << 0;
			
 
				+// Mutex is write reentrant.
			
 
				+const unsigned __tsan_mutex_write_reentrant  = 1 << 1;
			
 
				+// Mutex is read reentrant.
			
 
				+const unsigned __tsan_mutex_read_reentrant   = 1 << 2;
			
 
				+// Mutex does not have static storage duration, and must not be used after
			
 
				+// its destructor runs.  The opposite of __tsan_mutex_linker_init.
			
 
				+// If this flag is passed to __tsan_mutex_destroy, then the destruction
			
 
				+// is ignored unless this flag was previously set on the mutex.
			
 
				+const unsigned __tsan_mutex_not_static       = 1 << 8;
			
 
				+
			
 
				+// Mutex operation flags:
			
 
				+
			
 
				+// Denotes read lock operation.
			
 
				+const unsigned __tsan_mutex_read_lock        = 1 << 3;
			
 
				+// Denotes try lock operation.
			
 
				+const unsigned __tsan_mutex_try_lock         = 1 << 4;
			
 
				+// Denotes that a try lock operation has failed to acquire the mutex.
			
 
				+const unsigned __tsan_mutex_try_lock_failed  = 1 << 5;
			
 
				+// Denotes that the lock operation acquires multiple recursion levels.
			
 
				+// Number of levels is passed in recursion parameter.
			
 
				+// This is useful for annotation of e.g. Java builtin monitors,
			
 
				+// for which wait operation releases all recursive acquisitions of the mutex.
			
 
				+const unsigned __tsan_mutex_recursive_lock   = 1 << 6;
			
 
				+// Denotes that the unlock operation releases all recursion levels.
			
 
				+// Number of released levels is returned and later must be passed to
			
 
				+// the corresponding __tsan_mutex_post_lock annotation.
			
 
				+const unsigned __tsan_mutex_recursive_unlock = 1 << 7;
			
 
				+
			
 
				+// Annotate creation of a mutex.
			
 
				+// Supported flags: mutex creation flags.
			
 
				+void __tsan_mutex_create(void *addr, unsigned flags);
			
 
				+
			
 
				+// Annotate destruction of a mutex.
			
 
				+// Supported flags:
			
 
				+//   - __tsan_mutex_linker_init
			
 
				+//   - __tsan_mutex_not_static
			
 
				+void __tsan_mutex_destroy(void *addr, unsigned flags);
			
 
				+
			
 
				+// Annotate start of lock operation.
			
 
				+// Supported flags:
			
 
				+//   - __tsan_mutex_read_lock
			
 
				+//   - __tsan_mutex_try_lock
			
 
				+//   - all mutex creation flags
			
 
				+void __tsan_mutex_pre_lock(void *addr, unsigned flags);
			
 
				+
			
 
				+// Annotate end of lock operation.
			
 
				+// Supported flags:
			
 
				+//   - __tsan_mutex_read_lock (must match __tsan_mutex_pre_lock)
			
 
				+//   - __tsan_mutex_try_lock (must match __tsan_mutex_pre_lock)
			
 
				+//   - __tsan_mutex_try_lock_failed
			
 
				+//   - __tsan_mutex_recursive_lock
			
 
				+//   - all mutex creation flags
			
 
				+void __tsan_mutex_post_lock(void *addr, unsigned flags, int recursion);
			
 
				+
			
 
				+// Annotate start of unlock operation.
			
 
				+// Supported flags:
			
 
				+//   - __tsan_mutex_read_lock
			
 
				+//   - __tsan_mutex_recursive_unlock
			
 
				+int __tsan_mutex_pre_unlock(void *addr, unsigned flags);
			
 
				+
			
 
				+// Annotate end of unlock operation.
			
 
				+// Supported flags:
			
 
				+//   - __tsan_mutex_read_lock (must match __tsan_mutex_pre_unlock)
			
 
				+void __tsan_mutex_post_unlock(void *addr, unsigned flags);
			
 
				+
			
 
				+// Annotate start/end of notify/signal/broadcast operation.
			
 
				+// Supported flags: none.
			
 
				+void __tsan_mutex_pre_signal(void *addr, unsigned flags);
			
 
				+void __tsan_mutex_post_signal(void *addr, unsigned flags);
			
 
				+
			
 
				+// Annotate start/end of a region of code where lock/unlock/signal operation
			
 
				+// diverts to do something else unrelated to the mutex. This can be used to
			
 
				+// annotate, for example, calls into cooperative scheduler or contention
			
 
				+// profiling code.
			
 
				+// These annotations must be called only from within
			
 
				+// __tsan_mutex_pre/post_lock, __tsan_mutex_pre/post_unlock,
			
 
				+// __tsan_mutex_pre/post_signal regions.
			
 
				+// Supported flags: none.
			
 
				+void __tsan_mutex_pre_divert(void *addr, unsigned flags);
			
 
				+void __tsan_mutex_post_divert(void *addr, unsigned flags);
			
 
				+
			
 
				+// External race detection API.
			
 
				+// Can be used by non-instrumented libraries to detect when their objects are
			
 
				+// being used in an unsafe manner.
			
 
				+//   - __tsan_external_read/__tsan_external_write annotates the logical reads
			
 
				+//       and writes of the object at the specified address. 'caller_pc' should
			
 
				+//       be the PC of the library user, which the library can obtain with e.g.
			
 
				+//       `__builtin_return_address(0)`.
			
 
				+//   - __tsan_external_register_tag registers a 'tag' with the specified name,
			
 
				+//       which is later used in read/write annotations to denote the object type
			
 
				+//   - __tsan_external_assign_tag can optionally mark a heap object with a tag
			
 
				+void *__tsan_external_register_tag(const char *object_type);
			
 
				+void __tsan_external_register_header(void *tag, const char *header);
			
 
				+void __tsan_external_assign_tag(void *addr, void *tag);
			
 
				+void __tsan_external_read(void *addr, void *caller_pc, void *tag);
			
 
				+void __tsan_external_write(void *addr, void *caller_pc, void *tag);
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}  // extern "C"
			
 
				+#endif
			
 
				+
			
 
				+#endif  // SANITIZER_TSAN_INTERFACE_H
			
--- a/demo/include/sanitizer/tsan_interface_atomic.h
+++ b/demo/include/sanitizer/tsan_interface_atomic.h
@@ -0,0 +1,222 @@
 
				+//===-- tsan_interface_atomic.h ---------------------------------*- C++ -*-===//
			
 
				+//
			
 
				+//                     The LLVM Compiler Infrastructure
			
 
				+//
			
 
				+// This file is distributed under the University of Illinois Open Source
			
 
				+// License. See LICENSE.TXT for details.
			
 
				+//
			
 
				+//===----------------------------------------------------------------------===//
			
 
				+//
			
 
				+// This file is a part of ThreadSanitizer (TSan), a race detector.
			
 
				+//
			
 
				+// Public interface header for TSan atomics.
			
 
				+//===----------------------------------------------------------------------===//
			
 
				+#ifndef TSAN_INTERFACE_ATOMIC_H
			
 
				+#define TSAN_INTERFACE_ATOMIC_H
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+typedef char     __tsan_atomic8;
			
 
				+typedef short    __tsan_atomic16;  // NOLINT
			
 
				+typedef int      __tsan_atomic32;
			
 
				+typedef long     __tsan_atomic64;  // NOLINT
			
 
				+#if defined(__SIZEOF_INT128__) \
			
 
				+    || (__clang_major__ * 100 + __clang_minor__ >= 302)
			
 
				+__extension__ typedef __int128 __tsan_atomic128;
			
 
				+# define __TSAN_HAS_INT128 1
			
 
				+#else
			
 
				+# define __TSAN_HAS_INT128 0
			
 
				+#endif
			
 
				+
			
 
				+// Part of ABI, do not change.
			
 
				+// http://llvm.org/viewvc/llvm-project/libcxx/trunk/include/atomic?view=markup
			
 
				+typedef enum {
			
 
				+  __tsan_memory_order_relaxed,
			
 
				+  __tsan_memory_order_consume,
			
 
				+  __tsan_memory_order_acquire,
			
 
				+  __tsan_memory_order_release,
			
 
				+  __tsan_memory_order_acq_rel,
			
 
				+  __tsan_memory_order_seq_cst
			
 
				+} __tsan_memory_order;
			
 
				+
			
 
				+__tsan_atomic8 __tsan_atomic8_load(const volatile __tsan_atomic8 *a,
			
 
				+    __tsan_memory_order mo);
			
 
				+__tsan_atomic16 __tsan_atomic16_load(const volatile __tsan_atomic16 *a,
			
 
				+    __tsan_memory_order mo);
			
 
				+__tsan_atomic32 __tsan_atomic32_load(const volatile __tsan_atomic32 *a,
			
 
				+    __tsan_memory_order mo);
			
 
				+__tsan_atomic64 __tsan_atomic64_load(const volatile __tsan_atomic64 *a,
			
 
				+    __tsan_memory_order mo);
			
 
				+#if __TSAN_HAS_INT128
			
 
				+__tsan_atomic128 __tsan_atomic128_load(const volatile __tsan_atomic128 *a,
			
 
				+    __tsan_memory_order mo);
			
 
				+#endif
			
 
				+
			
 
				+void __tsan_atomic8_store(volatile __tsan_atomic8 *a, __tsan_atomic8 v,
			
 
				+    __tsan_memory_order mo);
			
 
				+void __tsan_atomic16_store(volatile __tsan_atomic16 *a, __tsan_atomic16 v,
			
 
				+    __tsan_memory_order mo);
			
 
				+void __tsan_atomic32_store(volatile __tsan_atomic32 *a, __tsan_atomic32 v,
			
 
				+    __tsan_memory_order mo);
			
 
				+void __tsan_atomic64_store(volatile __tsan_atomic64 *a, __tsan_atomic64 v,
			
 
				+    __tsan_memory_order mo);
			
 
				+#if __TSAN_HAS_INT128
			
 
				+void __tsan_atomic128_store(volatile __tsan_atomic128 *a, __tsan_atomic128 v,
			
 
				+    __tsan_memory_order mo);
			
 
				+#endif
			
 
				+
			
 
				+__tsan_atomic8 __tsan_atomic8_exchange(volatile __tsan_atomic8 *a,
			
 
				+    __tsan_atomic8 v, __tsan_memory_order mo);
			
 
				+__tsan_atomic16 __tsan_atomic16_exchange(volatile __tsan_atomic16 *a,
			
 
				+    __tsan_atomic16 v, __tsan_memory_order mo);
			
 
				+__tsan_atomic32 __tsan_atomic32_exchange(volatile __tsan_atomic32 *a,
			
 
				+    __tsan_atomic32 v, __tsan_memory_order mo);
			
 
				+__tsan_atomic64 __tsan_atomic64_exchange(volatile __tsan_atomic64 *a,
			
 
				+    __tsan_atomic64 v, __tsan_memory_order mo);
			
 
				+#if __TSAN_HAS_INT128
			
 
				+__tsan_atomic128 __tsan_atomic128_exchange(volatile __tsan_atomic128 *a,
			
 
				+    __tsan_atomic128 v, __tsan_memory_order mo);
			
 
				+#endif
			
 
				+
			
 
				+__tsan_atomic8 __tsan_atomic8_fetch_add(volatile __tsan_atomic8 *a,
			
 
				+    __tsan_atomic8 v, __tsan_memory_order mo);
			
 
				+__tsan_atomic16 __tsan_atomic16_fetch_add(volatile __tsan_atomic16 *a,
			
 
				+    __tsan_atomic16 v, __tsan_memory_order mo);
			
 
				+__tsan_atomic32 __tsan_atomic32_fetch_add(volatile __tsan_atomic32 *a,
			
 
				+    __tsan_atomic32 v, __tsan_memory_order mo);
			
 
				+__tsan_atomic64 __tsan_atomic64_fetch_add(volatile __tsan_atomic64 *a,
			
 
				+    __tsan_atomic64 v, __tsan_memory_order mo);
			
 
				+#if __TSAN_HAS_INT128
			
 
				+__tsan_atomic128 __tsan_atomic128_fetch_add(volatile __tsan_atomic128 *a,
			
 
				+    __tsan_atomic128 v, __tsan_memory_order mo);
			
 
				+#endif
			
 
				+
			
 
				+__tsan_atomic8 __tsan_atomic8_fetch_sub(volatile __tsan_atomic8 *a,
			
 
				+    __tsan_atomic8 v, __tsan_memory_order mo);
			
 
				+__tsan_atomic16 __tsan_atomic16_fetch_sub(volatile __tsan_atomic16 *a,
			
 
				+    __tsan_atomic16 v, __tsan_memory_order mo);
			
 
				+__tsan_atomic32 __tsan_atomic32_fetch_sub(volatile __tsan_atomic32 *a,
			
 
				+    __tsan_atomic32 v, __tsan_memory_order mo);
			
 
				+__tsan_atomic64 __tsan_atomic64_fetch_sub(volatile __tsan_atomic64 *a,
			
 
				+    __tsan_atomic64 v, __tsan_memory_order mo);
			
 
				+#if __TSAN_HAS_INT128
			
 
				+__tsan_atomic128 __tsan_atomic128_fetch_sub(volatile __tsan_atomic128 *a,
			
 
				+    __tsan_atomic128 v, __tsan_memory_order mo);
			
 
				+#endif
			
 
				+
			
 
				+__tsan_atomic8 __tsan_atomic8_fetch_and(volatile __tsan_atomic8 *a,
			
 
				+    __tsan_atomic8 v, __tsan_memory_order mo);
			
 
				+__tsan_atomic16 __tsan_atomic16_fetch_and(volatile __tsan_atomic16 *a,
			
 
				+    __tsan_atomic16 v, __tsan_memory_order mo);
			
 
				+__tsan_atomic32 __tsan_atomic32_fetch_and(volatile __tsan_atomic32 *a,
			
 
				+    __tsan_atomic32 v, __tsan_memory_order mo);
			
 
				+__tsan_atomic64 __tsan_atomic64_fetch_and(volatile __tsan_atomic64 *a,
			
 
				+    __tsan_atomic64 v, __tsan_memory_order mo);
			
 
				+#if __TSAN_HAS_INT128
			
 
				+__tsan_atomic128 __tsan_atomic128_fetch_and(volatile __tsan_atomic128 *a,
			
 
				+    __tsan_atomic128 v, __tsan_memory_order mo);
			
 
				+#endif
			
 
				+
			
 
				+__tsan_atomic8 __tsan_atomic8_fetch_or(volatile __tsan_atomic8 *a,
			
 
				+    __tsan_atomic8 v, __tsan_memory_order mo);
			
 
				+__tsan_atomic16 __tsan_atomic16_fetch_or(volatile __tsan_atomic16 *a,
			
 
				+    __tsan_atomic16 v, __tsan_memory_order mo);
			
 
				+__tsan_atomic32 __tsan_atomic32_fetch_or(volatile __tsan_atomic32 *a,
			
 
				+    __tsan_atomic32 v, __tsan_memory_order mo);
			
 
				+__tsan_atomic64 __tsan_atomic64_fetch_or(volatile __tsan_atomic64 *a,
			
 
				+    __tsan_atomic64 v, __tsan_memory_order mo);
			
 
				+#if __TSAN_HAS_INT128
			
 
				+__tsan_atomic128 __tsan_atomic128_fetch_or(volatile __tsan_atomic128 *a,
			
 
				+    __tsan_atomic128 v, __tsan_memory_order mo);
			
 
				+#endif
			
 
				+
			
 
				+__tsan_atomic8 __tsan_atomic8_fetch_xor(volatile __tsan_atomic8 *a,
			
 
				+    __tsan_atomic8 v, __tsan_memory_order mo);
			
 
				+__tsan_atomic16 __tsan_atomic16_fetch_xor(volatile __tsan_atomic16 *a,
			
 
				+    __tsan_atomic16 v, __tsan_memory_order mo);
			
 
				+__tsan_atomic32 __tsan_atomic32_fetch_xor(volatile __tsan_atomic32 *a,
			
 
				+    __tsan_atomic32 v, __tsan_memory_order mo);
			
 
				+__tsan_atomic64 __tsan_atomic64_fetch_xor(volatile __tsan_atomic64 *a,
			
 
				+    __tsan_atomic64 v, __tsan_memory_order mo);
			
 
				+#if __TSAN_HAS_INT128
			
 
				+__tsan_atomic128 __tsan_atomic128_fetch_xor(volatile __tsan_atomic128 *a,
			
 
				+    __tsan_atomic128 v, __tsan_memory_order mo);
			
 
				+#endif
			
 
				+
			
 
				+__tsan_atomic8 __tsan_atomic8_fetch_nand(volatile __tsan_atomic8 *a,
			
 
				+    __tsan_atomic8 v, __tsan_memory_order mo);
			
 
				+__tsan_atomic16 __tsan_atomic16_fetch_nand(volatile __tsan_atomic16 *a,
			
 
				+    __tsan_atomic16 v, __tsan_memory_order mo);
			
 
				+__tsan_atomic32 __tsan_atomic32_fetch_nand(volatile __tsan_atomic32 *a,
			
 
				+    __tsan_atomic32 v, __tsan_memory_order mo);
			
 
				+__tsan_atomic64 __tsan_atomic64_fetch_nand(volatile __tsan_atomic64 *a,
			
 
				+    __tsan_atomic64 v, __tsan_memory_order mo);
			
 
				+#if __TSAN_HAS_INT128
			
 
				+__tsan_atomic128 __tsan_atomic128_fetch_nand(volatile __tsan_atomic128 *a,
			
 
				+    __tsan_atomic128 v, __tsan_memory_order mo);
			
 
				+#endif
			
 
				+
			
 
				+int __tsan_atomic8_compare_exchange_weak(volatile __tsan_atomic8 *a,
			
 
				+    __tsan_atomic8 *c, __tsan_atomic8 v, __tsan_memory_order mo,
			
 
				+    __tsan_memory_order fail_mo);
			
 
				+int __tsan_atomic16_compare_exchange_weak(volatile __tsan_atomic16 *a,
			
 
				+    __tsan_atomic16 *c, __tsan_atomic16 v, __tsan_memory_order mo,
			
 
				+    __tsan_memory_order fail_mo);
			
 
				+int __tsan_atomic32_compare_exchange_weak(volatile __tsan_atomic32 *a,
			
 
				+    __tsan_atomic32 *c, __tsan_atomic32 v, __tsan_memory_order mo,
			
 
				+    __tsan_memory_order fail_mo);
			
 
				+int __tsan_atomic64_compare_exchange_weak(volatile __tsan_atomic64 *a,
			
 
				+    __tsan_atomic64 *c, __tsan_atomic64 v, __tsan_memory_order mo,
			
 
				+    __tsan_memory_order fail_mo);
			
 
				+#if __TSAN_HAS_INT128
			
 
				+int __tsan_atomic128_compare_exchange_weak(volatile __tsan_atomic128 *a,
			
 
				+    __tsan_atomic128 *c, __tsan_atomic128 v, __tsan_memory_order mo,
			
 
				+    __tsan_memory_order fail_mo);
			
 
				+#endif
			
 
				+
			
 
				+int __tsan_atomic8_compare_exchange_strong(volatile __tsan_atomic8 *a,
			
 
				+    __tsan_atomic8 *c, __tsan_atomic8 v, __tsan_memory_order mo,
			
 
				+    __tsan_memory_order fail_mo);
			
 
				+int __tsan_atomic16_compare_exchange_strong(volatile __tsan_atomic16 *a,
			
 
				+    __tsan_atomic16 *c, __tsan_atomic16 v, __tsan_memory_order mo,
			
 
				+    __tsan_memory_order fail_mo);
			
 
				+int __tsan_atomic32_compare_exchange_strong(volatile __tsan_atomic32 *a,
			
 
				+    __tsan_atomic32 *c, __tsan_atomic32 v, __tsan_memory_order mo,
			
 
				+    __tsan_memory_order fail_mo);
			
 
				+int __tsan_atomic64_compare_exchange_strong(volatile __tsan_atomic64 *a,
			
 
				+    __tsan_atomic64 *c, __tsan_atomic64 v, __tsan_memory_order mo,
			
 
				+    __tsan_memory_order fail_mo);
			
 
				+#if __TSAN_HAS_INT128
			
 
				+int __tsan_atomic128_compare_exchange_strong(volatile __tsan_atomic128 *a,
			
 
				+    __tsan_atomic128 *c, __tsan_atomic128 v, __tsan_memory_order mo,
			
 
				+    __tsan_memory_order fail_mo);
			
 
				+#endif
			
 
				+
			
 
				+__tsan_atomic8 __tsan_atomic8_compare_exchange_val(
			
 
				+    volatile __tsan_atomic8 *a, __tsan_atomic8 c, __tsan_atomic8 v,
			
 
				+    __tsan_memory_order mo, __tsan_memory_order fail_mo);
			
 
				+__tsan_atomic16 __tsan_atomic16_compare_exchange_val(
			
 
				+    volatile __tsan_atomic16 *a, __tsan_atomic16 c, __tsan_atomic16 v,
			
 
				+    __tsan_memory_order mo, __tsan_memory_order fail_mo);
			
 
				+__tsan_atomic32 __tsan_atomic32_compare_exchange_val(
			
 
				+    volatile __tsan_atomic32 *a, __tsan_atomic32 c, __tsan_atomic32 v,
			
 
				+    __tsan_memory_order mo, __tsan_memory_order fail_mo);
			
 
				+__tsan_atomic64 __tsan_atomic64_compare_exchange_val(
			
 
				+    volatile __tsan_atomic64 *a, __tsan_atomic64 c, __tsan_atomic64 v,
			
 
				+    __tsan_memory_order mo, __tsan_memory_order fail_mo);
			
 
				+#if __TSAN_HAS_INT128
			
 
				+__tsan_atomic128 __tsan_atomic128_compare_exchange_val(
			
 
				+    volatile __tsan_atomic128 *a, __tsan_atomic128 c, __tsan_atomic128 v,
			
 
				+    __tsan_memory_order mo, __tsan_memory_order fail_mo);
			
 
				+#endif
			
 
				+
			
 
				+void __tsan_atomic_thread_fence(__tsan_memory_order mo);
			
 
				+void __tsan_atomic_signal_fence(__tsan_memory_order mo);
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}  // extern "C"
			
 
				+#endif
			
 
				+
			
 
				+#endif  // TSAN_INTERFACE_ATOMIC_H
			
--- a/demo/include/shaintrin.h
+++ b/demo/include/shaintrin.h
@@ -0,0 +1,75 @@
 
				+/*===---- shaintrin.h - SHA intrinsics -------------------------------------===
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+
			
 
				+#ifndef __IMMINTRIN_H
			
 
				+#error "Never use <shaintrin.h> directly; include <immintrin.h> instead."
			
 
				+#endif
			
 
				+
			
 
				+#ifndef __SHAINTRIN_H
			
 
				+#define __SHAINTRIN_H
			
 
				+
			
 
				+/* Define the default attributes for the functions in this file. */
			
 
				+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sha")))
			
 
				+
			
 
				+#define _mm_sha1rnds4_epu32(V1, V2, M) __extension__ ({ \
			
 
				+  __builtin_ia32_sha1rnds4((__v4si)(__m128i)(V1), (__v4si)(__m128i)(V2), (M)); })
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm_sha1nexte_epu32(__m128i __X, __m128i __Y)
			
 
				+{
			
 
				+  return (__m128i)__builtin_ia32_sha1nexte((__v4si)__X, (__v4si)__Y);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm_sha1msg1_epu32(__m128i __X, __m128i __Y)
			
 
				+{
			
 
				+  return (__m128i)__builtin_ia32_sha1msg1((__v4si)__X, (__v4si)__Y);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm_sha1msg2_epu32(__m128i __X, __m128i __Y)
			
 
				+{
			
 
				+  return (__m128i)__builtin_ia32_sha1msg2((__v4si)__X, (__v4si)__Y);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm_sha256rnds2_epu32(__m128i __X, __m128i __Y, __m128i __Z)
			
 
				+{
			
 
				+  return (__m128i)__builtin_ia32_sha256rnds2((__v4si)__X, (__v4si)__Y, (__v4si)__Z);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm_sha256msg1_epu32(__m128i __X, __m128i __Y)
			
 
				+{
			
 
				+  return (__m128i)__builtin_ia32_sha256msg1((__v4si)__X, (__v4si)__Y);
			
 
				+}
			
 
				+
			
 
				+static __inline__ __m128i __DEFAULT_FN_ATTRS
			
 
				+_mm_sha256msg2_epu32(__m128i __X, __m128i __Y)
			
 
				+{
			
 
				+  return (__m128i)__builtin_ia32_sha256msg2((__v4si)__X, (__v4si)__Y);
			
 
				+}
			
 
				+
			
 
				+#undef __DEFAULT_FN_ATTRS
			
 
				+
			
 
				+#endif /* __SHAINTRIN_H */
			
--- a/demo/include/smmintrin.h
+++ b/demo/include/smmintrin.h
--- a/demo/include/stdalign.h
+++ b/demo/include/stdalign.h
@@ -0,0 +1,35 @@
 
				+/*===---- stdalign.h - Standard header for alignment ------------------------===
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+
			
 
				+#ifndef __STDALIGN_H
			
 
				+#define __STDALIGN_H
			
 
				+
			
 
				+#ifndef __cplusplus
			
 
				+#define alignas _Alignas
			
 
				+#define alignof _Alignof
			
 
				+#endif
			
 
				+
			
 
				+#define __alignas_is_defined 1
			
 
				+#define __alignof_is_defined 1
			
 
				+
			
 
				+#endif /* __STDALIGN_H */
			
--- a/demo/include/stdarg.h
+++ b/demo/include/stdarg.h
@@ -0,0 +1,51 @@
 
				+/*===---- stdarg.h - Variable argument handling ----------------------------===
			
 
				+ *
			
 
				+ * Copyright (c) 2008 Eli Friedman
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in
			
 
				+ * all copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
			
 
				+ * THE SOFTWARE.
			
 
				+ *
			
 
				+ *===-----------------------------------------------------------------------===
			
 
				+ */
			
 
				+
			
 
				+#ifndef __STDARG_H
			
 
				+#define __STDARG_H
			
 
				+
			
 
				+#ifndef _VA_LIST
			
 
				+typedef __builtin_va_list va_list;
			
 
				+#define _VA_LIST
			
 
				+#endif
			
 
				+#define va_start(ap, param) __builtin_va_start(ap, param)
			
 
				+#define va_end(ap)          __builtin_va_end(ap)
			
 
				+#define va_arg(ap, type)    __builtin_va_arg(ap, type)
			
 
				+
			
 
				+/* GCC always defines __va_copy, but does not define va_copy unless in c99 mode
			
 
				+ * or -ansi is not specified, since it was not part of C90.
			
 
				+ */
			
 
				+#define __va_copy(d,s) __builtin_va_copy(d,s)
			
 
				+
			
 
				+#if __STDC_VERSION__ >= 199901L || __cplusplus >= 201103L || !defined(__STRICT_ANSI__)
			
 
				+#define va_copy(dest, src)  __builtin_va_copy(dest, src)
			
 
				+#endif
			
 
				+
			
 
				+#ifndef __GNUC_VA_LIST
			
 
				+#define __GNUC_VA_LIST 1
			
 
				+typedef __builtin_va_list __gnuc_va_list;
			
 
				+#endif
			
 
				+
			
 
				+#endif /* __STDARG_H */
			
--- a/demo/include/stdatomic.h
+++ b/demo/include/stdatomic.h