avx512vldqintrin.h 48 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198
  1. /*===---- avx512vldqintrin.h - AVX512VL and AVX512DQ intrinsics ------------===
  2. *
  3. * Permission is hereby granted, free of charge, to any person obtaining a copy
  4. * of this software and associated documentation files (the "Software"), to deal
  5. * in the Software without restriction, including without limitation the rights
  6. * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  7. * copies of the Software, and to permit persons to whom the Software is
  8. * furnished to do so, subject to the following conditions:
  9. *
  10. * The above copyright notice and this permission notice shall be included in
  11. * all copies or substantial portions of the Software.
  12. *
  13. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  14. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  15. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  16. * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  17. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  18. * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  19. * THE SOFTWARE.
  20. *
  21. *===-----------------------------------------------------------------------===
  22. */
  23. #ifndef __IMMINTRIN_H
  24. #error "Never use <avx512vldqintrin.h> directly; include <immintrin.h> instead."
  25. #endif
  26. #ifndef __AVX512VLDQINTRIN_H
  27. #define __AVX512VLDQINTRIN_H
  28. /* Define the default attributes for the functions in this file. */
  29. #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512dq")))
  30. static __inline__ __m256i __DEFAULT_FN_ATTRS
  31. _mm256_mullo_epi64 (__m256i __A, __m256i __B) {
  32. return (__m256i) ((__v4du) __A * (__v4du) __B);
  33. }
  34. static __inline__ __m256i __DEFAULT_FN_ATTRS
  35. _mm256_mask_mullo_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
  36. return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
  37. (__v4di)_mm256_mullo_epi64(__A, __B),
  38. (__v4di)__W);
  39. }
  40. static __inline__ __m256i __DEFAULT_FN_ATTRS
  41. _mm256_maskz_mullo_epi64(__mmask8 __U, __m256i __A, __m256i __B) {
  42. return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
  43. (__v4di)_mm256_mullo_epi64(__A, __B),
  44. (__v4di)_mm256_setzero_si256());
  45. }
  46. static __inline__ __m128i __DEFAULT_FN_ATTRS
  47. _mm_mullo_epi64 (__m128i __A, __m128i __B) {
  48. return (__m128i) ((__v2du) __A * (__v2du) __B);
  49. }
  50. static __inline__ __m128i __DEFAULT_FN_ATTRS
  51. _mm_mask_mullo_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
  52. return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
  53. (__v2di)_mm_mullo_epi64(__A, __B),
  54. (__v2di)__W);
  55. }
  56. static __inline__ __m128i __DEFAULT_FN_ATTRS
  57. _mm_maskz_mullo_epi64(__mmask8 __U, __m128i __A, __m128i __B) {
  58. return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
  59. (__v2di)_mm_mullo_epi64(__A, __B),
  60. (__v2di)_mm_setzero_si128());
  61. }
  62. static __inline__ __m256d __DEFAULT_FN_ATTRS
  63. _mm256_mask_andnot_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
  64. return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
  65. (__v4df)_mm256_andnot_pd(__A, __B),
  66. (__v4df)__W);
  67. }
  68. static __inline__ __m256d __DEFAULT_FN_ATTRS
  69. _mm256_maskz_andnot_pd(__mmask8 __U, __m256d __A, __m256d __B) {
  70. return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
  71. (__v4df)_mm256_andnot_pd(__A, __B),
  72. (__v4df)_mm256_setzero_pd());
  73. }
  74. static __inline__ __m128d __DEFAULT_FN_ATTRS
  75. _mm_mask_andnot_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
  76. return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
  77. (__v2df)_mm_andnot_pd(__A, __B),
  78. (__v2df)__W);
  79. }
  80. static __inline__ __m128d __DEFAULT_FN_ATTRS
  81. _mm_maskz_andnot_pd(__mmask8 __U, __m128d __A, __m128d __B) {
  82. return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
  83. (__v2df)_mm_andnot_pd(__A, __B),
  84. (__v2df)_mm_setzero_pd());
  85. }
  86. static __inline__ __m256 __DEFAULT_FN_ATTRS
  87. _mm256_mask_andnot_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
  88. return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
  89. (__v8sf)_mm256_andnot_ps(__A, __B),
  90. (__v8sf)__W);
  91. }
  92. static __inline__ __m256 __DEFAULT_FN_ATTRS
  93. _mm256_maskz_andnot_ps(__mmask8 __U, __m256 __A, __m256 __B) {
  94. return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
  95. (__v8sf)_mm256_andnot_ps(__A, __B),
  96. (__v8sf)_mm256_setzero_ps());
  97. }
  98. static __inline__ __m128 __DEFAULT_FN_ATTRS
  99. _mm_mask_andnot_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
  100. return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
  101. (__v4sf)_mm_andnot_ps(__A, __B),
  102. (__v4sf)__W);
  103. }
  104. static __inline__ __m128 __DEFAULT_FN_ATTRS
  105. _mm_maskz_andnot_ps(__mmask8 __U, __m128 __A, __m128 __B) {
  106. return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
  107. (__v4sf)_mm_andnot_ps(__A, __B),
  108. (__v4sf)_mm_setzero_ps());
  109. }
  110. static __inline__ __m256d __DEFAULT_FN_ATTRS
  111. _mm256_mask_and_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
  112. return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
  113. (__v4df)_mm256_and_pd(__A, __B),
  114. (__v4df)__W);
  115. }
  116. static __inline__ __m256d __DEFAULT_FN_ATTRS
  117. _mm256_maskz_and_pd(__mmask8 __U, __m256d __A, __m256d __B) {
  118. return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
  119. (__v4df)_mm256_and_pd(__A, __B),
  120. (__v4df)_mm256_setzero_pd());
  121. }
  122. static __inline__ __m128d __DEFAULT_FN_ATTRS
  123. _mm_mask_and_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
  124. return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
  125. (__v2df)_mm_and_pd(__A, __B),
  126. (__v2df)__W);
  127. }
  128. static __inline__ __m128d __DEFAULT_FN_ATTRS
  129. _mm_maskz_and_pd(__mmask8 __U, __m128d __A, __m128d __B) {
  130. return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
  131. (__v2df)_mm_and_pd(__A, __B),
  132. (__v2df)_mm_setzero_pd());
  133. }
  134. static __inline__ __m256 __DEFAULT_FN_ATTRS
  135. _mm256_mask_and_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
  136. return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
  137. (__v8sf)_mm256_and_ps(__A, __B),
  138. (__v8sf)__W);
  139. }
  140. static __inline__ __m256 __DEFAULT_FN_ATTRS
  141. _mm256_maskz_and_ps(__mmask8 __U, __m256 __A, __m256 __B) {
  142. return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
  143. (__v8sf)_mm256_and_ps(__A, __B),
  144. (__v8sf)_mm256_setzero_ps());
  145. }
  146. static __inline__ __m128 __DEFAULT_FN_ATTRS
  147. _mm_mask_and_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
  148. return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
  149. (__v4sf)_mm_and_ps(__A, __B),
  150. (__v4sf)__W);
  151. }
  152. static __inline__ __m128 __DEFAULT_FN_ATTRS
  153. _mm_maskz_and_ps(__mmask8 __U, __m128 __A, __m128 __B) {
  154. return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
  155. (__v4sf)_mm_and_ps(__A, __B),
  156. (__v4sf)_mm_setzero_ps());
  157. }
  158. static __inline__ __m256d __DEFAULT_FN_ATTRS
  159. _mm256_mask_xor_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
  160. return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
  161. (__v4df)_mm256_xor_pd(__A, __B),
  162. (__v4df)__W);
  163. }
  164. static __inline__ __m256d __DEFAULT_FN_ATTRS
  165. _mm256_maskz_xor_pd(__mmask8 __U, __m256d __A, __m256d __B) {
  166. return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
  167. (__v4df)_mm256_xor_pd(__A, __B),
  168. (__v4df)_mm256_setzero_pd());
  169. }
  170. static __inline__ __m128d __DEFAULT_FN_ATTRS
  171. _mm_mask_xor_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
  172. return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
  173. (__v2df)_mm_xor_pd(__A, __B),
  174. (__v2df)__W);
  175. }
  176. static __inline__ __m128d __DEFAULT_FN_ATTRS
  177. _mm_maskz_xor_pd (__mmask8 __U, __m128d __A, __m128d __B) {
  178. return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
  179. (__v2df)_mm_xor_pd(__A, __B),
  180. (__v2df)_mm_setzero_pd());
  181. }
  182. static __inline__ __m256 __DEFAULT_FN_ATTRS
  183. _mm256_mask_xor_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
  184. return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
  185. (__v8sf)_mm256_xor_ps(__A, __B),
  186. (__v8sf)__W);
  187. }
  188. static __inline__ __m256 __DEFAULT_FN_ATTRS
  189. _mm256_maskz_xor_ps(__mmask8 __U, __m256 __A, __m256 __B) {
  190. return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
  191. (__v8sf)_mm256_xor_ps(__A, __B),
  192. (__v8sf)_mm256_setzero_ps());
  193. }
  194. static __inline__ __m128 __DEFAULT_FN_ATTRS
  195. _mm_mask_xor_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
  196. return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
  197. (__v4sf)_mm_xor_ps(__A, __B),
  198. (__v4sf)__W);
  199. }
  200. static __inline__ __m128 __DEFAULT_FN_ATTRS
  201. _mm_maskz_xor_ps(__mmask8 __U, __m128 __A, __m128 __B) {
  202. return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
  203. (__v4sf)_mm_xor_ps(__A, __B),
  204. (__v4sf)_mm_setzero_ps());
  205. }
  206. static __inline__ __m256d __DEFAULT_FN_ATTRS
  207. _mm256_mask_or_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
  208. return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
  209. (__v4df)_mm256_or_pd(__A, __B),
  210. (__v4df)__W);
  211. }
  212. static __inline__ __m256d __DEFAULT_FN_ATTRS
  213. _mm256_maskz_or_pd(__mmask8 __U, __m256d __A, __m256d __B) {
  214. return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
  215. (__v4df)_mm256_or_pd(__A, __B),
  216. (__v4df)_mm256_setzero_pd());
  217. }
  218. static __inline__ __m128d __DEFAULT_FN_ATTRS
  219. _mm_mask_or_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
  220. return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
  221. (__v2df)_mm_or_pd(__A, __B),
  222. (__v2df)__W);
  223. }
  224. static __inline__ __m128d __DEFAULT_FN_ATTRS
  225. _mm_maskz_or_pd(__mmask8 __U, __m128d __A, __m128d __B) {
  226. return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
  227. (__v2df)_mm_or_pd(__A, __B),
  228. (__v2df)_mm_setzero_pd());
  229. }
  230. static __inline__ __m256 __DEFAULT_FN_ATTRS
  231. _mm256_mask_or_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
  232. return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
  233. (__v8sf)_mm256_or_ps(__A, __B),
  234. (__v8sf)__W);
  235. }
  236. static __inline__ __m256 __DEFAULT_FN_ATTRS
  237. _mm256_maskz_or_ps(__mmask8 __U, __m256 __A, __m256 __B) {
  238. return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
  239. (__v8sf)_mm256_or_ps(__A, __B),
  240. (__v8sf)_mm256_setzero_ps());
  241. }
  242. static __inline__ __m128 __DEFAULT_FN_ATTRS
  243. _mm_mask_or_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
  244. return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
  245. (__v4sf)_mm_or_ps(__A, __B),
  246. (__v4sf)__W);
  247. }
  248. static __inline__ __m128 __DEFAULT_FN_ATTRS
  249. _mm_maskz_or_ps(__mmask8 __U, __m128 __A, __m128 __B) {
  250. return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
  251. (__v4sf)_mm_or_ps(__A, __B),
  252. (__v4sf)_mm_setzero_ps());
  253. }
  254. static __inline__ __m128i __DEFAULT_FN_ATTRS
  255. _mm_cvtpd_epi64 (__m128d __A) {
  256. return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A,
  257. (__v2di) _mm_setzero_si128(),
  258. (__mmask8) -1);
  259. }
  260. static __inline__ __m128i __DEFAULT_FN_ATTRS
  261. _mm_mask_cvtpd_epi64 (__m128i __W, __mmask8 __U, __m128d __A) {
  262. return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A,
  263. (__v2di) __W,
  264. (__mmask8) __U);
  265. }
  266. static __inline__ __m128i __DEFAULT_FN_ATTRS
  267. _mm_maskz_cvtpd_epi64 (__mmask8 __U, __m128d __A) {
  268. return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A,
  269. (__v2di) _mm_setzero_si128(),
  270. (__mmask8) __U);
  271. }
  272. static __inline__ __m256i __DEFAULT_FN_ATTRS
  273. _mm256_cvtpd_epi64 (__m256d __A) {
  274. return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A,
  275. (__v4di) _mm256_setzero_si256(),
  276. (__mmask8) -1);
  277. }
  278. static __inline__ __m256i __DEFAULT_FN_ATTRS
  279. _mm256_mask_cvtpd_epi64 (__m256i __W, __mmask8 __U, __m256d __A) {
  280. return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A,
  281. (__v4di) __W,
  282. (__mmask8) __U);
  283. }
  284. static __inline__ __m256i __DEFAULT_FN_ATTRS
  285. _mm256_maskz_cvtpd_epi64 (__mmask8 __U, __m256d __A) {
  286. return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A,
  287. (__v4di) _mm256_setzero_si256(),
  288. (__mmask8) __U);
  289. }
  290. static __inline__ __m128i __DEFAULT_FN_ATTRS
  291. _mm_cvtpd_epu64 (__m128d __A) {
  292. return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A,
  293. (__v2di) _mm_setzero_si128(),
  294. (__mmask8) -1);
  295. }
  296. static __inline__ __m128i __DEFAULT_FN_ATTRS
  297. _mm_mask_cvtpd_epu64 (__m128i __W, __mmask8 __U, __m128d __A) {
  298. return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A,
  299. (__v2di) __W,
  300. (__mmask8) __U);
  301. }
  302. static __inline__ __m128i __DEFAULT_FN_ATTRS
  303. _mm_maskz_cvtpd_epu64 (__mmask8 __U, __m128d __A) {
  304. return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A,
  305. (__v2di) _mm_setzero_si128(),
  306. (__mmask8) __U);
  307. }
  308. static __inline__ __m256i __DEFAULT_FN_ATTRS
  309. _mm256_cvtpd_epu64 (__m256d __A) {
  310. return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A,
  311. (__v4di) _mm256_setzero_si256(),
  312. (__mmask8) -1);
  313. }
  314. static __inline__ __m256i __DEFAULT_FN_ATTRS
  315. _mm256_mask_cvtpd_epu64 (__m256i __W, __mmask8 __U, __m256d __A) {
  316. return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A,
  317. (__v4di) __W,
  318. (__mmask8) __U);
  319. }
  320. static __inline__ __m256i __DEFAULT_FN_ATTRS
  321. _mm256_maskz_cvtpd_epu64 (__mmask8 __U, __m256d __A) {
  322. return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A,
  323. (__v4di) _mm256_setzero_si256(),
  324. (__mmask8) __U);
  325. }
  326. static __inline__ __m128i __DEFAULT_FN_ATTRS
  327. _mm_cvtps_epi64 (__m128 __A) {
  328. return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A,
  329. (__v2di) _mm_setzero_si128(),
  330. (__mmask8) -1);
  331. }
  332. static __inline__ __m128i __DEFAULT_FN_ATTRS
  333. _mm_mask_cvtps_epi64 (__m128i __W, __mmask8 __U, __m128 __A) {
  334. return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A,
  335. (__v2di) __W,
  336. (__mmask8) __U);
  337. }
  338. static __inline__ __m128i __DEFAULT_FN_ATTRS
  339. _mm_maskz_cvtps_epi64 (__mmask8 __U, __m128 __A) {
  340. return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A,
  341. (__v2di) _mm_setzero_si128(),
  342. (__mmask8) __U);
  343. }
  344. static __inline__ __m256i __DEFAULT_FN_ATTRS
  345. _mm256_cvtps_epi64 (__m128 __A) {
  346. return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A,
  347. (__v4di) _mm256_setzero_si256(),
  348. (__mmask8) -1);
  349. }
  350. static __inline__ __m256i __DEFAULT_FN_ATTRS
  351. _mm256_mask_cvtps_epi64 (__m256i __W, __mmask8 __U, __m128 __A) {
  352. return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A,
  353. (__v4di) __W,
  354. (__mmask8) __U);
  355. }
  356. static __inline__ __m256i __DEFAULT_FN_ATTRS
  357. _mm256_maskz_cvtps_epi64 (__mmask8 __U, __m128 __A) {
  358. return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A,
  359. (__v4di) _mm256_setzero_si256(),
  360. (__mmask8) __U);
  361. }
  362. static __inline__ __m128i __DEFAULT_FN_ATTRS
  363. _mm_cvtps_epu64 (__m128 __A) {
  364. return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A,
  365. (__v2di) _mm_setzero_si128(),
  366. (__mmask8) -1);
  367. }
  368. static __inline__ __m128i __DEFAULT_FN_ATTRS
  369. _mm_mask_cvtps_epu64 (__m128i __W, __mmask8 __U, __m128 __A) {
  370. return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A,
  371. (__v2di) __W,
  372. (__mmask8) __U);
  373. }
  374. static __inline__ __m128i __DEFAULT_FN_ATTRS
  375. _mm_maskz_cvtps_epu64 (__mmask8 __U, __m128 __A) {
  376. return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A,
  377. (__v2di) _mm_setzero_si128(),
  378. (__mmask8) __U);
  379. }
  380. static __inline__ __m256i __DEFAULT_FN_ATTRS
  381. _mm256_cvtps_epu64 (__m128 __A) {
  382. return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A,
  383. (__v4di) _mm256_setzero_si256(),
  384. (__mmask8) -1);
  385. }
  386. static __inline__ __m256i __DEFAULT_FN_ATTRS
  387. _mm256_mask_cvtps_epu64 (__m256i __W, __mmask8 __U, __m128 __A) {
  388. return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A,
  389. (__v4di) __W,
  390. (__mmask8) __U);
  391. }
  392. static __inline__ __m256i __DEFAULT_FN_ATTRS
  393. _mm256_maskz_cvtps_epu64 (__mmask8 __U, __m128 __A) {
  394. return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A,
  395. (__v4di) _mm256_setzero_si256(),
  396. (__mmask8) __U);
  397. }
  398. static __inline__ __m128d __DEFAULT_FN_ATTRS
  399. _mm_cvtepi64_pd (__m128i __A) {
  400. return (__m128d) __builtin_ia32_cvtqq2pd128_mask ((__v2di) __A,
  401. (__v2df) _mm_setzero_pd(),
  402. (__mmask8) -1);
  403. }
  404. static __inline__ __m128d __DEFAULT_FN_ATTRS
  405. _mm_mask_cvtepi64_pd (__m128d __W, __mmask8 __U, __m128i __A) {
  406. return (__m128d) __builtin_ia32_cvtqq2pd128_mask ((__v2di) __A,
  407. (__v2df) __W,
  408. (__mmask8) __U);
  409. }
  410. static __inline__ __m128d __DEFAULT_FN_ATTRS
  411. _mm_maskz_cvtepi64_pd (__mmask8 __U, __m128i __A) {
  412. return (__m128d) __builtin_ia32_cvtqq2pd128_mask ((__v2di) __A,
  413. (__v2df) _mm_setzero_pd(),
  414. (__mmask8) __U);
  415. }
  416. static __inline__ __m256d __DEFAULT_FN_ATTRS
  417. _mm256_cvtepi64_pd (__m256i __A) {
  418. return (__m256d) __builtin_ia32_cvtqq2pd256_mask ((__v4di) __A,
  419. (__v4df) _mm256_setzero_pd(),
  420. (__mmask8) -1);
  421. }
  422. static __inline__ __m256d __DEFAULT_FN_ATTRS
  423. _mm256_mask_cvtepi64_pd (__m256d __W, __mmask8 __U, __m256i __A) {
  424. return (__m256d) __builtin_ia32_cvtqq2pd256_mask ((__v4di) __A,
  425. (__v4df) __W,
  426. (__mmask8) __U);
  427. }
  428. static __inline__ __m256d __DEFAULT_FN_ATTRS
  429. _mm256_maskz_cvtepi64_pd (__mmask8 __U, __m256i __A) {
  430. return (__m256d) __builtin_ia32_cvtqq2pd256_mask ((__v4di) __A,
  431. (__v4df) _mm256_setzero_pd(),
  432. (__mmask8) __U);
  433. }
  434. static __inline__ __m128 __DEFAULT_FN_ATTRS
  435. _mm_cvtepi64_ps (__m128i __A) {
  436. return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A,
  437. (__v4sf) _mm_setzero_ps(),
  438. (__mmask8) -1);
  439. }
  440. static __inline__ __m128 __DEFAULT_FN_ATTRS
  441. _mm_mask_cvtepi64_ps (__m128 __W, __mmask8 __U, __m128i __A) {
  442. return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A,
  443. (__v4sf) __W,
  444. (__mmask8) __U);
  445. }
  446. static __inline__ __m128 __DEFAULT_FN_ATTRS
  447. _mm_maskz_cvtepi64_ps (__mmask8 __U, __m128i __A) {
  448. return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A,
  449. (__v4sf) _mm_setzero_ps(),
  450. (__mmask8) __U);
  451. }
  452. static __inline__ __m128 __DEFAULT_FN_ATTRS
  453. _mm256_cvtepi64_ps (__m256i __A) {
  454. return (__m128) __builtin_ia32_cvtqq2ps256_mask ((__v4di) __A,
  455. (__v4sf) _mm_setzero_ps(),
  456. (__mmask8) -1);
  457. }
  458. static __inline__ __m128 __DEFAULT_FN_ATTRS
  459. _mm256_mask_cvtepi64_ps (__m128 __W, __mmask8 __U, __m256i __A) {
  460. return (__m128) __builtin_ia32_cvtqq2ps256_mask ((__v4di) __A,
  461. (__v4sf) __W,
  462. (__mmask8) __U);
  463. }
  464. static __inline__ __m128 __DEFAULT_FN_ATTRS
  465. _mm256_maskz_cvtepi64_ps (__mmask8 __U, __m256i __A) {
  466. return (__m128) __builtin_ia32_cvtqq2ps256_mask ((__v4di) __A,
  467. (__v4sf) _mm_setzero_ps(),
  468. (__mmask8) __U);
  469. }
  470. static __inline__ __m128i __DEFAULT_FN_ATTRS
  471. _mm_cvttpd_epi64 (__m128d __A) {
  472. return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A,
  473. (__v2di) _mm_setzero_si128(),
  474. (__mmask8) -1);
  475. }
  476. static __inline__ __m128i __DEFAULT_FN_ATTRS
  477. _mm_mask_cvttpd_epi64 (__m128i __W, __mmask8 __U, __m128d __A) {
  478. return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A,
  479. (__v2di) __W,
  480. (__mmask8) __U);
  481. }
  482. static __inline__ __m128i __DEFAULT_FN_ATTRS
  483. _mm_maskz_cvttpd_epi64 (__mmask8 __U, __m128d __A) {
  484. return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A,
  485. (__v2di) _mm_setzero_si128(),
  486. (__mmask8) __U);
  487. }
  488. static __inline__ __m256i __DEFAULT_FN_ATTRS
  489. _mm256_cvttpd_epi64 (__m256d __A) {
  490. return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A,
  491. (__v4di) _mm256_setzero_si256(),
  492. (__mmask8) -1);
  493. }
  494. static __inline__ __m256i __DEFAULT_FN_ATTRS
  495. _mm256_mask_cvttpd_epi64 (__m256i __W, __mmask8 __U, __m256d __A) {
  496. return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A,
  497. (__v4di) __W,
  498. (__mmask8) __U);
  499. }
  500. static __inline__ __m256i __DEFAULT_FN_ATTRS
  501. _mm256_maskz_cvttpd_epi64 (__mmask8 __U, __m256d __A) {
  502. return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A,
  503. (__v4di) _mm256_setzero_si256(),
  504. (__mmask8) __U);
  505. }
  506. static __inline__ __m128i __DEFAULT_FN_ATTRS
  507. _mm_cvttpd_epu64 (__m128d __A) {
  508. return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A,
  509. (__v2di) _mm_setzero_si128(),
  510. (__mmask8) -1);
  511. }
  512. static __inline__ __m128i __DEFAULT_FN_ATTRS
  513. _mm_mask_cvttpd_epu64 (__m128i __W, __mmask8 __U, __m128d __A) {
  514. return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A,
  515. (__v2di) __W,
  516. (__mmask8) __U);
  517. }
  518. static __inline__ __m128i __DEFAULT_FN_ATTRS
  519. _mm_maskz_cvttpd_epu64 (__mmask8 __U, __m128d __A) {
  520. return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A,
  521. (__v2di) _mm_setzero_si128(),
  522. (__mmask8) __U);
  523. }
  524. static __inline__ __m256i __DEFAULT_FN_ATTRS
  525. _mm256_cvttpd_epu64 (__m256d __A) {
  526. return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A,
  527. (__v4di) _mm256_setzero_si256(),
  528. (__mmask8) -1);
  529. }
  530. static __inline__ __m256i __DEFAULT_FN_ATTRS
  531. _mm256_mask_cvttpd_epu64 (__m256i __W, __mmask8 __U, __m256d __A) {
  532. return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A,
  533. (__v4di) __W,
  534. (__mmask8) __U);
  535. }
  536. static __inline__ __m256i __DEFAULT_FN_ATTRS
  537. _mm256_maskz_cvttpd_epu64 (__mmask8 __U, __m256d __A) {
  538. return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A,
  539. (__v4di) _mm256_setzero_si256(),
  540. (__mmask8) __U);
  541. }
  542. static __inline__ __m128i __DEFAULT_FN_ATTRS
  543. _mm_cvttps_epi64 (__m128 __A) {
  544. return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A,
  545. (__v2di) _mm_setzero_si128(),
  546. (__mmask8) -1);
  547. }
  548. static __inline__ __m128i __DEFAULT_FN_ATTRS
  549. _mm_mask_cvttps_epi64 (__m128i __W, __mmask8 __U, __m128 __A) {
  550. return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A,
  551. (__v2di) __W,
  552. (__mmask8) __U);
  553. }
  554. static __inline__ __m128i __DEFAULT_FN_ATTRS
  555. _mm_maskz_cvttps_epi64 (__mmask8 __U, __m128 __A) {
  556. return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A,
  557. (__v2di) _mm_setzero_si128(),
  558. (__mmask8) __U);
  559. }
  560. static __inline__ __m256i __DEFAULT_FN_ATTRS
  561. _mm256_cvttps_epi64 (__m128 __A) {
  562. return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A,
  563. (__v4di) _mm256_setzero_si256(),
  564. (__mmask8) -1);
  565. }
  566. static __inline__ __m256i __DEFAULT_FN_ATTRS
  567. _mm256_mask_cvttps_epi64 (__m256i __W, __mmask8 __U, __m128 __A) {
  568. return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A,
  569. (__v4di) __W,
  570. (__mmask8) __U);
  571. }
  572. static __inline__ __m256i __DEFAULT_FN_ATTRS
  573. _mm256_maskz_cvttps_epi64 (__mmask8 __U, __m128 __A) {
  574. return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A,
  575. (__v4di) _mm256_setzero_si256(),
  576. (__mmask8) __U);
  577. }
  578. static __inline__ __m128i __DEFAULT_FN_ATTRS
  579. _mm_cvttps_epu64 (__m128 __A) {
  580. return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A,
  581. (__v2di) _mm_setzero_si128(),
  582. (__mmask8) -1);
  583. }
  584. static __inline__ __m128i __DEFAULT_FN_ATTRS
  585. _mm_mask_cvttps_epu64 (__m128i __W, __mmask8 __U, __m128 __A) {
  586. return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A,
  587. (__v2di) __W,
  588. (__mmask8) __U);
  589. }
  590. static __inline__ __m128i __DEFAULT_FN_ATTRS
  591. _mm_maskz_cvttps_epu64 (__mmask8 __U, __m128 __A) {
  592. return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A,
  593. (__v2di) _mm_setzero_si128(),
  594. (__mmask8) __U);
  595. }
  596. static __inline__ __m256i __DEFAULT_FN_ATTRS
  597. _mm256_cvttps_epu64 (__m128 __A) {
  598. return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A,
  599. (__v4di) _mm256_setzero_si256(),
  600. (__mmask8) -1);
  601. }
  602. static __inline__ __m256i __DEFAULT_FN_ATTRS
  603. _mm256_mask_cvttps_epu64 (__m256i __W, __mmask8 __U, __m128 __A) {
  604. return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A,
  605. (__v4di) __W,
  606. (__mmask8) __U);
  607. }
  608. static __inline__ __m256i __DEFAULT_FN_ATTRS
  609. _mm256_maskz_cvttps_epu64 (__mmask8 __U, __m128 __A) {
  610. return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A,
  611. (__v4di) _mm256_setzero_si256(),
  612. (__mmask8) __U);
  613. }
  614. static __inline__ __m128d __DEFAULT_FN_ATTRS
  615. _mm_cvtepu64_pd (__m128i __A) {
  616. return (__m128d) __builtin_ia32_cvtuqq2pd128_mask ((__v2di) __A,
  617. (__v2df) _mm_setzero_pd(),
  618. (__mmask8) -1);
  619. }
  620. static __inline__ __m128d __DEFAULT_FN_ATTRS
  621. _mm_mask_cvtepu64_pd (__m128d __W, __mmask8 __U, __m128i __A) {
  622. return (__m128d) __builtin_ia32_cvtuqq2pd128_mask ((__v2di) __A,
  623. (__v2df) __W,
  624. (__mmask8) __U);
  625. }
  626. static __inline__ __m128d __DEFAULT_FN_ATTRS
  627. _mm_maskz_cvtepu64_pd (__mmask8 __U, __m128i __A) {
  628. return (__m128d) __builtin_ia32_cvtuqq2pd128_mask ((__v2di) __A,
  629. (__v2df) _mm_setzero_pd(),
  630. (__mmask8) __U);
  631. }
  632. static __inline__ __m256d __DEFAULT_FN_ATTRS
  633. _mm256_cvtepu64_pd (__m256i __A) {
  634. return (__m256d) __builtin_ia32_cvtuqq2pd256_mask ((__v4di) __A,
  635. (__v4df) _mm256_setzero_pd(),
  636. (__mmask8) -1);
  637. }
  638. static __inline__ __m256d __DEFAULT_FN_ATTRS
  639. _mm256_mask_cvtepu64_pd (__m256d __W, __mmask8 __U, __m256i __A) {
  640. return (__m256d) __builtin_ia32_cvtuqq2pd256_mask ((__v4di) __A,
  641. (__v4df) __W,
  642. (__mmask8) __U);
  643. }
  644. static __inline__ __m256d __DEFAULT_FN_ATTRS
  645. _mm256_maskz_cvtepu64_pd (__mmask8 __U, __m256i __A) {
  646. return (__m256d) __builtin_ia32_cvtuqq2pd256_mask ((__v4di) __A,
  647. (__v4df) _mm256_setzero_pd(),
  648. (__mmask8) __U);
  649. }
  650. static __inline__ __m128 __DEFAULT_FN_ATTRS
  651. _mm_cvtepu64_ps (__m128i __A) {
  652. return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A,
  653. (__v4sf) _mm_setzero_ps(),
  654. (__mmask8) -1);
  655. }
  656. static __inline__ __m128 __DEFAULT_FN_ATTRS
  657. _mm_mask_cvtepu64_ps (__m128 __W, __mmask8 __U, __m128i __A) {
  658. return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A,
  659. (__v4sf) __W,
  660. (__mmask8) __U);
  661. }
  662. static __inline__ __m128 __DEFAULT_FN_ATTRS
  663. _mm_maskz_cvtepu64_ps (__mmask8 __U, __m128i __A) {
  664. return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A,
  665. (__v4sf) _mm_setzero_ps(),
  666. (__mmask8) __U);
  667. }
  668. static __inline__ __m128 __DEFAULT_FN_ATTRS
  669. _mm256_cvtepu64_ps (__m256i __A) {
  670. return (__m128) __builtin_ia32_cvtuqq2ps256_mask ((__v4di) __A,
  671. (__v4sf) _mm_setzero_ps(),
  672. (__mmask8) -1);
  673. }
  674. static __inline__ __m128 __DEFAULT_FN_ATTRS
  675. _mm256_mask_cvtepu64_ps (__m128 __W, __mmask8 __U, __m256i __A) {
  676. return (__m128) __builtin_ia32_cvtuqq2ps256_mask ((__v4di) __A,
  677. (__v4sf) __W,
  678. (__mmask8) __U);
  679. }
  680. static __inline__ __m128 __DEFAULT_FN_ATTRS
  681. _mm256_maskz_cvtepu64_ps (__mmask8 __U, __m256i __A) {
  682. return (__m128) __builtin_ia32_cvtuqq2ps256_mask ((__v4di) __A,
  683. (__v4sf) _mm_setzero_ps(),
  684. (__mmask8) __U);
  685. }
  686. #define _mm_range_pd(A, B, C) __extension__ ({ \
  687. (__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \
  688. (__v2df)(__m128d)(B), (int)(C), \
  689. (__v2df)_mm_setzero_pd(), \
  690. (__mmask8)-1); })
  691. #define _mm_mask_range_pd(W, U, A, B, C) __extension__ ({ \
  692. (__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \
  693. (__v2df)(__m128d)(B), (int)(C), \
  694. (__v2df)(__m128d)(W), \
  695. (__mmask8)(U)); })
  696. #define _mm_maskz_range_pd(U, A, B, C) __extension__ ({ \
  697. (__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \
  698. (__v2df)(__m128d)(B), (int)(C), \
  699. (__v2df)_mm_setzero_pd(), \
  700. (__mmask8)(U)); })
  701. #define _mm256_range_pd(A, B, C) __extension__ ({ \
  702. (__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \
  703. (__v4df)(__m256d)(B), (int)(C), \
  704. (__v4df)_mm256_setzero_pd(), \
  705. (__mmask8)-1); })
  706. #define _mm256_mask_range_pd(W, U, A, B, C) __extension__ ({ \
  707. (__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \
  708. (__v4df)(__m256d)(B), (int)(C), \
  709. (__v4df)(__m256d)(W), \
  710. (__mmask8)(U)); })
  711. #define _mm256_maskz_range_pd(U, A, B, C) __extension__ ({ \
  712. (__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \
  713. (__v4df)(__m256d)(B), (int)(C), \
  714. (__v4df)_mm256_setzero_pd(), \
  715. (__mmask8)(U)); })
  716. #define _mm_range_ps(A, B, C) __extension__ ({ \
  717. (__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \
  718. (__v4sf)(__m128)(B), (int)(C), \
  719. (__v4sf)_mm_setzero_ps(), \
  720. (__mmask8)-1); })
  721. #define _mm_mask_range_ps(W, U, A, B, C) __extension__ ({ \
  722. (__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \
  723. (__v4sf)(__m128)(B), (int)(C), \
  724. (__v4sf)(__m128)(W), (__mmask8)(U)); })
  725. #define _mm_maskz_range_ps(U, A, B, C) __extension__ ({ \
  726. (__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \
  727. (__v4sf)(__m128)(B), (int)(C), \
  728. (__v4sf)_mm_setzero_ps(), \
  729. (__mmask8)(U)); })
  730. #define _mm256_range_ps(A, B, C) __extension__ ({ \
  731. (__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \
  732. (__v8sf)(__m256)(B), (int)(C), \
  733. (__v8sf)_mm256_setzero_ps(), \
  734. (__mmask8)-1); })
  735. #define _mm256_mask_range_ps(W, U, A, B, C) __extension__ ({ \
  736. (__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \
  737. (__v8sf)(__m256)(B), (int)(C), \
  738. (__v8sf)(__m256)(W), (__mmask8)(U)); })
  739. #define _mm256_maskz_range_ps(U, A, B, C) __extension__ ({ \
  740. (__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \
  741. (__v8sf)(__m256)(B), (int)(C), \
  742. (__v8sf)_mm256_setzero_ps(), \
  743. (__mmask8)(U)); })
  744. #define _mm_reduce_pd(A, B) __extension__ ({ \
  745. (__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \
  746. (__v2df)_mm_setzero_pd(), \
  747. (__mmask8)-1); })
  748. #define _mm_mask_reduce_pd(W, U, A, B) __extension__ ({ \
  749. (__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \
  750. (__v2df)(__m128d)(W), \
  751. (__mmask8)(U)); })
  752. #define _mm_maskz_reduce_pd(U, A, B) __extension__ ({ \
  753. (__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \
  754. (__v2df)_mm_setzero_pd(), \
  755. (__mmask8)(U)); })
  756. #define _mm256_reduce_pd(A, B) __extension__ ({ \
  757. (__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \
  758. (__v4df)_mm256_setzero_pd(), \
  759. (__mmask8)-1); })
  760. #define _mm256_mask_reduce_pd(W, U, A, B) __extension__ ({ \
  761. (__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \
  762. (__v4df)(__m256d)(W), \
  763. (__mmask8)(U)); })
  764. #define _mm256_maskz_reduce_pd(U, A, B) __extension__ ({ \
  765. (__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \
  766. (__v4df)_mm256_setzero_pd(), \
  767. (__mmask8)(U)); })
  768. #define _mm_reduce_ps(A, B) __extension__ ({ \
  769. (__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \
  770. (__v4sf)_mm_setzero_ps(), \
  771. (__mmask8)-1); })
  772. #define _mm_mask_reduce_ps(W, U, A, B) __extension__ ({ \
  773. (__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \
  774. (__v4sf)(__m128)(W), \
  775. (__mmask8)(U)); })
  776. #define _mm_maskz_reduce_ps(U, A, B) __extension__ ({ \
  777. (__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \
  778. (__v4sf)_mm_setzero_ps(), \
  779. (__mmask8)(U)); })
  780. #define _mm256_reduce_ps(A, B) __extension__ ({ \
  781. (__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \
  782. (__v8sf)_mm256_setzero_ps(), \
  783. (__mmask8)-1); })
  784. #define _mm256_mask_reduce_ps(W, U, A, B) __extension__ ({ \
  785. (__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \
  786. (__v8sf)(__m256)(W), \
  787. (__mmask8)(U)); })
  788. #define _mm256_maskz_reduce_ps(U, A, B) __extension__ ({ \
  789. (__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \
  790. (__v8sf)_mm256_setzero_ps(), \
  791. (__mmask8)(U)); })
  792. static __inline__ __mmask8 __DEFAULT_FN_ATTRS
  793. _mm_movepi32_mask (__m128i __A)
  794. {
  795. return (__mmask8) __builtin_ia32_cvtd2mask128 ((__v4si) __A);
  796. }
  797. static __inline__ __mmask8 __DEFAULT_FN_ATTRS
  798. _mm256_movepi32_mask (__m256i __A)
  799. {
  800. return (__mmask8) __builtin_ia32_cvtd2mask256 ((__v8si) __A);
  801. }
  802. static __inline__ __m128i __DEFAULT_FN_ATTRS
  803. _mm_movm_epi32 (__mmask8 __A)
  804. {
  805. return (__m128i) __builtin_ia32_cvtmask2d128 (__A);
  806. }
  807. static __inline__ __m256i __DEFAULT_FN_ATTRS
  808. _mm256_movm_epi32 (__mmask8 __A)
  809. {
  810. return (__m256i) __builtin_ia32_cvtmask2d256 (__A);
  811. }
  812. static __inline__ __m128i __DEFAULT_FN_ATTRS
  813. _mm_movm_epi64 (__mmask8 __A)
  814. {
  815. return (__m128i) __builtin_ia32_cvtmask2q128 (__A);
  816. }
  817. static __inline__ __m256i __DEFAULT_FN_ATTRS
  818. _mm256_movm_epi64 (__mmask8 __A)
  819. {
  820. return (__m256i) __builtin_ia32_cvtmask2q256 (__A);
  821. }
  822. static __inline__ __mmask8 __DEFAULT_FN_ATTRS
  823. _mm_movepi64_mask (__m128i __A)
  824. {
  825. return (__mmask8) __builtin_ia32_cvtq2mask128 ((__v2di) __A);
  826. }
  827. static __inline__ __mmask8 __DEFAULT_FN_ATTRS
  828. _mm256_movepi64_mask (__m256i __A)
  829. {
  830. return (__mmask8) __builtin_ia32_cvtq2mask256 ((__v4di) __A);
  831. }
  832. static __inline__ __m256 __DEFAULT_FN_ATTRS
  833. _mm256_broadcast_f32x2 (__m128 __A)
  834. {
  835. return (__m256)__builtin_shufflevector((__v4sf)__A,
  836. (__v4sf)_mm_undefined_ps(),
  837. 0, 1, 0, 1, 0, 1, 0, 1);
  838. }
  839. static __inline__ __m256 __DEFAULT_FN_ATTRS
  840. _mm256_mask_broadcast_f32x2 (__m256 __O, __mmask8 __M, __m128 __A)
  841. {
  842. return (__m256)__builtin_ia32_selectps_256((__mmask8)__M,
  843. (__v8sf)_mm256_broadcast_f32x2(__A),
  844. (__v8sf)__O);
  845. }
  846. static __inline__ __m256 __DEFAULT_FN_ATTRS
  847. _mm256_maskz_broadcast_f32x2 (__mmask8 __M, __m128 __A)
  848. {
  849. return (__m256)__builtin_ia32_selectps_256((__mmask8)__M,
  850. (__v8sf)_mm256_broadcast_f32x2(__A),
  851. (__v8sf)_mm256_setzero_ps());
  852. }
  853. static __inline__ __m256d __DEFAULT_FN_ATTRS
  854. _mm256_broadcast_f64x2(__m128d __A)
  855. {
  856. return (__m256d)__builtin_shufflevector((__v2df)__A, (__v2df)__A,
  857. 0, 1, 0, 1);
  858. }
  859. static __inline__ __m256d __DEFAULT_FN_ATTRS
  860. _mm256_mask_broadcast_f64x2(__m256d __O, __mmask8 __M, __m128d __A)
  861. {
  862. return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__M,
  863. (__v4df)_mm256_broadcast_f64x2(__A),
  864. (__v4df)__O);
  865. }
  866. static __inline__ __m256d __DEFAULT_FN_ATTRS
  867. _mm256_maskz_broadcast_f64x2 (__mmask8 __M, __m128d __A)
  868. {
  869. return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__M,
  870. (__v4df)_mm256_broadcast_f64x2(__A),
  871. (__v4df)_mm256_setzero_pd());
  872. }
  873. static __inline__ __m128i __DEFAULT_FN_ATTRS
  874. _mm_broadcast_i32x2 (__m128i __A)
  875. {
  876. return (__m128i)__builtin_shufflevector((__v4si)__A,
  877. (__v4si)_mm_undefined_si128(),
  878. 0, 1, 0, 1);
  879. }
  880. static __inline__ __m128i __DEFAULT_FN_ATTRS
  881. _mm_mask_broadcast_i32x2 (__m128i __O, __mmask8 __M, __m128i __A)
  882. {
  883. return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
  884. (__v4si)_mm_broadcast_i32x2(__A),
  885. (__v4si)__O);
  886. }
  887. static __inline__ __m128i __DEFAULT_FN_ATTRS
  888. _mm_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A)
  889. {
  890. return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
  891. (__v4si)_mm_broadcast_i32x2(__A),
  892. (__v4si)_mm_setzero_si128());
  893. }
  894. static __inline__ __m256i __DEFAULT_FN_ATTRS
  895. _mm256_broadcast_i32x2 (__m128i __A)
  896. {
  897. return (__m256i)__builtin_shufflevector((__v4si)__A,
  898. (__v4si)_mm_undefined_si128(),
  899. 0, 1, 0, 1, 0, 1, 0, 1);
  900. }
  901. static __inline__ __m256i __DEFAULT_FN_ATTRS
  902. _mm256_mask_broadcast_i32x2 (__m256i __O, __mmask8 __M, __m128i __A)
  903. {
  904. return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
  905. (__v8si)_mm256_broadcast_i32x2(__A),
  906. (__v8si)__O);
  907. }
  908. static __inline__ __m256i __DEFAULT_FN_ATTRS
  909. _mm256_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A)
  910. {
  911. return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
  912. (__v8si)_mm256_broadcast_i32x2(__A),
  913. (__v8si)_mm256_setzero_si256());
  914. }
  915. static __inline__ __m256i __DEFAULT_FN_ATTRS
  916. _mm256_broadcast_i64x2(__m128i __A)
  917. {
  918. return (__m256i)__builtin_shufflevector((__v2di)__A, (__v2di)__A,
  919. 0, 1, 0, 1);
  920. }
  921. static __inline__ __m256i __DEFAULT_FN_ATTRS
  922. _mm256_mask_broadcast_i64x2(__m256i __O, __mmask8 __M, __m128i __A)
  923. {
  924. return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
  925. (__v4di)_mm256_broadcast_i64x2(__A),
  926. (__v4di)__O);
  927. }
  928. static __inline__ __m256i __DEFAULT_FN_ATTRS
  929. _mm256_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A)
  930. {
  931. return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
  932. (__v4di)_mm256_broadcast_i64x2(__A),
  933. (__v4di)_mm256_setzero_si256());
  934. }
  935. #define _mm256_extractf64x2_pd(A, imm) __extension__ ({ \
  936. (__m128d)__builtin_shufflevector((__v4df)(__m256d)(A), \
  937. (__v4df)_mm256_undefined_pd(), \
  938. ((imm) & 1) ? 2 : 0, \
  939. ((imm) & 1) ? 3 : 1); })
  940. #define _mm256_mask_extractf64x2_pd(W, U, A, imm) __extension__ ({ \
  941. (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
  942. (__v2df)_mm256_extractf64x2_pd((A), (imm)), \
  943. (__v2df)(W)); })
  944. #define _mm256_maskz_extractf64x2_pd(U, A, imm) __extension__ ({ \
  945. (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
  946. (__v2df)_mm256_extractf64x2_pd((A), (imm)), \
  947. (__v2df)_mm_setzero_pd()); })
  948. #define _mm256_extracti64x2_epi64(A, imm) __extension__ ({ \
  949. (__m128i)__builtin_shufflevector((__v4di)(__m256i)(A), \
  950. (__v4di)_mm256_undefined_si256(), \
  951. ((imm) & 1) ? 2 : 0, \
  952. ((imm) & 1) ? 3 : 1); })
  953. #define _mm256_mask_extracti64x2_epi64(W, U, A, imm) __extension__ ({ \
  954. (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
  955. (__v2di)_mm256_extracti64x2_epi64((A), (imm)), \
  956. (__v2di)(W)); })
  957. #define _mm256_maskz_extracti64x2_epi64(U, A, imm) __extension__ ({ \
  958. (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
  959. (__v2di)_mm256_extracti64x2_epi64((A), (imm)), \
  960. (__v2di)_mm_setzero_di()); })
  961. #define _mm256_insertf64x2(A, B, imm) __extension__ ({ \
  962. (__m256d)__builtin_shufflevector((__v4df)(A), \
  963. (__v4df)_mm256_castpd128_pd256((__m128d)(B)), \
  964. ((imm) & 0x1) ? 0 : 4, \
  965. ((imm) & 0x1) ? 1 : 5, \
  966. ((imm) & 0x1) ? 4 : 2, \
  967. ((imm) & 0x1) ? 5 : 3); })
  968. #define _mm256_mask_insertf64x2(W, U, A, B, imm) __extension__ ({ \
  969. (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
  970. (__v4df)_mm256_insertf64x2((A), (B), (imm)), \
  971. (__v4df)(W)); })
  972. #define _mm256_maskz_insertf64x2(U, A, B, imm) __extension__ ({ \
  973. (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
  974. (__v4df)_mm256_insertf64x2((A), (B), (imm)), \
  975. (__v4df)_mm256_setzero_pd()); })
  976. #define _mm256_inserti64x2(A, B, imm) __extension__ ({ \
  977. (__m256i)__builtin_shufflevector((__v4di)(A), \
  978. (__v4di)_mm256_castsi128_si256((__m128i)(B)), \
  979. ((imm) & 0x1) ? 0 : 4, \
  980. ((imm) & 0x1) ? 1 : 5, \
  981. ((imm) & 0x1) ? 4 : 2, \
  982. ((imm) & 0x1) ? 5 : 3); })
  983. #define _mm256_mask_inserti64x2(W, U, A, B, imm) __extension__ ({ \
  984. (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
  985. (__v4di)_mm256_inserti64x2((A), (B), (imm)), \
  986. (__v4di)(W)); })
  987. #define _mm256_maskz_inserti64x2(U, A, B, imm) __extension__ ({ \
  988. (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
  989. (__v4di)_mm256_inserti64x2((A), (B), (imm)), \
  990. (__v4di)_mm256_setzero_si256()); })
  991. #define _mm_mask_fpclass_pd_mask(U, A, imm) __extension__ ({ \
  992. (__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(A), (int)(imm), \
  993. (__mmask8)(U)); })
  994. #define _mm_fpclass_pd_mask(A, imm) __extension__ ({ \
  995. (__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(A), (int)(imm), \
  996. (__mmask8)-1); })
  997. #define _mm256_mask_fpclass_pd_mask(U, A, imm) __extension__ ({ \
  998. (__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)(__m256d)(A), (int)(imm), \
  999. (__mmask8)(U)); })
  1000. #define _mm256_fpclass_pd_mask(A, imm) __extension__ ({ \
  1001. (__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)(__m256d)(A), (int)(imm), \
  1002. (__mmask8)-1); })
  1003. #define _mm_mask_fpclass_ps_mask(U, A, imm) __extension__ ({ \
  1004. (__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)(__m128)(A), (int)(imm), \
  1005. (__mmask8)(U)); })
  1006. #define _mm_fpclass_ps_mask(A, imm) __extension__ ({ \
  1007. (__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)(__m128)(A), (int)(imm), \
  1008. (__mmask8)-1); })
  1009. #define _mm256_mask_fpclass_ps_mask(U, A, imm) __extension__ ({ \
  1010. (__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)(__m256)(A), (int)(imm), \
  1011. (__mmask8)(U)); })
  1012. #define _mm256_fpclass_ps_mask(A, imm) __extension__ ({ \
  1013. (__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)(__m256)(A), (int)(imm), \
  1014. (__mmask8)-1); })
  1015. #undef __DEFAULT_FN_ATTRS
  1016. #endif