emmintrin.h 183 KB


  1. /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
  2. *
  3. * Permission is hereby granted, free of charge, to any person obtaining a copy
  4. * of this software and associated documentation files (the "Software"), to deal
  5. * in the Software without restriction, including without limitation the rights
  6. * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  7. * copies of the Software, and to permit persons to whom the Software is
  8. * furnished to do so, subject to the following conditions:
  9. *
  10. * The above copyright notice and this permission notice shall be included in
  11. * all copies or substantial portions of the Software.
  12. *
  13. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  14. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  15. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  16. * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  17. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  18. * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  19. * THE SOFTWARE.
  20. *
  21. *===-----------------------------------------------------------------------===
  22. */
  23. #ifndef __EMMINTRIN_H
  24. #define __EMMINTRIN_H
  25. #include <xmmintrin.h>
  26. typedef double __m128d __attribute__((__vector_size__(16)));
  27. typedef long long __m128i __attribute__((__vector_size__(16)));
  28. /* Type defines. */
  29. typedef double __v2df __attribute__ ((__vector_size__ (16)));
  30. typedef long long __v2di __attribute__ ((__vector_size__ (16)));
  31. typedef short __v8hi __attribute__((__vector_size__(16)));
  32. typedef char __v16qi __attribute__((__vector_size__(16)));
  33. /* Unsigned types */
  34. typedef unsigned long long __v2du __attribute__ ((__vector_size__ (16)));
  35. typedef unsigned short __v8hu __attribute__((__vector_size__(16)));
  36. typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
  37. /* We need an explicitly signed variant for char. Note that this shouldn't
  38. * appear in the interface though. */
  39. typedef signed char __v16qs __attribute__((__vector_size__(16)));
  40. #include <f16cintrin.h>
  41. /* Define the default attributes for the functions in this file. */
  42. #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
  43. /// \brief Adds lower double-precision values in both operands and returns the
  44. /// sum in the lower 64 bits of the result. The upper 64 bits of the result
  45. /// are copied from the upper double-precision value of the first operand.
  46. ///
  47. /// \headerfile <x86intrin.h>
  48. ///
  49. /// This intrinsic corresponds to the <c> VADDSD / ADDSD </c> instruction.
  50. ///
  51. /// \param __a
  52. /// A 128-bit vector of [2 x double] containing one of the source operands.
  53. /// \param __b
  54. /// A 128-bit vector of [2 x double] containing one of the source operands.
  55. /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
  56. /// sum of the lower 64 bits of both operands. The upper 64 bits are copied
  57. /// from the upper 64 bits of the first source operand.
  58. static __inline__ __m128d __DEFAULT_FN_ATTRS
  59. _mm_add_sd(__m128d __a, __m128d __b)
  60. {
  61. __a[0] += __b[0];
  62. return __a;
  63. }
  64. /// \brief Adds two 128-bit vectors of [2 x double].
  65. ///
  66. /// \headerfile <x86intrin.h>
  67. ///
  68. /// This intrinsic corresponds to the <c> VADDPD / ADDPD </c> instruction.
  69. ///
  70. /// \param __a
  71. /// A 128-bit vector of [2 x double] containing one of the source operands.
  72. /// \param __b
  73. /// A 128-bit vector of [2 x double] containing one of the source operands.
  74. /// \returns A 128-bit vector of [2 x double] containing the sums of both
  75. /// operands.
  76. static __inline__ __m128d __DEFAULT_FN_ATTRS
  77. _mm_add_pd(__m128d __a, __m128d __b)
  78. {
  79. return (__m128d)((__v2df)__a + (__v2df)__b);
  80. }
  81. /// \brief Subtracts the lower double-precision value of the second operand
  82. /// from the lower double-precision value of the first operand and returns
  83. /// the difference in the lower 64 bits of the result. The upper 64 bits of
  84. /// the result are copied from the upper double-precision value of the first
  85. /// operand.
  86. ///
  87. /// \headerfile <x86intrin.h>
  88. ///
  89. /// This intrinsic corresponds to the <c> VSUBSD / SUBSD </c> instruction.
  90. ///
  91. /// \param __a
  92. /// A 128-bit vector of [2 x double] containing the minuend.
  93. /// \param __b
  94. /// A 128-bit vector of [2 x double] containing the subtrahend.
  95. /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
  96. /// difference of the lower 64 bits of both operands. The upper 64 bits are
  97. /// copied from the upper 64 bits of the first source operand.
  98. static __inline__ __m128d __DEFAULT_FN_ATTRS
  99. _mm_sub_sd(__m128d __a, __m128d __b)
  100. {
  101. __a[0] -= __b[0];
  102. return __a;
  103. }
  104. /// \brief Subtracts two 128-bit vectors of [2 x double].
  105. ///
  106. /// \headerfile <x86intrin.h>
  107. ///
  108. /// This intrinsic corresponds to the <c> VSUBPD / SUBPD </c> instruction.
  109. ///
  110. /// \param __a
  111. /// A 128-bit vector of [2 x double] containing the minuend.
  112. /// \param __b
  113. /// A 128-bit vector of [2 x double] containing the subtrahend.
  114. /// \returns A 128-bit vector of [2 x double] containing the differences between
  115. /// both operands.
  116. static __inline__ __m128d __DEFAULT_FN_ATTRS
  117. _mm_sub_pd(__m128d __a, __m128d __b)
  118. {
  119. return (__m128d)((__v2df)__a - (__v2df)__b);
  120. }
  121. /// \brief Multiplies lower double-precision values in both operands and returns
  122. /// the product in the lower 64 bits of the result. The upper 64 bits of the
  123. /// result are copied from the upper double-precision value of the first
  124. /// operand.
  125. ///
  126. /// \headerfile <x86intrin.h>
  127. ///
  128. /// This intrinsic corresponds to the <c> VMULSD / MULSD </c> instruction.
  129. ///
  130. /// \param __a
  131. /// A 128-bit vector of [2 x double] containing one of the source operands.
  132. /// \param __b
  133. /// A 128-bit vector of [2 x double] containing one of the source operands.
  134. /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
  135. /// product of the lower 64 bits of both operands. The upper 64 bits are
  136. /// copied from the upper 64 bits of the first source operand.
  137. static __inline__ __m128d __DEFAULT_FN_ATTRS
  138. _mm_mul_sd(__m128d __a, __m128d __b)
  139. {
  140. __a[0] *= __b[0];
  141. return __a;
  142. }
  143. /// \brief Multiplies two 128-bit vectors of [2 x double].
  144. ///
  145. /// \headerfile <x86intrin.h>
  146. ///
  147. /// This intrinsic corresponds to the <c> VMULPD / MULPD </c> instruction.
  148. ///
  149. /// \param __a
  150. /// A 128-bit vector of [2 x double] containing one of the operands.
  151. /// \param __b
  152. /// A 128-bit vector of [2 x double] containing one of the operands.
  153. /// \returns A 128-bit vector of [2 x double] containing the products of both
  154. /// operands.
  155. static __inline__ __m128d __DEFAULT_FN_ATTRS
  156. _mm_mul_pd(__m128d __a, __m128d __b)
  157. {
  158. return (__m128d)((__v2df)__a * (__v2df)__b);
  159. }
  160. /// \brief Divides the lower double-precision value of the first operand by the
  161. /// lower double-precision value of the second operand and returns the
  162. /// quotient in the lower 64 bits of the result. The upper 64 bits of the
  163. /// result are copied from the upper double-precision value of the first
  164. /// operand.
  165. ///
  166. /// \headerfile <x86intrin.h>
  167. ///
  168. /// This intrinsic corresponds to the <c> VDIVSD / DIVSD </c> instruction.
  169. ///
  170. /// \param __a
  171. /// A 128-bit vector of [2 x double] containing the dividend.
  172. /// \param __b
  173. /// A 128-bit vector of [2 x double] containing divisor.
  174. /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
  175. /// quotient of the lower 64 bits of both operands. The upper 64 bits are
  176. /// copied from the upper 64 bits of the first source operand.
  177. static __inline__ __m128d __DEFAULT_FN_ATTRS
  178. _mm_div_sd(__m128d __a, __m128d __b)
  179. {
  180. __a[0] /= __b[0];
  181. return __a;
  182. }
  183. /// \brief Performs an element-by-element division of two 128-bit vectors of
  184. /// [2 x double].
  185. ///
  186. /// \headerfile <x86intrin.h>
  187. ///
  188. /// This intrinsic corresponds to the <c> VDIVPD / DIVPD </c> instruction.
  189. ///
  190. /// \param __a
  191. /// A 128-bit vector of [2 x double] containing the dividend.
  192. /// \param __b
  193. /// A 128-bit vector of [2 x double] containing the divisor.
  194. /// \returns A 128-bit vector of [2 x double] containing the quotients of both
  195. /// operands.
  196. static __inline__ __m128d __DEFAULT_FN_ATTRS
  197. _mm_div_pd(__m128d __a, __m128d __b)
  198. {
  199. return (__m128d)((__v2df)__a / (__v2df)__b);
  200. }
  201. /// \brief Calculates the square root of the lower double-precision value of
  202. /// the second operand and returns it in the lower 64 bits of the result.
  203. /// The upper 64 bits of the result are copied from the upper
  204. /// double-precision value of the first operand.
  205. ///
  206. /// \headerfile <x86intrin.h>
  207. ///
  208. /// This intrinsic corresponds to the <c> VSQRTSD / SQRTSD </c> instruction.
  209. ///
  210. /// \param __a
  211. /// A 128-bit vector of [2 x double] containing one of the operands. The
  212. /// upper 64 bits of this operand are copied to the upper 64 bits of the
  213. /// result.
  214. /// \param __b
  215. /// A 128-bit vector of [2 x double] containing one of the operands. The
  216. /// square root is calculated using the lower 64 bits of this operand.
  217. /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
  218. /// square root of the lower 64 bits of operand \a __b, and whose upper 64
  219. /// bits are copied from the upper 64 bits of operand \a __a.
  220. static __inline__ __m128d __DEFAULT_FN_ATTRS
  221. _mm_sqrt_sd(__m128d __a, __m128d __b)
  222. {
  223. __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b);
  224. return (__m128d) { __c[0], __a[1] };
  225. }
  226. /// \brief Calculates the square root of the each of two values stored in a
  227. /// 128-bit vector of [2 x double].
  228. ///
  229. /// \headerfile <x86intrin.h>
  230. ///
  231. /// This intrinsic corresponds to the <c> VSQRTPD / SQRTPD </c> instruction.
  232. ///
  233. /// \param __a
  234. /// A 128-bit vector of [2 x double].
  235. /// \returns A 128-bit vector of [2 x double] containing the square roots of the
  236. /// values in the operand.
  237. static __inline__ __m128d __DEFAULT_FN_ATTRS
  238. _mm_sqrt_pd(__m128d __a)
  239. {
  240. return __builtin_ia32_sqrtpd((__v2df)__a);
  241. }
  242. /// \brief Compares lower 64-bit double-precision values of both operands, and
  243. /// returns the lesser of the pair of values in the lower 64-bits of the
  244. /// result. The upper 64 bits of the result are copied from the upper
  245. /// double-precision value of the first operand.
  246. ///
  247. /// \headerfile <x86intrin.h>
  248. ///
  249. /// This intrinsic corresponds to the <c> VMINSD / MINSD </c> instruction.
  250. ///
  251. /// \param __a
  252. /// A 128-bit vector of [2 x double] containing one of the operands. The
  253. /// lower 64 bits of this operand are used in the comparison.
  254. /// \param __b
  255. /// A 128-bit vector of [2 x double] containing one of the operands. The
  256. /// lower 64 bits of this operand are used in the comparison.
  257. /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
  258. /// minimum value between both operands. The upper 64 bits are copied from
  259. /// the upper 64 bits of the first source operand.
  260. static __inline__ __m128d __DEFAULT_FN_ATTRS
  261. _mm_min_sd(__m128d __a, __m128d __b)
  262. {
  263. return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b);
  264. }
  265. /// \brief Performs element-by-element comparison of the two 128-bit vectors of
  266. /// [2 x double] and returns the vector containing the lesser of each pair of
  267. /// values.
  268. ///
  269. /// \headerfile <x86intrin.h>
  270. ///
  271. /// This intrinsic corresponds to the <c> VMINPD / MINPD </c> instruction.
  272. ///
  273. /// \param __a
  274. /// A 128-bit vector of [2 x double] containing one of the operands.
  275. /// \param __b
  276. /// A 128-bit vector of [2 x double] containing one of the operands.
  277. /// \returns A 128-bit vector of [2 x double] containing the minimum values
  278. /// between both operands.
  279. static __inline__ __m128d __DEFAULT_FN_ATTRS
  280. _mm_min_pd(__m128d __a, __m128d __b)
  281. {
  282. return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b);
  283. }
  284. /// \brief Compares lower 64-bit double-precision values of both operands, and
  285. /// returns the greater of the pair of values in the lower 64-bits of the
  286. /// result. The upper 64 bits of the result are copied from the upper
  287. /// double-precision value of the first operand.
  288. ///
  289. /// \headerfile <x86intrin.h>
  290. ///
  291. /// This intrinsic corresponds to the <c> VMAXSD / MAXSD </c> instruction.
  292. ///
  293. /// \param __a
  294. /// A 128-bit vector of [2 x double] containing one of the operands. The
  295. /// lower 64 bits of this operand are used in the comparison.
  296. /// \param __b
  297. /// A 128-bit vector of [2 x double] containing one of the operands. The
  298. /// lower 64 bits of this operand are used in the comparison.
  299. /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
  300. /// maximum value between both operands. The upper 64 bits are copied from
  301. /// the upper 64 bits of the first source operand.
  302. static __inline__ __m128d __DEFAULT_FN_ATTRS
  303. _mm_max_sd(__m128d __a, __m128d __b)
  304. {
  305. return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b);
  306. }
  307. /// \brief Performs element-by-element comparison of the two 128-bit vectors of
  308. /// [2 x double] and returns the vector containing the greater of each pair
  309. /// of values.
  310. ///
  311. /// \headerfile <x86intrin.h>
  312. ///
  313. /// This intrinsic corresponds to the <c> VMAXPD / MAXPD </c> instruction.
  314. ///
  315. /// \param __a
  316. /// A 128-bit vector of [2 x double] containing one of the operands.
  317. /// \param __b
  318. /// A 128-bit vector of [2 x double] containing one of the operands.
  319. /// \returns A 128-bit vector of [2 x double] containing the maximum values
  320. /// between both operands.
  321. static __inline__ __m128d __DEFAULT_FN_ATTRS
  322. _mm_max_pd(__m128d __a, __m128d __b)
  323. {
  324. return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b);
  325. }
  326. /// \brief Performs a bitwise AND of two 128-bit vectors of [2 x double].
  327. ///
  328. /// \headerfile <x86intrin.h>
  329. ///
  330. /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
  331. ///
  332. /// \param __a
  333. /// A 128-bit vector of [2 x double] containing one of the source operands.
  334. /// \param __b
  335. /// A 128-bit vector of [2 x double] containing one of the source operands.
  336. /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
  337. /// values between both operands.
  338. static __inline__ __m128d __DEFAULT_FN_ATTRS
  339. _mm_and_pd(__m128d __a, __m128d __b)
  340. {
  341. return (__m128d)((__v2du)__a & (__v2du)__b);
  342. }
  343. /// \brief Performs a bitwise AND of two 128-bit vectors of [2 x double], using
  344. /// the one's complement of the values contained in the first source operand.
  345. ///
  346. /// \headerfile <x86intrin.h>
  347. ///
  348. /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
  349. ///
  350. /// \param __a
  351. /// A 128-bit vector of [2 x double] containing the left source operand. The
  352. /// one's complement of this value is used in the bitwise AND.
  353. /// \param __b
  354. /// A 128-bit vector of [2 x double] containing the right source operand.
  355. /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
  356. /// values in the second operand and the one's complement of the first
  357. /// operand.
  358. static __inline__ __m128d __DEFAULT_FN_ATTRS
  359. _mm_andnot_pd(__m128d __a, __m128d __b)
  360. {
  361. return (__m128d)(~(__v2du)__a & (__v2du)__b);
  362. }
  363. /// \brief Performs a bitwise OR of two 128-bit vectors of [2 x double].
  364. ///
  365. /// \headerfile <x86intrin.h>
  366. ///
  367. /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
  368. ///
  369. /// \param __a
  370. /// A 128-bit vector of [2 x double] containing one of the source operands.
  371. /// \param __b
  372. /// A 128-bit vector of [2 x double] containing one of the source operands.
  373. /// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the
  374. /// values between both operands.
  375. static __inline__ __m128d __DEFAULT_FN_ATTRS
  376. _mm_or_pd(__m128d __a, __m128d __b)
  377. {
  378. return (__m128d)((__v2du)__a | (__v2du)__b);
  379. }
  380. /// \brief Performs a bitwise XOR of two 128-bit vectors of [2 x double].
  381. ///
  382. /// \headerfile <x86intrin.h>
  383. ///
  384. /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
  385. ///
  386. /// \param __a
  387. /// A 128-bit vector of [2 x double] containing one of the source operands.
  388. /// \param __b
  389. /// A 128-bit vector of [2 x double] containing one of the source operands.
  390. /// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the
  391. /// values between both operands.
  392. static __inline__ __m128d __DEFAULT_FN_ATTRS
  393. _mm_xor_pd(__m128d __a, __m128d __b)
  394. {
  395. return (__m128d)((__v2du)__a ^ (__v2du)__b);
  396. }
  397. /// \brief Compares each of the corresponding double-precision values of the
  398. /// 128-bit vectors of [2 x double] for equality. Each comparison yields 0h
  399. /// for false, FFFFFFFFFFFFFFFFh for true.
  400. ///
  401. /// \headerfile <x86intrin.h>
  402. ///
  403. /// This intrinsic corresponds to the <c> VCMPEQPD / CMPEQPD </c> instruction.
  404. ///
  405. /// \param __a
  406. /// A 128-bit vector of [2 x double].
  407. /// \param __b
  408. /// A 128-bit vector of [2 x double].
  409. /// \returns A 128-bit vector containing the comparison results.
  410. static __inline__ __m128d __DEFAULT_FN_ATTRS
  411. _mm_cmpeq_pd(__m128d __a, __m128d __b)
  412. {
  413. return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b);
  414. }
  415. /// \brief Compares each of the corresponding double-precision values of the
  416. /// 128-bit vectors of [2 x double] to determine if the values in the first
  417. /// operand are less than those in the second operand. Each comparison
  418. /// yields 0h for false, FFFFFFFFFFFFFFFFh for true.
  419. ///
  420. /// \headerfile <x86intrin.h>
  421. ///
  422. /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
  423. ///
  424. /// \param __a
  425. /// A 128-bit vector of [2 x double].
  426. /// \param __b
  427. /// A 128-bit vector of [2 x double].
  428. /// \returns A 128-bit vector containing the comparison results.
  429. static __inline__ __m128d __DEFAULT_FN_ATTRS
  430. _mm_cmplt_pd(__m128d __a, __m128d __b)
  431. {
  432. return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b);
  433. }
  434. /// \brief Compares each of the corresponding double-precision values of the
  435. /// 128-bit vectors of [2 x double] to determine if the values in the first
  436. /// operand are less than or equal to those in the second operand.
  437. ///
  438. /// Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
  439. ///
  440. /// \headerfile <x86intrin.h>
  441. ///
  442. /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
  443. ///
  444. /// \param __a
  445. /// A 128-bit vector of [2 x double].
  446. /// \param __b
  447. /// A 128-bit vector of [2 x double].
  448. /// \returns A 128-bit vector containing the comparison results.
  449. static __inline__ __m128d __DEFAULT_FN_ATTRS
  450. _mm_cmple_pd(__m128d __a, __m128d __b)
  451. {
  452. return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b);
  453. }
  454. /// \brief Compares each of the corresponding double-precision values of the
  455. /// 128-bit vectors of [2 x double] to determine if the values in the first
  456. /// operand are greater than those in the second operand.
  457. ///
  458. /// Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
  459. ///
  460. /// \headerfile <x86intrin.h>
  461. ///
  462. /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
  463. ///
  464. /// \param __a
  465. /// A 128-bit vector of [2 x double].
  466. /// \param __b
  467. /// A 128-bit vector of [2 x double].
  468. /// \returns A 128-bit vector containing the comparison results.
  469. static __inline__ __m128d __DEFAULT_FN_ATTRS
  470. _mm_cmpgt_pd(__m128d __a, __m128d __b)
  471. {
  472. return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a);
  473. }
  474. /// \brief Compares each of the corresponding double-precision values of the
  475. /// 128-bit vectors of [2 x double] to determine if the values in the first
  476. /// operand are greater than or equal to those in the second operand.
  477. ///
  478. /// Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
  479. ///
  480. /// \headerfile <x86intrin.h>
  481. ///
  482. /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
  483. ///
  484. /// \param __a
  485. /// A 128-bit vector of [2 x double].
  486. /// \param __b
  487. /// A 128-bit vector of [2 x double].
  488. /// \returns A 128-bit vector containing the comparison results.
  489. static __inline__ __m128d __DEFAULT_FN_ATTRS
  490. _mm_cmpge_pd(__m128d __a, __m128d __b)
  491. {
  492. return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a);
  493. }
  494. /// \brief Compares each of the corresponding double-precision values of the
  495. /// 128-bit vectors of [2 x double] to determine if the values in the first
  496. /// operand are ordered with respect to those in the second operand.
  497. ///
  498. /// A pair of double-precision values are "ordered" with respect to each
  499. /// other if neither value is a NaN. Each comparison yields 0h for false,
  500. /// FFFFFFFFFFFFFFFFh for true.
  501. ///
  502. /// \headerfile <x86intrin.h>
  503. ///
  504. /// This intrinsic corresponds to the <c> VCMPORDPD / CMPORDPD </c> instruction.
  505. ///
  506. /// \param __a
  507. /// A 128-bit vector of [2 x double].
  508. /// \param __b
  509. /// A 128-bit vector of [2 x double].
  510. /// \returns A 128-bit vector containing the comparison results.
  511. static __inline__ __m128d __DEFAULT_FN_ATTRS
  512. _mm_cmpord_pd(__m128d __a, __m128d __b)
  513. {
  514. return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b);
  515. }
  516. /// \brief Compares each of the corresponding double-precision values of the
  517. /// 128-bit vectors of [2 x double] to determine if the values in the first
  518. /// operand are unordered with respect to those in the second operand.
  519. ///
  520. /// A pair of double-precision values are "unordered" with respect to each
  521. /// other if one or both values are NaN. Each comparison yields 0h for false,
  522. /// FFFFFFFFFFFFFFFFh for true.
  523. ///
  524. /// \headerfile <x86intrin.h>
  525. ///
  526. /// This intrinsic corresponds to the <c> VCMPUNORDPD / CMPUNORDPD </c>
  527. /// instruction.
  528. ///
  529. /// \param __a
  530. /// A 128-bit vector of [2 x double].
  531. /// \param __b
  532. /// A 128-bit vector of [2 x double].
  533. /// \returns A 128-bit vector containing the comparison results.
  534. static __inline__ __m128d __DEFAULT_FN_ATTRS
  535. _mm_cmpunord_pd(__m128d __a, __m128d __b)
  536. {
  537. return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b);
  538. }
  539. /// \brief Compares each of the corresponding double-precision values of the
  540. /// 128-bit vectors of [2 x double] to determine if the values in the first
  541. /// operand are unequal to those in the second operand.
  542. ///
  543. /// Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
  544. ///
  545. /// \headerfile <x86intrin.h>
  546. ///
  547. /// This intrinsic corresponds to the <c> VCMPNEQPD / CMPNEQPD </c> instruction.
  548. ///
  549. /// \param __a
  550. /// A 128-bit vector of [2 x double].
  551. /// \param __b
  552. /// A 128-bit vector of [2 x double].
  553. /// \returns A 128-bit vector containing the comparison results.
  554. static __inline__ __m128d __DEFAULT_FN_ATTRS
  555. _mm_cmpneq_pd(__m128d __a, __m128d __b)
  556. {
  557. return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b);
  558. }
  559. /// \brief Compares each of the corresponding double-precision values of the
  560. /// 128-bit vectors of [2 x double] to determine if the values in the first
  561. /// operand are not less than those in the second operand.
  562. ///
  563. /// Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
  564. ///
  565. /// \headerfile <x86intrin.h>
  566. ///
  567. /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
  568. ///
  569. /// \param __a
  570. /// A 128-bit vector of [2 x double].
  571. /// \param __b
  572. /// A 128-bit vector of [2 x double].
  573. /// \returns A 128-bit vector containing the comparison results.
  574. static __inline__ __m128d __DEFAULT_FN_ATTRS
  575. _mm_cmpnlt_pd(__m128d __a, __m128d __b)
  576. {
  577. return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b);
  578. }
  579. /// \brief Compares each of the corresponding double-precision values of the
  580. /// 128-bit vectors of [2 x double] to determine if the values in the first
  581. /// operand are not less than or equal to those in the second operand.
  582. ///
  583. /// Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
  584. ///
  585. /// \headerfile <x86intrin.h>
  586. ///
  587. /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
  588. ///
  589. /// \param __a
  590. /// A 128-bit vector of [2 x double].
  591. /// \param __b
  592. /// A 128-bit vector of [2 x double].
  593. /// \returns A 128-bit vector containing the comparison results.
  594. static __inline__ __m128d __DEFAULT_FN_ATTRS
  595. _mm_cmpnle_pd(__m128d __a, __m128d __b)
  596. {
  597. return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b);
  598. }
  599. /// \brief Compares each of the corresponding double-precision values of the
  600. /// 128-bit vectors of [2 x double] to determine if the values in the first
  601. /// operand are not greater than those in the second operand.
  602. ///
  603. /// Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
  604. ///
  605. /// \headerfile <x86intrin.h>
  606. ///
  607. /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
  608. ///
  609. /// \param __a
  610. /// A 128-bit vector of [2 x double].
  611. /// \param __b
  612. /// A 128-bit vector of [2 x double].
  613. /// \returns A 128-bit vector containing the comparison results.
  614. static __inline__ __m128d __DEFAULT_FN_ATTRS
  615. _mm_cmpngt_pd(__m128d __a, __m128d __b)
  616. {
  617. return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a);
  618. }
  619. /// \brief Compares each of the corresponding double-precision values of the
  620. /// 128-bit vectors of [2 x double] to determine if the values in the first
  621. /// operand are not greater than or equal to those in the second operand.
  622. ///
  623. /// Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
  624. ///
  625. /// \headerfile <x86intrin.h>
  626. ///
  627. /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
  628. ///
  629. /// \param __a
  630. /// A 128-bit vector of [2 x double].
  631. /// \param __b
  632. /// A 128-bit vector of [2 x double].
  633. /// \returns A 128-bit vector containing the comparison results.
  634. static __inline__ __m128d __DEFAULT_FN_ATTRS
  635. _mm_cmpnge_pd(__m128d __a, __m128d __b)
  636. {
  637. return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a);
  638. }
  639. /// \brief Compares the lower double-precision floating-point values in each of
  640. /// the two 128-bit floating-point vectors of [2 x double] for equality.
  641. ///
  642. /// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
  643. ///
  644. /// \headerfile <x86intrin.h>
  645. ///
  646. /// This intrinsic corresponds to the <c> VCMPEQSD / CMPEQSD </c> instruction.
  647. ///
  648. /// \param __a
  649. /// A 128-bit vector of [2 x double]. The lower double-precision value is
  650. /// compared to the lower double-precision value of \a __b.
  651. /// \param __b
  652. /// A 128-bit vector of [2 x double]. The lower double-precision value is
  653. /// compared to the lower double-precision value of \a __a.
  654. /// \returns A 128-bit vector. The lower 64 bits contains the comparison
  655. /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
  656. static __inline__ __m128d __DEFAULT_FN_ATTRS
  657. _mm_cmpeq_sd(__m128d __a, __m128d __b)
  658. {
  659. return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b);
  660. }
  661. /// \brief Compares the lower double-precision floating-point values in each of
  662. /// the two 128-bit floating-point vectors of [2 x double] to determine if
  663. /// the value in the first parameter is less than the corresponding value in
  664. /// the second parameter.
  665. ///
  666. /// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
  667. ///
  668. /// \headerfile <x86intrin.h>
  669. ///
  670. /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
  671. ///
  672. /// \param __a
  673. /// A 128-bit vector of [2 x double]. The lower double-precision value is
  674. /// compared to the lower double-precision value of \a __b.
  675. /// \param __b
  676. /// A 128-bit vector of [2 x double]. The lower double-precision value is
  677. /// compared to the lower double-precision value of \a __a.
  678. /// \returns A 128-bit vector. The lower 64 bits contains the comparison
  679. /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
  680. static __inline__ __m128d __DEFAULT_FN_ATTRS
  681. _mm_cmplt_sd(__m128d __a, __m128d __b)
  682. {
  683. return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b);
  684. }
  685. /// \brief Compares the lower double-precision floating-point values in each of
  686. /// the two 128-bit floating-point vectors of [2 x double] to determine if
  687. /// the value in the first parameter is less than or equal to the
  688. /// corresponding value in the second parameter.
  689. ///
  690. /// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
  691. ///
  692. /// \headerfile <x86intrin.h>
  693. ///
  694. /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
  695. ///
  696. /// \param __a
  697. /// A 128-bit vector of [2 x double]. The lower double-precision value is
  698. /// compared to the lower double-precision value of \a __b.
  699. /// \param __b
  700. /// A 128-bit vector of [2 x double]. The lower double-precision value is
  701. /// compared to the lower double-precision value of \a __a.
  702. /// \returns A 128-bit vector. The lower 64 bits contains the comparison
  703. /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
  704. static __inline__ __m128d __DEFAULT_FN_ATTRS
  705. _mm_cmple_sd(__m128d __a, __m128d __b)
  706. {
  707. return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b);
  708. }
  709. /// \brief Compares the lower double-precision floating-point values in each of
  710. /// the two 128-bit floating-point vectors of [2 x double] to determine if
  711. /// the value in the first parameter is greater than the corresponding value
  712. /// in the second parameter.
  713. ///
  714. /// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
  715. ///
  716. /// \headerfile <x86intrin.h>
  717. ///
  718. /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
  719. ///
  720. /// \param __a
  721. /// A 128-bit vector of [2 x double]. The lower double-precision value is
  722. /// compared to the lower double-precision value of \a __b.
  723. /// \param __b
  724. /// A 128-bit vector of [2 x double]. The lower double-precision value is
  725. /// compared to the lower double-precision value of \a __a.
  726. /// \returns A 128-bit vector. The lower 64 bits contains the comparison
  727. /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
  728. static __inline__ __m128d __DEFAULT_FN_ATTRS
  729. _mm_cmpgt_sd(__m128d __a, __m128d __b)
  730. {
  731. __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a);
  732. return (__m128d) { __c[0], __a[1] };
  733. }
  734. /// \brief Compares the lower double-precision floating-point values in each of
  735. /// the two 128-bit floating-point vectors of [2 x double] to determine if
  736. /// the value in the first parameter is greater than or equal to the
  737. /// corresponding value in the second parameter.
  738. ///
  739. /// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
  740. ///
  741. /// \headerfile <x86intrin.h>
  742. ///
  743. /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
  744. ///
  745. /// \param __a
  746. /// A 128-bit vector of [2 x double]. The lower double-precision value is
  747. /// compared to the lower double-precision value of \a __b.
  748. /// \param __b
  749. /// A 128-bit vector of [2 x double]. The lower double-precision value is
  750. /// compared to the lower double-precision value of \a __a.
  751. /// \returns A 128-bit vector. The lower 64 bits contains the comparison
  752. /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
  753. static __inline__ __m128d __DEFAULT_FN_ATTRS
  754. _mm_cmpge_sd(__m128d __a, __m128d __b)
  755. {
  756. __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a);
  757. return (__m128d) { __c[0], __a[1] };
  758. }
  759. /// \brief Compares the lower double-precision floating-point values in each of
  760. /// the two 128-bit floating-point vectors of [2 x double] to determine if
  761. /// the value in the first parameter is "ordered" with respect to the
  762. /// corresponding value in the second parameter.
  763. ///
  764. /// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. A pair of
  765. /// double-precision values are "ordered" with respect to each other if
  766. /// neither value is a NaN.
  767. ///
  768. /// \headerfile <x86intrin.h>
  769. ///
  770. /// This intrinsic corresponds to the <c> VCMPORDSD / CMPORDSD </c> instruction.
  771. ///
  772. /// \param __a
  773. /// A 128-bit vector of [2 x double]. The lower double-precision value is
  774. /// compared to the lower double-precision value of \a __b.
  775. /// \param __b
  776. /// A 128-bit vector of [2 x double]. The lower double-precision value is
  777. /// compared to the lower double-precision value of \a __a.
  778. /// \returns A 128-bit vector. The lower 64 bits contains the comparison
  779. /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
  780. static __inline__ __m128d __DEFAULT_FN_ATTRS
  781. _mm_cmpord_sd(__m128d __a, __m128d __b)
  782. {
  783. return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b);
  784. }
  785. /// \brief Compares the lower double-precision floating-point values in each of
  786. /// the two 128-bit floating-point vectors of [2 x double] to determine if
  787. /// the value in the first parameter is "unordered" with respect to the
  788. /// corresponding value in the second parameter.
  789. ///
  790. /// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. A pair of
  791. /// double-precision values are "unordered" with respect to each other if one
  792. /// or both values are NaN.
  793. ///
  794. /// \headerfile <x86intrin.h>
  795. ///
  796. /// This intrinsic corresponds to the <c> VCMPUNORDSD / CMPUNORDSD </c>
  797. /// instruction.
  798. ///
  799. /// \param __a
  800. /// A 128-bit vector of [2 x double]. The lower double-precision value is
  801. /// compared to the lower double-precision value of \a __b.
  802. /// \param __b
  803. /// A 128-bit vector of [2 x double]. The lower double-precision value is
  804. /// compared to the lower double-precision value of \a __a.
  805. /// \returns A 128-bit vector. The lower 64 bits contains the comparison
  806. /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
  807. static __inline__ __m128d __DEFAULT_FN_ATTRS
  808. _mm_cmpunord_sd(__m128d __a, __m128d __b)
  809. {
  810. return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b);
  811. }
  812. /// \brief Compares the lower double-precision floating-point values in each of
  813. /// the two 128-bit floating-point vectors of [2 x double] to determine if
  814. /// the value in the first parameter is unequal to the corresponding value in
  815. /// the second parameter.
  816. ///
  817. /// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
  818. ///
  819. /// \headerfile <x86intrin.h>
  820. ///
  821. /// This intrinsic corresponds to the <c> VCMPNEQSD / CMPNEQSD </c> instruction.
  822. ///
  823. /// \param __a
  824. /// A 128-bit vector of [2 x double]. The lower double-precision value is
  825. /// compared to the lower double-precision value of \a __b.
  826. /// \param __b
  827. /// A 128-bit vector of [2 x double]. The lower double-precision value is
  828. /// compared to the lower double-precision value of \a __a.
  829. /// \returns A 128-bit vector. The lower 64 bits contains the comparison
  830. /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
  831. static __inline__ __m128d __DEFAULT_FN_ATTRS
  832. _mm_cmpneq_sd(__m128d __a, __m128d __b)
  833. {
  834. return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b);
  835. }
  836. /// \brief Compares the lower double-precision floating-point values in each of
  837. /// the two 128-bit floating-point vectors of [2 x double] to determine if
  838. /// the value in the first parameter is not less than the corresponding
  839. /// value in the second parameter.
  840. ///
  841. /// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
  842. ///
  843. /// \headerfile <x86intrin.h>
  844. ///
  845. /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
  846. ///
  847. /// \param __a
  848. /// A 128-bit vector of [2 x double]. The lower double-precision value is
  849. /// compared to the lower double-precision value of \a __b.
  850. /// \param __b
  851. /// A 128-bit vector of [2 x double]. The lower double-precision value is
  852. /// compared to the lower double-precision value of \a __a.
  853. /// \returns A 128-bit vector. The lower 64 bits contains the comparison
  854. /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
  855. static __inline__ __m128d __DEFAULT_FN_ATTRS
  856. _mm_cmpnlt_sd(__m128d __a, __m128d __b)
  857. {
  858. return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b);
  859. }
  860. /// \brief Compares the lower double-precision floating-point values in each of
  861. /// the two 128-bit floating-point vectors of [2 x double] to determine if
  862. /// the value in the first parameter is not less than or equal to the
  863. /// corresponding value in the second parameter.
  864. ///
  865. /// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
  866. ///
  867. /// \headerfile <x86intrin.h>
  868. ///
  869. /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
  870. ///
  871. /// \param __a
  872. /// A 128-bit vector of [2 x double]. The lower double-precision value is
  873. /// compared to the lower double-precision value of \a __b.
  874. /// \param __b
  875. /// A 128-bit vector of [2 x double]. The lower double-precision value is
  876. /// compared to the lower double-precision value of \a __a.
  877. /// \returns A 128-bit vector. The lower 64 bits contains the comparison
  878. /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
  879. static __inline__ __m128d __DEFAULT_FN_ATTRS
  880. _mm_cmpnle_sd(__m128d __a, __m128d __b)
  881. {
  882. return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b);
  883. }
  884. /// \brief Compares the lower double-precision floating-point values in each of
  885. /// the two 128-bit floating-point vectors of [2 x double] to determine if
  886. /// the value in the first parameter is not greater than the corresponding
  887. /// value in the second parameter.
  888. ///
  889. /// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
  890. ///
  891. /// \headerfile <x86intrin.h>
  892. ///
  893. /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
  894. ///
  895. /// \param __a
  896. /// A 128-bit vector of [2 x double]. The lower double-precision value is
  897. /// compared to the lower double-precision value of \a __b.
  898. /// \param __b
  899. /// A 128-bit vector of [2 x double]. The lower double-precision value is
  900. /// compared to the lower double-precision value of \a __a.
  901. /// \returns A 128-bit vector. The lower 64 bits contains the comparison
  902. /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
  903. static __inline__ __m128d __DEFAULT_FN_ATTRS
  904. _mm_cmpngt_sd(__m128d __a, __m128d __b)
  905. {
  906. __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a);
  907. return (__m128d) { __c[0], __a[1] };
  908. }
  909. /// \brief Compares the lower double-precision floating-point values in each of
  910. /// the two 128-bit floating-point vectors of [2 x double] to determine if
  911. /// the value in the first parameter is not greater than or equal to the
  912. /// corresponding value in the second parameter.
  913. ///
  914. /// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
  915. ///
  916. /// \headerfile <x86intrin.h>
  917. ///
  918. /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
  919. ///
  920. /// \param __a
  921. /// A 128-bit vector of [2 x double]. The lower double-precision value is
  922. /// compared to the lower double-precision value of \a __b.
  923. /// \param __b
  924. /// A 128-bit vector of [2 x double]. The lower double-precision value is
  925. /// compared to the lower double-precision value of \a __a.
  926. /// \returns A 128-bit vector. The lower 64 bits contains the comparison
  927. /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
  928. static __inline__ __m128d __DEFAULT_FN_ATTRS
  929. _mm_cmpnge_sd(__m128d __a, __m128d __b)
  930. {
  931. __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a);
  932. return (__m128d) { __c[0], __a[1] };
  933. }
  934. /// \brief Compares the lower double-precision floating-point values in each of
  935. /// the two 128-bit floating-point vectors of [2 x double] for equality.
  936. ///
  937. /// The comparison yields 0 for false, 1 for true. If either of the two
  938. /// lower double-precision values is NaN, 0 is returned.
  939. ///
  940. /// \headerfile <x86intrin.h>
  941. ///
  942. /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
  943. ///
  944. /// \param __a
  945. /// A 128-bit vector of [2 x double]. The lower double-precision value is
  946. /// compared to the lower double-precision value of \a __b.
  947. /// \param __b
  948. /// A 128-bit vector of [2 x double]. The lower double-precision value is
  949. /// compared to the lower double-precision value of \a __a.
  950. /// \returns An integer containing the comparison results. If either of the two
  951. /// lower double-precision values is NaN, 0 is returned.
  952. static __inline__ int __DEFAULT_FN_ATTRS
  953. _mm_comieq_sd(__m128d __a, __m128d __b)
  954. {
  955. return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b);
  956. }
  957. /// \brief Compares the lower double-precision floating-point values in each of
  958. /// the two 128-bit floating-point vectors of [2 x double] to determine if
  959. /// the value in the first parameter is less than the corresponding value in
  960. /// the second parameter.
  961. ///
  962. /// The comparison yields 0 for false, 1 for true. If either of the two
  963. /// lower double-precision values is NaN, 0 is returned.
  964. ///
  965. /// \headerfile <x86intrin.h>
  966. ///
  967. /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
  968. ///
  969. /// \param __a
  970. /// A 128-bit vector of [2 x double]. The lower double-precision value is
  971. /// compared to the lower double-precision value of \a __b.
  972. /// \param __b
  973. /// A 128-bit vector of [2 x double]. The lower double-precision value is
  974. /// compared to the lower double-precision value of \a __a.
  975. /// \returns An integer containing the comparison results. If either of the two
  976. /// lower double-precision values is NaN, 0 is returned.
  977. static __inline__ int __DEFAULT_FN_ATTRS
  978. _mm_comilt_sd(__m128d __a, __m128d __b)
  979. {
  980. return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b);
  981. }
  982. /// \brief Compares the lower double-precision floating-point values in each of
  983. /// the two 128-bit floating-point vectors of [2 x double] to determine if
  984. /// the value in the first parameter is less than or equal to the
  985. /// corresponding value in the second parameter.
  986. ///
  987. /// The comparison yields 0 for false, 1 for true. If either of the two
  988. /// lower double-precision values is NaN, 0 is returned.
  989. ///
  990. /// \headerfile <x86intrin.h>
  991. ///
  992. /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
  993. ///
  994. /// \param __a
  995. /// A 128-bit vector of [2 x double]. The lower double-precision value is
  996. /// compared to the lower double-precision value of \a __b.
  997. /// \param __b
  998. /// A 128-bit vector of [2 x double]. The lower double-precision value is
  999. /// compared to the lower double-precision value of \a __a.
  1000. /// \returns An integer containing the comparison results. If either of the two
  1001. /// lower double-precision values is NaN, 0 is returned.
  1002. static __inline__ int __DEFAULT_FN_ATTRS
  1003. _mm_comile_sd(__m128d __a, __m128d __b)
  1004. {
  1005. return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b);
  1006. }
  1007. /// \brief Compares the lower double-precision floating-point values in each of
  1008. /// the two 128-bit floating-point vectors of [2 x double] to determine if
  1009. /// the value in the first parameter is greater than the corresponding value
  1010. /// in the second parameter.
  1011. ///
  1012. /// The comparison yields 0 for false, 1 for true. If either of the two
  1013. /// lower double-precision values is NaN, 0 is returned.
  1014. ///
  1015. /// \headerfile <x86intrin.h>
  1016. ///
  1017. /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
  1018. ///
  1019. /// \param __a
  1020. /// A 128-bit vector of [2 x double]. The lower double-precision value is
  1021. /// compared to the lower double-precision value of \a __b.
  1022. /// \param __b
  1023. /// A 128-bit vector of [2 x double]. The lower double-precision value is
  1024. /// compared to the lower double-precision value of \a __a.
  1025. /// \returns An integer containing the comparison results. If either of the two
  1026. /// lower double-precision values is NaN, 0 is returned.
  1027. static __inline__ int __DEFAULT_FN_ATTRS
  1028. _mm_comigt_sd(__m128d __a, __m128d __b)
  1029. {
  1030. return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b);
  1031. }
  1032. /// \brief Compares the lower double-precision floating-point values in each of
  1033. /// the two 128-bit floating-point vectors of [2 x double] to determine if
  1034. /// the value in the first parameter is greater than or equal to the
  1035. /// corresponding value in the second parameter.
  1036. ///
  1037. /// The comparison yields 0 for false, 1 for true. If either of the two
  1038. /// lower double-precision values is NaN, 0 is returned.
  1039. ///
  1040. /// \headerfile <x86intrin.h>
  1041. ///
  1042. /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
  1043. ///
  1044. /// \param __a
  1045. /// A 128-bit vector of [2 x double]. The lower double-precision value is
  1046. /// compared to the lower double-precision value of \a __b.
  1047. /// \param __b
  1048. /// A 128-bit vector of [2 x double]. The lower double-precision value is
  1049. /// compared to the lower double-precision value of \a __a.
  1050. /// \returns An integer containing the comparison results. If either of the two
  1051. /// lower double-precision values is NaN, 0 is returned.
  1052. static __inline__ int __DEFAULT_FN_ATTRS
  1053. _mm_comige_sd(__m128d __a, __m128d __b)
  1054. {
  1055. return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b);
  1056. }
  1057. /// \brief Compares the lower double-precision floating-point values in each of
  1058. /// the two 128-bit floating-point vectors of [2 x double] to determine if
  1059. /// the value in the first parameter is unequal to the corresponding value in
  1060. /// the second parameter.
  1061. ///
  1062. /// The comparison yields 0 for false, 1 for true. If either of the two
  1063. /// lower double-precision values is NaN, 1 is returned.
  1064. ///
  1065. /// \headerfile <x86intrin.h>
  1066. ///
  1067. /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
  1068. ///
  1069. /// \param __a
  1070. /// A 128-bit vector of [2 x double]. The lower double-precision value is
  1071. /// compared to the lower double-precision value of \a __b.
  1072. /// \param __b
  1073. /// A 128-bit vector of [2 x double]. The lower double-precision value is
  1074. /// compared to the lower double-precision value of \a __a.
  1075. /// \returns An integer containing the comparison results. If either of the two
  1076. /// lower double-precision values is NaN, 1 is returned.
  1077. static __inline__ int __DEFAULT_FN_ATTRS
  1078. _mm_comineq_sd(__m128d __a, __m128d __b)
  1079. {
  1080. return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b);
  1081. }
  1082. /// \brief Compares the lower double-precision floating-point values in each of
  1083. /// the two 128-bit floating-point vectors of [2 x double] for equality. The
  1084. /// comparison yields 0 for false, 1 for true.
  1085. ///
  1086. /// If either of the two lower double-precision values is NaN, 0 is returned.
  1087. ///
  1088. /// \headerfile <x86intrin.h>
  1089. ///
  1090. /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
  1091. ///
  1092. /// \param __a
  1093. /// A 128-bit vector of [2 x double]. The lower double-precision value is
  1094. /// compared to the lower double-precision value of \a __b.
  1095. /// \param __b
  1096. /// A 128-bit vector of [2 x double]. The lower double-precision value is
  1097. /// compared to the lower double-precision value of \a __a.
  1098. /// \returns An integer containing the comparison results. If either of the two
  1099. /// lower double-precision values is NaN, 0 is returned.
  1100. static __inline__ int __DEFAULT_FN_ATTRS
  1101. _mm_ucomieq_sd(__m128d __a, __m128d __b)
  1102. {
  1103. return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b);
  1104. }
  1105. /// \brief Compares the lower double-precision floating-point values in each of
  1106. /// the two 128-bit floating-point vectors of [2 x double] to determine if
  1107. /// the value in the first parameter is less than the corresponding value in
  1108. /// the second parameter.
  1109. ///
  1110. /// The comparison yields 0 for false, 1 for true. If either of the two lower
  1111. /// double-precision values is NaN, 0 is returned.
  1112. ///
  1113. /// \headerfile <x86intrin.h>
  1114. ///
  1115. /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
  1116. ///
  1117. /// \param __a
  1118. /// A 128-bit vector of [2 x double]. The lower double-precision value is
  1119. /// compared to the lower double-precision value of \a __b.
  1120. /// \param __b
  1121. /// A 128-bit vector of [2 x double]. The lower double-precision value is
  1122. /// compared to the lower double-precision value of \a __a.
  1123. /// \returns An integer containing the comparison results. If either of the two
  1124. /// lower double-precision values is NaN, 0 is returned.
  1125. static __inline__ int __DEFAULT_FN_ATTRS
  1126. _mm_ucomilt_sd(__m128d __a, __m128d __b)
  1127. {
  1128. return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b);
  1129. }
  1130. /// \brief Compares the lower double-precision floating-point values in each of
  1131. /// the two 128-bit floating-point vectors of [2 x double] to determine if
  1132. /// the value in the first parameter is less than or equal to the
  1133. /// corresponding value in the second parameter.
  1134. ///
  1135. /// The comparison yields 0 for false, 1 for true. If either of the two lower
  1136. /// double-precision values is NaN, 0 is returned.
  1137. ///
  1138. /// \headerfile <x86intrin.h>
  1139. ///
  1140. /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
  1141. ///
  1142. /// \param __a
  1143. /// A 128-bit vector of [2 x double]. The lower double-precision value is
  1144. /// compared to the lower double-precision value of \a __b.
  1145. /// \param __b
  1146. /// A 128-bit vector of [2 x double]. The lower double-precision value is
  1147. /// compared to the lower double-precision value of \a __a.
  1148. /// \returns An integer containing the comparison results. If either of the two
  1149. /// lower double-precision values is NaN, 0 is returned.
  1150. static __inline__ int __DEFAULT_FN_ATTRS
  1151. _mm_ucomile_sd(__m128d __a, __m128d __b)
  1152. {
  1153. return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b);
  1154. }
  1155. /// \brief Compares the lower double-precision floating-point values in each of
  1156. /// the two 128-bit floating-point vectors of [2 x double] to determine if
  1157. /// the value in the first parameter is greater than the corresponding value
  1158. /// in the second parameter.
  1159. ///
  1160. /// The comparison yields 0 for false, 1 for true. If either of the two lower
  1161. /// double-precision values is NaN, 0 is returned.
  1162. ///
  1163. /// \headerfile <x86intrin.h>
  1164. ///
  1165. /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
  1166. ///
  1167. /// \param __a
  1168. /// A 128-bit vector of [2 x double]. The lower double-precision value is
  1169. /// compared to the lower double-precision value of \a __b.
  1170. /// \param __b
  1171. /// A 128-bit vector of [2 x double]. The lower double-precision value is
  1172. /// compared to the lower double-precision value of \a __a.
  1173. /// \returns An integer containing the comparison results. If either of the two
  1174. /// lower double-precision values is NaN, 0 is returned.
  1175. static __inline__ int __DEFAULT_FN_ATTRS
  1176. _mm_ucomigt_sd(__m128d __a, __m128d __b)
  1177. {
  1178. return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b);
  1179. }
  1180. /// \brief Compares the lower double-precision floating-point values in each of
  1181. /// the two 128-bit floating-point vectors of [2 x double] to determine if
  1182. /// the value in the first parameter is greater than or equal to the
  1183. /// corresponding value in the second parameter.
  1184. ///
  1185. /// The comparison yields 0 for false, 1 for true. If either of the two
  1186. /// lower double-precision values is NaN, 0 is returned.
  1187. ///
  1188. /// \headerfile <x86intrin.h>
  1189. ///
  1190. /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
  1191. ///
  1192. /// \param __a
  1193. /// A 128-bit vector of [2 x double]. The lower double-precision value is
  1194. /// compared to the lower double-precision value of \a __b.
  1195. /// \param __b
  1196. /// A 128-bit vector of [2 x double]. The lower double-precision value is
  1197. /// compared to the lower double-precision value of \a __a.
  1198. /// \returns An integer containing the comparison results. If either of the two
  1199. /// lower double-precision values is NaN, 0 is returned.
  1200. static __inline__ int __DEFAULT_FN_ATTRS
  1201. _mm_ucomige_sd(__m128d __a, __m128d __b)
  1202. {
  1203. return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b);
  1204. }
  1205. /// \brief Compares the lower double-precision floating-point values in each of
  1206. /// the two 128-bit floating-point vectors of [2 x double] to determine if
  1207. /// the value in the first parameter is unequal to the corresponding value in
  1208. /// the second parameter.
  1209. ///
  1210. /// The comparison yields 0 for false, 1 for true. If either of the two lower
  1211. /// double-precision values is NaN, 1 is returned.
  1212. ///
  1213. /// \headerfile <x86intrin.h>
  1214. ///
  1215. /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
  1216. ///
  1217. /// \param __a
  1218. /// A 128-bit vector of [2 x double]. The lower double-precision value is
  1219. /// compared to the lower double-precision value of \a __b.
  1220. /// \param __b
  1221. /// A 128-bit vector of [2 x double]. The lower double-precision value is
  1222. /// compared to the lower double-precision value of \a __a.
  1223. /// \returns An integer containing the comparison result. If either of the two
  1224. /// lower double-precision values is NaN, 1 is returned.
  1225. static __inline__ int __DEFAULT_FN_ATTRS
  1226. _mm_ucomineq_sd(__m128d __a, __m128d __b)
  1227. {
  1228. return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b);
  1229. }
  1230. /// \brief Converts the two double-precision floating-point elements of a
  1231. /// 128-bit vector of [2 x double] into two single-precision floating-point
  1232. /// values, returned in the lower 64 bits of a 128-bit vector of [4 x float].
  1233. /// The upper 64 bits of the result vector are set to zero.
  1234. ///
  1235. /// \headerfile <x86intrin.h>
  1236. ///
  1237. /// This intrinsic corresponds to the <c> VCVTPD2PS / CVTPD2PS </c> instruction.
  1238. ///
  1239. /// \param __a
  1240. /// A 128-bit vector of [2 x double].
  1241. /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
  1242. /// converted values. The upper 64 bits are set to zero.
  1243. static __inline__ __m128 __DEFAULT_FN_ATTRS
  1244. _mm_cvtpd_ps(__m128d __a)
  1245. {
  1246. return __builtin_ia32_cvtpd2ps((__v2df)__a);
  1247. }
  1248. /// \brief Converts the lower two single-precision floating-point elements of a
  1249. /// 128-bit vector of [4 x float] into two double-precision floating-point
  1250. /// values, returned in a 128-bit vector of [2 x double]. The upper two
  1251. /// elements of the input vector are unused.
  1252. ///
  1253. /// \headerfile <x86intrin.h>
  1254. ///
  1255. /// This intrinsic corresponds to the <c> VCVTPS2PD / CVTPS2PD </c> instruction.
  1256. ///
  1257. /// \param __a
  1258. /// A 128-bit vector of [4 x float]. The lower two single-precision
  1259. /// floating-point elements are converted to double-precision values. The
  1260. /// upper two elements are unused.
  1261. /// \returns A 128-bit vector of [2 x double] containing the converted values.
  1262. static __inline__ __m128d __DEFAULT_FN_ATTRS
  1263. _mm_cvtps_pd(__m128 __a)
  1264. {
  1265. return (__m128d) __builtin_convertvector(
  1266. __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df);
  1267. }
  1268. /// \brief Converts the lower two integer elements of a 128-bit vector of
  1269. /// [4 x i32] into two double-precision floating-point values, returned in a
  1270. /// 128-bit vector of [2 x double].
  1271. ///
  1272. /// The upper two elements of the input vector are unused.
  1273. ///
  1274. /// \headerfile <x86intrin.h>
  1275. ///
  1276. /// This intrinsic corresponds to the <c> VCVTDQ2PD / CVTDQ2PD </c> instruction.
  1277. ///
  1278. /// \param __a
  1279. /// A 128-bit integer vector of [4 x i32]. The lower two integer elements are
  1280. /// converted to double-precision values.
  1281. ///
  1282. /// The upper two elements are unused.
  1283. /// \returns A 128-bit vector of [2 x double] containing the converted values.
  1284. static __inline__ __m128d __DEFAULT_FN_ATTRS
  1285. _mm_cvtepi32_pd(__m128i __a)
  1286. {
  1287. return (__m128d) __builtin_convertvector(
  1288. __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df);
  1289. }
  1290. /// \brief Converts the two double-precision floating-point elements of a
  1291. /// 128-bit vector of [2 x double] into two signed 32-bit integer values,
  1292. /// returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper
  1293. /// 64 bits of the result vector are set to zero.
  1294. ///
  1295. /// \headerfile <x86intrin.h>
  1296. ///
  1297. /// This intrinsic corresponds to the <c> VCVTPD2DQ / CVTPD2DQ </c> instruction.
  1298. ///
  1299. /// \param __a
  1300. /// A 128-bit vector of [2 x double].
  1301. /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
  1302. /// converted values. The upper 64 bits are set to zero.
  1303. static __inline__ __m128i __DEFAULT_FN_ATTRS
  1304. _mm_cvtpd_epi32(__m128d __a)
  1305. {
  1306. return __builtin_ia32_cvtpd2dq((__v2df)__a);
  1307. }
  1308. /// \brief Converts the low-order element of a 128-bit vector of [2 x double]
  1309. /// into a 32-bit signed integer value.
  1310. ///
  1311. /// \headerfile <x86intrin.h>
  1312. ///
  1313. /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
  1314. ///
  1315. /// \param __a
  1316. /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
  1317. /// conversion.
  1318. /// \returns A 32-bit signed integer containing the converted value.
  1319. static __inline__ int __DEFAULT_FN_ATTRS
  1320. _mm_cvtsd_si32(__m128d __a)
  1321. {
  1322. return __builtin_ia32_cvtsd2si((__v2df)__a);
  1323. }
  1324. /// \brief Converts the lower double-precision floating-point element of a
  1325. /// 128-bit vector of [2 x double], in the second parameter, into a
  1326. /// single-precision floating-point value, returned in the lower 32 bits of a
  1327. /// 128-bit vector of [4 x float]. The upper 96 bits of the result vector are
  1328. /// copied from the upper 96 bits of the first parameter.
  1329. ///
  1330. /// \headerfile <x86intrin.h>
  1331. ///
  1332. /// This intrinsic corresponds to the <c> VCVTSD2SS / CVTSD2SS </c> instruction.
  1333. ///
  1334. /// \param __a
  1335. /// A 128-bit vector of [4 x float]. The upper 96 bits of this parameter are
  1336. /// copied to the upper 96 bits of the result.
  1337. /// \param __b
  1338. /// A 128-bit vector of [2 x double]. The lower double-precision
  1339. /// floating-point element is used in the conversion.
  1340. /// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the
  1341. /// converted value from the second parameter. The upper 96 bits are copied
  1342. /// from the upper 96 bits of the first parameter.
  1343. static __inline__ __m128 __DEFAULT_FN_ATTRS
  1344. _mm_cvtsd_ss(__m128 __a, __m128d __b)
  1345. {
  1346. return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b);
  1347. }
  1348. /// \brief Converts a 32-bit signed integer value, in the second parameter, into
  1349. /// a double-precision floating-point value, returned in the lower 64 bits of
  1350. /// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
  1351. /// are copied from the upper 64 bits of the first parameter.
  1352. ///
  1353. /// \headerfile <x86intrin.h>
  1354. ///
  1355. /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
  1356. ///
  1357. /// \param __a
  1358. /// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
  1359. /// copied to the upper 64 bits of the result.
  1360. /// \param __b
  1361. /// A 32-bit signed integer containing the value to be converted.
  1362. /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
  1363. /// converted value from the second parameter. The upper 64 bits are copied
  1364. /// from the upper 64 bits of the first parameter.
  1365. static __inline__ __m128d __DEFAULT_FN_ATTRS
  1366. _mm_cvtsi32_sd(__m128d __a, int __b)
  1367. {
  1368. __a[0] = __b;
  1369. return __a;
  1370. }
  1371. /// \brief Converts the lower single-precision floating-point element of a
  1372. /// 128-bit vector of [4 x float], in the second parameter, into a
  1373. /// double-precision floating-point value, returned in the lower 64 bits of
  1374. /// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
  1375. /// are copied from the upper 64 bits of the first parameter.
  1376. ///
  1377. /// \headerfile <x86intrin.h>
  1378. ///
  1379. /// This intrinsic corresponds to the <c> VCVTSS2SD / CVTSS2SD </c> instruction.
  1380. ///
  1381. /// \param __a
  1382. /// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
  1383. /// copied to the upper 64 bits of the result.
  1384. /// \param __b
  1385. /// A 128-bit vector of [4 x float]. The lower single-precision
  1386. /// floating-point element is used in the conversion.
  1387. /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
  1388. /// converted value from the second parameter. The upper 64 bits are copied
  1389. /// from the upper 64 bits of the first parameter.
  1390. static __inline__ __m128d __DEFAULT_FN_ATTRS
  1391. _mm_cvtss_sd(__m128d __a, __m128 __b)
  1392. {
  1393. __a[0] = __b[0];
  1394. return __a;
  1395. }
  1396. /// \brief Converts the two double-precision floating-point elements of a
  1397. /// 128-bit vector of [2 x double] into two signed 32-bit integer values,
  1398. /// returned in the lower 64 bits of a 128-bit vector of [4 x i32].
  1399. ///
  1400. /// If the result of either conversion is inexact, the result is truncated
  1401. /// (rounded towards zero) regardless of the current MXCSR setting. The upper
  1402. /// 64 bits of the result vector are set to zero.
  1403. ///
  1404. /// \headerfile <x86intrin.h>
  1405. ///
  1406. /// This intrinsic corresponds to the <c> VCVTTPD2DQ / CVTTPD2DQ </c>
  1407. /// instruction.
  1408. ///
  1409. /// \param __a
  1410. /// A 128-bit vector of [2 x double].
  1411. /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
  1412. /// converted values. The upper 64 bits are set to zero.
  1413. static __inline__ __m128i __DEFAULT_FN_ATTRS
  1414. _mm_cvttpd_epi32(__m128d __a)
  1415. {
  1416. return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a);
  1417. }
  1418. /// \brief Converts the low-order element of a [2 x double] vector into a 32-bit
  1419. /// signed integer value, truncating the result when it is inexact.
  1420. ///
  1421. /// \headerfile <x86intrin.h>
  1422. ///
  1423. /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
  1424. /// instruction.
  1425. ///
  1426. /// \param __a
  1427. /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
  1428. /// conversion.
  1429. /// \returns A 32-bit signed integer containing the converted value.
  1430. static __inline__ int __DEFAULT_FN_ATTRS
  1431. _mm_cvttsd_si32(__m128d __a)
  1432. {
  1433. return __builtin_ia32_cvttsd2si((__v2df)__a);
  1434. }
  1435. /// \brief Converts the two double-precision floating-point elements of a
  1436. /// 128-bit vector of [2 x double] into two signed 32-bit integer values,
  1437. /// returned in a 64-bit vector of [2 x i32].
  1438. ///
  1439. /// \headerfile <x86intrin.h>
  1440. ///
  1441. /// This intrinsic corresponds to the <c> CVTPD2PI </c> instruction.
  1442. ///
  1443. /// \param __a
  1444. /// A 128-bit vector of [2 x double].
  1445. /// \returns A 64-bit vector of [2 x i32] containing the converted values.
  1446. static __inline__ __m64 __DEFAULT_FN_ATTRS
  1447. _mm_cvtpd_pi32(__m128d __a)
  1448. {
  1449. return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a);
  1450. }
  1451. /// \brief Converts the two double-precision floating-point elements of a
  1452. /// 128-bit vector of [2 x double] into two signed 32-bit integer values,
  1453. /// returned in a 64-bit vector of [2 x i32].
  1454. ///
  1455. /// If the result of either conversion is inexact, the result is truncated
  1456. /// (rounded towards zero) regardless of the current MXCSR setting.
  1457. ///
  1458. /// \headerfile <x86intrin.h>
  1459. ///
  1460. /// This intrinsic corresponds to the <c> CVTTPD2PI </c> instruction.
  1461. ///
  1462. /// \param __a
  1463. /// A 128-bit vector of [2 x double].
  1464. /// \returns A 64-bit vector of [2 x i32] containing the converted values.
  1465. static __inline__ __m64 __DEFAULT_FN_ATTRS
  1466. _mm_cvttpd_pi32(__m128d __a)
  1467. {
  1468. return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a);
  1469. }
  1470. /// \brief Converts the two signed 32-bit integer elements of a 64-bit vector of
  1471. /// [2 x i32] into two double-precision floating-point values, returned in a
  1472. /// 128-bit vector of [2 x double].
  1473. ///
  1474. /// \headerfile <x86intrin.h>
  1475. ///
  1476. /// This intrinsic corresponds to the <c> CVTPI2PD </c> instruction.
  1477. ///
  1478. /// \param __a
  1479. /// A 64-bit vector of [2 x i32].
  1480. /// \returns A 128-bit vector of [2 x double] containing the converted values.
  1481. static __inline__ __m128d __DEFAULT_FN_ATTRS
  1482. _mm_cvtpi32_pd(__m64 __a)
  1483. {
  1484. return __builtin_ia32_cvtpi2pd((__v2si)__a);
  1485. }
  1486. /// \brief Returns the low-order element of a 128-bit vector of [2 x double] as
  1487. /// a double-precision floating-point value.
  1488. ///
  1489. /// \headerfile <x86intrin.h>
  1490. ///
  1491. /// This intrinsic has no corresponding instruction.
  1492. ///
  1493. /// \param __a
  1494. /// A 128-bit vector of [2 x double]. The lower 64 bits are returned.
  1495. /// \returns A double-precision floating-point value copied from the lower 64
  1496. /// bits of \a __a.
  1497. static __inline__ double __DEFAULT_FN_ATTRS
  1498. _mm_cvtsd_f64(__m128d __a)
  1499. {
  1500. return __a[0];
  1501. }
  1502. /// \brief Loads a 128-bit floating-point vector of [2 x double] from an aligned
  1503. /// memory location.
  1504. ///
  1505. /// \headerfile <x86intrin.h>
  1506. ///
  1507. /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction.
  1508. ///
  1509. /// \param __dp
  1510. /// A pointer to a 128-bit memory location. The address of the memory
  1511. /// location has to be 16-byte aligned.
  1512. /// \returns A 128-bit vector of [2 x double] containing the loaded values.
  1513. static __inline__ __m128d __DEFAULT_FN_ATTRS
  1514. _mm_load_pd(double const *__dp)
  1515. {
  1516. return *(__m128d*)__dp;
  1517. }
  1518. /// \brief Loads a double-precision floating-point value from a specified memory
  1519. /// location and duplicates it to both vector elements of a 128-bit vector of
  1520. /// [2 x double].
  1521. ///
  1522. /// \headerfile <x86intrin.h>
  1523. ///
  1524. /// This intrinsic corresponds to the <c> VMOVDDUP / MOVDDUP </c> instruction.
  1525. ///
  1526. /// \param __dp
  1527. /// A pointer to a memory location containing a double-precision value.
  1528. /// \returns A 128-bit vector of [2 x double] containing the loaded and
  1529. /// duplicated values.
  1530. static __inline__ __m128d __DEFAULT_FN_ATTRS
  1531. _mm_load1_pd(double const *__dp)
  1532. {
  1533. struct __mm_load1_pd_struct {
  1534. double __u;
  1535. } __attribute__((__packed__, __may_alias__));
  1536. double __u = ((struct __mm_load1_pd_struct*)__dp)->__u;
  1537. return (__m128d){ __u, __u };
  1538. }
  1539. #define _mm_load_pd1(dp) _mm_load1_pd(dp)
  1540. /// \brief Loads two double-precision values, in reverse order, from an aligned
  1541. /// memory location into a 128-bit vector of [2 x double].
  1542. ///
  1543. /// \headerfile <x86intrin.h>
  1544. ///
  1545. /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction +
  1546. /// needed shuffling instructions. In AVX mode, the shuffling may be combined
  1547. /// with the \c VMOVAPD, resulting in only a \c VPERMILPD instruction.
  1548. ///
  1549. /// \param __dp
  1550. /// A 16-byte aligned pointer to an array of double-precision values to be
  1551. /// loaded in reverse order.
  1552. /// \returns A 128-bit vector of [2 x double] containing the reversed loaded
  1553. /// values.
  1554. static __inline__ __m128d __DEFAULT_FN_ATTRS
  1555. _mm_loadr_pd(double const *__dp)
  1556. {
  1557. __m128d __u = *(__m128d*)__dp;
  1558. return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0);
  1559. }
  1560. /// \brief Loads a 128-bit floating-point vector of [2 x double] from an
  1561. /// unaligned memory location.
  1562. ///
  1563. /// \headerfile <x86intrin.h>
  1564. ///
  1565. /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
  1566. ///
  1567. /// \param __dp
  1568. /// A pointer to a 128-bit memory location. The address of the memory
  1569. /// location does not have to be aligned.
  1570. /// \returns A 128-bit vector of [2 x double] containing the loaded values.
  1571. static __inline__ __m128d __DEFAULT_FN_ATTRS
  1572. _mm_loadu_pd(double const *__dp)
  1573. {
  1574. struct __loadu_pd {
  1575. __m128d __v;
  1576. } __attribute__((__packed__, __may_alias__));
  1577. return ((struct __loadu_pd*)__dp)->__v;
  1578. }
  1579. /// \brief Loads a 64-bit integer value to the low element of a 128-bit integer
  1580. /// vector and clears the upper element.
  1581. ///
  1582. /// \headerfile <x86intrin.h>
  1583. ///
  1584. /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
  1585. ///
  1586. /// \param __a
  1587. /// A pointer to a 64-bit memory location. The address of the memory
  1588. /// location does not have to be aligned.
  1589. /// \returns A 128-bit vector of [2 x i64] containing the loaded value.
  1590. static __inline__ __m128i __DEFAULT_FN_ATTRS
  1591. _mm_loadu_si64(void const *__a)
  1592. {
  1593. struct __loadu_si64 {
  1594. long long __v;
  1595. } __attribute__((__packed__, __may_alias__));
  1596. long long __u = ((struct __loadu_si64*)__a)->__v;
  1597. return (__m128i){__u, 0L};
  1598. }
  1599. /// \brief Loads a 64-bit double-precision value to the low element of a
  1600. /// 128-bit integer vector and clears the upper element.
  1601. ///
  1602. /// \headerfile <x86intrin.h>
  1603. ///
  1604. /// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
  1605. ///
  1606. /// \param __dp
  1607. /// A pointer to a memory location containing a double-precision value.
  1608. /// The address of the memory location does not have to be aligned.
  1609. /// \returns A 128-bit vector of [2 x double] containing the loaded value.
  1610. static __inline__ __m128d __DEFAULT_FN_ATTRS
  1611. _mm_load_sd(double const *__dp)
  1612. {
  1613. struct __mm_load_sd_struct {
  1614. double __u;
  1615. } __attribute__((__packed__, __may_alias__));
  1616. double __u = ((struct __mm_load_sd_struct*)__dp)->__u;
  1617. return (__m128d){ __u, 0 };
  1618. }
  1619. /// \brief Loads a double-precision value into the high-order bits of a 128-bit
  1620. /// vector of [2 x double]. The low-order bits are copied from the low-order
  1621. /// bits of the first operand.
  1622. ///
  1623. /// \headerfile <x86intrin.h>
  1624. ///
  1625. /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
  1626. ///
  1627. /// \param __a
  1628. /// A 128-bit vector of [2 x double]. \n
  1629. /// Bits [63:0] are written to bits [63:0] of the result.
  1630. /// \param __dp
  1631. /// A pointer to a 64-bit memory location containing a double-precision
  1632. /// floating-point value that is loaded. The loaded value is written to bits
  1633. /// [127:64] of the result. The address of the memory location does not have
  1634. /// to be aligned.
  1635. /// \returns A 128-bit vector of [2 x double] containing the moved values.
  1636. static __inline__ __m128d __DEFAULT_FN_ATTRS
  1637. _mm_loadh_pd(__m128d __a, double const *__dp)
  1638. {
  1639. struct __mm_loadh_pd_struct {
  1640. double __u;
  1641. } __attribute__((__packed__, __may_alias__));
  1642. double __u = ((struct __mm_loadh_pd_struct*)__dp)->__u;
  1643. return (__m128d){ __a[0], __u };
  1644. }
  1645. /// \brief Loads a double-precision value into the low-order bits of a 128-bit
  1646. /// vector of [2 x double]. The high-order bits are copied from the
  1647. /// high-order bits of the first operand.
  1648. ///
  1649. /// \headerfile <x86intrin.h>
  1650. ///
  1651. /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
  1652. ///
  1653. /// \param __a
  1654. /// A 128-bit vector of [2 x double]. \n
  1655. /// Bits [127:64] are written to bits [127:64] of the result.
  1656. /// \param __dp
  1657. /// A pointer to a 64-bit memory location containing a double-precision
  1658. /// floating-point value that is loaded. The loaded value is written to bits
  1659. /// [63:0] of the result. The address of the memory location does not have to
  1660. /// be aligned.
  1661. /// \returns A 128-bit vector of [2 x double] containing the moved values.
  1662. static __inline__ __m128d __DEFAULT_FN_ATTRS
  1663. _mm_loadl_pd(__m128d __a, double const *__dp)
  1664. {
  1665. struct __mm_loadl_pd_struct {
  1666. double __u;
  1667. } __attribute__((__packed__, __may_alias__));
  1668. double __u = ((struct __mm_loadl_pd_struct*)__dp)->__u;
  1669. return (__m128d){ __u, __a[1] };
  1670. }
  1671. /// \brief Constructs a 128-bit floating-point vector of [2 x double] with
  1672. /// unspecified content. This could be used as an argument to another
  1673. /// intrinsic function where the argument is required but the value is not
  1674. /// actually used.
  1675. ///
  1676. /// \headerfile <x86intrin.h>
  1677. ///
  1678. /// This intrinsic has no corresponding instruction.
  1679. ///
  1680. /// \returns A 128-bit floating-point vector of [2 x double] with unspecified
  1681. /// content.
  1682. static __inline__ __m128d __DEFAULT_FN_ATTRS
  1683. _mm_undefined_pd(void)
  1684. {
  1685. return (__m128d)__builtin_ia32_undef128();
  1686. }
  1687. /// \brief Constructs a 128-bit floating-point vector of [2 x double]. The lower
  1688. /// 64 bits of the vector are initialized with the specified double-precision
  1689. /// floating-point value. The upper 64 bits are set to zero.
  1690. ///
  1691. /// \headerfile <x86intrin.h>
  1692. ///
  1693. /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
  1694. ///
  1695. /// \param __w
  1696. /// A double-precision floating-point value used to initialize the lower 64
  1697. /// bits of the result.
  1698. /// \returns An initialized 128-bit floating-point vector of [2 x double]. The
  1699. /// lower 64 bits contain the value of the parameter. The upper 64 bits are
  1700. /// set to zero.
  1701. static __inline__ __m128d __DEFAULT_FN_ATTRS
  1702. _mm_set_sd(double __w)
  1703. {
  1704. return (__m128d){ __w, 0 };
  1705. }
  1706. /// \brief Constructs a 128-bit floating-point vector of [2 x double], with each
  1707. /// of the two double-precision floating-point vector elements set to the
  1708. /// specified double-precision floating-point value.
  1709. ///
  1710. /// \headerfile <x86intrin.h>
  1711. ///
  1712. /// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
  1713. ///
  1714. /// \param __w
  1715. /// A double-precision floating-point value used to initialize each vector
  1716. /// element of the result.
  1717. /// \returns An initialized 128-bit floating-point vector of [2 x double].
  1718. static __inline__ __m128d __DEFAULT_FN_ATTRS
  1719. _mm_set1_pd(double __w)
  1720. {
  1721. return (__m128d){ __w, __w };
  1722. }
  1723. /// \brief Constructs a 128-bit floating-point vector of [2 x double], with each
  1724. /// of the two double-precision floating-point vector elements set to the
  1725. /// specified double-precision floating-point value.
  1726. ///
  1727. /// \headerfile <x86intrin.h>
  1728. ///
  1729. /// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
  1730. ///
  1731. /// \param __w
  1732. /// A double-precision floating-point value used to initialize each vector
  1733. /// element of the result.
  1734. /// \returns An initialized 128-bit floating-point vector of [2 x double].
  1735. static __inline__ __m128d __DEFAULT_FN_ATTRS
  1736. _mm_set_pd1(double __w)
  1737. {
  1738. return _mm_set1_pd(__w);
  1739. }
  1740. /// \brief Constructs a 128-bit floating-point vector of [2 x double]
  1741. /// initialized with the specified double-precision floating-point values.
  1742. ///
  1743. /// \headerfile <x86intrin.h>
  1744. ///
  1745. /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
  1746. ///
  1747. /// \param __w
  1748. /// A double-precision floating-point value used to initialize the upper 64
  1749. /// bits of the result.
  1750. /// \param __x
  1751. /// A double-precision floating-point value used to initialize the lower 64
  1752. /// bits of the result.
  1753. /// \returns An initialized 128-bit floating-point vector of [2 x double].
  1754. static __inline__ __m128d __DEFAULT_FN_ATTRS
  1755. _mm_set_pd(double __w, double __x)
  1756. {
  1757. return (__m128d){ __x, __w };
  1758. }
  1759. /// \brief Constructs a 128-bit floating-point vector of [2 x double],
  1760. /// initialized in reverse order with the specified double-precision
  1761. /// floating-point values.
  1762. ///
  1763. /// \headerfile <x86intrin.h>
  1764. ///
  1765. /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
  1766. ///
  1767. /// \param __w
  1768. /// A double-precision floating-point value used to initialize the lower 64
  1769. /// bits of the result.
  1770. /// \param __x
  1771. /// A double-precision floating-point value used to initialize the upper 64
  1772. /// bits of the result.
  1773. /// \returns An initialized 128-bit floating-point vector of [2 x double].
  1774. static __inline__ __m128d __DEFAULT_FN_ATTRS
  1775. _mm_setr_pd(double __w, double __x)
  1776. {
  1777. return (__m128d){ __w, __x };
  1778. }
  1779. /// \brief Constructs a 128-bit floating-point vector of [2 x double]
  1780. /// initialized to zero.
  1781. ///
  1782. /// \headerfile <x86intrin.h>
  1783. ///
  1784. /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
  1785. ///
  1786. /// \returns An initialized 128-bit floating-point vector of [2 x double] with
  1787. /// all elements set to zero.
  1788. static __inline__ __m128d __DEFAULT_FN_ATTRS
  1789. _mm_setzero_pd(void)
  1790. {
  1791. return (__m128d){ 0, 0 };
  1792. }
  1793. /// \brief Constructs a 128-bit floating-point vector of [2 x double]. The lower
  1794. /// 64 bits are set to the lower 64 bits of the second parameter. The upper
  1795. /// 64 bits are set to the upper 64 bits of the first parameter.
  1796. ///
  1797. /// \headerfile <x86intrin.h>
  1798. ///
  1799. /// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
  1800. ///
  1801. /// \param __a
  1802. /// A 128-bit vector of [2 x double]. The upper 64 bits are written to the
  1803. /// upper 64 bits of the result.
  1804. /// \param __b
  1805. /// A 128-bit vector of [2 x double]. The lower 64 bits are written to the
  1806. /// lower 64 bits of the result.
  1807. /// \returns A 128-bit vector of [2 x double] containing the moved values.
  1808. static __inline__ __m128d __DEFAULT_FN_ATTRS
  1809. _mm_move_sd(__m128d __a, __m128d __b)
  1810. {
  1811. return (__m128d){ __b[0], __a[1] };
  1812. }
  1813. /// \brief Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
  1814. /// memory location.
  1815. ///
  1816. /// \headerfile <x86intrin.h>
  1817. ///
  1818. /// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
  1819. ///
  1820. /// \param __dp
  1821. /// A pointer to a 64-bit memory location.
  1822. /// \param __a
  1823. /// A 128-bit vector of [2 x double] containing the value to be stored.
  1824. static __inline__ void __DEFAULT_FN_ATTRS
  1825. _mm_store_sd(double *__dp, __m128d __a)
  1826. {
  1827. struct __mm_store_sd_struct {
  1828. double __u;
  1829. } __attribute__((__packed__, __may_alias__));
  1830. ((struct __mm_store_sd_struct*)__dp)->__u = __a[0];
  1831. }
  1832. /// \brief Moves packed double-precision values from a 128-bit vector of
  1833. /// [2 x double] to a memory location.
  1834. ///
  1835. /// \headerfile <x86intrin.h>
  1836. ///
  1837. /// This intrinsic corresponds to the <c>VMOVAPD / MOVAPS</c> instruction.
  1838. ///
  1839. /// \param __dp
  1840. /// A pointer to an aligned memory location that can store two
  1841. /// double-precision values.
  1842. /// \param __a
  1843. /// A packed 128-bit vector of [2 x double] containing the values to be
  1844. /// moved.
  1845. static __inline__ void __DEFAULT_FN_ATTRS
  1846. _mm_store_pd(double *__dp, __m128d __a)
  1847. {
  1848. *(__m128d*)__dp = __a;
  1849. }
  1850. /// \brief Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
  1851. /// the upper and lower 64 bits of a memory location.
  1852. ///
  1853. /// \headerfile <x86intrin.h>
  1854. ///
  1855. /// This intrinsic corresponds to the
  1856. /// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
  1857. ///
  1858. /// \param __dp
  1859. /// A pointer to a memory location that can store two double-precision
  1860. /// values.
  1861. /// \param __a
  1862. /// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
  1863. /// of the values in \a __dp.
  1864. static __inline__ void __DEFAULT_FN_ATTRS
  1865. _mm_store1_pd(double *__dp, __m128d __a)
  1866. {
  1867. __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
  1868. _mm_store_pd(__dp, __a);
  1869. }
  1870. /// \brief Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
  1871. /// the upper and lower 64 bits of a memory location.
  1872. ///
  1873. /// \headerfile <x86intrin.h>
  1874. ///
  1875. /// This intrinsic corresponds to the
  1876. /// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
  1877. ///
  1878. /// \param __dp
  1879. /// A pointer to a memory location that can store two double-precision
  1880. /// values.
  1881. /// \param __a
  1882. /// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
  1883. /// of the values in \a __dp.
  1884. static __inline__ void __DEFAULT_FN_ATTRS
  1885. _mm_store_pd1(double *__dp, __m128d __a)
  1886. {
  1887. return _mm_store1_pd(__dp, __a);
  1888. }
  1889. /// \brief Stores a 128-bit vector of [2 x double] into an unaligned memory
  1890. /// location.
  1891. ///
  1892. /// \headerfile <x86intrin.h>
  1893. ///
  1894. /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
  1895. ///
  1896. /// \param __dp
  1897. /// A pointer to a 128-bit memory location. The address of the memory
  1898. /// location does not have to be aligned.
  1899. /// \param __a
  1900. /// A 128-bit vector of [2 x double] containing the values to be stored.
  1901. static __inline__ void __DEFAULT_FN_ATTRS
  1902. _mm_storeu_pd(double *__dp, __m128d __a)
  1903. {
  1904. struct __storeu_pd {
  1905. __m128d __v;
  1906. } __attribute__((__packed__, __may_alias__));
  1907. ((struct __storeu_pd*)__dp)->__v = __a;
  1908. }
  1909. /// \brief Stores two double-precision values, in reverse order, from a 128-bit
  1910. /// vector of [2 x double] to a 16-byte aligned memory location.
  1911. ///
  1912. /// \headerfile <x86intrin.h>
  1913. ///
  1914. /// This intrinsic corresponds to a shuffling instruction followed by a
  1915. /// <c> VMOVAPD / MOVAPD </c> instruction.
  1916. ///
  1917. /// \param __dp
  1918. /// A pointer to a 16-byte aligned memory location that can store two
  1919. /// double-precision values.
  1920. /// \param __a
  1921. /// A 128-bit vector of [2 x double] containing the values to be reversed and
  1922. /// stored.
  1923. static __inline__ void __DEFAULT_FN_ATTRS
  1924. _mm_storer_pd(double *__dp, __m128d __a)
  1925. {
  1926. __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0);
  1927. *(__m128d *)__dp = __a;
  1928. }
  1929. /// \brief Stores the upper 64 bits of a 128-bit vector of [2 x double] to a
  1930. /// memory location.
  1931. ///
  1932. /// \headerfile <x86intrin.h>
  1933. ///
  1934. /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
  1935. ///
  1936. /// \param __dp
  1937. /// A pointer to a 64-bit memory location.
  1938. /// \param __a
  1939. /// A 128-bit vector of [2 x double] containing the value to be stored.
  1940. static __inline__ void __DEFAULT_FN_ATTRS
  1941. _mm_storeh_pd(double *__dp, __m128d __a)
  1942. {
  1943. struct __mm_storeh_pd_struct {
  1944. double __u;
  1945. } __attribute__((__packed__, __may_alias__));
  1946. ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1];
  1947. }
  1948. /// \brief Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
  1949. /// memory location.
  1950. ///
  1951. /// \headerfile <x86intrin.h>
  1952. ///
  1953. /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
  1954. ///
  1955. /// \param __dp
  1956. /// A pointer to a 64-bit memory location.
  1957. /// \param __a
  1958. /// A 128-bit vector of [2 x double] containing the value to be stored.
  1959. static __inline__ void __DEFAULT_FN_ATTRS
  1960. _mm_storel_pd(double *__dp, __m128d __a)
  1961. {
  1962. struct __mm_storeh_pd_struct {
  1963. double __u;
  1964. } __attribute__((__packed__, __may_alias__));
  1965. ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0];
  1966. }
  1967. /// \brief Adds the corresponding elements of two 128-bit vectors of [16 x i8],
  1968. /// saving the lower 8 bits of each sum in the corresponding element of a
  1969. /// 128-bit result vector of [16 x i8].
  1970. ///
  1971. /// The integer elements of both parameters can be either signed or unsigned.
  1972. ///
  1973. /// \headerfile <x86intrin.h>
  1974. ///
  1975. /// This intrinsic corresponds to the <c> VPADDB / PADDB </c> instruction.
  1976. ///
  1977. /// \param __a
  1978. /// A 128-bit vector of [16 x i8].
  1979. /// \param __b
  1980. /// A 128-bit vector of [16 x i8].
  1981. /// \returns A 128-bit vector of [16 x i8] containing the sums of both
  1982. /// parameters.
  1983. static __inline__ __m128i __DEFAULT_FN_ATTRS
  1984. _mm_add_epi8(__m128i __a, __m128i __b)
  1985. {
  1986. return (__m128i)((__v16qu)__a + (__v16qu)__b);
  1987. }
  1988. /// \brief Adds the corresponding elements of two 128-bit vectors of [8 x i16],
  1989. /// saving the lower 16 bits of each sum in the corresponding element of a
  1990. /// 128-bit result vector of [8 x i16].
  1991. ///
  1992. /// The integer elements of both parameters can be either signed or unsigned.
  1993. ///
  1994. /// \headerfile <x86intrin.h>
  1995. ///
  1996. /// This intrinsic corresponds to the <c> VPADDW / PADDW </c> instruction.
  1997. ///
  1998. /// \param __a
  1999. /// A 128-bit vector of [8 x i16].
  2000. /// \param __b
  2001. /// A 128-bit vector of [8 x i16].
  2002. /// \returns A 128-bit vector of [8 x i16] containing the sums of both
  2003. /// parameters.
  2004. static __inline__ __m128i __DEFAULT_FN_ATTRS
  2005. _mm_add_epi16(__m128i __a, __m128i __b)
  2006. {
  2007. return (__m128i)((__v8hu)__a + (__v8hu)__b);
  2008. }
  2009. /// \brief Adds the corresponding elements of two 128-bit vectors of [4 x i32],
  2010. /// saving the lower 32 bits of each sum in the corresponding element of a
  2011. /// 128-bit result vector of [4 x i32].
  2012. ///
  2013. /// The integer elements of both parameters can be either signed or unsigned.
  2014. ///
  2015. /// \headerfile <x86intrin.h>
  2016. ///
  2017. /// This intrinsic corresponds to the <c> VPADDD / PADDD </c> instruction.
  2018. ///
  2019. /// \param __a
  2020. /// A 128-bit vector of [4 x i32].
  2021. /// \param __b
  2022. /// A 128-bit vector of [4 x i32].
  2023. /// \returns A 128-bit vector of [4 x i32] containing the sums of both
  2024. /// parameters.
  2025. static __inline__ __m128i __DEFAULT_FN_ATTRS
  2026. _mm_add_epi32(__m128i __a, __m128i __b)
  2027. {
  2028. return (__m128i)((__v4su)__a + (__v4su)__b);
  2029. }
  2030. /// \brief Adds two signed or unsigned 64-bit integer values, returning the
  2031. /// lower 64 bits of the sum.
  2032. ///
  2033. /// \headerfile <x86intrin.h>
  2034. ///
  2035. /// This intrinsic corresponds to the <c> PADDQ </c> instruction.
  2036. ///
  2037. /// \param __a
  2038. /// A 64-bit integer.
  2039. /// \param __b
  2040. /// A 64-bit integer.
  2041. /// \returns A 64-bit integer containing the sum of both parameters.
  2042. static __inline__ __m64 __DEFAULT_FN_ATTRS
  2043. _mm_add_si64(__m64 __a, __m64 __b)
  2044. {
  2045. return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b);
  2046. }
  2047. /// \brief Adds the corresponding elements of two 128-bit vectors of [2 x i64],
  2048. /// saving the lower 64 bits of each sum in the corresponding element of a
  2049. /// 128-bit result vector of [2 x i64].
  2050. ///
  2051. /// The integer elements of both parameters can be either signed or unsigned.
  2052. ///
  2053. /// \headerfile <x86intrin.h>
  2054. ///
  2055. /// This intrinsic corresponds to the <c> VPADDQ / PADDQ </c> instruction.
  2056. ///
  2057. /// \param __a
  2058. /// A 128-bit vector of [2 x i64].
  2059. /// \param __b
  2060. /// A 128-bit vector of [2 x i64].
  2061. /// \returns A 128-bit vector of [2 x i64] containing the sums of both
  2062. /// parameters.
  2063. static __inline__ __m128i __DEFAULT_FN_ATTRS
  2064. _mm_add_epi64(__m128i __a, __m128i __b)
  2065. {
  2066. return (__m128i)((__v2du)__a + (__v2du)__b);
  2067. }
  2068. /// \brief Adds, with saturation, the corresponding elements of two 128-bit
  2069. /// signed [16 x i8] vectors, saving each sum in the corresponding element of
  2070. /// a 128-bit result vector of [16 x i8]. Positive sums greater than 7Fh are
  2071. /// saturated to 7Fh. Negative sums less than 80h are saturated to 80h.
  2072. ///
  2073. /// \headerfile <x86intrin.h>
  2074. ///
  2075. /// This intrinsic corresponds to the <c> VPADDSB / PADDSB </c> instruction.
  2076. ///
  2077. /// \param __a
  2078. /// A 128-bit signed [16 x i8] vector.
  2079. /// \param __b
  2080. /// A 128-bit signed [16 x i8] vector.
  2081. /// \returns A 128-bit signed [16 x i8] vector containing the saturated sums of
  2082. /// both parameters.
  2083. static __inline__ __m128i __DEFAULT_FN_ATTRS
  2084. _mm_adds_epi8(__m128i __a, __m128i __b)
  2085. {
  2086. return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b);
  2087. }
  2088. /// \brief Adds, with saturation, the corresponding elements of two 128-bit
  2089. /// signed [8 x i16] vectors, saving each sum in the corresponding element of
  2090. /// a 128-bit result vector of [8 x i16]. Positive sums greater than 7FFFh
  2091. /// are saturated to 7FFFh. Negative sums less than 8000h are saturated to
  2092. /// 8000h.
  2093. ///
  2094. /// \headerfile <x86intrin.h>
  2095. ///
  2096. /// This intrinsic corresponds to the <c> VPADDSW / PADDSW </c> instruction.
  2097. ///
  2098. /// \param __a
  2099. /// A 128-bit signed [8 x i16] vector.
  2100. /// \param __b
  2101. /// A 128-bit signed [8 x i16] vector.
  2102. /// \returns A 128-bit signed [8 x i16] vector containing the saturated sums of
  2103. /// both parameters.
  2104. static __inline__ __m128i __DEFAULT_FN_ATTRS
  2105. _mm_adds_epi16(__m128i __a, __m128i __b)
  2106. {
  2107. return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b);
  2108. }
  2109. /// \brief Adds, with saturation, the corresponding elements of two 128-bit
  2110. /// unsigned [16 x i8] vectors, saving each sum in the corresponding element
  2111. /// of a 128-bit result vector of [16 x i8]. Positive sums greater than FFh
  2112. /// are saturated to FFh. Negative sums are saturated to 00h.
  2113. ///
  2114. /// \headerfile <x86intrin.h>
  2115. ///
  2116. /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
  2117. ///
  2118. /// \param __a
  2119. /// A 128-bit unsigned [16 x i8] vector.
  2120. /// \param __b
  2121. /// A 128-bit unsigned [16 x i8] vector.
  2122. /// \returns A 128-bit unsigned [16 x i8] vector containing the saturated sums
  2123. /// of both parameters.
  2124. static __inline__ __m128i __DEFAULT_FN_ATTRS
  2125. _mm_adds_epu8(__m128i __a, __m128i __b)
  2126. {
  2127. return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b);
  2128. }
  2129. /// \brief Adds, with saturation, the corresponding elements of two 128-bit
  2130. /// unsigned [8 x i16] vectors, saving each sum in the corresponding element
  2131. /// of a 128-bit result vector of [8 x i16]. Positive sums greater than FFFFh
  2132. /// are saturated to FFFFh. Negative sums are saturated to 0000h.
  2133. ///
  2134. /// \headerfile <x86intrin.h>
  2135. ///
  2136. /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
  2137. ///
  2138. /// \param __a
  2139. /// A 128-bit unsigned [8 x i16] vector.
  2140. /// \param __b
  2141. /// A 128-bit unsigned [8 x i16] vector.
  2142. /// \returns A 128-bit unsigned [8 x i16] vector containing the saturated sums
  2143. /// of both parameters.
  2144. static __inline__ __m128i __DEFAULT_FN_ATTRS
  2145. _mm_adds_epu16(__m128i __a, __m128i __b)
  2146. {
  2147. return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b);
  2148. }
  2149. /// \brief Computes the rounded avarages of corresponding elements of two
  2150. /// 128-bit unsigned [16 x i8] vectors, saving each result in the
  2151. /// corresponding element of a 128-bit result vector of [16 x i8].
  2152. ///
  2153. /// \headerfile <x86intrin.h>
  2154. ///
  2155. /// This intrinsic corresponds to the <c> VPAVGB / PAVGB </c> instruction.
  2156. ///
  2157. /// \param __a
  2158. /// A 128-bit unsigned [16 x i8] vector.
  2159. /// \param __b
  2160. /// A 128-bit unsigned [16 x i8] vector.
  2161. /// \returns A 128-bit unsigned [16 x i8] vector containing the rounded
  2162. /// averages of both parameters.
  2163. static __inline__ __m128i __DEFAULT_FN_ATTRS
  2164. _mm_avg_epu8(__m128i __a, __m128i __b)
  2165. {
  2166. typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
  2167. return (__m128i)__builtin_convertvector(
  2168. ((__builtin_convertvector((__v16qu)__a, __v16hu) +
  2169. __builtin_convertvector((__v16qu)__b, __v16hu)) + 1)
  2170. >> 1, __v16qu);
  2171. }
  2172. /// \brief Computes the rounded avarages of corresponding elements of two
  2173. /// 128-bit unsigned [8 x i16] vectors, saving each result in the
  2174. /// corresponding element of a 128-bit result vector of [8 x i16].
  2175. ///
  2176. /// \headerfile <x86intrin.h>
  2177. ///
  2178. /// This intrinsic corresponds to the <c> VPAVGW / PAVGW </c> instruction.
  2179. ///
  2180. /// \param __a
  2181. /// A 128-bit unsigned [8 x i16] vector.
  2182. /// \param __b
  2183. /// A 128-bit unsigned [8 x i16] vector.
  2184. /// \returns A 128-bit unsigned [8 x i16] vector containing the rounded
  2185. /// averages of both parameters.
  2186. static __inline__ __m128i __DEFAULT_FN_ATTRS
  2187. _mm_avg_epu16(__m128i __a, __m128i __b)
  2188. {
  2189. typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
  2190. return (__m128i)__builtin_convertvector(
  2191. ((__builtin_convertvector((__v8hu)__a, __v8su) +
  2192. __builtin_convertvector((__v8hu)__b, __v8su)) + 1)
  2193. >> 1, __v8hu);
  2194. }
  2195. /// \brief Multiplies the corresponding elements of two 128-bit signed [8 x i16]
  2196. /// vectors, producing eight intermediate 32-bit signed integer products, and
  2197. /// adds the consecutive pairs of 32-bit products to form a 128-bit signed
  2198. /// [4 x i32] vector.
  2199. ///
  2200. /// For example, bits [15:0] of both parameters are multiplied producing a
  2201. /// 32-bit product, bits [31:16] of both parameters are multiplied producing
  2202. /// a 32-bit product, and the sum of those two products becomes bits [31:0]
  2203. /// of the result.
  2204. ///
  2205. /// \headerfile <x86intrin.h>
  2206. ///
  2207. /// This intrinsic corresponds to the <c> VPMADDWD / PMADDWD </c> instruction.
  2208. ///
  2209. /// \param __a
  2210. /// A 128-bit signed [8 x i16] vector.
  2211. /// \param __b
  2212. /// A 128-bit signed [8 x i16] vector.
  2213. /// \returns A 128-bit signed [4 x i32] vector containing the sums of products
  2214. /// of both parameters.
  2215. static __inline__ __m128i __DEFAULT_FN_ATTRS
  2216. _mm_madd_epi16(__m128i __a, __m128i __b)
  2217. {
  2218. return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
  2219. }
  2220. /// \brief Compares corresponding elements of two 128-bit signed [8 x i16]
  2221. /// vectors, saving the greater value from each comparison in the
  2222. /// corresponding element of a 128-bit result vector of [8 x i16].
  2223. ///
  2224. /// \headerfile <x86intrin.h>
  2225. ///
  2226. /// This intrinsic corresponds to the <c> VPMAXSW / PMAXSW </c> instruction.
  2227. ///
  2228. /// \param __a
  2229. /// A 128-bit signed [8 x i16] vector.
  2230. /// \param __b
  2231. /// A 128-bit signed [8 x i16] vector.
  2232. /// \returns A 128-bit signed [8 x i16] vector containing the greater value of
  2233. /// each comparison.
  2234. static __inline__ __m128i __DEFAULT_FN_ATTRS
  2235. _mm_max_epi16(__m128i __a, __m128i __b)
  2236. {
  2237. return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b);
  2238. }
  2239. /// \brief Compares corresponding elements of two 128-bit unsigned [16 x i8]
  2240. /// vectors, saving the greater value from each comparison in the
  2241. /// corresponding element of a 128-bit result vector of [16 x i8].
  2242. ///
  2243. /// \headerfile <x86intrin.h>
  2244. ///
  2245. /// This intrinsic corresponds to the <c> VPMAXUB / PMAXUB </c> instruction.
  2246. ///
  2247. /// \param __a
  2248. /// A 128-bit unsigned [16 x i8] vector.
  2249. /// \param __b
  2250. /// A 128-bit unsigned [16 x i8] vector.
  2251. /// \returns A 128-bit unsigned [16 x i8] vector containing the greater value of
  2252. /// each comparison.
  2253. static __inline__ __m128i __DEFAULT_FN_ATTRS
  2254. _mm_max_epu8(__m128i __a, __m128i __b)
  2255. {
  2256. return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b);
  2257. }
  2258. /// \brief Compares corresponding elements of two 128-bit signed [8 x i16]
  2259. /// vectors, saving the smaller value from each comparison in the
  2260. /// corresponding element of a 128-bit result vector of [8 x i16].
  2261. ///
  2262. /// \headerfile <x86intrin.h>
  2263. ///
  2264. /// This intrinsic corresponds to the <c> VPMINSW / PMINSW </c> instruction.
  2265. ///
  2266. /// \param __a
  2267. /// A 128-bit signed [8 x i16] vector.
  2268. /// \param __b
  2269. /// A 128-bit signed [8 x i16] vector.
  2270. /// \returns A 128-bit signed [8 x i16] vector containing the smaller value of
  2271. /// each comparison.
  2272. static __inline__ __m128i __DEFAULT_FN_ATTRS
  2273. _mm_min_epi16(__m128i __a, __m128i __b)
  2274. {
  2275. return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b);
  2276. }
  2277. /// \brief Compares corresponding elements of two 128-bit unsigned [16 x i8]
  2278. /// vectors, saving the smaller value from each comparison in the
  2279. /// corresponding element of a 128-bit result vector of [16 x i8].
  2280. ///
  2281. /// \headerfile <x86intrin.h>
  2282. ///
  2283. /// This intrinsic corresponds to the <c> VPMINUB / PMINUB </c> instruction.
  2284. ///
  2285. /// \param __a
  2286. /// A 128-bit unsigned [16 x i8] vector.
  2287. /// \param __b
  2288. /// A 128-bit unsigned [16 x i8] vector.
  2289. /// \returns A 128-bit unsigned [16 x i8] vector containing the smaller value of
  2290. /// each comparison.
  2291. static __inline__ __m128i __DEFAULT_FN_ATTRS
  2292. _mm_min_epu8(__m128i __a, __m128i __b)
  2293. {
  2294. return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b);
  2295. }
  2296. /// \brief Multiplies the corresponding elements of two signed [8 x i16]
  2297. /// vectors, saving the upper 16 bits of each 32-bit product in the
  2298. /// corresponding element of a 128-bit signed [8 x i16] result vector.
  2299. ///
  2300. /// \headerfile <x86intrin.h>
  2301. ///
  2302. /// This intrinsic corresponds to the <c> VPMULHW / PMULHW </c> instruction.
  2303. ///
  2304. /// \param __a
  2305. /// A 128-bit signed [8 x i16] vector.
  2306. /// \param __b
  2307. /// A 128-bit signed [8 x i16] vector.
  2308. /// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of
  2309. /// each of the eight 32-bit products.
  2310. static __inline__ __m128i __DEFAULT_FN_ATTRS
  2311. _mm_mulhi_epi16(__m128i __a, __m128i __b)
  2312. {
  2313. return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
  2314. }
  2315. /// \brief Multiplies the corresponding elements of two unsigned [8 x i16]
  2316. /// vectors, saving the upper 16 bits of each 32-bit product in the
  2317. /// corresponding element of a 128-bit unsigned [8 x i16] result vector.
  2318. ///
  2319. /// \headerfile <x86intrin.h>
  2320. ///
  2321. /// This intrinsic corresponds to the <c> VPMULHUW / PMULHUW </c> instruction.
  2322. ///
  2323. /// \param __a
  2324. /// A 128-bit unsigned [8 x i16] vector.
  2325. /// \param __b
  2326. /// A 128-bit unsigned [8 x i16] vector.
  2327. /// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits
  2328. /// of each of the eight 32-bit products.
  2329. static __inline__ __m128i __DEFAULT_FN_ATTRS
  2330. _mm_mulhi_epu16(__m128i __a, __m128i __b)
  2331. {
  2332. return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
  2333. }
  2334. /// \brief Multiplies the corresponding elements of two signed [8 x i16]
  2335. /// vectors, saving the lower 16 bits of each 32-bit product in the
  2336. /// corresponding element of a 128-bit signed [8 x i16] result vector.
  2337. ///
  2338. /// \headerfile <x86intrin.h>
  2339. ///
  2340. /// This intrinsic corresponds to the <c> VPMULLW / PMULLW </c> instruction.
  2341. ///
  2342. /// \param __a
  2343. /// A 128-bit signed [8 x i16] vector.
  2344. /// \param __b
  2345. /// A 128-bit signed [8 x i16] vector.
  2346. /// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of
  2347. /// each of the eight 32-bit products.
  2348. static __inline__ __m128i __DEFAULT_FN_ATTRS
  2349. _mm_mullo_epi16(__m128i __a, __m128i __b)
  2350. {
  2351. return (__m128i)((__v8hu)__a * (__v8hu)__b);
  2352. }
  2353. /// \brief Multiplies 32-bit unsigned integer values contained in the lower bits
  2354. /// of the two 64-bit integer vectors and returns the 64-bit unsigned
  2355. /// product.
  2356. ///
  2357. /// \headerfile <x86intrin.h>
  2358. ///
  2359. /// This intrinsic corresponds to the <c> PMULUDQ </c> instruction.
  2360. ///
  2361. /// \param __a
  2362. /// A 64-bit integer containing one of the source operands.
  2363. /// \param __b
  2364. /// A 64-bit integer containing one of the source operands.
  2365. /// \returns A 64-bit integer vector containing the product of both operands.
  2366. static __inline__ __m64 __DEFAULT_FN_ATTRS
  2367. _mm_mul_su32(__m64 __a, __m64 __b)
  2368. {
  2369. return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
  2370. }
  2371. /// \brief Multiplies 32-bit unsigned integer values contained in the lower
  2372. /// bits of the corresponding elements of two [2 x i64] vectors, and returns
  2373. /// the 64-bit products in the corresponding elements of a [2 x i64] vector.
  2374. ///
  2375. /// \headerfile <x86intrin.h>
  2376. ///
  2377. /// This intrinsic corresponds to the <c> VPMULUDQ / PMULUDQ </c> instruction.
  2378. ///
  2379. /// \param __a
  2380. /// A [2 x i64] vector containing one of the source operands.
  2381. /// \param __b
  2382. /// A [2 x i64] vector containing one of the source operands.
  2383. /// \returns A [2 x i64] vector containing the product of both operands.
  2384. static __inline__ __m128i __DEFAULT_FN_ATTRS
  2385. _mm_mul_epu32(__m128i __a, __m128i __b)
  2386. {
  2387. return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
  2388. }
  2389. /// \brief Computes the absolute differences of corresponding 8-bit integer
  2390. /// values in two 128-bit vectors. Sums the first 8 absolute differences, and
  2391. /// separately sums the second 8 absolute differences. Packs these two
  2392. /// unsigned 16-bit integer sums into the upper and lower elements of a
  2393. /// [2 x i64] vector.
  2394. ///
  2395. /// \headerfile <x86intrin.h>
  2396. ///
  2397. /// This intrinsic corresponds to the <c> VPSADBW / PSADBW </c> instruction.
  2398. ///
  2399. /// \param __a
  2400. /// A 128-bit integer vector containing one of the source operands.
  2401. /// \param __b
  2402. /// A 128-bit integer vector containing one of the source operands.
  2403. /// \returns A [2 x i64] vector containing the sums of the sets of absolute
  2404. /// differences between both operands.
  2405. static __inline__ __m128i __DEFAULT_FN_ATTRS
  2406. _mm_sad_epu8(__m128i __a, __m128i __b)
  2407. {
  2408. return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
  2409. }
  2410. /// \brief Subtracts the corresponding 8-bit integer values in the operands.
  2411. ///
  2412. /// \headerfile <x86intrin.h>
  2413. ///
  2414. /// This intrinsic corresponds to the <c> VPSUBB / PSUBB </c> instruction.
  2415. ///
  2416. /// \param __a
  2417. /// A 128-bit integer vector containing the minuends.
  2418. /// \param __b
  2419. /// A 128-bit integer vector containing the subtrahends.
  2420. /// \returns A 128-bit integer vector containing the differences of the values
  2421. /// in the operands.
  2422. static __inline__ __m128i __DEFAULT_FN_ATTRS
  2423. _mm_sub_epi8(__m128i __a, __m128i __b)
  2424. {
  2425. return (__m128i)((__v16qu)__a - (__v16qu)__b);
  2426. }
  2427. /// \brief Subtracts the corresponding 16-bit integer values in the operands.
  2428. ///
  2429. /// \headerfile <x86intrin.h>
  2430. ///
  2431. /// This intrinsic corresponds to the <c> VPSUBW / PSUBW </c> instruction.
  2432. ///
  2433. /// \param __a
  2434. /// A 128-bit integer vector containing the minuends.
  2435. /// \param __b
  2436. /// A 128-bit integer vector containing the subtrahends.
  2437. /// \returns A 128-bit integer vector containing the differences of the values
  2438. /// in the operands.
  2439. static __inline__ __m128i __DEFAULT_FN_ATTRS
  2440. _mm_sub_epi16(__m128i __a, __m128i __b)
  2441. {
  2442. return (__m128i)((__v8hu)__a - (__v8hu)__b);
  2443. }
  2444. /// \brief Subtracts the corresponding 32-bit integer values in the operands.
  2445. ///
  2446. /// \headerfile <x86intrin.h>
  2447. ///
  2448. /// This intrinsic corresponds to the <c> VPSUBD / PSUBD </c> instruction.
  2449. ///
  2450. /// \param __a
  2451. /// A 128-bit integer vector containing the minuends.
  2452. /// \param __b
  2453. /// A 128-bit integer vector containing the subtrahends.
  2454. /// \returns A 128-bit integer vector containing the differences of the values
  2455. /// in the operands.
  2456. static __inline__ __m128i __DEFAULT_FN_ATTRS
  2457. _mm_sub_epi32(__m128i __a, __m128i __b)
  2458. {
  2459. return (__m128i)((__v4su)__a - (__v4su)__b);
  2460. }
  2461. /// \brief Subtracts signed or unsigned 64-bit integer values and writes the
  2462. /// difference to the corresponding bits in the destination.
  2463. ///
  2464. /// \headerfile <x86intrin.h>
  2465. ///
  2466. /// This intrinsic corresponds to the <c> PSUBQ </c> instruction.
  2467. ///
  2468. /// \param __a
  2469. /// A 64-bit integer vector containing the minuend.
  2470. /// \param __b
  2471. /// A 64-bit integer vector containing the subtrahend.
  2472. /// \returns A 64-bit integer vector containing the difference of the values in
  2473. /// the operands.
  2474. static __inline__ __m64 __DEFAULT_FN_ATTRS
  2475. _mm_sub_si64(__m64 __a, __m64 __b)
  2476. {
  2477. return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b);
  2478. }
  2479. /// \brief Subtracts the corresponding elements of two [2 x i64] vectors.
  2480. ///
  2481. /// \headerfile <x86intrin.h>
  2482. ///
  2483. /// This intrinsic corresponds to the <c> VPSUBQ / PSUBQ </c> instruction.
  2484. ///
  2485. /// \param __a
  2486. /// A 128-bit integer vector containing the minuends.
  2487. /// \param __b
  2488. /// A 128-bit integer vector containing the subtrahends.
  2489. /// \returns A 128-bit integer vector containing the differences of the values
  2490. /// in the operands.
  2491. static __inline__ __m128i __DEFAULT_FN_ATTRS
  2492. _mm_sub_epi64(__m128i __a, __m128i __b)
  2493. {
  2494. return (__m128i)((__v2du)__a - (__v2du)__b);
  2495. }
  2496. /// \brief Subtracts corresponding 8-bit signed integer values in the input and
  2497. /// returns the differences in the corresponding bytes in the destination.
  2498. /// Differences greater than 7Fh are saturated to 7Fh, and differences less
  2499. /// than 80h are saturated to 80h.
  2500. ///
  2501. /// \headerfile <x86intrin.h>
  2502. ///
  2503. /// This intrinsic corresponds to the <c> VPSUBSB / PSUBSB </c> instruction.
  2504. ///
  2505. /// \param __a
  2506. /// A 128-bit integer vector containing the minuends.
  2507. /// \param __b
  2508. /// A 128-bit integer vector containing the subtrahends.
  2509. /// \returns A 128-bit integer vector containing the differences of the values
  2510. /// in the operands.
  2511. static __inline__ __m128i __DEFAULT_FN_ATTRS
  2512. _mm_subs_epi8(__m128i __a, __m128i __b)
  2513. {
  2514. return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b);
  2515. }
  2516. /// \brief Subtracts corresponding 16-bit signed integer values in the input and
  2517. /// returns the differences in the corresponding bytes in the destination.
  2518. /// Differences greater than 7FFFh are saturated to 7FFFh, and values less
  2519. /// than 8000h are saturated to 8000h.
  2520. ///
  2521. /// \headerfile <x86intrin.h>
  2522. ///
  2523. /// This intrinsic corresponds to the <c> VPSUBSW / PSUBSW </c> instruction.
  2524. ///
  2525. /// \param __a
  2526. /// A 128-bit integer vector containing the minuends.
  2527. /// \param __b
  2528. /// A 128-bit integer vector containing the subtrahends.
  2529. /// \returns A 128-bit integer vector containing the differences of the values
  2530. /// in the operands.
  2531. static __inline__ __m128i __DEFAULT_FN_ATTRS
  2532. _mm_subs_epi16(__m128i __a, __m128i __b)
  2533. {
  2534. return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b);
  2535. }
  2536. /// \brief Subtracts corresponding 8-bit unsigned integer values in the input
  2537. /// and returns the differences in the corresponding bytes in the
  2538. /// destination. Differences less than 00h are saturated to 00h.
  2539. ///
  2540. /// \headerfile <x86intrin.h>
  2541. ///
  2542. /// This intrinsic corresponds to the <c> VPSUBUSB / PSUBUSB </c> instruction.
  2543. ///
  2544. /// \param __a
  2545. /// A 128-bit integer vector containing the minuends.
  2546. /// \param __b
  2547. /// A 128-bit integer vector containing the subtrahends.
  2548. /// \returns A 128-bit integer vector containing the unsigned integer
  2549. /// differences of the values in the operands.
  2550. static __inline__ __m128i __DEFAULT_FN_ATTRS
  2551. _mm_subs_epu8(__m128i __a, __m128i __b)
  2552. {
  2553. return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b);
  2554. }
  2555. /// \brief Subtracts corresponding 16-bit unsigned integer values in the input
  2556. /// and returns the differences in the corresponding bytes in the
  2557. /// destination. Differences less than 0000h are saturated to 0000h.
  2558. ///
  2559. /// \headerfile <x86intrin.h>
  2560. ///
  2561. /// This intrinsic corresponds to the <c> VPSUBUSW / PSUBUSW </c> instruction.
  2562. ///
  2563. /// \param __a
  2564. /// A 128-bit integer vector containing the minuends.
  2565. /// \param __b
  2566. /// A 128-bit integer vector containing the subtrahends.
  2567. /// \returns A 128-bit integer vector containing the unsigned integer
  2568. /// differences of the values in the operands.
  2569. static __inline__ __m128i __DEFAULT_FN_ATTRS
  2570. _mm_subs_epu16(__m128i __a, __m128i __b)
  2571. {
  2572. return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b);
  2573. }
  2574. /// \brief Performs a bitwise AND of two 128-bit integer vectors.
  2575. ///
  2576. /// \headerfile <x86intrin.h>
  2577. ///
  2578. /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
  2579. ///
  2580. /// \param __a
  2581. /// A 128-bit integer vector containing one of the source operands.
  2582. /// \param __b
  2583. /// A 128-bit integer vector containing one of the source operands.
  2584. /// \returns A 128-bit integer vector containing the bitwise AND of the values
  2585. /// in both operands.
  2586. static __inline__ __m128i __DEFAULT_FN_ATTRS
  2587. _mm_and_si128(__m128i __a, __m128i __b)
  2588. {
  2589. return (__m128i)((__v2du)__a & (__v2du)__b);
  2590. }
  2591. /// \brief Performs a bitwise AND of two 128-bit integer vectors, using the
  2592. /// one's complement of the values contained in the first source operand.
  2593. ///
  2594. /// \headerfile <x86intrin.h>
  2595. ///
  2596. /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
  2597. ///
  2598. /// \param __a
  2599. /// A 128-bit vector containing the left source operand. The one's complement
  2600. /// of this value is used in the bitwise AND.
  2601. /// \param __b
  2602. /// A 128-bit vector containing the right source operand.
  2603. /// \returns A 128-bit integer vector containing the bitwise AND of the one's
  2604. /// complement of the first operand and the values in the second operand.
  2605. static __inline__ __m128i __DEFAULT_FN_ATTRS
  2606. _mm_andnot_si128(__m128i __a, __m128i __b)
  2607. {
  2608. return (__m128i)(~(__v2du)__a & (__v2du)__b);
  2609. }
  2610. /// \brief Performs a bitwise OR of two 128-bit integer vectors.
  2611. ///
  2612. /// \headerfile <x86intrin.h>
  2613. ///
  2614. /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
  2615. ///
  2616. /// \param __a
  2617. /// A 128-bit integer vector containing one of the source operands.
  2618. /// \param __b
  2619. /// A 128-bit integer vector containing one of the source operands.
  2620. /// \returns A 128-bit integer vector containing the bitwise OR of the values
  2621. /// in both operands.
  2622. static __inline__ __m128i __DEFAULT_FN_ATTRS
  2623. _mm_or_si128(__m128i __a, __m128i __b)
  2624. {
  2625. return (__m128i)((__v2du)__a | (__v2du)__b);
  2626. }
  2627. /// \brief Performs a bitwise exclusive OR of two 128-bit integer vectors.
  2628. ///
  2629. /// \headerfile <x86intrin.h>
  2630. ///
  2631. /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
  2632. ///
  2633. /// \param __a
  2634. /// A 128-bit integer vector containing one of the source operands.
  2635. /// \param __b
  2636. /// A 128-bit integer vector containing one of the source operands.
  2637. /// \returns A 128-bit integer vector containing the bitwise exclusive OR of the
  2638. /// values in both operands.
  2639. static __inline__ __m128i __DEFAULT_FN_ATTRS
  2640. _mm_xor_si128(__m128i __a, __m128i __b)
  2641. {
  2642. return (__m128i)((__v2du)__a ^ (__v2du)__b);
  2643. }
  2644. /// \brief Left-shifts the 128-bit integer vector operand by the specified
  2645. /// number of bytes. Low-order bits are cleared.
  2646. ///
  2647. /// \headerfile <x86intrin.h>
  2648. ///
  2649. /// \code
  2650. /// __m128i _mm_slli_si128(__m128i a, const int imm);
  2651. /// \endcode
  2652. ///
  2653. /// This intrinsic corresponds to the <c> VPSLLDQ / PSLLDQ </c> instruction.
  2654. ///
  2655. /// \param a
  2656. /// A 128-bit integer vector containing the source operand.
  2657. /// \param imm
  2658. /// An immediate value specifying the number of bytes to left-shift operand
  2659. /// \a a.
  2660. /// \returns A 128-bit integer vector containing the left-shifted value.
  2661. #define _mm_slli_si128(a, imm) __extension__ ({ \
  2662. (__m128i)__builtin_shufflevector( \
  2663. (__v16qi)_mm_setzero_si128(), \
  2664. (__v16qi)(__m128i)(a), \
  2665. ((char)(imm)&0xF0) ? 0 : 16 - (char)(imm), \
  2666. ((char)(imm)&0xF0) ? 1 : 17 - (char)(imm), \
  2667. ((char)(imm)&0xF0) ? 2 : 18 - (char)(imm), \
  2668. ((char)(imm)&0xF0) ? 3 : 19 - (char)(imm), \
  2669. ((char)(imm)&0xF0) ? 4 : 20 - (char)(imm), \
  2670. ((char)(imm)&0xF0) ? 5 : 21 - (char)(imm), \
  2671. ((char)(imm)&0xF0) ? 6 : 22 - (char)(imm), \
  2672. ((char)(imm)&0xF0) ? 7 : 23 - (char)(imm), \
  2673. ((char)(imm)&0xF0) ? 8 : 24 - (char)(imm), \
  2674. ((char)(imm)&0xF0) ? 9 : 25 - (char)(imm), \
  2675. ((char)(imm)&0xF0) ? 10 : 26 - (char)(imm), \
  2676. ((char)(imm)&0xF0) ? 11 : 27 - (char)(imm), \
  2677. ((char)(imm)&0xF0) ? 12 : 28 - (char)(imm), \
  2678. ((char)(imm)&0xF0) ? 13 : 29 - (char)(imm), \
  2679. ((char)(imm)&0xF0) ? 14 : 30 - (char)(imm), \
  2680. ((char)(imm)&0xF0) ? 15 : 31 - (char)(imm)); })
  2681. #define _mm_bslli_si128(a, imm) \
  2682. _mm_slli_si128((a), (imm))
  2683. /// \brief Left-shifts each 16-bit value in the 128-bit integer vector operand
  2684. /// by the specified number of bits. Low-order bits are cleared.
  2685. ///
  2686. /// \headerfile <x86intrin.h>
  2687. ///
  2688. /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
  2689. ///
  2690. /// \param __a
  2691. /// A 128-bit integer vector containing the source operand.
  2692. /// \param __count
  2693. /// An integer value specifying the number of bits to left-shift each value
  2694. /// in operand \a __a.
  2695. /// \returns A 128-bit integer vector containing the left-shifted values.
  2696. static __inline__ __m128i __DEFAULT_FN_ATTRS
  2697. _mm_slli_epi16(__m128i __a, int __count)
  2698. {
  2699. return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
  2700. }
  2701. /// \brief Left-shifts each 16-bit value in the 128-bit integer vector operand
  2702. /// by the specified number of bits. Low-order bits are cleared.
  2703. ///
  2704. /// \headerfile <x86intrin.h>
  2705. ///
  2706. /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
  2707. ///
  2708. /// \param __a
  2709. /// A 128-bit integer vector containing the source operand.
  2710. /// \param __count
  2711. /// A 128-bit integer vector in which bits [63:0] specify the number of bits
  2712. /// to left-shift each value in operand \a __a.
  2713. /// \returns A 128-bit integer vector containing the left-shifted values.
  2714. static __inline__ __m128i __DEFAULT_FN_ATTRS
  2715. _mm_sll_epi16(__m128i __a, __m128i __count)
  2716. {
  2717. return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
  2718. }
  2719. /// \brief Left-shifts each 32-bit value in the 128-bit integer vector operand
  2720. /// by the specified number of bits. Low-order bits are cleared.
  2721. ///
  2722. /// \headerfile <x86intrin.h>
  2723. ///
  2724. /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
  2725. ///
  2726. /// \param __a
  2727. /// A 128-bit integer vector containing the source operand.
  2728. /// \param __count
  2729. /// An integer value specifying the number of bits to left-shift each value
  2730. /// in operand \a __a.
  2731. /// \returns A 128-bit integer vector containing the left-shifted values.
  2732. static __inline__ __m128i __DEFAULT_FN_ATTRS
  2733. _mm_slli_epi32(__m128i __a, int __count)
  2734. {
  2735. return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
  2736. }
  2737. /// \brief Left-shifts each 32-bit value in the 128-bit integer vector operand
  2738. /// by the specified number of bits. Low-order bits are cleared.
  2739. ///
  2740. /// \headerfile <x86intrin.h>
  2741. ///
  2742. /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
  2743. ///
  2744. /// \param __a
  2745. /// A 128-bit integer vector containing the source operand.
  2746. /// \param __count
  2747. /// A 128-bit integer vector in which bits [63:0] specify the number of bits
  2748. /// to left-shift each value in operand \a __a.
  2749. /// \returns A 128-bit integer vector containing the left-shifted values.
  2750. static __inline__ __m128i __DEFAULT_FN_ATTRS
  2751. _mm_sll_epi32(__m128i __a, __m128i __count)
  2752. {
  2753. return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
  2754. }
  2755. /// \brief Left-shifts each 64-bit value in the 128-bit integer vector operand
  2756. /// by the specified number of bits. Low-order bits are cleared.
  2757. ///
  2758. /// \headerfile <x86intrin.h>
  2759. ///
  2760. /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
  2761. ///
  2762. /// \param __a
  2763. /// A 128-bit integer vector containing the source operand.
  2764. /// \param __count
  2765. /// An integer value specifying the number of bits to left-shift each value
  2766. /// in operand \a __a.
  2767. /// \returns A 128-bit integer vector containing the left-shifted values.
  2768. static __inline__ __m128i __DEFAULT_FN_ATTRS
  2769. _mm_slli_epi64(__m128i __a, int __count)
  2770. {
  2771. return __builtin_ia32_psllqi128((__v2di)__a, __count);
  2772. }
  2773. /// \brief Left-shifts each 64-bit value in the 128-bit integer vector operand
  2774. /// by the specified number of bits. Low-order bits are cleared.
  2775. ///
  2776. /// \headerfile <x86intrin.h>
  2777. ///
  2778. /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
  2779. ///
  2780. /// \param __a
  2781. /// A 128-bit integer vector containing the source operand.
  2782. /// \param __count
  2783. /// A 128-bit integer vector in which bits [63:0] specify the number of bits
  2784. /// to left-shift each value in operand \a __a.
  2785. /// \returns A 128-bit integer vector containing the left-shifted values.
  2786. static __inline__ __m128i __DEFAULT_FN_ATTRS
  2787. _mm_sll_epi64(__m128i __a, __m128i __count)
  2788. {
  2789. return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count);
  2790. }
  2791. /// \brief Right-shifts each 16-bit value in the 128-bit integer vector operand
  2792. /// by the specified number of bits. High-order bits are filled with the sign
  2793. /// bit of the initial value.
  2794. ///
  2795. /// \headerfile <x86intrin.h>
  2796. ///
  2797. /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
  2798. ///
  2799. /// \param __a
  2800. /// A 128-bit integer vector containing the source operand.
  2801. /// \param __count
  2802. /// An integer value specifying the number of bits to right-shift each value
  2803. /// in operand \a __a.
  2804. /// \returns A 128-bit integer vector containing the right-shifted values.
  2805. static __inline__ __m128i __DEFAULT_FN_ATTRS
  2806. _mm_srai_epi16(__m128i __a, int __count)
  2807. {
  2808. return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
  2809. }
  2810. /// \brief Right-shifts each 16-bit value in the 128-bit integer vector operand
  2811. /// by the specified number of bits. High-order bits are filled with the sign
  2812. /// bit of the initial value.
  2813. ///
  2814. /// \headerfile <x86intrin.h>
  2815. ///
  2816. /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
  2817. ///
  2818. /// \param __a
  2819. /// A 128-bit integer vector containing the source operand.
  2820. /// \param __count
  2821. /// A 128-bit integer vector in which bits [63:0] specify the number of bits
  2822. /// to right-shift each value in operand \a __a.
  2823. /// \returns A 128-bit integer vector containing the right-shifted values.
  2824. static __inline__ __m128i __DEFAULT_FN_ATTRS
  2825. _mm_sra_epi16(__m128i __a, __m128i __count)
  2826. {
  2827. return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
  2828. }
  2829. /// \brief Right-shifts each 32-bit value in the 128-bit integer vector operand
  2830. /// by the specified number of bits. High-order bits are filled with the sign
  2831. /// bit of the initial value.
  2832. ///
  2833. /// \headerfile <x86intrin.h>
  2834. ///
  2835. /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
  2836. ///
  2837. /// \param __a
  2838. /// A 128-bit integer vector containing the source operand.
  2839. /// \param __count
  2840. /// An integer value specifying the number of bits to right-shift each value
  2841. /// in operand \a __a.
  2842. /// \returns A 128-bit integer vector containing the right-shifted values.
  2843. static __inline__ __m128i __DEFAULT_FN_ATTRS
  2844. _mm_srai_epi32(__m128i __a, int __count)
  2845. {
  2846. return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
  2847. }
  2848. /// \brief Right-shifts each 32-bit value in the 128-bit integer vector operand
  2849. /// by the specified number of bits. High-order bits are filled with the sign
  2850. /// bit of the initial value.
  2851. ///
  2852. /// \headerfile <x86intrin.h>
  2853. ///
  2854. /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
  2855. ///
  2856. /// \param __a
  2857. /// A 128-bit integer vector containing the source operand.
  2858. /// \param __count
  2859. /// A 128-bit integer vector in which bits [63:0] specify the number of bits
  2860. /// to right-shift each value in operand \a __a.
  2861. /// \returns A 128-bit integer vector containing the right-shifted values.
  2862. static __inline__ __m128i __DEFAULT_FN_ATTRS
  2863. _mm_sra_epi32(__m128i __a, __m128i __count)
  2864. {
  2865. return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
  2866. }
  2867. /// \brief Right-shifts the 128-bit integer vector operand by the specified
  2868. /// number of bytes. High-order bits are cleared.
  2869. ///
  2870. /// \headerfile <x86intrin.h>
  2871. ///
  2872. /// \code
  2873. /// __m128i _mm_srli_si128(__m128i a, const int imm);
  2874. /// \endcode
  2875. ///
  2876. /// This intrinsic corresponds to the <c> VPSRLDQ / PSRLDQ </c> instruction.
  2877. ///
  2878. /// \param a
  2879. /// A 128-bit integer vector containing the source operand.
  2880. /// \param imm
  2881. /// An immediate value specifying the number of bytes to right-shift operand
  2882. /// \a a.
  2883. /// \returns A 128-bit integer vector containing the right-shifted value.
  2884. #define _mm_srli_si128(a, imm) __extension__ ({ \
  2885. (__m128i)__builtin_shufflevector( \
  2886. (__v16qi)(__m128i)(a), \
  2887. (__v16qi)_mm_setzero_si128(), \
  2888. ((char)(imm)&0xF0) ? 16 : (char)(imm) + 0, \
  2889. ((char)(imm)&0xF0) ? 17 : (char)(imm) + 1, \
  2890. ((char)(imm)&0xF0) ? 18 : (char)(imm) + 2, \
  2891. ((char)(imm)&0xF0) ? 19 : (char)(imm) + 3, \
  2892. ((char)(imm)&0xF0) ? 20 : (char)(imm) + 4, \
  2893. ((char)(imm)&0xF0) ? 21 : (char)(imm) + 5, \
  2894. ((char)(imm)&0xF0) ? 22 : (char)(imm) + 6, \
  2895. ((char)(imm)&0xF0) ? 23 : (char)(imm) + 7, \
  2896. ((char)(imm)&0xF0) ? 24 : (char)(imm) + 8, \
  2897. ((char)(imm)&0xF0) ? 25 : (char)(imm) + 9, \
  2898. ((char)(imm)&0xF0) ? 26 : (char)(imm) + 10, \
  2899. ((char)(imm)&0xF0) ? 27 : (char)(imm) + 11, \
  2900. ((char)(imm)&0xF0) ? 28 : (char)(imm) + 12, \
  2901. ((char)(imm)&0xF0) ? 29 : (char)(imm) + 13, \
  2902. ((char)(imm)&0xF0) ? 30 : (char)(imm) + 14, \
  2903. ((char)(imm)&0xF0) ? 31 : (char)(imm) + 15); })
  2904. #define _mm_bsrli_si128(a, imm) \
  2905. _mm_srli_si128((a), (imm))
  2906. /// \brief Right-shifts each of 16-bit values in the 128-bit integer vector
  2907. /// operand by the specified number of bits. High-order bits are cleared.
  2908. ///
  2909. /// \headerfile <x86intrin.h>
  2910. ///
  2911. /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
  2912. ///
  2913. /// \param __a
  2914. /// A 128-bit integer vector containing the source operand.
  2915. /// \param __count
  2916. /// An integer value specifying the number of bits to right-shift each value
  2917. /// in operand \a __a.
  2918. /// \returns A 128-bit integer vector containing the right-shifted values.
  2919. static __inline__ __m128i __DEFAULT_FN_ATTRS
  2920. _mm_srli_epi16(__m128i __a, int __count)
  2921. {
  2922. return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
  2923. }
  2924. /// \brief Right-shifts each of 16-bit values in the 128-bit integer vector
  2925. /// operand by the specified number of bits. High-order bits are cleared.
  2926. ///
  2927. /// \headerfile <x86intrin.h>
  2928. ///
  2929. /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
  2930. ///
  2931. /// \param __a
  2932. /// A 128-bit integer vector containing the source operand.
  2933. /// \param __count
  2934. /// A 128-bit integer vector in which bits [63:0] specify the number of bits
  2935. /// to right-shift each value in operand \a __a.
  2936. /// \returns A 128-bit integer vector containing the right-shifted values.
  2937. static __inline__ __m128i __DEFAULT_FN_ATTRS
  2938. _mm_srl_epi16(__m128i __a, __m128i __count)
  2939. {
  2940. return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
  2941. }
  2942. /// \brief Right-shifts each of 32-bit values in the 128-bit integer vector
  2943. /// operand by the specified number of bits. High-order bits are cleared.
  2944. ///
  2945. /// \headerfile <x86intrin.h>
  2946. ///
  2947. /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
  2948. ///
  2949. /// \param __a
  2950. /// A 128-bit integer vector containing the source operand.
  2951. /// \param __count
  2952. /// An integer value specifying the number of bits to right-shift each value
  2953. /// in operand \a __a.
  2954. /// \returns A 128-bit integer vector containing the right-shifted values.
  2955. static __inline__ __m128i __DEFAULT_FN_ATTRS
  2956. _mm_srli_epi32(__m128i __a, int __count)
  2957. {
  2958. return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
  2959. }
  2960. /// \brief Right-shifts each of 32-bit values in the 128-bit integer vector
  2961. /// operand by the specified number of bits. High-order bits are cleared.
  2962. ///
  2963. /// \headerfile <x86intrin.h>
  2964. ///
  2965. /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
  2966. ///
  2967. /// \param __a
  2968. /// A 128-bit integer vector containing the source operand.
  2969. /// \param __count
  2970. /// A 128-bit integer vector in which bits [63:0] specify the number of bits
  2971. /// to right-shift each value in operand \a __a.
  2972. /// \returns A 128-bit integer vector containing the right-shifted values.
  2973. static __inline__ __m128i __DEFAULT_FN_ATTRS
  2974. _mm_srl_epi32(__m128i __a, __m128i __count)
  2975. {
  2976. return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
  2977. }
  2978. /// \brief Right-shifts each of 64-bit values in the 128-bit integer vector
  2979. /// operand by the specified number of bits. High-order bits are cleared.
  2980. ///
  2981. /// \headerfile <x86intrin.h>
  2982. ///
  2983. /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
  2984. ///
  2985. /// \param __a
  2986. /// A 128-bit integer vector containing the source operand.
  2987. /// \param __count
  2988. /// An integer value specifying the number of bits to right-shift each value
  2989. /// in operand \a __a.
  2990. /// \returns A 128-bit integer vector containing the right-shifted values.
  2991. static __inline__ __m128i __DEFAULT_FN_ATTRS
  2992. _mm_srli_epi64(__m128i __a, int __count)
  2993. {
  2994. return __builtin_ia32_psrlqi128((__v2di)__a, __count);
  2995. }
  2996. /// \brief Right-shifts each of 64-bit values in the 128-bit integer vector
  2997. /// operand by the specified number of bits. High-order bits are cleared.
  2998. ///
  2999. /// \headerfile <x86intrin.h>
  3000. ///
  3001. /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
  3002. ///
  3003. /// \param __a
  3004. /// A 128-bit integer vector containing the source operand.
  3005. /// \param __count
  3006. /// A 128-bit integer vector in which bits [63:0] specify the number of bits
  3007. /// to right-shift each value in operand \a __a.
  3008. /// \returns A 128-bit integer vector containing the right-shifted values.
  3009. static __inline__ __m128i __DEFAULT_FN_ATTRS
  3010. _mm_srl_epi64(__m128i __a, __m128i __count)
  3011. {
  3012. return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count);
  3013. }
  3014. /// \brief Compares each of the corresponding 8-bit values of the 128-bit
  3015. /// integer vectors for equality. Each comparison yields 0h for false, FFh
  3016. /// for true.
  3017. ///
  3018. /// \headerfile <x86intrin.h>
  3019. ///
  3020. /// This intrinsic corresponds to the <c> VPCMPEQB / PCMPEQB </c> instruction.
  3021. ///
  3022. /// \param __a
  3023. /// A 128-bit integer vector.
  3024. /// \param __b
  3025. /// A 128-bit integer vector.
  3026. /// \returns A 128-bit integer vector containing the comparison results.
  3027. static __inline__ __m128i __DEFAULT_FN_ATTRS
  3028. _mm_cmpeq_epi8(__m128i __a, __m128i __b)
  3029. {
  3030. return (__m128i)((__v16qi)__a == (__v16qi)__b);
  3031. }
  3032. /// \brief Compares each of the corresponding 16-bit values of the 128-bit
  3033. /// integer vectors for equality. Each comparison yields 0h for false, FFFFh
  3034. /// for true.
  3035. ///
  3036. /// \headerfile <x86intrin.h>
  3037. ///
  3038. /// This intrinsic corresponds to the <c> VPCMPEQW / PCMPEQW </c> instruction.
  3039. ///
  3040. /// \param __a
  3041. /// A 128-bit integer vector.
  3042. /// \param __b
  3043. /// A 128-bit integer vector.
  3044. /// \returns A 128-bit integer vector containing the comparison results.
  3045. static __inline__ __m128i __DEFAULT_FN_ATTRS
  3046. _mm_cmpeq_epi16(__m128i __a, __m128i __b)
  3047. {
  3048. return (__m128i)((__v8hi)__a == (__v8hi)__b);
  3049. }
  3050. /// \brief Compares each of the corresponding 32-bit values of the 128-bit
  3051. /// integer vectors for equality. Each comparison yields 0h for false,
  3052. /// FFFFFFFFh for true.
  3053. ///
  3054. /// \headerfile <x86intrin.h>
  3055. ///
  3056. /// This intrinsic corresponds to the <c> VPCMPEQD / PCMPEQD </c> instruction.
  3057. ///
  3058. /// \param __a
  3059. /// A 128-bit integer vector.
  3060. /// \param __b
  3061. /// A 128-bit integer vector.
  3062. /// \returns A 128-bit integer vector containing the comparison results.
  3063. static __inline__ __m128i __DEFAULT_FN_ATTRS
  3064. _mm_cmpeq_epi32(__m128i __a, __m128i __b)
  3065. {
  3066. return (__m128i)((__v4si)__a == (__v4si)__b);
  3067. }
  3068. /// \brief Compares each of the corresponding signed 8-bit values of the 128-bit
  3069. /// integer vectors to determine if the values in the first operand are
  3070. /// greater than those in the second operand. Each comparison yields 0h for
  3071. /// false, FFh for true.
  3072. ///
  3073. /// \headerfile <x86intrin.h>
  3074. ///
  3075. /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
  3076. ///
  3077. /// \param __a
  3078. /// A 128-bit integer vector.
  3079. /// \param __b
  3080. /// A 128-bit integer vector.
  3081. /// \returns A 128-bit integer vector containing the comparison results.
  3082. static __inline__ __m128i __DEFAULT_FN_ATTRS
  3083. _mm_cmpgt_epi8(__m128i __a, __m128i __b)
  3084. {
  3085. /* This function always performs a signed comparison, but __v16qi is a char
  3086. which may be signed or unsigned, so use __v16qs. */
  3087. return (__m128i)((__v16qs)__a > (__v16qs)__b);
  3088. }
  3089. /// \brief Compares each of the corresponding signed 16-bit values of the
  3090. /// 128-bit integer vectors to determine if the values in the first operand
  3091. /// are greater than those in the second operand.
  3092. ///
  3093. /// Each comparison yields 0h for false, FFFFh for true.
  3094. ///
  3095. /// \headerfile <x86intrin.h>
  3096. ///
  3097. /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
  3098. ///
  3099. /// \param __a
  3100. /// A 128-bit integer vector.
  3101. /// \param __b
  3102. /// A 128-bit integer vector.
  3103. /// \returns A 128-bit integer vector containing the comparison results.
  3104. static __inline__ __m128i __DEFAULT_FN_ATTRS
  3105. _mm_cmpgt_epi16(__m128i __a, __m128i __b)
  3106. {
  3107. return (__m128i)((__v8hi)__a > (__v8hi)__b);
  3108. }
  3109. /// \brief Compares each of the corresponding signed 32-bit values of the
  3110. /// 128-bit integer vectors to determine if the values in the first operand
  3111. /// are greater than those in the second operand.
  3112. ///
  3113. /// Each comparison yields 0h for false, FFFFFFFFh for true.
  3114. ///
  3115. /// \headerfile <x86intrin.h>
  3116. ///
  3117. /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
  3118. ///
  3119. /// \param __a
  3120. /// A 128-bit integer vector.
  3121. /// \param __b
  3122. /// A 128-bit integer vector.
  3123. /// \returns A 128-bit integer vector containing the comparison results.
  3124. static __inline__ __m128i __DEFAULT_FN_ATTRS
  3125. _mm_cmpgt_epi32(__m128i __a, __m128i __b)
  3126. {
  3127. return (__m128i)((__v4si)__a > (__v4si)__b);
  3128. }
  3129. /// \brief Compares each of the corresponding signed 8-bit values of the 128-bit
  3130. /// integer vectors to determine if the values in the first operand are less
  3131. /// than those in the second operand.
  3132. ///
  3133. /// Each comparison yields 0h for false, FFh for true.
  3134. ///
  3135. /// \headerfile <x86intrin.h>
  3136. ///
  3137. /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
  3138. ///
  3139. /// \param __a
  3140. /// A 128-bit integer vector.
  3141. /// \param __b
  3142. /// A 128-bit integer vector.
  3143. /// \returns A 128-bit integer vector containing the comparison results.
  3144. static __inline__ __m128i __DEFAULT_FN_ATTRS
  3145. _mm_cmplt_epi8(__m128i __a, __m128i __b)
  3146. {
  3147. return _mm_cmpgt_epi8(__b, __a);
  3148. }
  3149. /// \brief Compares each of the corresponding signed 16-bit values of the
  3150. /// 128-bit integer vectors to determine if the values in the first operand
  3151. /// are less than those in the second operand.
  3152. ///
  3153. /// Each comparison yields 0h for false, FFFFh for true.
  3154. ///
  3155. /// \headerfile <x86intrin.h>
  3156. ///
  3157. /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
  3158. ///
  3159. /// \param __a
  3160. /// A 128-bit integer vector.
  3161. /// \param __b
  3162. /// A 128-bit integer vector.
  3163. /// \returns A 128-bit integer vector containing the comparison results.
  3164. static __inline__ __m128i __DEFAULT_FN_ATTRS
  3165. _mm_cmplt_epi16(__m128i __a, __m128i __b)
  3166. {
  3167. return _mm_cmpgt_epi16(__b, __a);
  3168. }
  3169. /// \brief Compares each of the corresponding signed 32-bit values of the
  3170. /// 128-bit integer vectors to determine if the values in the first operand
  3171. /// are less than those in the second operand.
  3172. ///
  3173. /// Each comparison yields 0h for false, FFFFFFFFh for true.
  3174. ///
  3175. /// \headerfile <x86intrin.h>
  3176. ///
  3177. /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
  3178. ///
  3179. /// \param __a
  3180. /// A 128-bit integer vector.
  3181. /// \param __b
  3182. /// A 128-bit integer vector.
  3183. /// \returns A 128-bit integer vector containing the comparison results.
  3184. static __inline__ __m128i __DEFAULT_FN_ATTRS
  3185. _mm_cmplt_epi32(__m128i __a, __m128i __b)
  3186. {
  3187. return _mm_cmpgt_epi32(__b, __a);
  3188. }
  3189. #ifdef __x86_64__
  3190. /// \brief Converts a 64-bit signed integer value from the second operand into a
  3191. /// double-precision value and returns it in the lower element of a [2 x
  3192. /// double] vector; the upper element of the returned vector is copied from
  3193. /// the upper element of the first operand.
  3194. ///
  3195. /// \headerfile <x86intrin.h>
  3196. ///
  3197. /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
  3198. ///
  3199. /// \param __a
  3200. /// A 128-bit vector of [2 x double]. The upper 64 bits of this operand are
  3201. /// copied to the upper 64 bits of the destination.
  3202. /// \param __b
  3203. /// A 64-bit signed integer operand containing the value to be converted.
  3204. /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
  3205. /// converted value of the second operand. The upper 64 bits are copied from
  3206. /// the upper 64 bits of the first operand.
  3207. static __inline__ __m128d __DEFAULT_FN_ATTRS
  3208. _mm_cvtsi64_sd(__m128d __a, long long __b)
  3209. {
  3210. __a[0] = __b;
  3211. return __a;
  3212. }
  3213. /// \brief Converts the first (lower) element of a vector of [2 x double] into a
  3214. /// 64-bit signed integer value, according to the current rounding mode.
  3215. ///
  3216. /// \headerfile <x86intrin.h>
  3217. ///
  3218. /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
  3219. ///
  3220. /// \param __a
  3221. /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
  3222. /// conversion.
  3223. /// \returns A 64-bit signed integer containing the converted value.
  3224. static __inline__ long long __DEFAULT_FN_ATTRS
  3225. _mm_cvtsd_si64(__m128d __a)
  3226. {
  3227. return __builtin_ia32_cvtsd2si64((__v2df)__a);
  3228. }
  3229. /// \brief Converts the first (lower) element of a vector of [2 x double] into a
  3230. /// 64-bit signed integer value, truncating the result when it is inexact.
  3231. ///
  3232. /// \headerfile <x86intrin.h>
  3233. ///
  3234. /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
  3235. /// instruction.
  3236. ///
  3237. /// \param __a
  3238. /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
  3239. /// conversion.
  3240. /// \returns A 64-bit signed integer containing the converted value.
  3241. static __inline__ long long __DEFAULT_FN_ATTRS
  3242. _mm_cvttsd_si64(__m128d __a)
  3243. {
  3244. return __builtin_ia32_cvttsd2si64((__v2df)__a);
  3245. }
  3246. #endif
  3247. /// \brief Converts a vector of [4 x i32] into a vector of [4 x float].
  3248. ///
  3249. /// \headerfile <x86intrin.h>
  3250. ///
  3251. /// This intrinsic corresponds to the <c> VCVTDQ2PS / CVTDQ2PS </c> instruction.
  3252. ///
  3253. /// \param __a
  3254. /// A 128-bit integer vector.
  3255. /// \returns A 128-bit vector of [4 x float] containing the converted values.
  3256. static __inline__ __m128 __DEFAULT_FN_ATTRS
  3257. _mm_cvtepi32_ps(__m128i __a)
  3258. {
  3259. return __builtin_ia32_cvtdq2ps((__v4si)__a);
  3260. }
  3261. /// \brief Converts a vector of [4 x float] into a vector of [4 x i32].
  3262. ///
  3263. /// \headerfile <x86intrin.h>
  3264. ///
  3265. /// This intrinsic corresponds to the <c> VCVTPS2DQ / CVTPS2DQ </c> instruction.
  3266. ///
  3267. /// \param __a
  3268. /// A 128-bit vector of [4 x float].
  3269. /// \returns A 128-bit integer vector of [4 x i32] containing the converted
  3270. /// values.
  3271. static __inline__ __m128i __DEFAULT_FN_ATTRS
  3272. _mm_cvtps_epi32(__m128 __a)
  3273. {
  3274. return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a);
  3275. }
  3276. /// \brief Converts a vector of [4 x float] into a vector of [4 x i32],
  3277. /// truncating the result when it is inexact.
  3278. ///
  3279. /// \headerfile <x86intrin.h>
  3280. ///
  3281. /// This intrinsic corresponds to the <c> VCVTTPS2DQ / CVTTPS2DQ </c>
  3282. /// instruction.
  3283. ///
  3284. /// \param __a
  3285. /// A 128-bit vector of [4 x float].
  3286. /// \returns A 128-bit vector of [4 x i32] containing the converted values.
  3287. static __inline__ __m128i __DEFAULT_FN_ATTRS
  3288. _mm_cvttps_epi32(__m128 __a)
  3289. {
  3290. return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__a);
  3291. }
  3292. /// \brief Returns a vector of [4 x i32] where the lowest element is the input
  3293. /// operand and the remaining elements are zero.
  3294. ///
  3295. /// \headerfile <x86intrin.h>
  3296. ///
  3297. /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
  3298. ///
  3299. /// \param __a
  3300. /// A 32-bit signed integer operand.
  3301. /// \returns A 128-bit vector of [4 x i32].
  3302. static __inline__ __m128i __DEFAULT_FN_ATTRS
  3303. _mm_cvtsi32_si128(int __a)
  3304. {
  3305. return (__m128i)(__v4si){ __a, 0, 0, 0 };
  3306. }
  3307. #ifdef __x86_64__
  3308. /// \brief Returns a vector of [2 x i64] where the lower element is the input
  3309. /// operand and the upper element is zero.
  3310. ///
  3311. /// \headerfile <x86intrin.h>
  3312. ///
  3313. /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
  3314. ///
  3315. /// \param __a
  3316. /// A 64-bit signed integer operand containing the value to be converted.
  3317. /// \returns A 128-bit vector of [2 x i64] containing the converted value.
  3318. static __inline__ __m128i __DEFAULT_FN_ATTRS
  3319. _mm_cvtsi64_si128(long long __a)
  3320. {
  3321. return (__m128i){ __a, 0 };
  3322. }
  3323. #endif
  3324. /// \brief Moves the least significant 32 bits of a vector of [4 x i32] to a
  3325. /// 32-bit signed integer value.
  3326. ///
  3327. /// \headerfile <x86intrin.h>
  3328. ///
  3329. /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
  3330. ///
  3331. /// \param __a
  3332. /// A vector of [4 x i32]. The least significant 32 bits are moved to the
  3333. /// destination.
  3334. /// \returns A 32-bit signed integer containing the moved value.
  3335. static __inline__ int __DEFAULT_FN_ATTRS
  3336. _mm_cvtsi128_si32(__m128i __a)
  3337. {
  3338. __v4si __b = (__v4si)__a;
  3339. return __b[0];
  3340. }
  3341. #ifdef __x86_64__
  3342. /// \brief Moves the least significant 64 bits of a vector of [2 x i64] to a
  3343. /// 64-bit signed integer value.
  3344. ///
  3345. /// \headerfile <x86intrin.h>
  3346. ///
  3347. /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
  3348. ///
  3349. /// \param __a
  3350. /// A vector of [2 x i64]. The least significant 64 bits are moved to the
  3351. /// destination.
  3352. /// \returns A 64-bit signed integer containing the moved value.
  3353. static __inline__ long long __DEFAULT_FN_ATTRS
  3354. _mm_cvtsi128_si64(__m128i __a)
  3355. {
  3356. return __a[0];
  3357. }
  3358. #endif
  3359. /// \brief Moves packed integer values from an aligned 128-bit memory location
  3360. /// to elements in a 128-bit integer vector.
  3361. ///
  3362. /// \headerfile <x86intrin.h>
  3363. ///
  3364. /// This intrinsic corresponds to the <c> VMOVDQA / MOVDQA </c> instruction.
  3365. ///
  3366. /// \param __p
  3367. /// An aligned pointer to a memory location containing integer values.
  3368. /// \returns A 128-bit integer vector containing the moved values.
  3369. static __inline__ __m128i __DEFAULT_FN_ATTRS
  3370. _mm_load_si128(__m128i const *__p)
  3371. {
  3372. return *__p;
  3373. }
  3374. /// \brief Moves packed integer values from an unaligned 128-bit memory location
  3375. /// to elements in a 128-bit integer vector.
  3376. ///
  3377. /// \headerfile <x86intrin.h>
  3378. ///
  3379. /// This intrinsic corresponds to the <c> VMOVDQU / MOVDQU </c> instruction.
  3380. ///
  3381. /// \param __p
  3382. /// A pointer to a memory location containing integer values.
  3383. /// \returns A 128-bit integer vector containing the moved values.
  3384. static __inline__ __m128i __DEFAULT_FN_ATTRS
  3385. _mm_loadu_si128(__m128i const *__p)
  3386. {
  3387. struct __loadu_si128 {
  3388. __m128i __v;
  3389. } __attribute__((__packed__, __may_alias__));
  3390. return ((struct __loadu_si128*)__p)->__v;
  3391. }
  3392. /// \brief Returns a vector of [2 x i64] where the lower element is taken from
  3393. /// the lower element of the operand, and the upper element is zero.
  3394. ///
  3395. /// \headerfile <x86intrin.h>
  3396. ///
  3397. /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
  3398. ///
  3399. /// \param __p
  3400. /// A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of
  3401. /// the destination.
  3402. /// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the
  3403. /// moved value. The higher order bits are cleared.
  3404. static __inline__ __m128i __DEFAULT_FN_ATTRS
  3405. _mm_loadl_epi64(__m128i const *__p)
  3406. {
  3407. struct __mm_loadl_epi64_struct {
  3408. long long __u;
  3409. } __attribute__((__packed__, __may_alias__));
  3410. return (__m128i) { ((struct __mm_loadl_epi64_struct*)__p)->__u, 0};
  3411. }
  3412. /// \brief Generates a 128-bit vector of [4 x i32] with unspecified content.
  3413. /// This could be used as an argument to another intrinsic function where the
  3414. /// argument is required but the value is not actually used.
  3415. ///
  3416. /// \headerfile <x86intrin.h>
  3417. ///
  3418. /// This intrinsic has no corresponding instruction.
  3419. ///
  3420. /// \returns A 128-bit vector of [4 x i32] with unspecified content.
  3421. static __inline__ __m128i __DEFAULT_FN_ATTRS
  3422. _mm_undefined_si128(void)
  3423. {
  3424. return (__m128i)__builtin_ia32_undef128();
  3425. }
  3426. /// \brief Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
  3427. /// the specified 64-bit integer values.
  3428. ///
  3429. /// \headerfile <x86intrin.h>
  3430. ///
  3431. /// This intrinsic is a utility function and does not correspond to a specific
  3432. /// instruction.
  3433. ///
  3434. /// \param __q1
  3435. /// A 64-bit integer value used to initialize the upper 64 bits of the
  3436. /// destination vector of [2 x i64].
  3437. /// \param __q0
  3438. /// A 64-bit integer value used to initialize the lower 64 bits of the
  3439. /// destination vector of [2 x i64].
  3440. /// \returns An initialized 128-bit vector of [2 x i64] containing the values
  3441. /// provided in the operands.
  3442. static __inline__ __m128i __DEFAULT_FN_ATTRS
  3443. _mm_set_epi64x(long long __q1, long long __q0)
  3444. {
  3445. return (__m128i){ __q0, __q1 };
  3446. }
  3447. /// \brief Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
  3448. /// the specified 64-bit integer values.
  3449. ///
  3450. /// \headerfile <x86intrin.h>
  3451. ///
  3452. /// This intrinsic is a utility function and does not correspond to a specific
  3453. /// instruction.
  3454. ///
  3455. /// \param __q1
  3456. /// A 64-bit integer value used to initialize the upper 64 bits of the
  3457. /// destination vector of [2 x i64].
  3458. /// \param __q0
  3459. /// A 64-bit integer value used to initialize the lower 64 bits of the
  3460. /// destination vector of [2 x i64].
  3461. /// \returns An initialized 128-bit vector of [2 x i64] containing the values
  3462. /// provided in the operands.
  3463. static __inline__ __m128i __DEFAULT_FN_ATTRS
  3464. _mm_set_epi64(__m64 __q1, __m64 __q0)
  3465. {
  3466. return (__m128i){ (long long)__q0, (long long)__q1 };
  3467. }
  3468. /// \brief Initializes the 32-bit values in a 128-bit vector of [4 x i32] with
  3469. /// the specified 32-bit integer values.
  3470. ///
  3471. /// \headerfile <x86intrin.h>
  3472. ///
  3473. /// This intrinsic is a utility function and does not correspond to a specific
  3474. /// instruction.
  3475. ///
  3476. /// \param __i3
  3477. /// A 32-bit integer value used to initialize bits [127:96] of the
  3478. /// destination vector.
  3479. /// \param __i2
  3480. /// A 32-bit integer value used to initialize bits [95:64] of the destination
  3481. /// vector.
  3482. /// \param __i1
  3483. /// A 32-bit integer value used to initialize bits [63:32] of the destination
  3484. /// vector.
  3485. /// \param __i0
  3486. /// A 32-bit integer value used to initialize bits [31:0] of the destination
  3487. /// vector.
  3488. /// \returns An initialized 128-bit vector of [4 x i32] containing the values
  3489. /// provided in the operands.
  3490. static __inline__ __m128i __DEFAULT_FN_ATTRS
  3491. _mm_set_epi32(int __i3, int __i2, int __i1, int __i0)
  3492. {
  3493. return (__m128i)(__v4si){ __i0, __i1, __i2, __i3};
  3494. }
  3495. /// \brief Initializes the 16-bit values in a 128-bit vector of [8 x i16] with
  3496. /// the specified 16-bit integer values.
  3497. ///
  3498. /// \headerfile <x86intrin.h>
  3499. ///
  3500. /// This intrinsic is a utility function and does not correspond to a specific
  3501. /// instruction.
  3502. ///
  3503. /// \param __w7
  3504. /// A 16-bit integer value used to initialize bits [127:112] of the
  3505. /// destination vector.
  3506. /// \param __w6
  3507. /// A 16-bit integer value used to initialize bits [111:96] of the
  3508. /// destination vector.
  3509. /// \param __w5
  3510. /// A 16-bit integer value used to initialize bits [95:80] of the destination
  3511. /// vector.
  3512. /// \param __w4
  3513. /// A 16-bit integer value used to initialize bits [79:64] of the destination
  3514. /// vector.
  3515. /// \param __w3
  3516. /// A 16-bit integer value used to initialize bits [63:48] of the destination
  3517. /// vector.
  3518. /// \param __w2
  3519. /// A 16-bit integer value used to initialize bits [47:32] of the destination
  3520. /// vector.
  3521. /// \param __w1
  3522. /// A 16-bit integer value used to initialize bits [31:16] of the destination
  3523. /// vector.
  3524. /// \param __w0
  3525. /// A 16-bit integer value used to initialize bits [15:0] of the destination
  3526. /// vector.
  3527. /// \returns An initialized 128-bit vector of [8 x i16] containing the values
  3528. /// provided in the operands.
  3529. static __inline__ __m128i __DEFAULT_FN_ATTRS
  3530. _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0)
  3531. {
  3532. return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 };
  3533. }
  3534. /// \brief Initializes the 8-bit values in a 128-bit vector of [16 x i8] with
  3535. /// the specified 8-bit integer values.
  3536. ///
  3537. /// \headerfile <x86intrin.h>
  3538. ///
  3539. /// This intrinsic is a utility function and does not correspond to a specific
  3540. /// instruction.
  3541. ///
  3542. /// \param __b15
  3543. /// Initializes bits [127:120] of the destination vector.
  3544. /// \param __b14
  3545. /// Initializes bits [119:112] of the destination vector.
  3546. /// \param __b13
  3547. /// Initializes bits [111:104] of the destination vector.
  3548. /// \param __b12
  3549. /// Initializes bits [103:96] of the destination vector.
  3550. /// \param __b11
  3551. /// Initializes bits [95:88] of the destination vector.
  3552. /// \param __b10
  3553. /// Initializes bits [87:80] of the destination vector.
  3554. /// \param __b9
  3555. /// Initializes bits [79:72] of the destination vector.
  3556. /// \param __b8
  3557. /// Initializes bits [71:64] of the destination vector.
  3558. /// \param __b7
  3559. /// Initializes bits [63:56] of the destination vector.
  3560. /// \param __b6
  3561. /// Initializes bits [55:48] of the destination vector.
  3562. /// \param __b5
  3563. /// Initializes bits [47:40] of the destination vector.
  3564. /// \param __b4
  3565. /// Initializes bits [39:32] of the destination vector.
  3566. /// \param __b3
  3567. /// Initializes bits [31:24] of the destination vector.
  3568. /// \param __b2
  3569. /// Initializes bits [23:16] of the destination vector.
  3570. /// \param __b1
  3571. /// Initializes bits [15:8] of the destination vector.
  3572. /// \param __b0
  3573. /// Initializes bits [7:0] of the destination vector.
  3574. /// \returns An initialized 128-bit vector of [16 x i8] containing the values
  3575. /// provided in the operands.
  3576. static __inline__ __m128i __DEFAULT_FN_ATTRS
  3577. _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0)
  3578. {
  3579. return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 };
  3580. }
  3581. /// \brief Initializes both values in a 128-bit integer vector with the
  3582. /// specified 64-bit integer value.
  3583. ///
  3584. /// \headerfile <x86intrin.h>
  3585. ///
  3586. /// This intrinsic is a utility function and does not correspond to a specific
  3587. /// instruction.
  3588. ///
  3589. /// \param __q
  3590. /// Integer value used to initialize the elements of the destination integer
  3591. /// vector.
  3592. /// \returns An initialized 128-bit integer vector of [2 x i64] with both
  3593. /// elements containing the value provided in the operand.
  3594. static __inline__ __m128i __DEFAULT_FN_ATTRS
  3595. _mm_set1_epi64x(long long __q)
  3596. {
  3597. return (__m128i){ __q, __q };
  3598. }
  3599. /// \brief Initializes both values in a 128-bit vector of [2 x i64] with the
  3600. /// specified 64-bit value.
  3601. ///
  3602. /// \headerfile <x86intrin.h>
  3603. ///
  3604. /// This intrinsic is a utility function and does not correspond to a specific
  3605. /// instruction.
  3606. ///
  3607. /// \param __q
  3608. /// A 64-bit value used to initialize the elements of the destination integer
  3609. /// vector.
  3610. /// \returns An initialized 128-bit vector of [2 x i64] with all elements
  3611. /// containing the value provided in the operand.
  3612. static __inline__ __m128i __DEFAULT_FN_ATTRS
  3613. _mm_set1_epi64(__m64 __q)
  3614. {
  3615. return (__m128i){ (long long)__q, (long long)__q };
  3616. }
  3617. /// \brief Initializes all values in a 128-bit vector of [4 x i32] with the
  3618. /// specified 32-bit value.
  3619. ///
  3620. /// \headerfile <x86intrin.h>
  3621. ///
  3622. /// This intrinsic is a utility function and does not correspond to a specific
  3623. /// instruction.
  3624. ///
  3625. /// \param __i
  3626. /// A 32-bit value used to initialize the elements of the destination integer
  3627. /// vector.
  3628. /// \returns An initialized 128-bit vector of [4 x i32] with all elements
  3629. /// containing the value provided in the operand.
  3630. static __inline__ __m128i __DEFAULT_FN_ATTRS
  3631. _mm_set1_epi32(int __i)
  3632. {
  3633. return (__m128i)(__v4si){ __i, __i, __i, __i };
  3634. }
  3635. /// \brief Initializes all values in a 128-bit vector of [8 x i16] with the
  3636. /// specified 16-bit value.
  3637. ///
  3638. /// \headerfile <x86intrin.h>
  3639. ///
  3640. /// This intrinsic is a utility function and does not correspond to a specific
  3641. /// instruction.
  3642. ///
  3643. /// \param __w
  3644. /// A 16-bit value used to initialize the elements of the destination integer
  3645. /// vector.
  3646. /// \returns An initialized 128-bit vector of [8 x i16] with all elements
  3647. /// containing the value provided in the operand.
  3648. static __inline__ __m128i __DEFAULT_FN_ATTRS
  3649. _mm_set1_epi16(short __w)
  3650. {
  3651. return (__m128i)(__v8hi){ __w, __w, __w, __w, __w, __w, __w, __w };
  3652. }
  3653. /// \brief Initializes all values in a 128-bit vector of [16 x i8] with the
  3654. /// specified 8-bit value.
  3655. ///
  3656. /// \headerfile <x86intrin.h>
  3657. ///
  3658. /// This intrinsic is a utility function and does not correspond to a specific
  3659. /// instruction.
  3660. ///
  3661. /// \param __b
  3662. /// An 8-bit value used to initialize the elements of the destination integer
  3663. /// vector.
  3664. /// \returns An initialized 128-bit vector of [16 x i8] with all elements
  3665. /// containing the value provided in the operand.
  3666. static __inline__ __m128i __DEFAULT_FN_ATTRS
  3667. _mm_set1_epi8(char __b)
  3668. {
  3669. return (__m128i)(__v16qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b };
  3670. }
  3671. /// \brief Constructs a 128-bit integer vector, initialized in reverse order
  3672. /// with the specified 64-bit integral values.
  3673. ///
  3674. /// \headerfile <x86intrin.h>
  3675. ///
  3676. /// This intrinsic does not correspond to a specific instruction.
  3677. ///
  3678. /// \param __q0
  3679. /// A 64-bit integral value used to initialize the lower 64 bits of the
  3680. /// result.
  3681. /// \param __q1
  3682. /// A 64-bit integral value used to initialize the upper 64 bits of the
  3683. /// result.
  3684. /// \returns An initialized 128-bit integer vector.
  3685. static __inline__ __m128i __DEFAULT_FN_ATTRS
  3686. _mm_setr_epi64(__m64 __q0, __m64 __q1)
  3687. {
  3688. return (__m128i){ (long long)__q0, (long long)__q1 };
  3689. }
  3690. /// \brief Constructs a 128-bit integer vector, initialized in reverse order
  3691. /// with the specified 32-bit integral values.
  3692. ///
  3693. /// \headerfile <x86intrin.h>
  3694. ///
  3695. /// This intrinsic is a utility function and does not correspond to a specific
  3696. /// instruction.
  3697. ///
  3698. /// \param __i0
  3699. /// A 32-bit integral value used to initialize bits [31:0] of the result.
  3700. /// \param __i1
  3701. /// A 32-bit integral value used to initialize bits [63:32] of the result.
  3702. /// \param __i2
  3703. /// A 32-bit integral value used to initialize bits [95:64] of the result.
  3704. /// \param __i3
  3705. /// A 32-bit integral value used to initialize bits [127:96] of the result.
  3706. /// \returns An initialized 128-bit integer vector.
  3707. static __inline__ __m128i __DEFAULT_FN_ATTRS
  3708. _mm_setr_epi32(int __i0, int __i1, int __i2, int __i3)
  3709. {
  3710. return (__m128i)(__v4si){ __i0, __i1, __i2, __i3};
  3711. }
  3712. /// \brief Constructs a 128-bit integer vector, initialized in reverse order
  3713. /// with the specified 16-bit integral values.
  3714. ///
  3715. /// \headerfile <x86intrin.h>
  3716. ///
  3717. /// This intrinsic is a utility function and does not correspond to a specific
  3718. /// instruction.
  3719. ///
  3720. /// \param __w0
  3721. /// A 16-bit integral value used to initialize bits [15:0] of the result.
  3722. /// \param __w1
  3723. /// A 16-bit integral value used to initialize bits [31:16] of the result.
  3724. /// \param __w2
  3725. /// A 16-bit integral value used to initialize bits [47:32] of the result.
  3726. /// \param __w3
  3727. /// A 16-bit integral value used to initialize bits [63:48] of the result.
  3728. /// \param __w4
  3729. /// A 16-bit integral value used to initialize bits [79:64] of the result.
  3730. /// \param __w5
  3731. /// A 16-bit integral value used to initialize bits [95:80] of the result.
  3732. /// \param __w6
  3733. /// A 16-bit integral value used to initialize bits [111:96] of the result.
  3734. /// \param __w7
  3735. /// A 16-bit integral value used to initialize bits [127:112] of the result.
  3736. /// \returns An initialized 128-bit integer vector.
  3737. static __inline__ __m128i __DEFAULT_FN_ATTRS
  3738. _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7)
  3739. {
  3740. return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 };
  3741. }
  3742. /// \brief Constructs a 128-bit integer vector, initialized in reverse order
  3743. /// with the specified 8-bit integral values.
  3744. ///
  3745. /// \headerfile <x86intrin.h>
  3746. ///
  3747. /// This intrinsic is a utility function and does not correspond to a specific
  3748. /// instruction.
  3749. ///
  3750. /// \param __b0
  3751. /// An 8-bit integral value used to initialize bits [7:0] of the result.
  3752. /// \param __b1
  3753. /// An 8-bit integral value used to initialize bits [15:8] of the result.
  3754. /// \param __b2
  3755. /// An 8-bit integral value used to initialize bits [23:16] of the result.
  3756. /// \param __b3
  3757. /// An 8-bit integral value used to initialize bits [31:24] of the result.
  3758. /// \param __b4
  3759. /// An 8-bit integral value used to initialize bits [39:32] of the result.
  3760. /// \param __b5
  3761. /// An 8-bit integral value used to initialize bits [47:40] of the result.
  3762. /// \param __b6
  3763. /// An 8-bit integral value used to initialize bits [55:48] of the result.
  3764. /// \param __b7
  3765. /// An 8-bit integral value used to initialize bits [63:56] of the result.
  3766. /// \param __b8
  3767. /// An 8-bit integral value used to initialize bits [71:64] of the result.
  3768. /// \param __b9
  3769. /// An 8-bit integral value used to initialize bits [79:72] of the result.
  3770. /// \param __b10
  3771. /// An 8-bit integral value used to initialize bits [87:80] of the result.
  3772. /// \param __b11
  3773. /// An 8-bit integral value used to initialize bits [95:88] of the result.
  3774. /// \param __b12
  3775. /// An 8-bit integral value used to initialize bits [103:96] of the result.
  3776. /// \param __b13
  3777. /// An 8-bit integral value used to initialize bits [111:104] of the result.
  3778. /// \param __b14
  3779. /// An 8-bit integral value used to initialize bits [119:112] of the result.
  3780. /// \param __b15
  3781. /// An 8-bit integral value used to initialize bits [127:120] of the result.
  3782. /// \returns An initialized 128-bit integer vector.
  3783. static __inline__ __m128i __DEFAULT_FN_ATTRS
  3784. _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15)
  3785. {
  3786. return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 };
  3787. }
  3788. /// \brief Creates a 128-bit integer vector initialized to zero.
  3789. ///
  3790. /// \headerfile <x86intrin.h>
  3791. ///
  3792. /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
  3793. ///
  3794. /// \returns An initialized 128-bit integer vector with all elements set to
  3795. /// zero.
  3796. static __inline__ __m128i __DEFAULT_FN_ATTRS
  3797. _mm_setzero_si128(void)
  3798. {
  3799. return (__m128i){ 0LL, 0LL };
  3800. }
  3801. /// \brief Stores a 128-bit integer vector to a memory location aligned on a
  3802. /// 128-bit boundary.
  3803. ///
  3804. /// \headerfile <x86intrin.h>
  3805. ///
  3806. /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
  3807. ///
  3808. /// \param __p
  3809. /// A pointer to an aligned memory location that will receive the integer
  3810. /// values.
  3811. /// \param __b
  3812. /// A 128-bit integer vector containing the values to be moved.
  3813. static __inline__ void __DEFAULT_FN_ATTRS
  3814. _mm_store_si128(__m128i *__p, __m128i __b)
  3815. {
  3816. *__p = __b;
  3817. }
  3818. /// \brief Stores a 128-bit integer vector to an unaligned memory location.
  3819. ///
  3820. /// \headerfile <x86intrin.h>
  3821. ///
  3822. /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
  3823. ///
  3824. /// \param __p
  3825. /// A pointer to a memory location that will receive the integer values.
  3826. /// \param __b
  3827. /// A 128-bit integer vector containing the values to be moved.
  3828. static __inline__ void __DEFAULT_FN_ATTRS
  3829. _mm_storeu_si128(__m128i *__p, __m128i __b)
  3830. {
  3831. struct __storeu_si128 {
  3832. __m128i __v;
  3833. } __attribute__((__packed__, __may_alias__));
  3834. ((struct __storeu_si128*)__p)->__v = __b;
  3835. }
  3836. /// \brief Moves bytes selected by the mask from the first operand to the
  3837. /// specified unaligned memory location. When a mask bit is 1, the
  3838. /// corresponding byte is written, otherwise it is not written.
  3839. ///
  3840. /// To minimize caching, the data is flagged as non-temporal (unlikely to be
  3841. /// used again soon). Exception and trap behavior for elements not selected
  3842. /// for storage to memory are implementation dependent.
  3843. ///
  3844. /// \headerfile <x86intrin.h>
  3845. ///
  3846. /// This intrinsic corresponds to the <c> VMASKMOVDQU / MASKMOVDQU </c>
  3847. /// instruction.
  3848. ///
  3849. /// \param __d
  3850. /// A 128-bit integer vector containing the values to be moved.
  3851. /// \param __n
  3852. /// A 128-bit integer vector containing the mask. The most significant bit of
  3853. /// each byte represents the mask bits.
  3854. /// \param __p
  3855. /// A pointer to an unaligned 128-bit memory location where the specified
  3856. /// values are moved.
  3857. static __inline__ void __DEFAULT_FN_ATTRS
  3858. _mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
  3859. {
  3860. __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
  3861. }
  3862. /// \brief Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to
  3863. /// a memory location.
  3864. ///
  3865. /// \headerfile <x86intrin.h>
  3866. ///
  3867. /// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
  3868. ///
  3869. /// \param __p
  3870. /// A pointer to a 64-bit memory location that will receive the lower 64 bits
  3871. /// of the integer vector parameter.
  3872. /// \param __a
  3873. /// A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the
  3874. /// value to be stored.
  3875. static __inline__ void __DEFAULT_FN_ATTRS
  3876. _mm_storel_epi64(__m128i *__p, __m128i __a)
  3877. {
  3878. struct __mm_storel_epi64_struct {
  3879. long long __u;
  3880. } __attribute__((__packed__, __may_alias__));
  3881. ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0];
  3882. }
  3883. /// \brief Stores a 128-bit floating point vector of [2 x double] to a 128-bit
  3884. /// aligned memory location.
  3885. ///
  3886. /// To minimize caching, the data is flagged as non-temporal (unlikely to be
  3887. /// used again soon).
  3888. ///
  3889. /// \headerfile <x86intrin.h>
  3890. ///
  3891. /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
  3892. ///
  3893. /// \param __p
  3894. /// A pointer to the 128-bit aligned memory location used to store the value.
  3895. /// \param __a
  3896. /// A vector of [2 x double] containing the 64-bit values to be stored.
  3897. static __inline__ void __DEFAULT_FN_ATTRS
  3898. _mm_stream_pd(double *__p, __m128d __a)
  3899. {
  3900. __builtin_nontemporal_store((__v2df)__a, (__v2df*)__p);
  3901. }
  3902. /// \brief Stores a 128-bit integer vector to a 128-bit aligned memory location.
  3903. ///
  3904. /// To minimize caching, the data is flagged as non-temporal (unlikely to be
  3905. /// used again soon).
  3906. ///
  3907. /// \headerfile <x86intrin.h>
  3908. ///
  3909. /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
  3910. ///
  3911. /// \param __p
  3912. /// A pointer to the 128-bit aligned memory location used to store the value.
  3913. /// \param __a
  3914. /// A 128-bit integer vector containing the values to be stored.
  3915. static __inline__ void __DEFAULT_FN_ATTRS
  3916. _mm_stream_si128(__m128i *__p, __m128i __a)
  3917. {
  3918. __builtin_nontemporal_store((__v2di)__a, (__v2di*)__p);
  3919. }
  3920. /// \brief Stores a 32-bit integer value in the specified memory location.
  3921. ///
  3922. /// To minimize caching, the data is flagged as non-temporal (unlikely to be
  3923. /// used again soon).
  3924. ///
  3925. /// \headerfile <x86intrin.h>
  3926. ///
  3927. /// This intrinsic corresponds to the <c> MOVNTI </c> instruction.
  3928. ///
  3929. /// \param __p
  3930. /// A pointer to the 32-bit memory location used to store the value.
  3931. /// \param __a
  3932. /// A 32-bit integer containing the value to be stored.
  3933. static __inline__ void __DEFAULT_FN_ATTRS
  3934. _mm_stream_si32(int *__p, int __a)
  3935. {
  3936. __builtin_ia32_movnti(__p, __a);
  3937. }
  3938. #ifdef __x86_64__
  3939. /// \brief Stores a 64-bit integer value in the specified memory location.
  3940. ///
  3941. /// To minimize caching, the data is flagged as non-temporal (unlikely to be
  3942. /// used again soon).
  3943. ///
  3944. /// \headerfile <x86intrin.h>
  3945. ///
  3946. /// This intrinsic corresponds to the <c> MOVNTIQ </c> instruction.
  3947. ///
  3948. /// \param __p
  3949. /// A pointer to the 64-bit memory location used to store the value.
  3950. /// \param __a
  3951. /// A 64-bit integer containing the value to be stored.
  3952. static __inline__ void __DEFAULT_FN_ATTRS
  3953. _mm_stream_si64(long long *__p, long long __a)
  3954. {
  3955. __builtin_ia32_movnti64(__p, __a);
  3956. }
  3957. #endif
  3958. #if defined(__cplusplus)
  3959. extern "C" {
  3960. #endif
  3961. /// \brief The cache line containing \a __p is flushed and invalidated from all
  3962. /// caches in the coherency domain.
  3963. ///
  3964. /// \headerfile <x86intrin.h>
  3965. ///
  3966. /// This intrinsic corresponds to the <c> CLFLUSH </c> instruction.
  3967. ///
  3968. /// \param __p
  3969. /// A pointer to the memory location used to identify the cache line to be
  3970. /// flushed.
  3971. void _mm_clflush(void const * __p);
  3972. /// \brief Forces strong memory ordering (serialization) between load
  3973. /// instructions preceding this instruction and load instructions following
  3974. /// this instruction, ensuring the system completes all previous loads before
  3975. /// executing subsequent loads.
  3976. ///
  3977. /// \headerfile <x86intrin.h>
  3978. ///
  3979. /// This intrinsic corresponds to the <c> LFENCE </c> instruction.
  3980. ///
  3981. void _mm_lfence(void);
  3982. /// \brief Forces strong memory ordering (serialization) between load and store
  3983. /// instructions preceding this instruction and load and store instructions
  3984. /// following this instruction, ensuring that the system completes all
  3985. /// previous memory accesses before executing subsequent memory accesses.
  3986. ///
  3987. /// \headerfile <x86intrin.h>
  3988. ///
  3989. /// This intrinsic corresponds to the <c> MFENCE </c> instruction.
  3990. ///
  3991. void _mm_mfence(void);
  3992. #if defined(__cplusplus)
  3993. } // extern "C"
  3994. #endif
  3995. /// \brief Converts 16-bit signed integers from both 128-bit integer vector
  3996. /// operands into 8-bit signed integers, and packs the results into the
  3997. /// destination. Positive values greater than 0x7F are saturated to 0x7F.
  3998. /// Negative values less than 0x80 are saturated to 0x80.
  3999. ///
  4000. /// \headerfile <x86intrin.h>
  4001. ///
  4002. /// This intrinsic corresponds to the <c> VPACKSSWB / PACKSSWB </c> instruction.
  4003. ///
  4004. /// \param __a
  4005. /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
  4006. /// a signed integer and is converted to a 8-bit signed integer with
  4007. /// saturation. Values greater than 0x7F are saturated to 0x7F. Values less
  4008. /// than 0x80 are saturated to 0x80. The converted [8 x i8] values are
  4009. /// written to the lower 64 bits of the result.
  4010. /// \param __b
  4011. /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
  4012. /// a signed integer and is converted to a 8-bit signed integer with
  4013. /// saturation. Values greater than 0x7F are saturated to 0x7F. Values less
  4014. /// than 0x80 are saturated to 0x80. The converted [8 x i8] values are
  4015. /// written to the higher 64 bits of the result.
  4016. /// \returns A 128-bit vector of [16 x i8] containing the converted values.
  4017. static __inline__ __m128i __DEFAULT_FN_ATTRS
  4018. _mm_packs_epi16(__m128i __a, __m128i __b)
  4019. {
  4020. return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
  4021. }
  4022. /// \brief Converts 32-bit signed integers from both 128-bit integer vector
  4023. /// operands into 16-bit signed integers, and packs the results into the
  4024. /// destination. Positive values greater than 0x7FFF are saturated to 0x7FFF.
  4025. /// Negative values less than 0x8000 are saturated to 0x8000.
  4026. ///
  4027. /// \headerfile <x86intrin.h>
  4028. ///
  4029. /// This intrinsic corresponds to the <c> VPACKSSDW / PACKSSDW </c> instruction.
  4030. ///
  4031. /// \param __a
  4032. /// A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
  4033. /// a signed integer and is converted to a 16-bit signed integer with
  4034. /// saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
  4035. /// less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
  4036. /// are written to the lower 64 bits of the result.
  4037. /// \param __b
  4038. /// A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
  4039. /// a signed integer and is converted to a 16-bit signed integer with
  4040. /// saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
  4041. /// less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
  4042. /// are written to the higher 64 bits of the result.
  4043. /// \returns A 128-bit vector of [8 x i16] containing the converted values.
  4044. static __inline__ __m128i __DEFAULT_FN_ATTRS
  4045. _mm_packs_epi32(__m128i __a, __m128i __b)
  4046. {
  4047. return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
  4048. }
  4049. /// \brief Converts 16-bit signed integers from both 128-bit integer vector
  4050. /// operands into 8-bit unsigned integers, and packs the results into the
  4051. /// destination. Values greater than 0xFF are saturated to 0xFF. Values less
  4052. /// than 0x00 are saturated to 0x00.
  4053. ///
  4054. /// \headerfile <x86intrin.h>
  4055. ///
  4056. /// This intrinsic corresponds to the <c> VPACKUSWB / PACKUSWB </c> instruction.
  4057. ///
  4058. /// \param __a
  4059. /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
  4060. /// a signed integer and is converted to an 8-bit unsigned integer with
  4061. /// saturation. Values greater than 0xFF are saturated to 0xFF. Values less
  4062. /// than 0x00 are saturated to 0x00. The converted [8 x i8] values are
  4063. /// written to the lower 64 bits of the result.
  4064. /// \param __b
  4065. /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
  4066. /// a signed integer and is converted to an 8-bit unsigned integer with
  4067. /// saturation. Values greater than 0xFF are saturated to 0xFF. Values less
  4068. /// than 0x00 are saturated to 0x00. The converted [8 x i8] values are
  4069. /// written to the higher 64 bits of the result.
  4070. /// \returns A 128-bit vector of [16 x i8] containing the converted values.
  4071. static __inline__ __m128i __DEFAULT_FN_ATTRS
  4072. _mm_packus_epi16(__m128i __a, __m128i __b)
  4073. {
  4074. return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
  4075. }
  4076. /// \brief Extracts 16 bits from a 128-bit integer vector of [8 x i16], using
  4077. /// the immediate-value parameter as a selector.
  4078. ///
  4079. /// \headerfile <x86intrin.h>
  4080. ///
  4081. /// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
  4082. ///
  4083. /// \param __a
  4084. /// A 128-bit integer vector.
  4085. /// \param __imm
  4086. /// An immediate value. Bits [2:0] selects values from \a __a to be assigned
  4087. /// to bits[15:0] of the result. \n
  4088. /// 000: assign values from bits [15:0] of \a __a. \n
  4089. /// 001: assign values from bits [31:16] of \a __a. \n
  4090. /// 010: assign values from bits [47:32] of \a __a. \n
  4091. /// 011: assign values from bits [63:48] of \a __a. \n
  4092. /// 100: assign values from bits [79:64] of \a __a. \n
  4093. /// 101: assign values from bits [95:80] of \a __a. \n
  4094. /// 110: assign values from bits [111:96] of \a __a. \n
  4095. /// 111: assign values from bits [127:112] of \a __a.
  4096. /// \returns An integer, whose lower 16 bits are selected from the 128-bit
  4097. /// integer vector parameter and the remaining bits are assigned zeros.
  4098. static __inline__ int __DEFAULT_FN_ATTRS
  4099. _mm_extract_epi16(__m128i __a, int __imm)
  4100. {
  4101. __v8hi __b = (__v8hi)__a;
  4102. return (unsigned short)__b[__imm & 7];
  4103. }
  4104. /// \brief Constructs a 128-bit integer vector by first making a copy of the
  4105. /// 128-bit integer vector parameter, and then inserting the lower 16 bits
  4106. /// of an integer parameter into an offset specified by the immediate-value
  4107. /// parameter.
  4108. ///
  4109. /// \headerfile <x86intrin.h>
  4110. ///
  4111. /// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction.
  4112. ///
  4113. /// \param __a
  4114. /// A 128-bit integer vector of [8 x i16]. This vector is copied to the
  4115. /// result and then one of the eight elements in the result is replaced by
  4116. /// the lower 16 bits of \a __b.
  4117. /// \param __b
  4118. /// An integer. The lower 16 bits of this parameter are written to the
  4119. /// result beginning at an offset specified by \a __imm.
  4120. /// \param __imm
  4121. /// An immediate value specifying the bit offset in the result at which the
  4122. /// lower 16 bits of \a __b are written.
  4123. /// \returns A 128-bit integer vector containing the constructed values.
  4124. static __inline__ __m128i __DEFAULT_FN_ATTRS
  4125. _mm_insert_epi16(__m128i __a, int __b, int __imm)
  4126. {
  4127. __v8hi __c = (__v8hi)__a;
  4128. __c[__imm & 7] = __b;
  4129. return (__m128i)__c;
  4130. }
  4131. /// \brief Copies the values of the most significant bits from each 8-bit
  4132. /// element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask
  4133. /// value, zero-extends the value, and writes it to the destination.
  4134. ///
  4135. /// \headerfile <x86intrin.h>
  4136. ///
  4137. /// This intrinsic corresponds to the <c> VPMOVMSKB / PMOVMSKB </c> instruction.
  4138. ///
  4139. /// \param __a
  4140. /// A 128-bit integer vector containing the values with bits to be extracted.
  4141. /// \returns The most significant bits from each 8-bit element in \a __a,
  4142. /// written to bits [15:0]. The other bits are assigned zeros.
  4143. static __inline__ int __DEFAULT_FN_ATTRS
  4144. _mm_movemask_epi8(__m128i __a)
  4145. {
  4146. return __builtin_ia32_pmovmskb128((__v16qi)__a);
  4147. }
  4148. /// \brief Constructs a 128-bit integer vector by shuffling four 32-bit
  4149. /// elements of a 128-bit integer vector parameter, using the immediate-value
  4150. /// parameter as a specifier.
  4151. ///
  4152. /// \headerfile <x86intrin.h>
  4153. ///
  4154. /// \code
  4155. /// __m128i _mm_shuffle_epi32(__m128i a, const int imm);
  4156. /// \endcode
  4157. ///
  4158. /// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction.
  4159. ///
  4160. /// \param a
  4161. /// A 128-bit integer vector containing the values to be copied.
  4162. /// \param imm
  4163. /// An immediate value containing an 8-bit value specifying which elements to
  4164. /// copy from a. The destinations within the 128-bit destination are assigned
  4165. /// values as follows: \n
  4166. /// Bits [1:0] are used to assign values to bits [31:0] of the result. \n
  4167. /// Bits [3:2] are used to assign values to bits [63:32] of the result. \n
  4168. /// Bits [5:4] are used to assign values to bits [95:64] of the result. \n
  4169. /// Bits [7:6] are used to assign values to bits [127:96] of the result. \n
  4170. /// Bit value assignments: \n
  4171. /// 00: assign values from bits [31:0] of \a a. \n
  4172. /// 01: assign values from bits [63:32] of \a a. \n
  4173. /// 10: assign values from bits [95:64] of \a a. \n
  4174. /// 11: assign values from bits [127:96] of \a a.
  4175. /// \returns A 128-bit integer vector containing the shuffled values.
  4176. #define _mm_shuffle_epi32(a, imm) __extension__ ({ \
  4177. (__m128i)__builtin_shufflevector((__v4si)(__m128i)(a), \
  4178. (__v4si)_mm_undefined_si128(), \
  4179. ((imm) >> 0) & 0x3, ((imm) >> 2) & 0x3, \
  4180. ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); })
  4181. /// \brief Constructs a 128-bit integer vector by shuffling four lower 16-bit
  4182. /// elements of a 128-bit integer vector of [8 x i16], using the immediate
  4183. /// value parameter as a specifier.
  4184. ///
  4185. /// \headerfile <x86intrin.h>
  4186. ///
  4187. /// \code
  4188. /// __m128i _mm_shufflelo_epi16(__m128i a, const int imm);
  4189. /// \endcode
  4190. ///
  4191. /// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction.
  4192. ///
  4193. /// \param a
  4194. /// A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits
  4195. /// [127:64] of the result.
  4196. /// \param imm
  4197. /// An 8-bit immediate value specifying which elements to copy from \a a. \n
  4198. /// Bits[1:0] are used to assign values to bits [15:0] of the result. \n
  4199. /// Bits[3:2] are used to assign values to bits [31:16] of the result. \n
  4200. /// Bits[5:4] are used to assign values to bits [47:32] of the result. \n
  4201. /// Bits[7:6] are used to assign values to bits [63:48] of the result. \n
  4202. /// Bit value assignments: \n
  4203. /// 00: assign values from bits [15:0] of \a a. \n
  4204. /// 01: assign values from bits [31:16] of \a a. \n
  4205. /// 10: assign values from bits [47:32] of \a a. \n
  4206. /// 11: assign values from bits [63:48] of \a a. \n
  4207. /// \returns A 128-bit integer vector containing the shuffled values.
  4208. #define _mm_shufflelo_epi16(a, imm) __extension__ ({ \
  4209. (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
  4210. (__v8hi)_mm_undefined_si128(), \
  4211. ((imm) >> 0) & 0x3, ((imm) >> 2) & 0x3, \
  4212. ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3, \
  4213. 4, 5, 6, 7); })
  4214. /// \brief Constructs a 128-bit integer vector by shuffling four upper 16-bit
  4215. /// elements of a 128-bit integer vector of [8 x i16], using the immediate
  4216. /// value parameter as a specifier.
  4217. ///
  4218. /// \headerfile <x86intrin.h>
  4219. ///
  4220. /// \code
  4221. /// __m128i _mm_shufflehi_epi16(__m128i a, const int imm);
  4222. /// \endcode
  4223. ///
  4224. /// This intrinsic corresponds to the <c> VPSHUFHW / PSHUFHW </c> instruction.
  4225. ///
  4226. /// \param a
  4227. /// A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits
  4228. /// [63:0] of the result.
  4229. /// \param imm
  4230. /// An 8-bit immediate value specifying which elements to copy from \a a. \n
  4231. /// Bits[1:0] are used to assign values to bits [79:64] of the result. \n
  4232. /// Bits[3:2] are used to assign values to bits [95:80] of the result. \n
  4233. /// Bits[5:4] are used to assign values to bits [111:96] of the result. \n
  4234. /// Bits[7:6] are used to assign values to bits [127:112] of the result. \n
  4235. /// Bit value assignments: \n
  4236. /// 00: assign values from bits [79:64] of \a a. \n
  4237. /// 01: assign values from bits [95:80] of \a a. \n
  4238. /// 10: assign values from bits [111:96] of \a a. \n
  4239. /// 11: assign values from bits [127:112] of \a a. \n
  4240. /// \returns A 128-bit integer vector containing the shuffled values.
  4241. #define _mm_shufflehi_epi16(a, imm) __extension__ ({ \
  4242. (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
  4243. (__v8hi)_mm_undefined_si128(), \
  4244. 0, 1, 2, 3, \
  4245. 4 + (((imm) >> 0) & 0x3), \
  4246. 4 + (((imm) >> 2) & 0x3), \
  4247. 4 + (((imm) >> 4) & 0x3), \
  4248. 4 + (((imm) >> 6) & 0x3)); })
  4249. /// \brief Unpacks the high-order (index 8-15) values from two 128-bit vectors
  4250. /// of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
  4251. ///
  4252. /// \headerfile <x86intrin.h>
  4253. ///
  4254. /// This intrinsic corresponds to the <c> VPUNPCKHBW / PUNPCKHBW </c>
  4255. /// instruction.
  4256. ///
  4257. /// \param __a
  4258. /// A 128-bit vector of [16 x i8].
  4259. /// Bits [71:64] are written to bits [7:0] of the result. \n
  4260. /// Bits [79:72] are written to bits [23:16] of the result. \n
  4261. /// Bits [87:80] are written to bits [39:32] of the result. \n
  4262. /// Bits [95:88] are written to bits [55:48] of the result. \n
  4263. /// Bits [103:96] are written to bits [71:64] of the result. \n
  4264. /// Bits [111:104] are written to bits [87:80] of the result. \n
  4265. /// Bits [119:112] are written to bits [103:96] of the result. \n
  4266. /// Bits [127:120] are written to bits [119:112] of the result.
  4267. /// \param __b
  4268. /// A 128-bit vector of [16 x i8]. \n
  4269. /// Bits [71:64] are written to bits [15:8] of the result. \n
  4270. /// Bits [79:72] are written to bits [31:24] of the result. \n
  4271. /// Bits [87:80] are written to bits [47:40] of the result. \n
  4272. /// Bits [95:88] are written to bits [63:56] of the result. \n
  4273. /// Bits [103:96] are written to bits [79:72] of the result. \n
  4274. /// Bits [111:104] are written to bits [95:88] of the result. \n
  4275. /// Bits [119:112] are written to bits [111:104] of the result. \n
  4276. /// Bits [127:120] are written to bits [127:120] of the result.
  4277. /// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
  4278. static __inline__ __m128i __DEFAULT_FN_ATTRS
  4279. _mm_unpackhi_epi8(__m128i __a, __m128i __b)
  4280. {
  4281. return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
  4282. }
  4283. /// \brief Unpacks the high-order (index 4-7) values from two 128-bit vectors of
  4284. /// [8 x i16] and interleaves them into a 128-bit vector of [8 x i16].
  4285. ///
  4286. /// \headerfile <x86intrin.h>
  4287. ///
  4288. /// This intrinsic corresponds to the <c> VPUNPCKHWD / PUNPCKHWD </c>
  4289. /// instruction.
  4290. ///
  4291. /// \param __a
  4292. /// A 128-bit vector of [8 x i16].
  4293. /// Bits [79:64] are written to bits [15:0] of the result. \n
  4294. /// Bits [95:80] are written to bits [47:32] of the result. \n
  4295. /// Bits [111:96] are written to bits [79:64] of the result. \n
  4296. /// Bits [127:112] are written to bits [111:96] of the result.
  4297. /// \param __b
  4298. /// A 128-bit vector of [8 x i16].
  4299. /// Bits [79:64] are written to bits [31:16] of the result. \n
  4300. /// Bits [95:80] are written to bits [63:48] of the result. \n
  4301. /// Bits [111:96] are written to bits [95:80] of the result. \n
  4302. /// Bits [127:112] are written to bits [127:112] of the result.
  4303. /// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
  4304. static __inline__ __m128i __DEFAULT_FN_ATTRS
  4305. _mm_unpackhi_epi16(__m128i __a, __m128i __b)
  4306. {
  4307. return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
  4308. }
  4309. /// \brief Unpacks the high-order (index 2,3) values from two 128-bit vectors of
  4310. /// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
  4311. ///
  4312. /// \headerfile <x86intrin.h>
  4313. ///
  4314. /// This intrinsic corresponds to the <c> VPUNPCKHDQ / PUNPCKHDQ </c>
  4315. /// instruction.
  4316. ///
  4317. /// \param __a
  4318. /// A 128-bit vector of [4 x i32]. \n
  4319. /// Bits [95:64] are written to bits [31:0] of the destination. \n
  4320. /// Bits [127:96] are written to bits [95:64] of the destination.
  4321. /// \param __b
  4322. /// A 128-bit vector of [4 x i32]. \n
  4323. /// Bits [95:64] are written to bits [64:32] of the destination. \n
  4324. /// Bits [127:96] are written to bits [127:96] of the destination.
  4325. /// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
  4326. static __inline__ __m128i __DEFAULT_FN_ATTRS
  4327. _mm_unpackhi_epi32(__m128i __a, __m128i __b)
  4328. {
  4329. return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3);
  4330. }
  4331. /// \brief Unpacks the high-order 64-bit elements from two 128-bit vectors of
  4332. /// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
  4333. ///
  4334. /// \headerfile <x86intrin.h>
  4335. ///
  4336. /// This intrinsic corresponds to the <c> VPUNPCKHQDQ / PUNPCKHQDQ </c>
  4337. /// instruction.
  4338. ///
  4339. /// \param __a
  4340. /// A 128-bit vector of [2 x i64]. \n
  4341. /// Bits [127:64] are written to bits [63:0] of the destination.
  4342. /// \param __b
  4343. /// A 128-bit vector of [2 x i64]. \n
  4344. /// Bits [127:64] are written to bits [127:64] of the destination.
  4345. /// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
  4346. static __inline__ __m128i __DEFAULT_FN_ATTRS
  4347. _mm_unpackhi_epi64(__m128i __a, __m128i __b)
  4348. {
  4349. return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2+1);
  4350. }
  4351. /// \brief Unpacks the low-order (index 0-7) values from two 128-bit vectors of
  4352. /// [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
  4353. ///
  4354. /// \headerfile <x86intrin.h>
  4355. ///
  4356. /// This intrinsic corresponds to the <c> VPUNPCKLBW / PUNPCKLBW </c>
  4357. /// instruction.
  4358. ///
  4359. /// \param __a
  4360. /// A 128-bit vector of [16 x i8]. \n
  4361. /// Bits [7:0] are written to bits [7:0] of the result. \n
  4362. /// Bits [15:8] are written to bits [23:16] of the result. \n
  4363. /// Bits [23:16] are written to bits [39:32] of the result. \n
  4364. /// Bits [31:24] are written to bits [55:48] of the result. \n
  4365. /// Bits [39:32] are written to bits [71:64] of the result. \n
  4366. /// Bits [47:40] are written to bits [87:80] of the result. \n
  4367. /// Bits [55:48] are written to bits [103:96] of the result. \n
  4368. /// Bits [63:56] are written to bits [119:112] of the result.
  4369. /// \param __b
  4370. /// A 128-bit vector of [16 x i8].
  4371. /// Bits [7:0] are written to bits [15:8] of the result. \n
  4372. /// Bits [15:8] are written to bits [31:24] of the result. \n
  4373. /// Bits [23:16] are written to bits [47:40] of the result. \n
  4374. /// Bits [31:24] are written to bits [63:56] of the result. \n
  4375. /// Bits [39:32] are written to bits [79:72] of the result. \n
  4376. /// Bits [47:40] are written to bits [95:88] of the result. \n
  4377. /// Bits [55:48] are written to bits [111:104] of the result. \n
  4378. /// Bits [63:56] are written to bits [127:120] of the result.
  4379. /// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
  4380. static __inline__ __m128i __DEFAULT_FN_ATTRS
  4381. _mm_unpacklo_epi8(__m128i __a, __m128i __b)
  4382. {
  4383. return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
  4384. }
  4385. /// \brief Unpacks the low-order (index 0-3) values from each of the two 128-bit
  4386. /// vectors of [8 x i16] and interleaves them into a 128-bit vector of
  4387. /// [8 x i16].
  4388. ///
  4389. /// \headerfile <x86intrin.h>
  4390. ///
  4391. /// This intrinsic corresponds to the <c> VPUNPCKLWD / PUNPCKLWD </c>
  4392. /// instruction.
  4393. ///
  4394. /// \param __a
  4395. /// A 128-bit vector of [8 x i16].
  4396. /// Bits [15:0] are written to bits [15:0] of the result. \n
  4397. /// Bits [31:16] are written to bits [47:32] of the result. \n
  4398. /// Bits [47:32] are written to bits [79:64] of the result. \n
  4399. /// Bits [63:48] are written to bits [111:96] of the result.
  4400. /// \param __b
  4401. /// A 128-bit vector of [8 x i16].
  4402. /// Bits [15:0] are written to bits [31:16] of the result. \n
  4403. /// Bits [31:16] are written to bits [63:48] of the result. \n
  4404. /// Bits [47:32] are written to bits [95:80] of the result. \n
  4405. /// Bits [63:48] are written to bits [127:112] of the result.
  4406. /// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
  4407. static __inline__ __m128i __DEFAULT_FN_ATTRS
  4408. _mm_unpacklo_epi16(__m128i __a, __m128i __b)
  4409. {
  4410. return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
  4411. }
  4412. /// \brief Unpacks the low-order (index 0,1) values from two 128-bit vectors of
  4413. /// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
  4414. ///
  4415. /// \headerfile <x86intrin.h>
  4416. ///
  4417. /// This intrinsic corresponds to the <c> VPUNPCKLDQ / PUNPCKLDQ </c>
  4418. /// instruction.
  4419. ///
  4420. /// \param __a
  4421. /// A 128-bit vector of [4 x i32]. \n
  4422. /// Bits [31:0] are written to bits [31:0] of the destination. \n
  4423. /// Bits [63:32] are written to bits [95:64] of the destination.
  4424. /// \param __b
  4425. /// A 128-bit vector of [4 x i32]. \n
  4426. /// Bits [31:0] are written to bits [64:32] of the destination. \n
  4427. /// Bits [63:32] are written to bits [127:96] of the destination.
  4428. /// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
  4429. static __inline__ __m128i __DEFAULT_FN_ATTRS
  4430. _mm_unpacklo_epi32(__m128i __a, __m128i __b)
  4431. {
  4432. return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1);
  4433. }
  4434. /// \brief Unpacks the low-order 64-bit elements from two 128-bit vectors of
  4435. /// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
  4436. ///
  4437. /// \headerfile <x86intrin.h>
  4438. ///
  4439. /// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c>
  4440. /// instruction.
  4441. ///
  4442. /// \param __a
  4443. /// A 128-bit vector of [2 x i64]. \n
  4444. /// Bits [63:0] are written to bits [63:0] of the destination. \n
  4445. /// \param __b
  4446. /// A 128-bit vector of [2 x i64]. \n
  4447. /// Bits [63:0] are written to bits [127:64] of the destination. \n
  4448. /// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
  4449. static __inline__ __m128i __DEFAULT_FN_ATTRS
  4450. _mm_unpacklo_epi64(__m128i __a, __m128i __b)
  4451. {
  4452. return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2+0);
  4453. }
  4454. /// \brief Returns the lower 64 bits of a 128-bit integer vector as a 64-bit
  4455. /// integer.
  4456. ///
  4457. /// \headerfile <x86intrin.h>
  4458. ///
  4459. /// This intrinsic corresponds to the <c> MOVDQ2Q </c> instruction.
  4460. ///
  4461. /// \param __a
  4462. /// A 128-bit integer vector operand. The lower 64 bits are moved to the
  4463. /// destination.
  4464. /// \returns A 64-bit integer containing the lower 64 bits of the parameter.
  4465. static __inline__ __m64 __DEFAULT_FN_ATTRS
  4466. _mm_movepi64_pi64(__m128i __a)
  4467. {
  4468. return (__m64)__a[0];
  4469. }
  4470. /// \brief Moves the 64-bit operand to a 128-bit integer vector, zeroing the
  4471. /// upper bits.
  4472. ///
  4473. /// \headerfile <x86intrin.h>
  4474. ///
  4475. /// This intrinsic corresponds to the <c> MOVD+VMOVQ </c> instruction.
  4476. ///
  4477. /// \param __a
  4478. /// A 64-bit value.
  4479. /// \returns A 128-bit integer vector. The lower 64 bits contain the value from
  4480. /// the operand. The upper 64 bits are assigned zeros.
  4481. static __inline__ __m128i __DEFAULT_FN_ATTRS
  4482. _mm_movpi64_epi64(__m64 __a)
  4483. {
  4484. return (__m128i){ (long long)__a, 0 };
  4485. }
  4486. /// \brief Moves the lower 64 bits of a 128-bit integer vector to a 128-bit
  4487. /// integer vector, zeroing the upper bits.
  4488. ///
  4489. /// \headerfile <x86intrin.h>
  4490. ///
  4491. /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
  4492. ///
  4493. /// \param __a
  4494. /// A 128-bit integer vector operand. The lower 64 bits are moved to the
  4495. /// destination.
  4496. /// \returns A 128-bit integer vector. The lower 64 bits contain the value from
  4497. /// the operand. The upper 64 bits are assigned zeros.
  4498. static __inline__ __m128i __DEFAULT_FN_ATTRS
  4499. _mm_move_epi64(__m128i __a)
  4500. {
  4501. return __builtin_shufflevector((__v2di)__a, (__m128i){ 0 }, 0, 2);
  4502. }
  4503. /// \brief Unpacks the high-order 64-bit elements from two 128-bit vectors of
  4504. /// [2 x double] and interleaves them into a 128-bit vector of [2 x
  4505. /// double].
  4506. ///
  4507. /// \headerfile <x86intrin.h>
  4508. ///
  4509. /// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
  4510. ///
  4511. /// \param __a
  4512. /// A 128-bit vector of [2 x double]. \n
  4513. /// Bits [127:64] are written to bits [63:0] of the destination.
  4514. /// \param __b
  4515. /// A 128-bit vector of [2 x double]. \n
  4516. /// Bits [127:64] are written to bits [127:64] of the destination.
  4517. /// \returns A 128-bit vector of [2 x double] containing the interleaved values.
  4518. static __inline__ __m128d __DEFAULT_FN_ATTRS
  4519. _mm_unpackhi_pd(__m128d __a, __m128d __b)
  4520. {
  4521. return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2+1);
  4522. }
  4523. /// \brief Unpacks the low-order 64-bit elements from two 128-bit vectors
  4524. /// of [2 x double] and interleaves them into a 128-bit vector of [2 x
  4525. /// double].
  4526. ///
  4527. /// \headerfile <x86intrin.h>
  4528. ///
  4529. /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
  4530. ///
  4531. /// \param __a
  4532. /// A 128-bit vector of [2 x double]. \n
  4533. /// Bits [63:0] are written to bits [63:0] of the destination.
  4534. /// \param __b
  4535. /// A 128-bit vector of [2 x double]. \n
  4536. /// Bits [63:0] are written to bits [127:64] of the destination.
  4537. /// \returns A 128-bit vector of [2 x double] containing the interleaved values.
  4538. static __inline__ __m128d __DEFAULT_FN_ATTRS
  4539. _mm_unpacklo_pd(__m128d __a, __m128d __b)
  4540. {
  4541. return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2+0);
  4542. }
  4543. /// \brief Extracts the sign bits of the double-precision values in the 128-bit
  4544. /// vector of [2 x double], zero-extends the value, and writes it to the
  4545. /// low-order bits of the destination.
  4546. ///
  4547. /// \headerfile <x86intrin.h>
  4548. ///
  4549. /// This intrinsic corresponds to the <c> VMOVMSKPD / MOVMSKPD </c> instruction.
  4550. ///
  4551. /// \param __a
  4552. /// A 128-bit vector of [2 x double] containing the values with sign bits to
  4553. /// be extracted.
  4554. /// \returns The sign bits from each of the double-precision elements in \a __a,
  4555. /// written to bits [1:0]. The remaining bits are assigned values of zero.
  4556. static __inline__ int __DEFAULT_FN_ATTRS
  4557. _mm_movemask_pd(__m128d __a)
  4558. {
  4559. return __builtin_ia32_movmskpd((__v2df)__a);
  4560. }
  4561. /// \brief Constructs a 128-bit floating-point vector of [2 x double] from two
  4562. /// 128-bit vector parameters of [2 x double], using the immediate-value
  4563. /// parameter as a specifier.
  4564. ///
  4565. /// \headerfile <x86intrin.h>
  4566. ///
  4567. /// \code
  4568. /// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i);
  4569. /// \endcode
  4570. ///
  4571. /// This intrinsic corresponds to the <c> VSHUFPD / SHUFPD </c> instruction.
  4572. ///
  4573. /// \param a
  4574. /// A 128-bit vector of [2 x double].
  4575. /// \param b
  4576. /// A 128-bit vector of [2 x double].
  4577. /// \param i
  4578. /// An 8-bit immediate value. The least significant two bits specify which
  4579. /// elements to copy from \a a and \a b: \n
  4580. /// Bit[0] = 0: lower element of \a a copied to lower element of result. \n
  4581. /// Bit[0] = 1: upper element of \a a copied to lower element of result. \n
  4582. /// Bit[1] = 0: lower element of \a b copied to upper element of result. \n
  4583. /// Bit[1] = 1: upper element of \a b copied to upper element of result. \n
  4584. /// \returns A 128-bit vector of [2 x double] containing the shuffled values.
  4585. #define _mm_shuffle_pd(a, b, i) __extension__ ({ \
  4586. (__m128d)__builtin_shufflevector((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
  4587. 0 + (((i) >> 0) & 0x1), \
  4588. 2 + (((i) >> 1) & 0x1)); })
  4589. /// \brief Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
  4590. /// floating-point vector of [4 x float].
  4591. ///
  4592. /// \headerfile <x86intrin.h>
  4593. ///
  4594. /// This intrinsic has no corresponding instruction.
  4595. ///
  4596. /// \param __a
  4597. /// A 128-bit floating-point vector of [2 x double].
  4598. /// \returns A 128-bit floating-point vector of [4 x float] containing the same
  4599. /// bitwise pattern as the parameter.
  4600. static __inline__ __m128 __DEFAULT_FN_ATTRS
  4601. _mm_castpd_ps(__m128d __a)
  4602. {
  4603. return (__m128)__a;
  4604. }
  4605. /// \brief Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
  4606. /// integer vector.
  4607. ///
  4608. /// \headerfile <x86intrin.h>
  4609. ///
  4610. /// This intrinsic has no corresponding instruction.
  4611. ///
  4612. /// \param __a
  4613. /// A 128-bit floating-point vector of [2 x double].
  4614. /// \returns A 128-bit integer vector containing the same bitwise pattern as the
  4615. /// parameter.
  4616. static __inline__ __m128i __DEFAULT_FN_ATTRS
  4617. _mm_castpd_si128(__m128d __a)
  4618. {
  4619. return (__m128i)__a;
  4620. }
  4621. /// \brief Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
  4622. /// floating-point vector of [2 x double].
  4623. ///
  4624. /// \headerfile <x86intrin.h>
  4625. ///
  4626. /// This intrinsic has no corresponding instruction.
  4627. ///
  4628. /// \param __a
  4629. /// A 128-bit floating-point vector of [4 x float].
  4630. /// \returns A 128-bit floating-point vector of [2 x double] containing the same
  4631. /// bitwise pattern as the parameter.
  4632. static __inline__ __m128d __DEFAULT_FN_ATTRS
  4633. _mm_castps_pd(__m128 __a)
  4634. {
  4635. return (__m128d)__a;
  4636. }
  4637. /// \brief Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
  4638. /// integer vector.
  4639. ///
  4640. /// \headerfile <x86intrin.h>
  4641. ///
  4642. /// This intrinsic has no corresponding instruction.
  4643. ///
  4644. /// \param __a
  4645. /// A 128-bit floating-point vector of [4 x float].
  4646. /// \returns A 128-bit integer vector containing the same bitwise pattern as the
  4647. /// parameter.
  4648. static __inline__ __m128i __DEFAULT_FN_ATTRS
  4649. _mm_castps_si128(__m128 __a)
  4650. {
  4651. return (__m128i)__a;
  4652. }
  4653. /// \brief Casts a 128-bit integer vector into a 128-bit floating-point vector
  4654. /// of [4 x float].
  4655. ///
  4656. /// \headerfile <x86intrin.h>
  4657. ///
  4658. /// This intrinsic has no corresponding instruction.
  4659. ///
  4660. /// \param __a
  4661. /// A 128-bit integer vector.
  4662. /// \returns A 128-bit floating-point vector of [4 x float] containing the same
  4663. /// bitwise pattern as the parameter.
  4664. static __inline__ __m128 __DEFAULT_FN_ATTRS
  4665. _mm_castsi128_ps(__m128i __a)
  4666. {
  4667. return (__m128)__a;
  4668. }
  4669. /// \brief Casts a 128-bit integer vector into a 128-bit floating-point vector
  4670. /// of [2 x double].
  4671. ///
  4672. /// \headerfile <x86intrin.h>
  4673. ///
  4674. /// This intrinsic has no corresponding instruction.
  4675. ///
  4676. /// \param __a
  4677. /// A 128-bit integer vector.
  4678. /// \returns A 128-bit floating-point vector of [2 x double] containing the same
  4679. /// bitwise pattern as the parameter.
  4680. static __inline__ __m128d __DEFAULT_FN_ATTRS
  4681. _mm_castsi128_pd(__m128i __a)
  4682. {
  4683. return (__m128d)__a;
  4684. }
  4685. #if defined(__cplusplus)
  4686. extern "C" {
  4687. #endif
  4688. /// \brief Indicates that a spin loop is being executed for the purposes of
  4689. /// optimizing power consumption during the loop.
  4690. ///
  4691. /// \headerfile <x86intrin.h>
  4692. ///
  4693. /// This intrinsic corresponds to the <c> PAUSE </c> instruction.
  4694. ///
  4695. void _mm_pause(void);
  4696. #if defined(__cplusplus)
  4697. } // extern "C"
  4698. #endif
  4699. #undef __DEFAULT_FN_ATTRS
  4700. #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
  4701. #define _MM_DENORMALS_ZERO_ON (0x0040)
  4702. #define _MM_DENORMALS_ZERO_OFF (0x0000)
  4703. #define _MM_DENORMALS_ZERO_MASK (0x0040)
  4704. #define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
  4705. #define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
  4706. #endif /* __EMMINTRIN_H */