tmmintrin.h 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781
  1. /*===---- tmmintrin.h - SSSE3 intrinsics -----------------------------------===
  2. *
  3. * Permission is hereby granted, free of charge, to any person obtaining a copy
  4. * of this software and associated documentation files (the "Software"), to deal
  5. * in the Software without restriction, including without limitation the rights
  6. * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  7. * copies of the Software, and to permit persons to whom the Software is
  8. * furnished to do so, subject to the following conditions:
  9. *
  10. * The above copyright notice and this permission notice shall be included in
  11. * all copies or substantial portions of the Software.
  12. *
  13. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  14. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  15. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  16. * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  17. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  18. * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  19. * THE SOFTWARE.
  20. *
  21. *===-----------------------------------------------------------------------===
  22. */
  23. #ifndef __TMMINTRIN_H
  24. #define __TMMINTRIN_H
  25. #include <pmmintrin.h>
  26. /* Define the default attributes for the functions in this file. */
  27. #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("ssse3")))
  28. /// \brief Computes the absolute value of each of the packed 8-bit signed
  29. /// integers in the source operand and stores the 8-bit unsigned integer
  30. /// results in the destination.
  31. ///
  32. /// \headerfile <x86intrin.h>
  33. ///
  34. /// This intrinsic corresponds to the \c PABSB instruction.
  35. ///
  36. /// \param __a
  37. /// A 64-bit vector of [8 x i8].
  38. /// \returns A 64-bit integer vector containing the absolute values of the
  39. /// elements in the operand.
  40. static __inline__ __m64 __DEFAULT_FN_ATTRS
  41. _mm_abs_pi8(__m64 __a)
  42. {
  43. return (__m64)__builtin_ia32_pabsb((__v8qi)__a);
  44. }
  45. /// \brief Computes the absolute value of each of the packed 8-bit signed
  46. /// integers in the source operand and stores the 8-bit unsigned integer
  47. /// results in the destination.
  48. ///
  49. /// \headerfile <x86intrin.h>
  50. ///
  51. /// This intrinsic corresponds to the \c VPABSB instruction.
  52. ///
  53. /// \param __a
  54. /// A 128-bit vector of [16 x i8].
  55. /// \returns A 128-bit integer vector containing the absolute values of the
  56. /// elements in the operand.
  57. static __inline__ __m128i __DEFAULT_FN_ATTRS
  58. _mm_abs_epi8(__m128i __a)
  59. {
  60. return (__m128i)__builtin_ia32_pabsb128((__v16qi)__a);
  61. }
  62. /// \brief Computes the absolute value of each of the packed 16-bit signed
  63. /// integers in the source operand and stores the 16-bit unsigned integer
  64. /// results in the destination.
  65. ///
  66. /// \headerfile <x86intrin.h>
  67. ///
  68. /// This intrinsic corresponds to the \c PABSW instruction.
  69. ///
  70. /// \param __a
  71. /// A 64-bit vector of [4 x i16].
  72. /// \returns A 64-bit integer vector containing the absolute values of the
  73. /// elements in the operand.
  74. static __inline__ __m64 __DEFAULT_FN_ATTRS
  75. _mm_abs_pi16(__m64 __a)
  76. {
  77. return (__m64)__builtin_ia32_pabsw((__v4hi)__a);
  78. }
  79. /// \brief Computes the absolute value of each of the packed 16-bit signed
  80. /// integers in the source operand and stores the 16-bit unsigned integer
  81. /// results in the destination.
  82. ///
  83. /// \headerfile <x86intrin.h>
  84. ///
  85. /// This intrinsic corresponds to the \c VPABSW instruction.
  86. ///
  87. /// \param __a
  88. /// A 128-bit vector of [8 x i16].
  89. /// \returns A 128-bit integer vector containing the absolute values of the
  90. /// elements in the operand.
  91. static __inline__ __m128i __DEFAULT_FN_ATTRS
  92. _mm_abs_epi16(__m128i __a)
  93. {
  94. return (__m128i)__builtin_ia32_pabsw128((__v8hi)__a);
  95. }
  96. /// \brief Computes the absolute value of each of the packed 32-bit signed
  97. /// integers in the source operand and stores the 32-bit unsigned integer
  98. /// results in the destination.
  99. ///
  100. /// \headerfile <x86intrin.h>
  101. ///
  102. /// This intrinsic corresponds to the \c PABSD instruction.
  103. ///
  104. /// \param __a
  105. /// A 64-bit vector of [2 x i32].
  106. /// \returns A 64-bit integer vector containing the absolute values of the
  107. /// elements in the operand.
  108. static __inline__ __m64 __DEFAULT_FN_ATTRS
  109. _mm_abs_pi32(__m64 __a)
  110. {
  111. return (__m64)__builtin_ia32_pabsd((__v2si)__a);
  112. }
  113. /// \brief Computes the absolute value of each of the packed 32-bit signed
  114. /// integers in the source operand and stores the 32-bit unsigned integer
  115. /// results in the destination.
  116. ///
  117. /// \headerfile <x86intrin.h>
  118. ///
  119. /// This intrinsic corresponds to the \c VPABSD instruction.
  120. ///
  121. /// \param __a
  122. /// A 128-bit vector of [4 x i32].
  123. /// \returns A 128-bit integer vector containing the absolute values of the
  124. /// elements in the operand.
  125. static __inline__ __m128i __DEFAULT_FN_ATTRS
  126. _mm_abs_epi32(__m128i __a)
  127. {
  128. return (__m128i)__builtin_ia32_pabsd128((__v4si)__a);
  129. }
  130. /// \brief Concatenates the two 128-bit integer vector operands, and
  131. /// right-shifts the result by the number of bytes specified in the immediate
  132. /// operand.
  133. ///
  134. /// \headerfile <x86intrin.h>
  135. ///
  136. /// \code
  137. /// __m128i _mm_alignr_epi8(__m128i a, __m128i b, const int n);
  138. /// \endcode
  139. ///
  140. /// This intrinsic corresponds to the \c PALIGNR instruction.
  141. ///
  142. /// \param a
  143. /// A 128-bit vector of [16 x i8] containing one of the source operands.
  144. /// \param b
  145. /// A 128-bit vector of [16 x i8] containing one of the source operands.
  146. /// \param n
  147. /// An immediate operand specifying how many bytes to right-shift the result.
  148. /// \returns A 128-bit integer vector containing the concatenated right-shifted
  149. /// value.
  150. #define _mm_alignr_epi8(a, b, n) __extension__ ({ \
  151. (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \
  152. (__v16qi)(__m128i)(b), (n)); })
  153. /// \brief Concatenates the two 64-bit integer vector operands, and right-shifts
  154. /// the result by the number of bytes specified in the immediate operand.
  155. ///
  156. /// \headerfile <x86intrin.h>
  157. ///
  158. /// \code
  159. /// __m64 _mm_alignr_pi8(__m64 a, __m64 b, const int n);
  160. /// \endcode
  161. ///
  162. /// This intrinsic corresponds to the \c PALIGNR instruction.
  163. ///
  164. /// \param a
  165. /// A 64-bit vector of [8 x i8] containing one of the source operands.
  166. /// \param b
  167. /// A 64-bit vector of [8 x i8] containing one of the source operands.
  168. /// \param n
  169. /// An immediate operand specifying how many bytes to right-shift the result.
  170. /// \returns A 64-bit integer vector containing the concatenated right-shifted
  171. /// value.
  172. #define _mm_alignr_pi8(a, b, n) __extension__ ({ \
  173. (__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n)); })
  174. /// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
  175. /// 128-bit vectors of [8 x i16].
  176. ///
  177. /// \headerfile <x86intrin.h>
  178. ///
  179. /// This intrinsic corresponds to the \c VPHADDW instruction.
  180. ///
  181. /// \param __a
  182. /// A 128-bit vector of [8 x i16] containing one of the source operands. The
  183. /// horizontal sums of the values are stored in the lower bits of the
  184. /// destination.
  185. /// \param __b
  186. /// A 128-bit vector of [8 x i16] containing one of the source operands. The
  187. /// horizontal sums of the values are stored in the upper bits of the
  188. /// destination.
  189. /// \returns A 128-bit vector of [8 x i16] containing the horizontal sums of
  190. /// both operands.
  191. static __inline__ __m128i __DEFAULT_FN_ATTRS
  192. _mm_hadd_epi16(__m128i __a, __m128i __b)
  193. {
  194. return (__m128i)__builtin_ia32_phaddw128((__v8hi)__a, (__v8hi)__b);
  195. }
  196. /// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
  197. /// 128-bit vectors of [4 x i32].
  198. ///
  199. /// \headerfile <x86intrin.h>
  200. ///
  201. /// This intrinsic corresponds to the \c VPHADDD instruction.
  202. ///
  203. /// \param __a
  204. /// A 128-bit vector of [4 x i32] containing one of the source operands. The
  205. /// horizontal sums of the values are stored in the lower bits of the
  206. /// destination.
  207. /// \param __b
  208. /// A 128-bit vector of [4 x i32] containing one of the source operands. The
  209. /// horizontal sums of the values are stored in the upper bits of the
  210. /// destination.
  211. /// \returns A 128-bit vector of [4 x i32] containing the horizontal sums of
  212. /// both operands.
  213. static __inline__ __m128i __DEFAULT_FN_ATTRS
  214. _mm_hadd_epi32(__m128i __a, __m128i __b)
  215. {
  216. return (__m128i)__builtin_ia32_phaddd128((__v4si)__a, (__v4si)__b);
  217. }
  218. /// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
  219. /// 64-bit vectors of [4 x i16].
  220. ///
  221. /// \headerfile <x86intrin.h>
  222. ///
  223. /// This intrinsic corresponds to the \c PHADDW instruction.
  224. ///
  225. /// \param __a
  226. /// A 64-bit vector of [4 x i16] containing one of the source operands. The
  227. /// horizontal sums of the values are stored in the lower bits of the
  228. /// destination.
  229. /// \param __b
  230. /// A 64-bit vector of [4 x i16] containing one of the source operands. The
  231. /// horizontal sums of the values are stored in the upper bits of the
  232. /// destination.
  233. /// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of both
  234. /// operands.
  235. static __inline__ __m64 __DEFAULT_FN_ATTRS
  236. _mm_hadd_pi16(__m64 __a, __m64 __b)
  237. {
  238. return (__m64)__builtin_ia32_phaddw((__v4hi)__a, (__v4hi)__b);
  239. }
  240. /// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
  241. /// 64-bit vectors of [2 x i32].
  242. ///
  243. /// \headerfile <x86intrin.h>
  244. ///
  245. /// This intrinsic corresponds to the \c PHADDD instruction.
  246. ///
  247. /// \param __a
  248. /// A 64-bit vector of [2 x i32] containing one of the source operands. The
  249. /// horizontal sums of the values are stored in the lower bits of the
  250. /// destination.
  251. /// \param __b
  252. /// A 64-bit vector of [2 x i32] containing one of the source operands. The
  253. /// horizontal sums of the values are stored in the upper bits of the
  254. /// destination.
  255. /// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of both
  256. /// operands.
  257. static __inline__ __m64 __DEFAULT_FN_ATTRS
  258. _mm_hadd_pi32(__m64 __a, __m64 __b)
  259. {
  260. return (__m64)__builtin_ia32_phaddd((__v2si)__a, (__v2si)__b);
  261. }
  262. /// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
  263. /// 128-bit vectors of [8 x i16]. Positive sums greater than 7FFFh are
  264. /// saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h.
  265. ///
  266. /// \headerfile <x86intrin.h>
  267. ///
  268. /// This intrinsic corresponds to the \c VPHADDSW instruction.
  269. ///
  270. /// \param __a
  271. /// A 128-bit vector of [8 x i16] containing one of the source operands. The
  272. /// horizontal sums of the values are stored in the lower bits of the
  273. /// destination.
  274. /// \param __b
  275. /// A 128-bit vector of [8 x i16] containing one of the source operands. The
  276. /// horizontal sums of the values are stored in the upper bits of the
  277. /// destination.
  278. /// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
  279. /// sums of both operands.
  280. static __inline__ __m128i __DEFAULT_FN_ATTRS
  281. _mm_hadds_epi16(__m128i __a, __m128i __b)
  282. {
  283. return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b);
  284. }
  285. /// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
  286. /// 64-bit vectors of [4 x i16]. Positive sums greater than 7FFFh are
  287. /// saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h.
  288. ///
  289. /// \headerfile <x86intrin.h>
  290. ///
  291. /// This intrinsic corresponds to the \c PHADDSW instruction.
  292. ///
  293. /// \param __a
  294. /// A 64-bit vector of [4 x i16] containing one of the source operands. The
  295. /// horizontal sums of the values are stored in the lower bits of the
  296. /// destination.
  297. /// \param __b
  298. /// A 64-bit vector of [4 x i16] containing one of the source operands. The
  299. /// horizontal sums of the values are stored in the upper bits of the
  300. /// destination.
  301. /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
  302. /// sums of both operands.
  303. static __inline__ __m64 __DEFAULT_FN_ATTRS
  304. _mm_hadds_pi16(__m64 __a, __m64 __b)
  305. {
  306. return (__m64)__builtin_ia32_phaddsw((__v4hi)__a, (__v4hi)__b);
  307. }
  308. /// \brief Horizontally subtracts the adjacent pairs of values contained in 2
  309. /// packed 128-bit vectors of [8 x i16].
  310. ///
  311. /// \headerfile <x86intrin.h>
  312. ///
  313. /// This intrinsic corresponds to the \c VPHSUBW instruction.
  314. ///
  315. /// \param __a
  316. /// A 128-bit vector of [8 x i16] containing one of the source operands. The
  317. /// horizontal differences between the values are stored in the lower bits of
  318. /// the destination.
  319. /// \param __b
  320. /// A 128-bit vector of [8 x i16] containing one of the source operands. The
  321. /// horizontal differences between the values are stored in the upper bits of
  322. /// the destination.
  323. /// \returns A 128-bit vector of [8 x i16] containing the horizontal differences
  324. /// of both operands.
  325. static __inline__ __m128i __DEFAULT_FN_ATTRS
  326. _mm_hsub_epi16(__m128i __a, __m128i __b)
  327. {
  328. return (__m128i)__builtin_ia32_phsubw128((__v8hi)__a, (__v8hi)__b);
  329. }
  330. /// \brief Horizontally subtracts the adjacent pairs of values contained in 2
  331. /// packed 128-bit vectors of [4 x i32].
  332. ///
  333. /// \headerfile <x86intrin.h>
  334. ///
  335. /// This intrinsic corresponds to the \c VPHSUBD instruction.
  336. ///
  337. /// \param __a
  338. /// A 128-bit vector of [4 x i32] containing one of the source operands. The
  339. /// horizontal differences between the values are stored in the lower bits of
  340. /// the destination.
  341. /// \param __b
  342. /// A 128-bit vector of [4 x i32] containing one of the source operands. The
  343. /// horizontal differences between the values are stored in the upper bits of
  344. /// the destination.
  345. /// \returns A 128-bit vector of [4 x i32] containing the horizontal differences
  346. /// of both operands.
  347. static __inline__ __m128i __DEFAULT_FN_ATTRS
  348. _mm_hsub_epi32(__m128i __a, __m128i __b)
  349. {
  350. return (__m128i)__builtin_ia32_phsubd128((__v4si)__a, (__v4si)__b);
  351. }
  352. /// \brief Horizontally subtracts the adjacent pairs of values contained in 2
  353. /// packed 64-bit vectors of [4 x i16].
  354. ///
  355. /// \headerfile <x86intrin.h>
  356. ///
  357. /// This intrinsic corresponds to the \c PHSUBW instruction.
  358. ///
  359. /// \param __a
  360. /// A 64-bit vector of [4 x i16] containing one of the source operands. The
  361. /// horizontal differences between the values are stored in the lower bits of
  362. /// the destination.
  363. /// \param __b
  364. /// A 64-bit vector of [4 x i16] containing one of the source operands. The
  365. /// horizontal differences between the values are stored in the upper bits of
  366. /// the destination.
  367. /// \returns A 64-bit vector of [4 x i16] containing the horizontal differences
  368. /// of both operands.
  369. static __inline__ __m64 __DEFAULT_FN_ATTRS
  370. _mm_hsub_pi16(__m64 __a, __m64 __b)
  371. {
  372. return (__m64)__builtin_ia32_phsubw((__v4hi)__a, (__v4hi)__b);
  373. }
  374. /// \brief Horizontally subtracts the adjacent pairs of values contained in 2
  375. /// packed 64-bit vectors of [2 x i32].
  376. ///
  377. /// \headerfile <x86intrin.h>
  378. ///
  379. /// This intrinsic corresponds to the \c PHSUBD instruction.
  380. ///
  381. /// \param __a
  382. /// A 64-bit vector of [2 x i32] containing one of the source operands. The
  383. /// horizontal differences between the values are stored in the lower bits of
  384. /// the destination.
  385. /// \param __b
  386. /// A 64-bit vector of [2 x i32] containing one of the source operands. The
  387. /// horizontal differences between the values are stored in the upper bits of
  388. /// the destination.
  389. /// \returns A 64-bit vector of [2 x i32] containing the horizontal differences
  390. /// of both operands.
  391. static __inline__ __m64 __DEFAULT_FN_ATTRS
  392. _mm_hsub_pi32(__m64 __a, __m64 __b)
  393. {
  394. return (__m64)__builtin_ia32_phsubd((__v2si)__a, (__v2si)__b);
  395. }
  396. /// \brief Horizontally subtracts the adjacent pairs of values contained in 2
  397. /// packed 128-bit vectors of [8 x i16]. Positive differences greater than
  398. /// 7FFFh are saturated to 7FFFh. Negative differences less than 8000h are
  399. /// saturated to 8000h.
  400. ///
  401. /// \headerfile <x86intrin.h>
  402. ///
  403. /// This intrinsic corresponds to the \c VPHSUBSW instruction.
  404. ///
  405. /// \param __a
  406. /// A 128-bit vector of [8 x i16] containing one of the source operands. The
  407. /// horizontal differences between the values are stored in the lower bits of
  408. /// the destination.
  409. /// \param __b
  410. /// A 128-bit vector of [8 x i16] containing one of the source operands. The
  411. /// horizontal differences between the values are stored in the upper bits of
  412. /// the destination.
  413. /// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
  414. /// differences of both operands.
  415. static __inline__ __m128i __DEFAULT_FN_ATTRS
  416. _mm_hsubs_epi16(__m128i __a, __m128i __b)
  417. {
  418. return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b);
  419. }
  420. /// \brief Horizontally subtracts the adjacent pairs of values contained in 2
  421. /// packed 64-bit vectors of [4 x i16]. Positive differences greater than
  422. /// 7FFFh are saturated to 7FFFh. Negative differences less than 8000h are
  423. /// saturated to 8000h.
  424. ///
  425. /// \headerfile <x86intrin.h>
  426. ///
  427. /// This intrinsic corresponds to the \c PHSUBSW instruction.
  428. ///
  429. /// \param __a
  430. /// A 64-bit vector of [4 x i16] containing one of the source operands. The
  431. /// horizontal differences between the values are stored in the lower bits of
  432. /// the destination.
  433. /// \param __b
  434. /// A 64-bit vector of [4 x i16] containing one of the source operands. The
  435. /// horizontal differences between the values are stored in the upper bits of
  436. /// the destination.
  437. /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
  438. /// differences of both operands.
  439. static __inline__ __m64 __DEFAULT_FN_ATTRS
  440. _mm_hsubs_pi16(__m64 __a, __m64 __b)
  441. {
  442. return (__m64)__builtin_ia32_phsubsw((__v4hi)__a, (__v4hi)__b);
  443. }
  444. /// \brief Multiplies corresponding pairs of packed 8-bit unsigned integer
  445. /// values contained in the first source operand and packed 8-bit signed
  446. /// integer values contained in the second source operand, adds pairs of
  447. /// contiguous products with signed saturation, and writes the 16-bit sums to
  448. /// the corresponding bits in the destination.
  449. ///
  450. /// For example, bits [7:0] of both operands are multiplied, bits [15:8] of
  451. /// both operands are multiplied, and the sum of both results is written to
  452. /// bits [15:0] of the destination.
  453. ///
  454. /// \headerfile <x86intrin.h>
  455. ///
  456. /// This intrinsic corresponds to the \c VPMADDUBSW instruction.
  457. ///
  458. /// \param __a
  459. /// A 128-bit integer vector containing the first source operand.
  460. /// \param __b
  461. /// A 128-bit integer vector containing the second source operand.
  462. /// \returns A 128-bit integer vector containing the sums of products of both
  463. /// operands: \n
  464. /// \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
  465. /// \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
  466. /// \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
  467. /// \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7) \n
  468. /// \a R4 := (\a __a8 * \a __b8) + (\a __a9 * \a __b9) \n
  469. /// \a R5 := (\a __a10 * \a __b10) + (\a __a11 * \a __b11) \n
  470. /// \a R6 := (\a __a12 * \a __b12) + (\a __a13 * \a __b13) \n
  471. /// \a R7 := (\a __a14 * \a __b14) + (\a __a15 * \a __b15)
  472. static __inline__ __m128i __DEFAULT_FN_ATTRS
  473. _mm_maddubs_epi16(__m128i __a, __m128i __b)
  474. {
  475. return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
  476. }
  477. /// \brief Multiplies corresponding pairs of packed 8-bit unsigned integer
  478. /// values contained in the first source operand and packed 8-bit signed
  479. /// integer values contained in the second source operand, adds pairs of
  480. /// contiguous products with signed saturation, and writes the 16-bit sums to
  481. /// the corresponding bits in the destination.
  482. ///
  483. /// For example, bits [7:0] of both operands are multiplied, bits [15:8] of
  484. /// both operands are multiplied, and the sum of both results is written to
  485. /// bits [15:0] of the destination.
  486. ///
  487. /// \headerfile <x86intrin.h>
  488. ///
  489. /// This intrinsic corresponds to the \c PMADDUBSW instruction.
  490. ///
  491. /// \param __a
  492. /// A 64-bit integer vector containing the first source operand.
  493. /// \param __b
  494. /// A 64-bit integer vector containing the second source operand.
  495. /// \returns A 64-bit integer vector containing the sums of products of both
  496. /// operands: \n
  497. /// \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
  498. /// \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
  499. /// \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
  500. /// \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7)
  501. static __inline__ __m64 __DEFAULT_FN_ATTRS
  502. _mm_maddubs_pi16(__m64 __a, __m64 __b)
  503. {
  504. return (__m64)__builtin_ia32_pmaddubsw((__v8qi)__a, (__v8qi)__b);
  505. }
  506. /// \brief Multiplies packed 16-bit signed integer values, truncates the 32-bit
  507. /// products to the 18 most significant bits by right-shifting, rounds the
  508. /// truncated value by adding 1, and writes bits [16:1] to the destination.
  509. ///
  510. /// \headerfile <x86intrin.h>
  511. ///
  512. /// This intrinsic corresponds to the \c VPMULHRSW instruction.
  513. ///
  514. /// \param __a
  515. /// A 128-bit vector of [8 x i16] containing one of the source operands.
  516. /// \param __b
  517. /// A 128-bit vector of [8 x i16] containing one of the source operands.
  518. /// \returns A 128-bit vector of [8 x i16] containing the rounded and scaled
  519. /// products of both operands.
  520. static __inline__ __m128i __DEFAULT_FN_ATTRS
  521. _mm_mulhrs_epi16(__m128i __a, __m128i __b)
  522. {
  523. return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b);
  524. }
  525. /// \brief Multiplies packed 16-bit signed integer values, truncates the 32-bit
  526. /// products to the 18 most significant bits by right-shifting, rounds the
  527. /// truncated value by adding 1, and writes bits [16:1] to the destination.
  528. ///
  529. /// \headerfile <x86intrin.h>
  530. ///
  531. /// This intrinsic corresponds to the \c PMULHRSW instruction.
  532. ///
  533. /// \param __a
  534. /// A 64-bit vector of [4 x i16] containing one of the source operands.
  535. /// \param __b
  536. /// A 64-bit vector of [4 x i16] containing one of the source operands.
  537. /// \returns A 64-bit vector of [4 x i16] containing the rounded and scaled
  538. /// products of both operands.
  539. static __inline__ __m64 __DEFAULT_FN_ATTRS
  540. _mm_mulhrs_pi16(__m64 __a, __m64 __b)
  541. {
  542. return (__m64)__builtin_ia32_pmulhrsw((__v4hi)__a, (__v4hi)__b);
  543. }
  544. /// \brief Copies the 8-bit integers from a 128-bit integer vector to the
  545. /// destination or clears 8-bit values in the destination, as specified by
  546. /// the second source operand.
  547. ///
  548. /// \headerfile <x86intrin.h>
  549. ///
  550. /// This intrinsic corresponds to the \c VPSHUFB instruction.
  551. ///
  552. /// \param __a
  553. /// A 128-bit integer vector containing the values to be copied.
  554. /// \param __b
  555. /// A 128-bit integer vector containing control bytes corresponding to
  556. /// positions in the destination:
  557. /// Bit 7: \n
  558. /// 1: Clear the corresponding byte in the destination. \n
  559. /// 0: Copy the selected source byte to the corresponding byte in the
  560. /// destination. \n
  561. /// Bits [6:4] Reserved. \n
  562. /// Bits [3:0] select the source byte to be copied.
  563. /// \returns A 128-bit integer vector containing the copied or cleared values.
  564. static __inline__ __m128i __DEFAULT_FN_ATTRS
  565. _mm_shuffle_epi8(__m128i __a, __m128i __b)
  566. {
  567. return (__m128i)__builtin_ia32_pshufb128((__v16qi)__a, (__v16qi)__b);
  568. }
  569. /// \brief Copies the 8-bit integers from a 64-bit integer vector to the
  570. /// destination or clears 8-bit values in the destination, as specified by
  571. /// the second source operand.
  572. ///
  573. /// \headerfile <x86intrin.h>
  574. ///
  575. /// This intrinsic corresponds to the \c PSHUFB instruction.
  576. ///
  577. /// \param __a
  578. /// A 64-bit integer vector containing the values to be copied.
  579. /// \param __b
  580. /// A 64-bit integer vector containing control bytes corresponding to
  581. /// positions in the destination:
  582. /// Bit 7: \n
  583. /// 1: Clear the corresponding byte in the destination. \n
  584. /// 0: Copy the selected source byte to the corresponding byte in the
  585. /// destination. \n
  586. /// Bits [3:0] select the source byte to be copied.
  587. /// \returns A 64-bit integer vector containing the copied or cleared values.
  588. static __inline__ __m64 __DEFAULT_FN_ATTRS
  589. _mm_shuffle_pi8(__m64 __a, __m64 __b)
  590. {
  591. return (__m64)__builtin_ia32_pshufb((__v8qi)__a, (__v8qi)__b);
  592. }
  593. /// \brief For each 8-bit integer in the first source operand, perform one of
  594. /// the following actions as specified by the second source operand.
  595. ///
  596. /// If the byte in the second source is negative, calculate the two's
  597. /// complement of the corresponding byte in the first source, and write that
  598. /// value to the destination. If the byte in the second source is positive,
  599. /// copy the corresponding byte from the first source to the destination. If
  600. /// the byte in the second source is zero, clear the corresponding byte in
  601. /// the destination.
  602. ///
  603. /// \headerfile <x86intrin.h>
  604. ///
  605. /// This intrinsic corresponds to the \c VPSIGNB instruction.
  606. ///
  607. /// \param __a
  608. /// A 128-bit integer vector containing the values to be copied.
  609. /// \param __b
  610. /// A 128-bit integer vector containing control bytes corresponding to
  611. /// positions in the destination.
  612. /// \returns A 128-bit integer vector containing the resultant values.
  613. static __inline__ __m128i __DEFAULT_FN_ATTRS
  614. _mm_sign_epi8(__m128i __a, __m128i __b)
  615. {
  616. return (__m128i)__builtin_ia32_psignb128((__v16qi)__a, (__v16qi)__b);
  617. }
  618. /// \brief For each 16-bit integer in the first source operand, perform one of
  619. /// the following actions as specified by the second source operand.
  620. ///
  621. /// If the word in the second source is negative, calculate the two's
  622. /// complement of the corresponding word in the first source, and write that
  623. /// value to the destination. If the word in the second source is positive,
  624. /// copy the corresponding word from the first source to the destination. If
  625. /// the word in the second source is zero, clear the corresponding word in
  626. /// the destination.
  627. ///
  628. /// \headerfile <x86intrin.h>
  629. ///
  630. /// This intrinsic corresponds to the \c VPSIGNW instruction.
  631. ///
  632. /// \param __a
  633. /// A 128-bit integer vector containing the values to be copied.
  634. /// \param __b
  635. /// A 128-bit integer vector containing control words corresponding to
  636. /// positions in the destination.
  637. /// \returns A 128-bit integer vector containing the resultant values.
  638. static __inline__ __m128i __DEFAULT_FN_ATTRS
  639. _mm_sign_epi16(__m128i __a, __m128i __b)
  640. {
  641. return (__m128i)__builtin_ia32_psignw128((__v8hi)__a, (__v8hi)__b);
  642. }
  643. /// \brief For each 32-bit integer in the first source operand, perform one of
  644. /// the following actions as specified by the second source operand.
  645. ///
  646. /// If the doubleword in the second source is negative, calculate the two's
  647. /// complement of the corresponding word in the first source, and write that
  648. /// value to the destination. If the doubleword in the second source is
  649. /// positive, copy the corresponding word from the first source to the
  650. /// destination. If the doubleword in the second source is zero, clear the
  651. /// corresponding word in the destination.
  652. ///
  653. /// \headerfile <x86intrin.h>
  654. ///
  655. /// This intrinsic corresponds to the \c VPSIGND instruction.
  656. ///
  657. /// \param __a
  658. /// A 128-bit integer vector containing the values to be copied.
  659. /// \param __b
  660. /// A 128-bit integer vector containing control doublewords corresponding to
  661. /// positions in the destination.
  662. /// \returns A 128-bit integer vector containing the resultant values.
  663. static __inline__ __m128i __DEFAULT_FN_ATTRS
  664. _mm_sign_epi32(__m128i __a, __m128i __b)
  665. {
  666. return (__m128i)__builtin_ia32_psignd128((__v4si)__a, (__v4si)__b);
  667. }
  668. /// \brief For each 8-bit integer in the first source operand, perform one of
  669. /// the following actions as specified by the second source operand.
  670. ///
  671. /// If the byte in the second source is negative, calculate the two's
  672. /// complement of the corresponding byte in the first source, and write that
  673. /// value to the destination. If the byte in the second source is positive,
  674. /// copy the corresponding byte from the first source to the destination. If
  675. /// the byte in the second source is zero, clear the corresponding byte in
  676. /// the destination.
  677. ///
  678. /// \headerfile <x86intrin.h>
  679. ///
  680. /// This intrinsic corresponds to the \c PSIGNB instruction.
  681. ///
  682. /// \param __a
  683. /// A 64-bit integer vector containing the values to be copied.
  684. /// \param __b
  685. /// A 64-bit integer vector containing control bytes corresponding to
  686. /// positions in the destination.
  687. /// \returns A 64-bit integer vector containing the resultant values.
  688. static __inline__ __m64 __DEFAULT_FN_ATTRS
  689. _mm_sign_pi8(__m64 __a, __m64 __b)
  690. {
  691. return (__m64)__builtin_ia32_psignb((__v8qi)__a, (__v8qi)__b);
  692. }
  693. /// \brief For each 16-bit integer in the first source operand, perform one of
  694. /// the following actions as specified by the second source operand.
  695. ///
  696. /// If the word in the second source is negative, calculate the two's
  697. /// complement of the corresponding word in the first source, and write that
  698. /// value to the destination. If the word in the second source is positive,
  699. /// copy the corresponding word from the first source to the destination. If
  700. /// the word in the second source is zero, clear the corresponding word in
  701. /// the destination.
  702. ///
  703. /// \headerfile <x86intrin.h>
  704. ///
  705. /// This intrinsic corresponds to the \c PSIGNW instruction.
  706. ///
  707. /// \param __a
  708. /// A 64-bit integer vector containing the values to be copied.
  709. /// \param __b
  710. /// A 64-bit integer vector containing control words corresponding to
  711. /// positions in the destination.
  712. /// \returns A 64-bit integer vector containing the resultant values.
  713. static __inline__ __m64 __DEFAULT_FN_ATTRS
  714. _mm_sign_pi16(__m64 __a, __m64 __b)
  715. {
  716. return (__m64)__builtin_ia32_psignw((__v4hi)__a, (__v4hi)__b);
  717. }
  718. /// \brief For each 32-bit integer in the first source operand, perform one of
  719. /// the following actions as specified by the second source operand.
  720. ///
  721. /// If the doubleword in the second source is negative, calculate the two's
  722. /// complement of the corresponding doubleword in the first source, and
  723. /// write that value to the destination. If the doubleword in the second
  724. /// source is positive, copy the corresponding doubleword from the first
  725. /// source to the destination. If the doubleword in the second source is
  726. /// zero, clear the corresponding doubleword in the destination.
  727. ///
  728. /// \headerfile <x86intrin.h>
  729. ///
  730. /// This intrinsic corresponds to the \c PSIGND instruction.
  731. ///
  732. /// \param __a
  733. /// A 64-bit integer vector containing the values to be copied.
  734. /// \param __b
  735. /// A 64-bit integer vector containing two control doublewords corresponding
  736. /// to positions in the destination.
  737. /// \returns A 64-bit integer vector containing the resultant values.
  738. static __inline__ __m64 __DEFAULT_FN_ATTRS
  739. _mm_sign_pi32(__m64 __a, __m64 __b)
  740. {
  741. return (__m64)__builtin_ia32_psignd((__v2si)__a, (__v2si)__b);
  742. }
  743. #undef __DEFAULT_FN_ATTRS
  744. #endif /* __TMMINTRIN_H */