pmmintrin.h 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304
  1. /*===---- pmmintrin.h - SSE3 intrinsics ------------------------------------===
  2. *
  3. * Permission is hereby granted, free of charge, to any person obtaining a copy
  4. * of this software and associated documentation files (the "Software"), to deal
  5. * in the Software without restriction, including without limitation the rights
  6. * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  7. * copies of the Software, and to permit persons to whom the Software is
  8. * furnished to do so, subject to the following conditions:
  9. *
  10. * The above copyright notice and this permission notice shall be included in
  11. * all copies or substantial portions of the Software.
  12. *
  13. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  14. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  15. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  16. * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  17. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  18. * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  19. * THE SOFTWARE.
  20. *
  21. *===-----------------------------------------------------------------------===
  22. */
  23. #ifndef __PMMINTRIN_H
  24. #define __PMMINTRIN_H
  25. #include <emmintrin.h>
  26. /* Define the default attributes for the functions in this file. */
  27. #define __DEFAULT_FN_ATTRS \
  28. __attribute__((__always_inline__, __nodebug__, __target__("sse3")))
  29. /// \brief Loads data from an unaligned memory location to elements in a 128-bit
  30. /// vector.
  31. ///
  32. /// If the address of the data is not 16-byte aligned, the instruction may
  33. /// read two adjacent aligned blocks of memory to retrieve the requested
  34. /// data.
  35. ///
  36. /// \headerfile <x86intrin.h>
  37. ///
  38. /// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
  39. ///
  40. /// \param __p
  41. /// A pointer to a 128-bit integer vector containing integer values.
  42. /// \returns A 128-bit vector containing the moved values.
  43. static __inline__ __m128i __DEFAULT_FN_ATTRS
  44. _mm_lddqu_si128(__m128i const *__p)
  45. {
  46. return (__m128i)__builtin_ia32_lddqu((char const *)__p);
  47. }
  48. /// \brief Adds the even-indexed values and subtracts the odd-indexed values of
  49. /// two 128-bit vectors of [4 x float].
  50. ///
  51. /// \headerfile <x86intrin.h>
  52. ///
  53. /// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
  54. ///
  55. /// \param __a
  56. /// A 128-bit vector of [4 x float] containing the left source operand.
  57. /// \param __b
  58. /// A 128-bit vector of [4 x float] containing the right source operand.
  59. /// \returns A 128-bit vector of [4 x float] containing the alternating sums and
  60. /// differences of both operands.
  61. static __inline__ __m128 __DEFAULT_FN_ATTRS
  62. _mm_addsub_ps(__m128 __a, __m128 __b)
  63. {
  64. return __builtin_ia32_addsubps((__v4sf)__a, (__v4sf)__b);
  65. }
  66. /// \brief Horizontally adds the adjacent pairs of values contained in two
  67. /// 128-bit vectors of [4 x float].
  68. ///
  69. /// \headerfile <x86intrin.h>
  70. ///
  71. /// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
  72. ///
  73. /// \param __a
  74. /// A 128-bit vector of [4 x float] containing one of the source operands.
  75. /// The horizontal sums of the values are stored in the lower bits of the
  76. /// destination.
  77. /// \param __b
  78. /// A 128-bit vector of [4 x float] containing one of the source operands.
  79. /// The horizontal sums of the values are stored in the upper bits of the
  80. /// destination.
  81. /// \returns A 128-bit vector of [4 x float] containing the horizontal sums of
  82. /// both operands.
  83. static __inline__ __m128 __DEFAULT_FN_ATTRS
  84. _mm_hadd_ps(__m128 __a, __m128 __b)
  85. {
  86. return __builtin_ia32_haddps((__v4sf)__a, (__v4sf)__b);
  87. }
  88. /// \brief Horizontally subtracts the adjacent pairs of values contained in two
  89. /// 128-bit vectors of [4 x float].
  90. ///
  91. /// \headerfile <x86intrin.h>
  92. ///
  93. /// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
  94. ///
  95. /// \param __a
  96. /// A 128-bit vector of [4 x float] containing one of the source operands.
  97. /// The horizontal differences between the values are stored in the lower
  98. /// bits of the destination.
  99. /// \param __b
  100. /// A 128-bit vector of [4 x float] containing one of the source operands.
  101. /// The horizontal differences between the values are stored in the upper
  102. /// bits of the destination.
  103. /// \returns A 128-bit vector of [4 x float] containing the horizontal
  104. /// differences of both operands.
  105. static __inline__ __m128 __DEFAULT_FN_ATTRS
  106. _mm_hsub_ps(__m128 __a, __m128 __b)
  107. {
  108. return __builtin_ia32_hsubps((__v4sf)__a, (__v4sf)__b);
  109. }
  110. /// \brief Moves and duplicates odd-indexed values from a 128-bit vector
  111. /// of [4 x float] to float values stored in a 128-bit vector of
  112. /// [4 x float].
  113. ///
  114. /// \headerfile <x86intrin.h>
  115. ///
  116. /// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
  117. ///
  118. /// \param __a
  119. /// A 128-bit vector of [4 x float]. \n
  120. /// Bits [127:96] of the source are written to bits [127:96] and [95:64] of
  121. /// the destination. \n
  122. /// Bits [63:32] of the source are written to bits [63:32] and [31:0] of the
  123. /// destination.
  124. /// \returns A 128-bit vector of [4 x float] containing the moved and duplicated
  125. /// values.
  126. static __inline__ __m128 __DEFAULT_FN_ATTRS
  127. _mm_movehdup_ps(__m128 __a)
  128. {
  129. return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 1, 1, 3, 3);
  130. }
  131. /// \brief Duplicates even-indexed values from a 128-bit vector of
  132. /// [4 x float] to float values stored in a 128-bit vector of [4 x float].
  133. ///
  134. /// \headerfile <x86intrin.h>
  135. ///
  136. /// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
  137. ///
  138. /// \param __a
  139. /// A 128-bit vector of [4 x float] \n
  140. /// Bits [95:64] of the source are written to bits [127:96] and [95:64] of
  141. /// the destination. \n
  142. /// Bits [31:0] of the source are written to bits [63:32] and [31:0] of the
  143. /// destination.
  144. /// \returns A 128-bit vector of [4 x float] containing the moved and duplicated
  145. /// values.
  146. static __inline__ __m128 __DEFAULT_FN_ATTRS
  147. _mm_moveldup_ps(__m128 __a)
  148. {
  149. return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 2, 2);
  150. }
  151. /// \brief Adds the even-indexed values and subtracts the odd-indexed values of
  152. /// two 128-bit vectors of [2 x double].
  153. ///
  154. /// \headerfile <x86intrin.h>
  155. ///
  156. /// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
  157. ///
  158. /// \param __a
  159. /// A 128-bit vector of [2 x double] containing the left source operand.
  160. /// \param __b
  161. /// A 128-bit vector of [2 x double] containing the right source operand.
  162. /// \returns A 128-bit vector of [2 x double] containing the alternating sums
  163. /// and differences of both operands.
  164. static __inline__ __m128d __DEFAULT_FN_ATTRS
  165. _mm_addsub_pd(__m128d __a, __m128d __b)
  166. {
  167. return __builtin_ia32_addsubpd((__v2df)__a, (__v2df)__b);
  168. }
  169. /// \brief Horizontally adds the pairs of values contained in two 128-bit
  170. /// vectors of [2 x double].
  171. ///
  172. /// \headerfile <x86intrin.h>
  173. ///
  174. /// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
  175. ///
  176. /// \param __a
  177. /// A 128-bit vector of [2 x double] containing one of the source operands.
  178. /// The horizontal sum of the values is stored in the lower bits of the
  179. /// destination.
  180. /// \param __b
  181. /// A 128-bit vector of [2 x double] containing one of the source operands.
  182. /// The horizontal sum of the values is stored in the upper bits of the
  183. /// destination.
  184. /// \returns A 128-bit vector of [2 x double] containing the horizontal sums of
  185. /// both operands.
  186. static __inline__ __m128d __DEFAULT_FN_ATTRS
  187. _mm_hadd_pd(__m128d __a, __m128d __b)
  188. {
  189. return __builtin_ia32_haddpd((__v2df)__a, (__v2df)__b);
  190. }
  191. /// \brief Horizontally subtracts the pairs of values contained in two 128-bit
  192. /// vectors of [2 x double].
  193. ///
  194. /// \headerfile <x86intrin.h>
  195. ///
  196. /// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
  197. ///
  198. /// \param __a
  199. /// A 128-bit vector of [2 x double] containing one of the source operands.
  200. /// The horizontal difference of the values is stored in the lower bits of
  201. /// the destination.
  202. /// \param __b
  203. /// A 128-bit vector of [2 x double] containing one of the source operands.
  204. /// The horizontal difference of the values is stored in the upper bits of
  205. /// the destination.
  206. /// \returns A 128-bit vector of [2 x double] containing the horizontal
  207. /// differences of both operands.
  208. static __inline__ __m128d __DEFAULT_FN_ATTRS
  209. _mm_hsub_pd(__m128d __a, __m128d __b)
  210. {
  211. return __builtin_ia32_hsubpd((__v2df)__a, (__v2df)__b);
  212. }
  213. /// \brief Moves and duplicates one double-precision value to double-precision
  214. /// values stored in a 128-bit vector of [2 x double].
  215. ///
  216. /// \headerfile <x86intrin.h>
  217. ///
  218. /// \code
  219. /// __m128d _mm_loaddup_pd(double const * dp);
  220. /// \endcode
  221. ///
  222. /// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
  223. ///
  224. /// \param dp
  225. /// A pointer to a double-precision value to be moved and duplicated.
  226. /// \returns A 128-bit vector of [2 x double] containing the moved and
  227. /// duplicated values.
  228. #define _mm_loaddup_pd(dp) _mm_load1_pd(dp)
  229. /// \brief Moves and duplicates the double-precision value in the lower bits of
  230. /// a 128-bit vector of [2 x double] to double-precision values stored in a
  231. /// 128-bit vector of [2 x double].
  232. ///
  233. /// \headerfile <x86intrin.h>
  234. ///
  235. /// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
  236. ///
  237. /// \param __a
  238. /// A 128-bit vector of [2 x double]. Bits [63:0] are written to bits
  239. /// [127:64] and [63:0] of the destination.
  240. /// \returns A 128-bit vector of [2 x double] containing the moved and
  241. /// duplicated values.
  242. static __inline__ __m128d __DEFAULT_FN_ATTRS
  243. _mm_movedup_pd(__m128d __a)
  244. {
  245. return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
  246. }
  247. /// \brief Establishes a linear address memory range to be monitored and puts
  248. /// the processor in the monitor event pending state. Data stored in the
  249. /// monitored address range causes the processor to exit the pending state.
  250. ///
  251. /// \headerfile <x86intrin.h>
  252. ///
  253. /// This intrinsic corresponds to the <c> MONITOR </c> instruction.
  254. ///
  255. /// \param __p
  256. /// The memory range to be monitored. The size of the range is determined by
  257. /// CPUID function 0000_0005h.
  258. /// \param __extensions
  259. /// Optional extensions for the monitoring state.
  260. /// \param __hints
  261. /// Optional hints for the monitoring state.
  262. static __inline__ void __DEFAULT_FN_ATTRS
  263. _mm_monitor(void const *__p, unsigned __extensions, unsigned __hints)
  264. {
  265. __builtin_ia32_monitor((void *)__p, __extensions, __hints);
  266. }
  267. /// \brief Used with the MONITOR instruction to wait while the processor is in
  268. /// the monitor event pending state. Data stored in the monitored address
  269. /// range causes the processor to exit the pending state.
  270. ///
  271. /// \headerfile <x86intrin.h>
  272. ///
  273. /// This intrinsic corresponds to the <c> MWAIT </c> instruction.
  274. ///
  275. /// \param __extensions
  276. /// Optional extensions for the monitoring state, which may vary by
  277. /// processor.
  278. /// \param __hints
  279. /// Optional hints for the monitoring state, which may vary by processor.
  280. static __inline__ void __DEFAULT_FN_ATTRS
  281. _mm_mwait(unsigned __extensions, unsigned __hints)
  282. {
  283. __builtin_ia32_mwait(__extensions, __hints);
  284. }
  285. #undef __DEFAULT_FN_ATTRS
  286. #endif /* __PMMINTRIN_H */