smmintrin.h 102 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465
  1. /*===---- smmintrin.h - SSE4 intrinsics ------------------------------------===
  2. *
  3. * Permission is hereby granted, free of charge, to any person obtaining a copy
  4. * of this software and associated documentation files (the "Software"), to deal
  5. * in the Software without restriction, including without limitation the rights
  6. * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  7. * copies of the Software, and to permit persons to whom the Software is
  8. * furnished to do so, subject to the following conditions:
  9. *
  10. * The above copyright notice and this permission notice shall be included in
  11. * all copies or substantial portions of the Software.
  12. *
  13. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  14. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  15. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  16. * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  17. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  18. * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  19. * THE SOFTWARE.
  20. *
  21. *===-----------------------------------------------------------------------===
  22. */
  23. #ifndef _SMMINTRIN_H
  24. #define _SMMINTRIN_H
  25. #include <tmmintrin.h>
  26. /* Define the default attributes for the functions in this file. */
  27. #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.1")))
  28. /* SSE4 Rounding macros. */
  29. #define _MM_FROUND_TO_NEAREST_INT 0x00
  30. #define _MM_FROUND_TO_NEG_INF 0x01
  31. #define _MM_FROUND_TO_POS_INF 0x02
  32. #define _MM_FROUND_TO_ZERO 0x03
  33. #define _MM_FROUND_CUR_DIRECTION 0x04
  34. #define _MM_FROUND_RAISE_EXC 0x00
  35. #define _MM_FROUND_NO_EXC 0x08
  36. #define _MM_FROUND_NINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT)
  37. #define _MM_FROUND_FLOOR (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF)
  38. #define _MM_FROUND_CEIL (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF)
  39. #define _MM_FROUND_TRUNC (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO)
  40. #define _MM_FROUND_RINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION)
  41. #define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION)
  42. /// \brief Rounds up each element of the 128-bit vector of [4 x float] to an
  43. /// integer and returns the rounded values in a 128-bit vector of
  44. /// [4 x float].
  45. ///
  46. /// \headerfile <x86intrin.h>
  47. ///
  48. /// \code
  49. /// __m128 _mm_ceil_ps(__m128 X);
  50. /// \endcode
  51. ///
  52. /// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
  53. ///
  54. /// \param X
  55. /// A 128-bit vector of [4 x float] values to be rounded up.
  56. /// \returns A 128-bit vector of [4 x float] containing the rounded values.
  57. #define _mm_ceil_ps(X) _mm_round_ps((X), _MM_FROUND_CEIL)
  58. /// \brief Rounds up each element of the 128-bit vector of [2 x double] to an
  59. /// integer and returns the rounded values in a 128-bit vector of
  60. /// [2 x double].
  61. ///
  62. /// \headerfile <x86intrin.h>
  63. ///
  64. /// \code
  65. /// __m128d _mm_ceil_pd(__m128d X);
  66. /// \endcode
  67. ///
  68. /// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
  69. ///
  70. /// \param X
  71. /// A 128-bit vector of [2 x double] values to be rounded up.
  72. /// \returns A 128-bit vector of [2 x double] containing the rounded values.
  73. #define _mm_ceil_pd(X) _mm_round_pd((X), _MM_FROUND_CEIL)
  74. /// \brief Copies three upper elements of the first 128-bit vector operand to
  75. /// the corresponding three upper elements of the 128-bit result vector of
  76. /// [4 x float]. Rounds up the lowest element of the second 128-bit vector
  77. /// operand to an integer and copies it to the lowest element of the 128-bit
  78. /// result vector of [4 x float].
  79. ///
  80. /// \headerfile <x86intrin.h>
  81. ///
  82. /// \code
  83. /// __m128 _mm_ceil_ss(__m128 X, __m128 Y);
  84. /// \endcode
  85. ///
  86. /// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
  87. ///
  88. /// \param X
  89. /// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
  90. /// copied to the corresponding bits of the result.
  91. /// \param Y
  92. /// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
  93. /// rounded up to the nearest integer and copied to the corresponding bits
  94. /// of the result.
  95. /// \returns A 128-bit vector of [4 x float] containing the copied and rounded
  96. /// values.
  97. #define _mm_ceil_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_CEIL)
  98. /// \brief Copies the upper element of the first 128-bit vector operand to the
  99. /// corresponding upper element of the 128-bit result vector of [2 x double].
  100. /// Rounds up the lower element of the second 128-bit vector operand to an
  101. /// integer and copies it to the lower element of the 128-bit result vector
  102. /// of [2 x double].
  103. ///
  104. /// \headerfile <x86intrin.h>
  105. ///
  106. /// \code
  107. /// __m128d _mm_ceil_sd(__m128d X, __m128d Y);
  108. /// \endcode
  109. ///
  110. /// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
  111. ///
  112. /// \param X
  113. /// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
  114. /// copied to the corresponding bits of the result.
  115. /// \param Y
  116. /// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
  117. /// rounded up to the nearest integer and copied to the corresponding bits
  118. /// of the result.
  119. /// \returns A 128-bit vector of [2 x double] containing the copied and rounded
  120. /// values.
  121. #define _mm_ceil_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_CEIL)
  122. /// \brief Rounds down each element of the 128-bit vector of [4 x float] to an
  123. /// an integer and returns the rounded values in a 128-bit vector of
  124. /// [4 x float].
  125. ///
  126. /// \headerfile <x86intrin.h>
  127. ///
  128. /// \code
  129. /// __m128 _mm_floor_ps(__m128 X);
  130. /// \endcode
  131. ///
  132. /// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
  133. ///
  134. /// \param X
  135. /// A 128-bit vector of [4 x float] values to be rounded down.
  136. /// \returns A 128-bit vector of [4 x float] containing the rounded values.
  137. #define _mm_floor_ps(X) _mm_round_ps((X), _MM_FROUND_FLOOR)
  138. /// \brief Rounds down each element of the 128-bit vector of [2 x double] to an
  139. /// integer and returns the rounded values in a 128-bit vector of
  140. /// [2 x double].
  141. ///
  142. /// \headerfile <x86intrin.h>
  143. ///
  144. /// \code
  145. /// __m128d _mm_floor_pd(__m128d X);
  146. /// \endcode
  147. ///
  148. /// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
  149. ///
  150. /// \param X
  151. /// A 128-bit vector of [2 x double].
  152. /// \returns A 128-bit vector of [2 x double] containing the rounded values.
  153. #define _mm_floor_pd(X) _mm_round_pd((X), _MM_FROUND_FLOOR)
  154. /// \brief Copies three upper elements of the first 128-bit vector operand to
  155. /// the corresponding three upper elements of the 128-bit result vector of
  156. /// [4 x float]. Rounds down the lowest element of the second 128-bit vector
  157. /// operand to an integer and copies it to the lowest element of the 128-bit
  158. /// result vector of [4 x float].
  159. ///
  160. /// \headerfile <x86intrin.h>
  161. ///
  162. /// \code
  163. /// __m128 _mm_floor_ss(__m128 X, __m128 Y);
  164. /// \endcode
  165. ///
  166. /// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
  167. ///
  168. /// \param X
  169. /// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
  170. /// copied to the corresponding bits of the result.
  171. /// \param Y
  172. /// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
  173. /// rounded down to the nearest integer and copied to the corresponding bits
  174. /// of the result.
  175. /// \returns A 128-bit vector of [4 x float] containing the copied and rounded
  176. /// values.
  177. #define _mm_floor_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_FLOOR)
  178. /// \brief Copies the upper element of the first 128-bit vector operand to the
  179. /// corresponding upper element of the 128-bit result vector of [2 x double].
  180. /// Rounds down the lower element of the second 128-bit vector operand to an
  181. /// integer and copies it to the lower element of the 128-bit result vector
  182. /// of [2 x double].
  183. ///
  184. /// \headerfile <x86intrin.h>
  185. ///
  186. /// \code
  187. /// __m128d _mm_floor_sd(__m128d X, __m128d Y);
  188. /// \endcode
  189. ///
  190. /// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
  191. ///
  192. /// \param X
  193. /// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
  194. /// copied to the corresponding bits of the result.
  195. /// \param Y
  196. /// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
  197. /// rounded down to the nearest integer and copied to the corresponding bits
  198. /// of the result.
  199. /// \returns A 128-bit vector of [2 x double] containing the copied and rounded
  200. /// values.
  201. #define _mm_floor_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_FLOOR)
  202. /// \brief Rounds each element of the 128-bit vector of [4 x float] to an
  203. /// integer value according to the rounding control specified by the second
  204. /// argument and returns the rounded values in a 128-bit vector of
  205. /// [4 x float].
  206. ///
  207. /// \headerfile <x86intrin.h>
  208. ///
  209. /// \code
  210. /// __m128 _mm_round_ps(__m128 X, const int M);
  211. /// \endcode
  212. ///
  213. /// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
  214. ///
  215. /// \param X
  216. /// A 128-bit vector of [4 x float].
  217. /// \param M
  218. /// An integer value that specifies the rounding operation. \n
  219. /// Bits [7:4] are reserved. \n
  220. /// Bit [3] is a precision exception value: \n
  221. /// 0: A normal PE exception is used \n
  222. /// 1: The PE field is not updated \n
  223. /// Bit [2] is the rounding control source: \n
  224. /// 0: Use bits [1:0] of \a M \n
  225. /// 1: Use the current MXCSR setting \n
  226. /// Bits [1:0] contain the rounding control definition: \n
  227. /// 00: Nearest \n
  228. /// 01: Downward (toward negative infinity) \n
  229. /// 10: Upward (toward positive infinity) \n
  230. /// 11: Truncated
  231. /// \returns A 128-bit vector of [4 x float] containing the rounded values.
  232. #define _mm_round_ps(X, M) __extension__ ({ \
  233. (__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M)); })
  234. /// \brief Copies three upper elements of the first 128-bit vector operand to
  235. /// the corresponding three upper elements of the 128-bit result vector of
  236. /// [4 x float]. Rounds the lowest element of the second 128-bit vector
  237. /// operand to an integer value according to the rounding control specified
  238. /// by the third argument and copies it to the lowest element of the 128-bit
  239. /// result vector of [4 x float].
  240. ///
  241. /// \headerfile <x86intrin.h>
  242. ///
  243. /// \code
  244. /// __m128 _mm_round_ss(__m128 X, __m128 Y, const int M);
  245. /// \endcode
  246. ///
  247. /// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
  248. ///
  249. /// \param X
  250. /// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
  251. /// copied to the corresponding bits of the result.
  252. /// \param Y
  253. /// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
  254. /// rounded to the nearest integer using the specified rounding control and
  255. /// copied to the corresponding bits of the result.
  256. /// \param M
  257. /// An integer value that specifies the rounding operation. \n
  258. /// Bits [7:4] are reserved. \n
  259. /// Bit [3] is a precision exception value: \n
  260. /// 0: A normal PE exception is used \n
  261. /// 1: The PE field is not updated \n
  262. /// Bit [2] is the rounding control source: \n
  263. /// 0: Use bits [1:0] of \a M \n
  264. /// 1: Use the current MXCSR setting \n
  265. /// Bits [1:0] contain the rounding control definition: \n
  266. /// 00: Nearest \n
  267. /// 01: Downward (toward negative infinity) \n
  268. /// 10: Upward (toward positive infinity) \n
  269. /// 11: Truncated
  270. /// \returns A 128-bit vector of [4 x float] containing the copied and rounded
  271. /// values.
  272. #define _mm_round_ss(X, Y, M) __extension__ ({ \
  273. (__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), \
  274. (__v4sf)(__m128)(Y), (M)); })
  275. /// \brief Rounds each element of the 128-bit vector of [2 x double] to an
  276. /// integer value according to the rounding control specified by the second
  277. /// argument and returns the rounded values in a 128-bit vector of
  278. /// [2 x double].
  279. ///
  280. /// \headerfile <x86intrin.h>
  281. ///
  282. /// \code
  283. /// __m128d _mm_round_pd(__m128d X, const int M);
  284. /// \endcode
  285. ///
  286. /// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
  287. ///
  288. /// \param X
  289. /// A 128-bit vector of [2 x double].
  290. /// \param M
  291. /// An integer value that specifies the rounding operation. \n
  292. /// Bits [7:4] are reserved. \n
  293. /// Bit [3] is a precision exception value: \n
  294. /// 0: A normal PE exception is used \n
  295. /// 1: The PE field is not updated \n
  296. /// Bit [2] is the rounding control source: \n
  297. /// 0: Use bits [1:0] of \a M \n
  298. /// 1: Use the current MXCSR setting \n
  299. /// Bits [1:0] contain the rounding control definition: \n
  300. /// 00: Nearest \n
  301. /// 01: Downward (toward negative infinity) \n
  302. /// 10: Upward (toward positive infinity) \n
  303. /// 11: Truncated
  304. /// \returns A 128-bit vector of [2 x double] containing the rounded values.
  305. #define _mm_round_pd(X, M) __extension__ ({ \
  306. (__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M)); })
  307. /// \brief Copies the upper element of the first 128-bit vector operand to the
  308. /// corresponding upper element of the 128-bit result vector of [2 x double].
  309. /// Rounds the lower element of the second 128-bit vector operand to an
  310. /// integer value according to the rounding control specified by the third
  311. /// argument and copies it to the lower element of the 128-bit result vector
  312. /// of [2 x double].
  313. ///
  314. /// \headerfile <x86intrin.h>
  315. ///
  316. /// \code
  317. /// __m128d _mm_round_sd(__m128d X, __m128d Y, const int M);
  318. /// \endcode
  319. ///
  320. /// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
  321. ///
  322. /// \param X
  323. /// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
  324. /// copied to the corresponding bits of the result.
  325. /// \param Y
  326. /// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
  327. /// rounded to the nearest integer using the specified rounding control and
  328. /// copied to the corresponding bits of the result.
  329. /// \param M
  330. /// An integer value that specifies the rounding operation. \n
  331. /// Bits [7:4] are reserved. \n
  332. /// Bit [3] is a precision exception value: \n
  333. /// 0: A normal PE exception is used \n
  334. /// 1: The PE field is not updated \n
  335. /// Bit [2] is the rounding control source: \n
  336. /// 0: Use bits [1:0] of \a M \n
  337. /// 1: Use the current MXCSR setting \n
  338. /// Bits [1:0] contain the rounding control definition: \n
  339. /// 00: Nearest \n
  340. /// 01: Downward (toward negative infinity) \n
  341. /// 10: Upward (toward positive infinity) \n
  342. /// 11: Truncated
  343. /// \returns A 128-bit vector of [2 x double] containing the copied and rounded
  344. /// values.
  345. #define _mm_round_sd(X, Y, M) __extension__ ({ \
  346. (__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), \
  347. (__v2df)(__m128d)(Y), (M)); })
  348. /* SSE4 Packed Blending Intrinsics. */
  349. /// \brief Returns a 128-bit vector of [2 x double] where the values are
  350. /// selected from either the first or second operand as specified by the
  351. /// third operand, the control mask.
  352. ///
  353. /// \headerfile <x86intrin.h>
  354. ///
  355. /// \code
  356. /// __m128d _mm_blend_pd(__m128d V1, __m128d V2, const int M);
  357. /// \endcode
  358. ///
  359. /// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
  360. ///
  361. /// \param V1
  362. /// A 128-bit vector of [2 x double].
  363. /// \param V2
  364. /// A 128-bit vector of [2 x double].
  365. /// \param M
  366. /// An immediate integer operand, with mask bits [1:0] specifying how the
  367. /// values are to be copied. The position of the mask bit corresponds to the
  368. /// index of a copied value. When a mask bit is 0, the corresponding 64-bit
  369. /// element in operand \a V1 is copied to the same position in the result.
  370. /// When a mask bit is 1, the corresponding 64-bit element in operand \a V2
  371. /// is copied to the same position in the result.
  372. /// \returns A 128-bit vector of [2 x double] containing the copied values.
  373. #define _mm_blend_pd(V1, V2, M) __extension__ ({ \
  374. (__m128d)__builtin_shufflevector((__v2df)(__m128d)(V1), \
  375. (__v2df)(__m128d)(V2), \
  376. (((M) & 0x01) ? 2 : 0), \
  377. (((M) & 0x02) ? 3 : 1)); })
  378. /// \brief Returns a 128-bit vector of [4 x float] where the values are selected
  379. /// from either the first or second operand as specified by the third
  380. /// operand, the control mask.
  381. ///
  382. /// \headerfile <x86intrin.h>
  383. ///
  384. /// \code
  385. /// __m128 _mm_blend_ps(__m128 V1, __m128 V2, const int M);
  386. /// \endcode
  387. ///
  388. /// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS </c> instruction.
  389. ///
  390. /// \param V1
  391. /// A 128-bit vector of [4 x float].
  392. /// \param V2
  393. /// A 128-bit vector of [4 x float].
  394. /// \param M
  395. /// An immediate integer operand, with mask bits [3:0] specifying how the
  396. /// values are to be copied. The position of the mask bit corresponds to the
  397. /// index of a copied value. When a mask bit is 0, the corresponding 32-bit
  398. /// element in operand \a V1 is copied to the same position in the result.
  399. /// When a mask bit is 1, the corresponding 32-bit element in operand \a V2
  400. /// is copied to the same position in the result.
  401. /// \returns A 128-bit vector of [4 x float] containing the copied values.
  402. #define _mm_blend_ps(V1, V2, M) __extension__ ({ \
  403. (__m128)__builtin_shufflevector((__v4sf)(__m128)(V1), (__v4sf)(__m128)(V2), \
  404. (((M) & 0x01) ? 4 : 0), \
  405. (((M) & 0x02) ? 5 : 1), \
  406. (((M) & 0x04) ? 6 : 2), \
  407. (((M) & 0x08) ? 7 : 3)); })
  408. /// \brief Returns a 128-bit vector of [2 x double] where the values are
  409. /// selected from either the first or second operand as specified by the
  410. /// third operand, the control mask.
  411. ///
  412. /// \headerfile <x86intrin.h>
  413. ///
  414. /// This intrinsic corresponds to the <c> VBLENDVPD / BLENDVPD </c> instruction.
  415. ///
  416. /// \param __V1
  417. /// A 128-bit vector of [2 x double].
  418. /// \param __V2
  419. /// A 128-bit vector of [2 x double].
  420. /// \param __M
  421. /// A 128-bit vector operand, with mask bits 127 and 63 specifying how the
  422. /// values are to be copied. The position of the mask bit corresponds to the
  423. /// most significant bit of a copied value. When a mask bit is 0, the
  424. /// corresponding 64-bit element in operand \a __V1 is copied to the same
  425. /// position in the result. When a mask bit is 1, the corresponding 64-bit
  426. /// element in operand \a __V2 is copied to the same position in the result.
  427. /// \returns A 128-bit vector of [2 x double] containing the copied values.
  428. static __inline__ __m128d __DEFAULT_FN_ATTRS
  429. _mm_blendv_pd (__m128d __V1, __m128d __V2, __m128d __M)
  430. {
  431. return (__m128d) __builtin_ia32_blendvpd ((__v2df)__V1, (__v2df)__V2,
  432. (__v2df)__M);
  433. }
  434. /// \brief Returns a 128-bit vector of [4 x float] where the values are
  435. /// selected from either the first or second operand as specified by the
  436. /// third operand, the control mask.
  437. ///
  438. /// \headerfile <x86intrin.h>
  439. ///
  440. /// This intrinsic corresponds to the <c> VBLENDVPS / BLENDVPS </c> instruction.
  441. ///
  442. /// \param __V1
  443. /// A 128-bit vector of [4 x float].
  444. /// \param __V2
  445. /// A 128-bit vector of [4 x float].
  446. /// \param __M
  447. /// A 128-bit vector operand, with mask bits 127, 95, 63, and 31 specifying
  448. /// how the values are to be copied. The position of the mask bit corresponds
  449. /// to the most significant bit of a copied value. When a mask bit is 0, the
  450. /// corresponding 32-bit element in operand \a __V1 is copied to the same
  451. /// position in the result. When a mask bit is 1, the corresponding 32-bit
  452. /// element in operand \a __V2 is copied to the same position in the result.
  453. /// \returns A 128-bit vector of [4 x float] containing the copied values.
  454. static __inline__ __m128 __DEFAULT_FN_ATTRS
  455. _mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M)
  456. {
  457. return (__m128) __builtin_ia32_blendvps ((__v4sf)__V1, (__v4sf)__V2,
  458. (__v4sf)__M);
  459. }
  460. /// \brief Returns a 128-bit vector of [16 x i8] where the values are selected
  461. /// from either of the first or second operand as specified by the third
  462. /// operand, the control mask.
  463. ///
  464. /// \headerfile <x86intrin.h>
  465. ///
  466. /// This intrinsic corresponds to the <c> VPBLENDVB / PBLENDVB </c> instruction.
  467. ///
  468. /// \param __V1
  469. /// A 128-bit vector of [16 x i8].
  470. /// \param __V2
  471. /// A 128-bit vector of [16 x i8].
  472. /// \param __M
  473. /// A 128-bit vector operand, with mask bits 127, 119, 111 ... 7 specifying
  474. /// how the values are to be copied. The position of the mask bit corresponds
  475. /// to the most significant bit of a copied value. When a mask bit is 0, the
  476. /// corresponding 8-bit element in operand \a __V1 is copied to the same
  477. /// position in the result. When a mask bit is 1, the corresponding 8-bit
  478. /// element in operand \a __V2 is copied to the same position in the result.
  479. /// \returns A 128-bit vector of [16 x i8] containing the copied values.
  480. static __inline__ __m128i __DEFAULT_FN_ATTRS
  481. _mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M)
  482. {
  483. return (__m128i) __builtin_ia32_pblendvb128 ((__v16qi)__V1, (__v16qi)__V2,
  484. (__v16qi)__M);
  485. }
  486. /// \brief Returns a 128-bit vector of [8 x i16] where the values are selected
  487. /// from either of the first or second operand as specified by the third
  488. /// operand, the control mask.
  489. ///
  490. /// \headerfile <x86intrin.h>
  491. ///
  492. /// \code
  493. /// __m128i _mm_blend_epi16(__m128i V1, __m128i V2, const int M);
  494. /// \endcode
  495. ///
  496. /// This intrinsic corresponds to the <c> VPBLENDW / PBLENDW </c> instruction.
  497. ///
  498. /// \param V1
  499. /// A 128-bit vector of [8 x i16].
  500. /// \param V2
  501. /// A 128-bit vector of [8 x i16].
  502. /// \param M
  503. /// An immediate integer operand, with mask bits [7:0] specifying how the
  504. /// values are to be copied. The position of the mask bit corresponds to the
  505. /// index of a copied value. When a mask bit is 0, the corresponding 16-bit
  506. /// element in operand \a V1 is copied to the same position in the result.
  507. /// When a mask bit is 1, the corresponding 16-bit element in operand \a V2
  508. /// is copied to the same position in the result.
  509. /// \returns A 128-bit vector of [8 x i16] containing the copied values.
  510. #define _mm_blend_epi16(V1, V2, M) __extension__ ({ \
  511. (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(V1), \
  512. (__v8hi)(__m128i)(V2), \
  513. (((M) & 0x01) ? 8 : 0), \
  514. (((M) & 0x02) ? 9 : 1), \
  515. (((M) & 0x04) ? 10 : 2), \
  516. (((M) & 0x08) ? 11 : 3), \
  517. (((M) & 0x10) ? 12 : 4), \
  518. (((M) & 0x20) ? 13 : 5), \
  519. (((M) & 0x40) ? 14 : 6), \
  520. (((M) & 0x80) ? 15 : 7)); })
  521. /* SSE4 Dword Multiply Instructions. */
  522. /// \brief Multiples corresponding elements of two 128-bit vectors of [4 x i32]
  523. /// and returns the lower 32 bits of the each product in a 128-bit vector of
  524. /// [4 x i32].
  525. ///
  526. /// \headerfile <x86intrin.h>
  527. ///
  528. /// This intrinsic corresponds to the <c> VPMULLD / PMULLD </c> instruction.
  529. ///
  530. /// \param __V1
  531. /// A 128-bit integer vector.
  532. /// \param __V2
  533. /// A 128-bit integer vector.
  534. /// \returns A 128-bit integer vector containing the products of both operands.
  535. static __inline__ __m128i __DEFAULT_FN_ATTRS
  536. _mm_mullo_epi32 (__m128i __V1, __m128i __V2)
  537. {
  538. return (__m128i) ((__v4su)__V1 * (__v4su)__V2);
  539. }
  540. /// \brief Multiplies corresponding even-indexed elements of two 128-bit
  541. /// vectors of [4 x i32] and returns a 128-bit vector of [2 x i64]
  542. /// containing the products.
  543. ///
  544. /// \headerfile <x86intrin.h>
  545. ///
  546. /// This intrinsic corresponds to the <c> VPMULDQ / PMULDQ </c> instruction.
  547. ///
  548. /// \param __V1
  549. /// A 128-bit vector of [4 x i32].
  550. /// \param __V2
  551. /// A 128-bit vector of [4 x i32].
  552. /// \returns A 128-bit vector of [2 x i64] containing the products of both
  553. /// operands.
  554. static __inline__ __m128i __DEFAULT_FN_ATTRS
  555. _mm_mul_epi32 (__m128i __V1, __m128i __V2)
  556. {
  557. return (__m128i) __builtin_ia32_pmuldq128 ((__v4si)__V1, (__v4si)__V2);
  558. }
  559. /* SSE4 Floating Point Dot Product Instructions. */
  560. /// \brief Computes the dot product of the two 128-bit vectors of [4 x float]
  561. /// and returns it in the elements of the 128-bit result vector of
  562. /// [4 x float].
  563. ///
  564. /// The immediate integer operand controls which input elements
  565. /// will contribute to the dot product, and where the final results are
  566. /// returned.
  567. ///
  568. /// \headerfile <x86intrin.h>
  569. ///
  570. /// \code
  571. /// __m128 _mm_dp_ps(__m128 X, __m128 Y, const int M);
  572. /// \endcode
  573. ///
  574. /// This intrinsic corresponds to the <c> VDPPS / DPPS </c> instruction.
  575. ///
  576. /// \param X
  577. /// A 128-bit vector of [4 x float].
  578. /// \param Y
  579. /// A 128-bit vector of [4 x float].
  580. /// \param M
  581. /// An immediate integer operand. Mask bits [7:4] determine which elements
  582. /// of the input vectors are used, with bit [4] corresponding to the lowest
  583. /// element and bit [7] corresponding to the highest element of each [4 x
  584. /// float] vector. If a bit is set, the corresponding elements from the two
  585. /// input vectors are used as an input for dot product; otherwise that input
  586. /// is treated as zero. Bits [3:0] determine which elements of the result
  587. /// will receive a copy of the final dot product, with bit [0] corresponding
  588. /// to the lowest element and bit [3] corresponding to the highest element of
  589. /// each [4 x float] subvector. If a bit is set, the dot product is returned
  590. /// in the corresponding element; otherwise that element is set to zero.
  591. /// \returns A 128-bit vector of [4 x float] containing the dot product.
  592. #define _mm_dp_ps(X, Y, M) __extension__ ({ \
  593. (__m128) __builtin_ia32_dpps((__v4sf)(__m128)(X), \
  594. (__v4sf)(__m128)(Y), (M)); })
  595. /// \brief Computes the dot product of the two 128-bit vectors of [2 x double]
  596. /// and returns it in the elements of the 128-bit result vector of
  597. /// [2 x double].
  598. ///
  599. /// The immediate integer operand controls which input
  600. /// elements will contribute to the dot product, and where the final results
  601. /// are returned.
  602. ///
  603. /// \headerfile <x86intrin.h>
  604. ///
  605. /// \code
  606. /// __m128d _mm_dp_pd(__m128d X, __m128d Y, const int M);
  607. /// \endcode
  608. ///
  609. /// This intrinsic corresponds to the <c> VDPPD / DPPD </c> instruction.
  610. ///
  611. /// \param X
  612. /// A 128-bit vector of [2 x double].
  613. /// \param Y
  614. /// A 128-bit vector of [2 x double].
  615. /// \param M
  616. /// An immediate integer operand. Mask bits [5:4] determine which elements
  617. /// of the input vectors are used, with bit [4] corresponding to the lowest
  618. /// element and bit [5] corresponding to the highest element of each of [2 x
  619. /// double] vector. If a bit is set, the corresponding elements from the two
  620. /// input vectors are used as an input for dot product; otherwise that input
  621. /// is treated as zero. Bits [1:0] determine which elements of the result
  622. /// will receive a copy of the final dot product, with bit [0] corresponding
  623. /// to the lowest element and bit [1] corresponding to the highest element of
  624. /// each [2 x double] vector. If a bit is set, the dot product is returned in
  625. /// the corresponding element; otherwise that element is set to zero.
  626. #define _mm_dp_pd(X, Y, M) __extension__ ({\
  627. (__m128d) __builtin_ia32_dppd((__v2df)(__m128d)(X), \
  628. (__v2df)(__m128d)(Y), (M)); })
  629. /* SSE4 Streaming Load Hint Instruction. */
  630. /// \brief Loads integer values from a 128-bit aligned memory location to a
  631. /// 128-bit integer vector.
  632. ///
  633. /// \headerfile <x86intrin.h>
  634. ///
  635. /// This intrinsic corresponds to the <c> VMOVNTDQA / MOVNTDQA </c> instruction.
  636. ///
  637. /// \param __V
  638. /// A pointer to a 128-bit aligned memory location that contains the integer
  639. /// values.
  640. /// \returns A 128-bit integer vector containing the data stored at the
  641. /// specified memory location.
  642. static __inline__ __m128i __DEFAULT_FN_ATTRS
  643. _mm_stream_load_si128 (__m128i const *__V)
  644. {
  645. return (__m128i) __builtin_nontemporal_load ((const __v2di *) __V);
  646. }
  647. /* SSE4 Packed Integer Min/Max Instructions. */
  648. /// \brief Compares the corresponding elements of two 128-bit vectors of
  649. /// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the lesser
  650. /// of the two values.
  651. ///
  652. /// \headerfile <x86intrin.h>
  653. ///
  654. /// This intrinsic corresponds to the <c> VPMINSB / PMINSB </c> instruction.
  655. ///
  656. /// \param __V1
  657. /// A 128-bit vector of [16 x i8].
  658. /// \param __V2
  659. /// A 128-bit vector of [16 x i8]
  660. /// \returns A 128-bit vector of [16 x i8] containing the lesser values.
  661. static __inline__ __m128i __DEFAULT_FN_ATTRS
  662. _mm_min_epi8 (__m128i __V1, __m128i __V2)
  663. {
  664. return (__m128i) __builtin_ia32_pminsb128 ((__v16qi) __V1, (__v16qi) __V2);
  665. }
  666. /// \brief Compares the corresponding elements of two 128-bit vectors of
  667. /// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the
  668. /// greater value of the two.
  669. ///
  670. /// \headerfile <x86intrin.h>
  671. ///
  672. /// This intrinsic corresponds to the <c> VPMAXSB / PMAXSB </c> instruction.
  673. ///
  674. /// \param __V1
  675. /// A 128-bit vector of [16 x i8].
  676. /// \param __V2
  677. /// A 128-bit vector of [16 x i8].
  678. /// \returns A 128-bit vector of [16 x i8] containing the greater values.
  679. static __inline__ __m128i __DEFAULT_FN_ATTRS
  680. _mm_max_epi8 (__m128i __V1, __m128i __V2)
  681. {
  682. return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi) __V1, (__v16qi) __V2);
  683. }
  684. /// \brief Compares the corresponding elements of two 128-bit vectors of
  685. /// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the lesser
  686. /// value of the two.
  687. ///
  688. /// \headerfile <x86intrin.h>
  689. ///
  690. /// This intrinsic corresponds to the <c> VPMINUW / PMINUW </c> instruction.
  691. ///
  692. /// \param __V1
  693. /// A 128-bit vector of [8 x u16].
  694. /// \param __V2
  695. /// A 128-bit vector of [8 x u16].
  696. /// \returns A 128-bit vector of [8 x u16] containing the lesser values.
  697. static __inline__ __m128i __DEFAULT_FN_ATTRS
  698. _mm_min_epu16 (__m128i __V1, __m128i __V2)
  699. {
  700. return (__m128i) __builtin_ia32_pminuw128 ((__v8hi) __V1, (__v8hi) __V2);
  701. }
  702. /// \brief Compares the corresponding elements of two 128-bit vectors of
  703. /// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the
  704. /// greater value of the two.
  705. ///
  706. /// \headerfile <x86intrin.h>
  707. ///
  708. /// This intrinsic corresponds to the <c> VPMAXUW / PMAXUW </c> instruction.
  709. ///
  710. /// \param __V1
  711. /// A 128-bit vector of [8 x u16].
  712. /// \param __V2
  713. /// A 128-bit vector of [8 x u16].
  714. /// \returns A 128-bit vector of [8 x u16] containing the greater values.
  715. static __inline__ __m128i __DEFAULT_FN_ATTRS
  716. _mm_max_epu16 (__m128i __V1, __m128i __V2)
  717. {
  718. return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi) __V1, (__v8hi) __V2);
  719. }
  720. /// \brief Compares the corresponding elements of two 128-bit vectors of
  721. /// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the lesser
  722. /// value of the two.
  723. ///
  724. /// \headerfile <x86intrin.h>
  725. ///
  726. /// This intrinsic corresponds to the <c> VPMINSD / PMINSD </c> instruction.
  727. ///
  728. /// \param __V1
  729. /// A 128-bit vector of [4 x i32].
  730. /// \param __V2
  731. /// A 128-bit vector of [4 x i32].
  732. /// \returns A 128-bit vector of [4 x i32] containing the lesser values.
  733. static __inline__ __m128i __DEFAULT_FN_ATTRS
  734. _mm_min_epi32 (__m128i __V1, __m128i __V2)
  735. {
  736. return (__m128i) __builtin_ia32_pminsd128 ((__v4si) __V1, (__v4si) __V2);
  737. }
  738. /// \brief Compares the corresponding elements of two 128-bit vectors of
  739. /// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the
  740. /// greater value of the two.
  741. ///
  742. /// \headerfile <x86intrin.h>
  743. ///
  744. /// This intrinsic corresponds to the <c> VPMAXSD / PMAXSD </c> instruction.
  745. ///
  746. /// \param __V1
  747. /// A 128-bit vector of [4 x i32].
  748. /// \param __V2
  749. /// A 128-bit vector of [4 x i32].
  750. /// \returns A 128-bit vector of [4 x i32] containing the greater values.
  751. static __inline__ __m128i __DEFAULT_FN_ATTRS
  752. _mm_max_epi32 (__m128i __V1, __m128i __V2)
  753. {
  754. return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si) __V1, (__v4si) __V2);
  755. }
  756. /// \brief Compares the corresponding elements of two 128-bit vectors of
  757. /// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the lesser
  758. /// value of the two.
  759. ///
  760. /// \headerfile <x86intrin.h>
  761. ///
  762. /// This intrinsic corresponds to the <c> VPMINUD / PMINUD </c> instruction.
  763. ///
  764. /// \param __V1
  765. /// A 128-bit vector of [4 x u32].
  766. /// \param __V2
  767. /// A 128-bit vector of [4 x u32].
  768. /// \returns A 128-bit vector of [4 x u32] containing the lesser values.
  769. static __inline__ __m128i __DEFAULT_FN_ATTRS
  770. _mm_min_epu32 (__m128i __V1, __m128i __V2)
  771. {
  772. return (__m128i) __builtin_ia32_pminud128((__v4si) __V1, (__v4si) __V2);
  773. }
  774. /// \brief Compares the corresponding elements of two 128-bit vectors of
  775. /// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the
  776. /// greater value of the two.
  777. ///
  778. /// \headerfile <x86intrin.h>
  779. ///
  780. /// This intrinsic corresponds to the <c> VPMAXUD / PMAXUD </c> instruction.
  781. ///
  782. /// \param __V1
  783. /// A 128-bit vector of [4 x u32].
  784. /// \param __V2
  785. /// A 128-bit vector of [4 x u32].
  786. /// \returns A 128-bit vector of [4 x u32] containing the greater values.
  787. static __inline__ __m128i __DEFAULT_FN_ATTRS
  788. _mm_max_epu32 (__m128i __V1, __m128i __V2)
  789. {
  790. return (__m128i) __builtin_ia32_pmaxud128((__v4si) __V1, (__v4si) __V2);
  791. }
  792. /* SSE4 Insertion and Extraction from XMM Register Instructions. */
  793. /// \brief Takes the first argument \a X and inserts an element from the second
  794. /// argument \a Y as selected by the third argument \a N. That result then
  795. /// has elements zeroed out also as selected by the third argument \a N. The
  796. /// resulting 128-bit vector of [4 x float] is then returned.
  797. ///
  798. /// \headerfile <x86intrin.h>
  799. ///
  800. /// \code
  801. /// __m128 _mm_insert_ps(__m128 X, __m128 Y, const int N);
  802. /// \endcode
  803. ///
  804. /// This intrinsic corresponds to the <c> VINSERTPS </c> instruction.
  805. ///
  806. /// \param X
  807. /// A 128-bit vector source operand of [4 x float]. With the exception of
  808. /// those bits in the result copied from parameter \a Y and zeroed by bits
  809. /// [3:0] of \a N, all bits from this parameter are copied to the result.
  810. /// \param Y
  811. /// A 128-bit vector source operand of [4 x float]. One single-precision
  812. /// floating-point element from this source, as determined by the immediate
  813. /// parameter, is copied to the result.
  814. /// \param N
  815. /// Specifies which bits from operand \a Y will be copied, which bits in the
  816. /// result they will be be copied to, and which bits in the result will be
  817. /// cleared. The following assignments are made: \n
  818. /// Bits [7:6] specify the bits to copy from operand \a Y: \n
  819. /// 00: Selects bits [31:0] from operand \a Y. \n
  820. /// 01: Selects bits [63:32] from operand \a Y. \n
  821. /// 10: Selects bits [95:64] from operand \a Y. \n
  822. /// 11: Selects bits [127:96] from operand \a Y. \n
  823. /// Bits [5:4] specify the bits in the result to which the selected bits
  824. /// from operand \a Y are copied: \n
  825. /// 00: Copies the selected bits from \a Y to result bits [31:0]. \n
  826. /// 01: Copies the selected bits from \a Y to result bits [63:32]. \n
  827. /// 10: Copies the selected bits from \a Y to result bits [95:64]. \n
  828. /// 11: Copies the selected bits from \a Y to result bits [127:96]. \n
  829. /// Bits[3:0]: If any of these bits are set, the corresponding result
  830. /// element is cleared.
  831. /// \returns A 128-bit vector of [4 x float] containing the copied
  832. /// single-precision floating point elements from the operands.
  833. #define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N))
  834. /// \brief Extracts a 32-bit integer from a 128-bit vector of [4 x float] and
  835. /// returns it, using the immediate value parameter \a N as a selector.
  836. ///
  837. /// \headerfile <x86intrin.h>
  838. ///
  839. /// \code
  840. /// int _mm_extract_ps(__m128 X, const int N);
  841. /// \endcode
  842. ///
  843. /// This intrinsic corresponds to the <c> VEXTRACTPS / EXTRACTPS </c>
  844. /// instruction.
  845. ///
  846. /// \param X
  847. /// A 128-bit vector of [4 x float].
  848. /// \param N
  849. /// An immediate value. Bits [1:0] determines which bits from the argument
  850. /// \a X are extracted and returned: \n
  851. /// 00: Bits [31:0] of parameter \a X are returned. \n
  852. /// 01: Bits [63:32] of parameter \a X are returned. \n
  853. /// 10: Bits [95:64] of parameter \a X are returned. \n
  854. /// 11: Bits [127:96] of parameter \a X are returned.
  855. /// \returns A 32-bit integer containing the extracted 32 bits of float data.
  856. #define _mm_extract_ps(X, N) (__extension__ \
  857. ({ union { int __i; float __f; } __t; \
  858. __v4sf __a = (__v4sf)(__m128)(X); \
  859. __t.__f = __a[(N) & 3]; \
  860. __t.__i;}))
  861. /* Miscellaneous insert and extract macros. */
  862. /* Extract a single-precision float from X at index N into D. */
  863. #define _MM_EXTRACT_FLOAT(D, X, N) (__extension__ ({ __v4sf __a = (__v4sf)(X); \
  864. (D) = __a[N]; }))
  865. /* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create
  866. an index suitable for _mm_insert_ps. */
  867. #define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) | ((Y) << 4) | (Z))
  868. /* Extract a float from X at index N into the first index of the return. */
  869. #define _MM_PICK_OUT_PS(X, N) _mm_insert_ps (_mm_setzero_ps(), (X), \
  870. _MM_MK_INSERTPS_NDX((N), 0, 0x0e))
  871. /* Insert int into packed integer array at index. */
  872. /// \brief Constructs a 128-bit vector of [16 x i8] by first making a copy of
  873. /// the 128-bit integer vector parameter, and then inserting the lower 8 bits
  874. /// of an integer parameter \a I into an offset specified by the immediate
  875. /// value parameter \a N.
  876. ///
  877. /// \headerfile <x86intrin.h>
  878. ///
  879. /// \code
  880. /// __m128i _mm_insert_epi8(__m128i X, int I, const int N);
  881. /// \endcode
  882. ///
  883. /// This intrinsic corresponds to the <c> VPINSRB / PINSRB </c> instruction.
  884. ///
  885. /// \param X
  886. /// A 128-bit integer vector of [16 x i8]. This vector is copied to the
  887. /// result and then one of the sixteen elements in the result vector is
  888. /// replaced by the lower 8 bits of \a I.
  889. /// \param I
  890. /// An integer. The lower 8 bits of this operand are written to the result
  891. /// beginning at the offset specified by \a N.
  892. /// \param N
  893. /// An immediate value. Bits [3:0] specify the bit offset in the result at
  894. /// which the lower 8 bits of \a I are written. \n
  895. /// 0000: Bits [7:0] of the result are used for insertion. \n
  896. /// 0001: Bits [15:8] of the result are used for insertion. \n
  897. /// 0010: Bits [23:16] of the result are used for insertion. \n
  898. /// 0011: Bits [31:24] of the result are used for insertion. \n
  899. /// 0100: Bits [39:32] of the result are used for insertion. \n
  900. /// 0101: Bits [47:40] of the result are used for insertion. \n
  901. /// 0110: Bits [55:48] of the result are used for insertion. \n
  902. /// 0111: Bits [63:56] of the result are used for insertion. \n
  903. /// 1000: Bits [71:64] of the result are used for insertion. \n
  904. /// 1001: Bits [79:72] of the result are used for insertion. \n
  905. /// 1010: Bits [87:80] of the result are used for insertion. \n
  906. /// 1011: Bits [95:88] of the result are used for insertion. \n
  907. /// 1100: Bits [103:96] of the result are used for insertion. \n
  908. /// 1101: Bits [111:104] of the result are used for insertion. \n
  909. /// 1110: Bits [119:112] of the result are used for insertion. \n
  910. /// 1111: Bits [127:120] of the result are used for insertion.
  911. /// \returns A 128-bit integer vector containing the constructed values.
  912. #define _mm_insert_epi8(X, I, N) (__extension__ \
  913. ({ __v16qi __a = (__v16qi)(__m128i)(X); \
  914. __a[(N) & 15] = (I); \
  915. (__m128i)__a;}))
  916. /// \brief Constructs a 128-bit vector of [4 x i32] by first making a copy of
  917. /// the 128-bit integer vector parameter, and then inserting the 32-bit
  918. /// integer parameter \a I at the offset specified by the immediate value
  919. /// parameter \a N.
  920. ///
  921. /// \headerfile <x86intrin.h>
  922. ///
  923. /// \code
  924. /// __m128i _mm_insert_epi32(__m128i X, int I, const int N);
  925. /// \endcode
  926. ///
  927. /// This intrinsic corresponds to the <c> VPINSRD / PINSRD </c> instruction.
  928. ///
  929. /// \param X
  930. /// A 128-bit integer vector of [4 x i32]. This vector is copied to the
  931. /// result and then one of the four elements in the result vector is
  932. /// replaced by \a I.
  933. /// \param I
  934. /// A 32-bit integer that is written to the result beginning at the offset
  935. /// specified by \a N.
  936. /// \param N
  937. /// An immediate value. Bits [1:0] specify the bit offset in the result at
  938. /// which the integer \a I is written. \n
  939. /// 00: Bits [31:0] of the result are used for insertion. \n
  940. /// 01: Bits [63:32] of the result are used for insertion. \n
  941. /// 10: Bits [95:64] of the result are used for insertion. \n
  942. /// 11: Bits [127:96] of the result are used for insertion.
  943. /// \returns A 128-bit integer vector containing the constructed values.
  944. #define _mm_insert_epi32(X, I, N) (__extension__ \
  945. ({ __v4si __a = (__v4si)(__m128i)(X); \
  946. __a[(N) & 3] = (I); \
  947. (__m128i)__a;}))
  948. #ifdef __x86_64__
  949. /// \brief Constructs a 128-bit vector of [2 x i64] by first making a copy of
  950. /// the 128-bit integer vector parameter, and then inserting the 64-bit
  951. /// integer parameter \a I, using the immediate value parameter \a N as an
  952. /// insertion location selector.
  953. ///
  954. /// \headerfile <x86intrin.h>
  955. ///
  956. /// \code
  957. /// __m128i _mm_insert_epi64(__m128i X, long long I, const int N);
  958. /// \endcode
  959. ///
  960. /// This intrinsic corresponds to the <c> VPINSRQ / PINSRQ </c> instruction.
  961. ///
  962. /// \param X
  963. /// A 128-bit integer vector of [2 x i64]. This vector is copied to the
  964. /// result and then one of the two elements in the result vector is replaced
  965. /// by \a I.
  966. /// \param I
  967. /// A 64-bit integer that is written to the result beginning at the offset
  968. /// specified by \a N.
  969. /// \param N
  970. /// An immediate value. Bit [0] specifies the bit offset in the result at
  971. /// which the integer \a I is written. \n
  972. /// 0: Bits [63:0] of the result are used for insertion. \n
  973. /// 1: Bits [127:64] of the result are used for insertion. \n
  974. /// \returns A 128-bit integer vector containing the constructed values.
  975. #define _mm_insert_epi64(X, I, N) (__extension__ \
  976. ({ __v2di __a = (__v2di)(__m128i)(X); \
  977. __a[(N) & 1] = (I); \
  978. (__m128i)__a;}))
  979. #endif /* __x86_64__ */
  980. /* Extract int from packed integer array at index. This returns the element
  981. * as a zero extended value, so it is unsigned.
  982. */
  983. /// \brief Extracts an 8-bit element from the 128-bit integer vector of
  984. /// [16 x i8], using the immediate value parameter \a N as a selector.
  985. ///
  986. /// \headerfile <x86intrin.h>
  987. ///
  988. /// \code
  989. /// int _mm_extract_epi8(__m128i X, const int N);
  990. /// \endcode
  991. ///
  992. /// This intrinsic corresponds to the <c> VPEXTRB / PEXTRB </c> instruction.
  993. ///
  994. /// \param X
  995. /// A 128-bit integer vector.
  996. /// \param N
  997. /// An immediate value. Bits [3:0] specify which 8-bit vector element from
  998. /// the argument \a X to extract and copy to the result. \n
  999. /// 0000: Bits [7:0] of parameter \a X are extracted. \n
  1000. /// 0001: Bits [15:8] of the parameter \a X are extracted. \n
  1001. /// 0010: Bits [23:16] of the parameter \a X are extracted. \n
  1002. /// 0011: Bits [31:24] of the parameter \a X are extracted. \n
  1003. /// 0100: Bits [39:32] of the parameter \a X are extracted. \n
  1004. /// 0101: Bits [47:40] of the parameter \a X are extracted. \n
  1005. /// 0110: Bits [55:48] of the parameter \a X are extracted. \n
  1006. /// 0111: Bits [63:56] of the parameter \a X are extracted. \n
  1007. /// 1000: Bits [71:64] of the parameter \a X are extracted. \n
  1008. /// 1001: Bits [79:72] of the parameter \a X are extracted. \n
  1009. /// 1010: Bits [87:80] of the parameter \a X are extracted. \n
  1010. /// 1011: Bits [95:88] of the parameter \a X are extracted. \n
  1011. /// 1100: Bits [103:96] of the parameter \a X are extracted. \n
  1012. /// 1101: Bits [111:104] of the parameter \a X are extracted. \n
  1013. /// 1110: Bits [119:112] of the parameter \a X are extracted. \n
  1014. /// 1111: Bits [127:120] of the parameter \a X are extracted.
  1015. /// \returns An unsigned integer, whose lower 8 bits are selected from the
  1016. /// 128-bit integer vector parameter and the remaining bits are assigned
  1017. /// zeros.
  1018. #define _mm_extract_epi8(X, N) (__extension__ \
  1019. ({ __v16qi __a = (__v16qi)(__m128i)(X); \
  1020. (int)(unsigned char) __a[(N) & 15];}))
  1021. /// \brief Extracts a 32-bit element from the 128-bit integer vector of
  1022. /// [4 x i32], using the immediate value parameter \a N as a selector.
  1023. ///
  1024. /// \headerfile <x86intrin.h>
  1025. ///
  1026. /// \code
  1027. /// int _mm_extract_epi32(__m128i X, const int N);
  1028. /// \endcode
  1029. ///
  1030. /// This intrinsic corresponds to the <c> VPEXTRD / PEXTRD </c> instruction.
  1031. ///
  1032. /// \param X
  1033. /// A 128-bit integer vector.
  1034. /// \param N
  1035. /// An immediate value. Bits [1:0] specify which 32-bit vector element from
  1036. /// the argument \a X to extract and copy to the result. \n
  1037. /// 00: Bits [31:0] of the parameter \a X are extracted. \n
  1038. /// 01: Bits [63:32] of the parameter \a X are extracted. \n
  1039. /// 10: Bits [95:64] of the parameter \a X are extracted. \n
  1040. /// 11: Bits [127:96] of the parameter \a X are exracted.
  1041. /// \returns An integer, whose lower 32 bits are selected from the 128-bit
  1042. /// integer vector parameter and the remaining bits are assigned zeros.
  1043. #define _mm_extract_epi32(X, N) (__extension__ \
  1044. ({ __v4si __a = (__v4si)(__m128i)(X); \
  1045. (int)__a[(N) & 3];}))
  1046. #ifdef __x86_64__
  1047. /// \brief Extracts a 64-bit element from the 128-bit integer vector of
  1048. /// [2 x i64], using the immediate value parameter \a N as a selector.
  1049. ///
  1050. /// \headerfile <x86intrin.h>
  1051. ///
  1052. /// \code
  1053. /// long long _mm_extract_epi64(__m128i X, const int N);
  1054. /// \endcode
  1055. ///
  1056. /// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction.
  1057. ///
  1058. /// \param X
  1059. /// A 128-bit integer vector.
  1060. /// \param N
  1061. /// An immediate value. Bit [0] specifies which 64-bit vector element from
  1062. /// the argument \a X to return. \n
  1063. /// 0: Bits [63:0] are returned. \n
  1064. /// 1: Bits [127:64] are returned. \n
  1065. /// \returns A 64-bit integer.
  1066. #define _mm_extract_epi64(X, N) (__extension__ \
  1067. ({ __v2di __a = (__v2di)(__m128i)(X); \
  1068. (long long)__a[(N) & 1];}))
  1069. #endif /* __x86_64 */
  1070. /* SSE4 128-bit Packed Integer Comparisons. */
  1071. /// \brief Tests whether the specified bits in a 128-bit integer vector are all
  1072. /// zeros.
  1073. ///
  1074. /// \headerfile <x86intrin.h>
  1075. ///
  1076. /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
  1077. ///
  1078. /// \param __M
  1079. /// A 128-bit integer vector containing the bits to be tested.
  1080. /// \param __V
  1081. /// A 128-bit integer vector selecting which bits to test in operand \a __M.
  1082. /// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
  1083. static __inline__ int __DEFAULT_FN_ATTRS
  1084. _mm_testz_si128(__m128i __M, __m128i __V)
  1085. {
  1086. return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V);
  1087. }
  1088. /// \brief Tests whether the specified bits in a 128-bit integer vector are all
  1089. /// ones.
  1090. ///
  1091. /// \headerfile <x86intrin.h>
  1092. ///
  1093. /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
  1094. ///
  1095. /// \param __M
  1096. /// A 128-bit integer vector containing the bits to be tested.
  1097. /// \param __V
  1098. /// A 128-bit integer vector selecting which bits to test in operand \a __M.
  1099. /// \returns TRUE if the specified bits are all ones; FALSE otherwise.
  1100. static __inline__ int __DEFAULT_FN_ATTRS
  1101. _mm_testc_si128(__m128i __M, __m128i __V)
  1102. {
  1103. return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V);
  1104. }
  1105. /// \brief Tests whether the specified bits in a 128-bit integer vector are
  1106. /// neither all zeros nor all ones.
  1107. ///
  1108. /// \headerfile <x86intrin.h>
  1109. ///
  1110. /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
  1111. ///
  1112. /// \param __M
  1113. /// A 128-bit integer vector containing the bits to be tested.
  1114. /// \param __V
  1115. /// A 128-bit integer vector selecting which bits to test in operand \a __M.
  1116. /// \returns TRUE if the specified bits are neither all zeros nor all ones;
  1117. /// FALSE otherwise.
  1118. static __inline__ int __DEFAULT_FN_ATTRS
  1119. _mm_testnzc_si128(__m128i __M, __m128i __V)
  1120. {
  1121. return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V);
  1122. }
  1123. /// \brief Tests whether the specified bits in a 128-bit integer vector are all
  1124. /// ones.
  1125. ///
  1126. /// \headerfile <x86intrin.h>
  1127. ///
  1128. /// \code
  1129. /// int _mm_test_all_ones(__m128i V);
  1130. /// \endcode
  1131. ///
  1132. /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
  1133. ///
  1134. /// \param V
  1135. /// A 128-bit integer vector containing the bits to be tested.
  1136. /// \returns TRUE if the bits specified in the operand are all set to 1; FALSE
  1137. /// otherwise.
  1138. #define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V)))
  1139. /// \brief Tests whether the specified bits in a 128-bit integer vector are
  1140. /// neither all zeros nor all ones.
  1141. ///
  1142. /// \headerfile <x86intrin.h>
  1143. ///
  1144. /// \code
  1145. /// int _mm_test_mix_ones_zeros(__m128i M, __m128i V);
  1146. /// \endcode
  1147. ///
  1148. /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
  1149. ///
  1150. /// \param M
  1151. /// A 128-bit integer vector containing the bits to be tested.
  1152. /// \param V
  1153. /// A 128-bit integer vector selecting which bits to test in operand \a M.
  1154. /// \returns TRUE if the specified bits are neither all zeros nor all ones;
  1155. /// FALSE otherwise.
  1156. #define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V))
  1157. /// \brief Tests whether the specified bits in a 128-bit integer vector are all
  1158. /// zeros.
  1159. ///
  1160. /// \headerfile <x86intrin.h>
  1161. ///
  1162. /// \code
  1163. /// int _mm_test_all_zeros(__m128i M, __m128i V);
  1164. /// \endcode
  1165. ///
  1166. /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
  1167. ///
  1168. /// \param M
  1169. /// A 128-bit integer vector containing the bits to be tested.
  1170. /// \param V
  1171. /// A 128-bit integer vector selecting which bits to test in operand \a M.
  1172. /// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
  1173. #define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V))
  1174. /* SSE4 64-bit Packed Integer Comparisons. */
  1175. /// \brief Compares each of the corresponding 64-bit values of the 128-bit
  1176. /// integer vectors for equality.
  1177. ///
  1178. /// \headerfile <x86intrin.h>
  1179. ///
  1180. /// This intrinsic corresponds to the <c> VPCMPEQQ / PCMPEQQ </c> instruction.
  1181. ///
  1182. /// \param __V1
  1183. /// A 128-bit integer vector.
  1184. /// \param __V2
  1185. /// A 128-bit integer vector.
  1186. /// \returns A 128-bit integer vector containing the comparison results.
  1187. static __inline__ __m128i __DEFAULT_FN_ATTRS
  1188. _mm_cmpeq_epi64(__m128i __V1, __m128i __V2)
  1189. {
  1190. return (__m128i)((__v2di)__V1 == (__v2di)__V2);
  1191. }
  1192. /* SSE4 Packed Integer Sign-Extension. */
  1193. /// \brief Sign-extends each of the lower eight 8-bit integer elements of a
  1194. /// 128-bit vector of [16 x i8] to 16-bit values and returns them in a
  1195. /// 128-bit vector of [8 x i16]. The upper eight elements of the input vector
  1196. /// are unused.
  1197. ///
  1198. /// \headerfile <x86intrin.h>
  1199. ///
  1200. /// This intrinsic corresponds to the <c> VPMOVSXBW / PMOVSXBW </c> instruction.
  1201. ///
  1202. /// \param __V
  1203. /// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are sign-
  1204. /// extended to 16-bit values.
  1205. /// \returns A 128-bit vector of [8 x i16] containing the sign-extended values.
  1206. static __inline__ __m128i __DEFAULT_FN_ATTRS
  1207. _mm_cvtepi8_epi16(__m128i __V)
  1208. {
  1209. /* This function always performs a signed extension, but __v16qi is a char
  1210. which may be signed or unsigned, so use __v16qs. */
  1211. return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi);
  1212. }
  1213. /// \brief Sign-extends each of the lower four 8-bit integer elements of a
  1214. /// 128-bit vector of [16 x i8] to 32-bit values and returns them in a
  1215. /// 128-bit vector of [4 x i32]. The upper twelve elements of the input
  1216. /// vector are unused.
  1217. ///
  1218. /// \headerfile <x86intrin.h>
  1219. ///
  1220. /// This intrinsic corresponds to the <c> VPMOVSXBD / PMOVSXBD </c> instruction.
  1221. ///
  1222. /// \param __V
  1223. /// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are sign-
  1224. /// extended to 32-bit values.
  1225. /// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
  1226. static __inline__ __m128i __DEFAULT_FN_ATTRS
  1227. _mm_cvtepi8_epi32(__m128i __V)
  1228. {
  1229. /* This function always performs a signed extension, but __v16qi is a char
  1230. which may be signed or unsigned, so use __v16qs. */
  1231. return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si);
  1232. }
  1233. /// \brief Sign-extends each of the lower two 8-bit integer elements of a
  1234. /// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in
  1235. /// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input
  1236. /// vector are unused.
  1237. ///
  1238. /// \headerfile <x86intrin.h>
  1239. ///
  1240. /// This intrinsic corresponds to the <c> VPMOVSXBQ / PMOVSXBQ </c> instruction.
  1241. ///
  1242. /// \param __V
  1243. /// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are sign-
  1244. /// extended to 64-bit values.
  1245. /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
  1246. static __inline__ __m128i __DEFAULT_FN_ATTRS
  1247. _mm_cvtepi8_epi64(__m128i __V)
  1248. {
  1249. /* This function always performs a signed extension, but __v16qi is a char
  1250. which may be signed or unsigned, so use __v16qs. */
  1251. return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di);
  1252. }
  1253. /// \brief Sign-extends each of the lower four 16-bit integer elements of a
  1254. /// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in
  1255. /// a 128-bit vector of [4 x i32]. The upper four elements of the input
  1256. /// vector are unused.
  1257. ///
  1258. /// \headerfile <x86intrin.h>
  1259. ///
  1260. /// This intrinsic corresponds to the <c> VPMOVSXWD / PMOVSXWD </c> instruction.
  1261. ///
  1262. /// \param __V
  1263. /// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are sign-
  1264. /// extended to 32-bit values.
  1265. /// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
  1266. static __inline__ __m128i __DEFAULT_FN_ATTRS
  1267. _mm_cvtepi16_epi32(__m128i __V)
  1268. {
  1269. return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si);
  1270. }
  1271. /// \brief Sign-extends each of the lower two 16-bit integer elements of a
  1272. /// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in
  1273. /// a 128-bit vector of [2 x i64]. The upper six elements of the input
  1274. /// vector are unused.
  1275. ///
  1276. /// \headerfile <x86intrin.h>
  1277. ///
  1278. /// This intrinsic corresponds to the <c> VPMOVSXWQ / PMOVSXWQ </c> instruction.
  1279. ///
  1280. /// \param __V
  1281. /// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are sign-
  1282. /// extended to 64-bit values.
  1283. /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
  1284. static __inline__ __m128i __DEFAULT_FN_ATTRS
  1285. _mm_cvtepi16_epi64(__m128i __V)
  1286. {
  1287. return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di);
  1288. }
  1289. /// \brief Sign-extends each of the lower two 32-bit integer elements of a
  1290. /// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in
  1291. /// a 128-bit vector of [2 x i64]. The upper two elements of the input vector
  1292. /// are unused.
  1293. ///
  1294. /// \headerfile <x86intrin.h>
  1295. ///
  1296. /// This intrinsic corresponds to the <c> VPMOVSXDQ / PMOVSXDQ </c> instruction.
  1297. ///
  1298. /// \param __V
  1299. /// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are sign-
  1300. /// extended to 64-bit values.
  1301. /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
  1302. static __inline__ __m128i __DEFAULT_FN_ATTRS
  1303. _mm_cvtepi32_epi64(__m128i __V)
  1304. {
  1305. return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4si)__V, (__v4si)__V, 0, 1), __v2di);
  1306. }
  1307. /* SSE4 Packed Integer Zero-Extension. */
  1308. /// \brief Zero-extends each of the lower eight 8-bit integer elements of a
  1309. /// 128-bit vector of [16 x i8] to 16-bit values and returns them in a
  1310. /// 128-bit vector of [8 x i16]. The upper eight elements of the input vector
  1311. /// are unused.
  1312. ///
  1313. /// \headerfile <x86intrin.h>
  1314. ///
  1315. /// This intrinsic corresponds to the <c> VPMOVZXBW / PMOVZXBW </c> instruction.
  1316. ///
  1317. /// \param __V
  1318. /// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are zero-
  1319. /// extended to 16-bit values.
  1320. /// \returns A 128-bit vector of [8 x i16] containing the zero-extended values.
  1321. static __inline__ __m128i __DEFAULT_FN_ATTRS
  1322. _mm_cvtepu8_epi16(__m128i __V)
  1323. {
  1324. return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi);
  1325. }
  1326. /// \brief Zero-extends each of the lower four 8-bit integer elements of a
  1327. /// 128-bit vector of [16 x i8] to 32-bit values and returns them in a
  1328. /// 128-bit vector of [4 x i32]. The upper twelve elements of the input
  1329. /// vector are unused.
  1330. ///
  1331. /// \headerfile <x86intrin.h>
  1332. ///
  1333. /// This intrinsic corresponds to the <c> VPMOVZXBD / PMOVZXBD </c> instruction.
  1334. ///
  1335. /// \param __V
  1336. /// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are zero-
  1337. /// extended to 32-bit values.
  1338. /// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
  1339. static __inline__ __m128i __DEFAULT_FN_ATTRS
  1340. _mm_cvtepu8_epi32(__m128i __V)
  1341. {
  1342. return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4si);
  1343. }
  1344. /// \brief Zero-extends each of the lower two 8-bit integer elements of a
  1345. /// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in
  1346. /// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input
  1347. /// vector are unused.
  1348. ///
  1349. /// \headerfile <x86intrin.h>
  1350. ///
  1351. /// This intrinsic corresponds to the <c> VPMOVZXBQ / PMOVZXBQ </c> instruction.
  1352. ///
  1353. /// \param __V
  1354. /// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are zero-
  1355. /// extended to 64-bit values.
  1356. /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
  1357. static __inline__ __m128i __DEFAULT_FN_ATTRS
  1358. _mm_cvtepu8_epi64(__m128i __V)
  1359. {
  1360. return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1), __v2di);
  1361. }
  1362. /// \brief Zero-extends each of the lower four 16-bit integer elements of a
  1363. /// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in
  1364. /// a 128-bit vector of [4 x i32]. The upper four elements of the input
  1365. /// vector are unused.
  1366. ///
  1367. /// \headerfile <x86intrin.h>
  1368. ///
  1369. /// This intrinsic corresponds to the <c> VPMOVZXWD / PMOVZXWD </c> instruction.
  1370. ///
  1371. /// \param __V
  1372. /// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are zero-
  1373. /// extended to 32-bit values.
  1374. /// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
  1375. static __inline__ __m128i __DEFAULT_FN_ATTRS
  1376. _mm_cvtepu16_epi32(__m128i __V)
  1377. {
  1378. return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4si);
  1379. }
  1380. /// \brief Zero-extends each of the lower two 16-bit integer elements of a
  1381. /// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in
  1382. /// a 128-bit vector of [2 x i64]. The upper six elements of the input vector
  1383. /// are unused.
  1384. ///
  1385. /// \headerfile <x86intrin.h>
  1386. ///
  1387. /// This intrinsic corresponds to the <c> VPMOVZXWQ / PMOVZXWQ </c> instruction.
  1388. ///
  1389. /// \param __V
  1390. /// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are zero-
  1391. /// extended to 64-bit values.
  1392. /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
  1393. static __inline__ __m128i __DEFAULT_FN_ATTRS
  1394. _mm_cvtepu16_epi64(__m128i __V)
  1395. {
  1396. return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1), __v2di);
  1397. }
  1398. /// \brief Zero-extends each of the lower two 32-bit integer elements of a
  1399. /// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in
  1400. /// a 128-bit vector of [2 x i64]. The upper two elements of the input vector
  1401. /// are unused.
  1402. ///
  1403. /// \headerfile <x86intrin.h>
  1404. ///
  1405. /// This intrinsic corresponds to the <c> VPMOVZXDQ / PMOVZXDQ </c> instruction.
  1406. ///
  1407. /// \param __V
  1408. /// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are zero-
  1409. /// extended to 64-bit values.
  1410. /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
  1411. static __inline__ __m128i __DEFAULT_FN_ATTRS
  1412. _mm_cvtepu32_epi64(__m128i __V)
  1413. {
  1414. return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4su)__V, (__v4su)__V, 0, 1), __v2di);
  1415. }
  1416. /* SSE4 Pack with Unsigned Saturation. */
  1417. /// \brief Converts 32-bit signed integers from both 128-bit integer vector
  1418. /// operands into 16-bit unsigned integers, and returns the packed result.
  1419. /// Values greater than 0xFFFF are saturated to 0xFFFF. Values less than
  1420. /// 0x0000 are saturated to 0x0000.
  1421. ///
  1422. /// \headerfile <x86intrin.h>
  1423. ///
  1424. /// This intrinsic corresponds to the <c> VPACKUSDW / PACKUSDW </c> instruction.
  1425. ///
  1426. /// \param __V1
  1427. /// A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a
  1428. /// signed integer and is converted to a 16-bit unsigned integer with
  1429. /// saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values
  1430. /// less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values
  1431. /// are written to the lower 64 bits of the result.
  1432. /// \param __V2
  1433. /// A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a
  1434. /// signed integer and is converted to a 16-bit unsigned integer with
  1435. /// saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values
  1436. /// less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values
  1437. /// are written to the higher 64 bits of the result.
  1438. /// \returns A 128-bit vector of [8 x i16] containing the converted values.
  1439. static __inline__ __m128i __DEFAULT_FN_ATTRS
  1440. _mm_packus_epi32(__m128i __V1, __m128i __V2)
  1441. {
  1442. return (__m128i) __builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2);
  1443. }
  1444. /* SSE4 Multiple Packed Sums of Absolute Difference. */
  1445. /// \brief Subtracts 8-bit unsigned integer values and computes the absolute
  1446. /// values of the differences to the corresponding bits in the destination.
  1447. /// Then sums of the absolute differences are returned according to the bit
  1448. /// fields in the immediate operand.
  1449. ///
  1450. /// \headerfile <x86intrin.h>
  1451. ///
  1452. /// \code
  1453. /// __m128i _mm_mpsadbw_epu8(__m128i X, __m128i Y, const int M);
  1454. /// \endcode
  1455. ///
  1456. /// This intrinsic corresponds to the <c> VMPSADBW / MPSADBW </c> instruction.
  1457. ///
  1458. /// \param X
  1459. /// A 128-bit vector of [16 x i8].
  1460. /// \param Y
  1461. /// A 128-bit vector of [16 x i8].
  1462. /// \param M
  1463. /// An 8-bit immediate operand specifying how the absolute differences are to
  1464. /// be calculated, according to the following algorithm:
  1465. /// \code
  1466. /// // M2 represents bit 2 of the immediate operand
  1467. /// // M10 represents bits [1:0] of the immediate operand
  1468. /// i = M2 * 4
  1469. /// j = M10 * 4
  1470. /// for (k = 0; k < 8; k = k + 1) {
  1471. /// d0 = abs(X[i + k + 0] - Y[j + 0])
  1472. /// d1 = abs(X[i + k + 1] - Y[j + 1])
  1473. /// d2 = abs(X[i + k + 2] - Y[j + 2])
  1474. /// d3 = abs(X[i + k + 3] - Y[j + 3])
  1475. /// r[k] = d0 + d1 + d2 + d3
  1476. /// }
  1477. /// \endcode
  1478. /// \returns A 128-bit integer vector containing the sums of the sets of
  1479. /// absolute differences between both operands.
  1480. #define _mm_mpsadbw_epu8(X, Y, M) __extension__ ({ \
  1481. (__m128i) __builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \
  1482. (__v16qi)(__m128i)(Y), (M)); })
  1483. /// \brief Finds the minimum unsigned 16-bit element in the input 128-bit
  1484. /// vector of [8 x u16] and returns it and along with its index.
  1485. ///
  1486. /// \headerfile <x86intrin.h>
  1487. ///
  1488. /// This intrinsic corresponds to the <c> VPHMINPOSUW / PHMINPOSUW </c>
  1489. /// instruction.
  1490. ///
  1491. /// \param __V
  1492. /// A 128-bit vector of [8 x u16].
  1493. /// \returns A 128-bit value where bits [15:0] contain the minimum value found
  1494. /// in parameter \a __V, bits [18:16] contain the index of the minimum value
  1495. /// and the remaining bits are set to 0.
  1496. static __inline__ __m128i __DEFAULT_FN_ATTRS
  1497. _mm_minpos_epu16(__m128i __V)
  1498. {
  1499. return (__m128i) __builtin_ia32_phminposuw128((__v8hi)__V);
  1500. }
  1501. /* Handle the sse4.2 definitions here. */
  1502. /* These definitions are normally in nmmintrin.h, but gcc puts them in here
  1503. so we'll do the same. */
  1504. #undef __DEFAULT_FN_ATTRS
  1505. #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
  1506. /* These specify the type of data that we're comparing. */
  1507. #define _SIDD_UBYTE_OPS 0x00
  1508. #define _SIDD_UWORD_OPS 0x01
  1509. #define _SIDD_SBYTE_OPS 0x02
  1510. #define _SIDD_SWORD_OPS 0x03
  1511. /* These specify the type of comparison operation. */
  1512. #define _SIDD_CMP_EQUAL_ANY 0x00
  1513. #define _SIDD_CMP_RANGES 0x04
  1514. #define _SIDD_CMP_EQUAL_EACH 0x08
  1515. #define _SIDD_CMP_EQUAL_ORDERED 0x0c
  1516. /* These macros specify the polarity of the operation. */
  1517. #define _SIDD_POSITIVE_POLARITY 0x00
  1518. #define _SIDD_NEGATIVE_POLARITY 0x10
  1519. #define _SIDD_MASKED_POSITIVE_POLARITY 0x20
  1520. #define _SIDD_MASKED_NEGATIVE_POLARITY 0x30
  1521. /* These macros are used in _mm_cmpXstri() to specify the return. */
  1522. #define _SIDD_LEAST_SIGNIFICANT 0x00
  1523. #define _SIDD_MOST_SIGNIFICANT 0x40
  1524. /* These macros are used in _mm_cmpXstri() to specify the return. */
  1525. #define _SIDD_BIT_MASK 0x00
  1526. #define _SIDD_UNIT_MASK 0x40
  1527. /* SSE4.2 Packed Comparison Intrinsics. */
  1528. /// \brief Uses the immediate operand \a M to perform a comparison of string
  1529. /// data with implicitly defined lengths that is contained in source operands
  1530. /// \a A and \a B. Returns a 128-bit integer vector representing the result
  1531. /// mask of the comparison.
  1532. ///
  1533. /// \headerfile <x86intrin.h>
  1534. ///
  1535. /// \code
  1536. /// __m128i _mm_cmpistrm(__m128i A, __m128i B, const int M);
  1537. /// \endcode
  1538. ///
  1539. /// This intrinsic corresponds to the <c> VPCMPISTRM / PCMPISTRM </c>
  1540. /// instruction.
  1541. ///
  1542. /// \param A
  1543. /// A 128-bit integer vector containing one of the source operands to be
  1544. /// compared.
  1545. /// \param B
  1546. /// A 128-bit integer vector containing one of the source operands to be
  1547. /// compared.
  1548. /// \param M
  1549. /// An 8-bit immediate operand specifying whether the characters are bytes or
  1550. /// words, the type of comparison to perform, and the format of the return
  1551. /// value. \n
  1552. /// Bits [1:0]: Determine source data format. \n
  1553. /// 00: 16 unsigned bytes \n
  1554. /// 01: 8 unsigned words \n
  1555. /// 10: 16 signed bytes \n
  1556. /// 11: 8 signed words \n
  1557. /// Bits [3:2]: Determine comparison type and aggregation method. \n
  1558. /// 00: Subset: Each character in \a B is compared for equality with all
  1559. /// the characters in \a A. \n
  1560. /// 01: Ranges: Each character in \a B is compared to \a A. The comparison
  1561. /// basis is greater than or equal for even-indexed elements in \a A,
  1562. /// and less than or equal for odd-indexed elements in \a A. \n
  1563. /// 10: Match: Compare each pair of corresponding characters in \a A and
  1564. /// \a B for equality. \n
  1565. /// 11: Substring: Search \a B for substring matches of \a A. \n
  1566. /// Bits [5:4]: Determine whether to perform a one's complement on the bit
  1567. /// mask of the comparison results. \n
  1568. /// 00: No effect. \n
  1569. /// 01: Negate the bit mask. \n
  1570. /// 10: No effect. \n
  1571. /// 11: Negate the bit mask only for bits with an index less than or equal
  1572. /// to the size of \a A or \a B. \n
  1573. /// Bit [6]: Determines whether the result is zero-extended or expanded to 16
  1574. /// bytes. \n
  1575. /// 0: The result is zero-extended to 16 bytes. \n
  1576. /// 1: The result is expanded to 16 bytes (this expansion is performed by
  1577. /// repeating each bit 8 or 16 times).
  1578. /// \returns Returns a 128-bit integer vector representing the result mask of
  1579. /// the comparison.
  1580. #define _mm_cmpistrm(A, B, M) \
  1581. (__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \
  1582. (__v16qi)(__m128i)(B), (int)(M))
  1583. /// \brief Uses the immediate operand \a M to perform a comparison of string
  1584. /// data with implicitly defined lengths that is contained in source operands
  1585. /// \a A and \a B. Returns an integer representing the result index of the
  1586. /// comparison.
  1587. ///
  1588. /// \headerfile <x86intrin.h>
  1589. ///
  1590. /// \code
  1591. /// int _mm_cmpistri(__m128i A, __m128i B, const int M);
  1592. /// \endcode
  1593. ///
  1594. /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
  1595. /// instruction.
  1596. ///
  1597. /// \param A
  1598. /// A 128-bit integer vector containing one of the source operands to be
  1599. /// compared.
  1600. /// \param B
  1601. /// A 128-bit integer vector containing one of the source operands to be
  1602. /// compared.
  1603. /// \param M
  1604. /// An 8-bit immediate operand specifying whether the characters are bytes or
  1605. /// words, the type of comparison to perform, and the format of the return
  1606. /// value. \n
  1607. /// Bits [1:0]: Determine source data format. \n
  1608. /// 00: 16 unsigned bytes \n
  1609. /// 01: 8 unsigned words \n
  1610. /// 10: 16 signed bytes \n
  1611. /// 11: 8 signed words \n
  1612. /// Bits [3:2]: Determine comparison type and aggregation method. \n
  1613. /// 00: Subset: Each character in \a B is compared for equality with all
  1614. /// the characters in \a A. \n
  1615. /// 01: Ranges: Each character in \a B is compared to \a A. The comparison
  1616. /// basis is greater than or equal for even-indexed elements in \a A,
  1617. /// and less than or equal for odd-indexed elements in \a A. \n
  1618. /// 10: Match: Compare each pair of corresponding characters in \a A and
  1619. /// \a B for equality. \n
  1620. /// 11: Substring: Search B for substring matches of \a A. \n
  1621. /// Bits [5:4]: Determine whether to perform a one's complement on the bit
  1622. /// mask of the comparison results. \n
  1623. /// 00: No effect. \n
  1624. /// 01: Negate the bit mask. \n
  1625. /// 10: No effect. \n
  1626. /// 11: Negate the bit mask only for bits with an index less than or equal
  1627. /// to the size of \a A or \a B. \n
  1628. /// Bit [6]: Determines whether the index of the lowest set bit or the
  1629. /// highest set bit is returned. \n
  1630. /// 0: The index of the least significant set bit. \n
  1631. /// 1: The index of the most significant set bit. \n
  1632. /// \returns Returns an integer representing the result index of the comparison.
  1633. #define _mm_cmpistri(A, B, M) \
  1634. (int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \
  1635. (__v16qi)(__m128i)(B), (int)(M))
  1636. /// \brief Uses the immediate operand \a M to perform a comparison of string
  1637. /// data with explicitly defined lengths that is contained in source operands
  1638. /// \a A and \a B. Returns a 128-bit integer vector representing the result
  1639. /// mask of the comparison.
  1640. ///
  1641. /// \headerfile <x86intrin.h>
  1642. ///
  1643. /// \code
  1644. /// __m128i _mm_cmpestrm(__m128i A, int LA, __m128i B, int LB, const int M);
  1645. /// \endcode
  1646. ///
  1647. /// This intrinsic corresponds to the <c> VPCMPESTRM / PCMPESTRM </c>
  1648. /// instruction.
  1649. ///
  1650. /// \param A
  1651. /// A 128-bit integer vector containing one of the source operands to be
  1652. /// compared.
  1653. /// \param LA
  1654. /// An integer that specifies the length of the string in \a A.
  1655. /// \param B
  1656. /// A 128-bit integer vector containing one of the source operands to be
  1657. /// compared.
  1658. /// \param LB
  1659. /// An integer that specifies the length of the string in \a B.
  1660. /// \param M
  1661. /// An 8-bit immediate operand specifying whether the characters are bytes or
  1662. /// words, the type of comparison to perform, and the format of the return
  1663. /// value. \n
  1664. /// Bits [1:0]: Determine source data format. \n
  1665. /// 00: 16 unsigned bytes \n
  1666. /// 01: 8 unsigned words \n
  1667. /// 10: 16 signed bytes \n
  1668. /// 11: 8 signed words \n
  1669. /// Bits [3:2]: Determine comparison type and aggregation method. \n
  1670. /// 00: Subset: Each character in \a B is compared for equality with all
  1671. /// the characters in \a A. \n
  1672. /// 01: Ranges: Each character in \a B is compared to \a A. The comparison
  1673. /// basis is greater than or equal for even-indexed elements in \a A,
  1674. /// and less than or equal for odd-indexed elements in \a A. \n
  1675. /// 10: Match: Compare each pair of corresponding characters in \a A and
  1676. /// \a B for equality. \n
  1677. /// 11: Substring: Search \a B for substring matches of \a A. \n
  1678. /// Bits [5:4]: Determine whether to perform a one's complement on the bit
  1679. /// mask of the comparison results. \n
  1680. /// 00: No effect. \n
  1681. /// 01: Negate the bit mask. \n
  1682. /// 10: No effect. \n
  1683. /// 11: Negate the bit mask only for bits with an index less than or equal
  1684. /// to the size of \a A or \a B. \n
  1685. /// Bit [6]: Determines whether the result is zero-extended or expanded to 16
  1686. /// bytes. \n
  1687. /// 0: The result is zero-extended to 16 bytes. \n
  1688. /// 1: The result is expanded to 16 bytes (this expansion is performed by
  1689. /// repeating each bit 8 or 16 times). \n
  1690. /// \returns Returns a 128-bit integer vector representing the result mask of
  1691. /// the comparison.
  1692. #define _mm_cmpestrm(A, LA, B, LB, M) \
  1693. (__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \
  1694. (__v16qi)(__m128i)(B), (int)(LB), \
  1695. (int)(M))
  1696. /// \brief Uses the immediate operand \a M to perform a comparison of string
  1697. /// data with explicitly defined lengths that is contained in source operands
  1698. /// \a A and \a B. Returns an integer representing the result index of the
  1699. /// comparison.
  1700. ///
  1701. /// \headerfile <x86intrin.h>
  1702. ///
  1703. /// \code
  1704. /// int _mm_cmpestri(__m128i A, int LA, __m128i B, int LB, const int M);
  1705. /// \endcode
  1706. ///
  1707. /// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
  1708. /// instruction.
  1709. ///
  1710. /// \param A
  1711. /// A 128-bit integer vector containing one of the source operands to be
  1712. /// compared.
  1713. /// \param LA
  1714. /// An integer that specifies the length of the string in \a A.
  1715. /// \param B
  1716. /// A 128-bit integer vector containing one of the source operands to be
  1717. /// compared.
  1718. /// \param LB
  1719. /// An integer that specifies the length of the string in \a B.
  1720. /// \param M
  1721. /// An 8-bit immediate operand specifying whether the characters are bytes or
  1722. /// words, the type of comparison to perform, and the format of the return
  1723. /// value. \n
  1724. /// Bits [1:0]: Determine source data format. \n
  1725. /// 00: 16 unsigned bytes \n
  1726. /// 01: 8 unsigned words \n
  1727. /// 10: 16 signed bytes \n
  1728. /// 11: 8 signed words \n
  1729. /// Bits [3:2]: Determine comparison type and aggregation method. \n
  1730. /// 00: Subset: Each character in \a B is compared for equality with all
  1731. /// the characters in \a A. \n
  1732. /// 01: Ranges: Each character in \a B is compared to \a A. The comparison
  1733. /// basis is greater than or equal for even-indexed elements in \a A,
  1734. /// and less than or equal for odd-indexed elements in \a A. \n
  1735. /// 10: Match: Compare each pair of corresponding characters in \a A and
  1736. /// \a B for equality. \n
  1737. /// 11: Substring: Search B for substring matches of \a A. \n
  1738. /// Bits [5:4]: Determine whether to perform a one's complement on the bit
  1739. /// mask of the comparison results. \n
  1740. /// 00: No effect. \n
  1741. /// 01: Negate the bit mask. \n
  1742. /// 10: No effect. \n
  1743. /// 11: Negate the bit mask only for bits with an index less than or equal
  1744. /// to the size of \a A or \a B. \n
  1745. /// Bit [6]: Determines whether the index of the lowest set bit or the
  1746. /// highest set bit is returned. \n
  1747. /// 0: The index of the least significant set bit. \n
  1748. /// 1: The index of the most significant set bit. \n
  1749. /// \returns Returns an integer representing the result index of the comparison.
  1750. #define _mm_cmpestri(A, LA, B, LB, M) \
  1751. (int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \
  1752. (__v16qi)(__m128i)(B), (int)(LB), \
  1753. (int)(M))
  1754. /* SSE4.2 Packed Comparison Intrinsics and EFlag Reading. */
  1755. /// \brief Uses the immediate operand \a M to perform a comparison of string
  1756. /// data with implicitly defined lengths that is contained in source operands
  1757. /// \a A and \a B. Returns 1 if the bit mask is zero and the length of the
  1758. /// string in \a B is the maximum, otherwise, returns 0.
  1759. ///
  1760. /// \headerfile <x86intrin.h>
  1761. ///
  1762. /// \code
  1763. /// int _mm_cmpistra(__m128i A, __m128i B, const int M);
  1764. /// \endcode
  1765. ///
  1766. /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
  1767. /// instruction.
  1768. ///
  1769. /// \param A
  1770. /// A 128-bit integer vector containing one of the source operands to be
  1771. /// compared.
  1772. /// \param B
  1773. /// A 128-bit integer vector containing one of the source operands to be
  1774. /// compared.
  1775. /// \param M
  1776. /// An 8-bit immediate operand specifying whether the characters are bytes or
  1777. /// words and the type of comparison to perform. \n
  1778. /// Bits [1:0]: Determine source data format. \n
  1779. /// 00: 16 unsigned bytes \n
  1780. /// 01: 8 unsigned words \n
  1781. /// 10: 16 signed bytes \n
  1782. /// 11: 8 signed words \n
  1783. /// Bits [3:2]: Determine comparison type and aggregation method. \n
  1784. /// 00: Subset: Each character in \a B is compared for equality with all
  1785. /// the characters in \a A. \n
  1786. /// 01: Ranges: Each character in \a B is compared to \a A. The comparison
  1787. /// basis is greater than or equal for even-indexed elements in \a A,
  1788. /// and less than or equal for odd-indexed elements in \a A. \n
  1789. /// 10: Match: Compare each pair of corresponding characters in \a A and
  1790. /// \a B for equality. \n
  1791. /// 11: Substring: Search \a B for substring matches of \a A. \n
  1792. /// Bits [5:4]: Determine whether to perform a one's complement on the bit
  1793. /// mask of the comparison results. \n
  1794. /// 00: No effect. \n
  1795. /// 01: Negate the bit mask. \n
  1796. /// 10: No effect. \n
  1797. /// 11: Negate the bit mask only for bits with an index less than or equal
  1798. /// to the size of \a A or \a B. \n
  1799. /// \returns Returns 1 if the bit mask is zero and the length of the string in
  1800. /// \a B is the maximum; otherwise, returns 0.
  1801. #define _mm_cmpistra(A, B, M) \
  1802. (int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \
  1803. (__v16qi)(__m128i)(B), (int)(M))
  1804. /// \brief Uses the immediate operand \a M to perform a comparison of string
  1805. /// data with implicitly defined lengths that is contained in source operands
  1806. /// \a A and \a B. Returns 1 if the bit mask is non-zero, otherwise, returns
  1807. /// 0.
  1808. ///
  1809. /// \headerfile <x86intrin.h>
  1810. ///
  1811. /// \code
  1812. /// int _mm_cmpistrc(__m128i A, __m128i B, const int M);
  1813. /// \endcode
  1814. ///
  1815. /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
  1816. /// instruction.
  1817. ///
  1818. /// \param A
  1819. /// A 128-bit integer vector containing one of the source operands to be
  1820. /// compared.
  1821. /// \param B
  1822. /// A 128-bit integer vector containing one of the source operands to be
  1823. /// compared.
  1824. /// \param M
  1825. /// An 8-bit immediate operand specifying whether the characters are bytes or
  1826. /// words and the type of comparison to perform. \n
  1827. /// Bits [1:0]: Determine source data format. \n
  1828. /// 00: 16 unsigned bytes \n
  1829. /// 01: 8 unsigned words \n
  1830. /// 10: 16 signed bytes \n
  1831. /// 11: 8 signed words \n
  1832. /// Bits [3:2]: Determine comparison type and aggregation method. \n
  1833. /// 00: Subset: Each character in \a B is compared for equality with all
  1834. /// the characters in \a A. \n
  1835. /// 01: Ranges: Each character in \a B is compared to \a A. The comparison
  1836. /// basis is greater than or equal for even-indexed elements in \a A,
  1837. /// and less than or equal for odd-indexed elements in \a A. \n
  1838. /// 10: Match: Compare each pair of corresponding characters in \a A and
  1839. /// \a B for equality. \n
  1840. /// 11: Substring: Search B for substring matches of \a A. \n
  1841. /// Bits [5:4]: Determine whether to perform a one's complement on the bit
  1842. /// mask of the comparison results. \n
  1843. /// 00: No effect. \n
  1844. /// 01: Negate the bit mask. \n
  1845. /// 10: No effect. \n
  1846. /// 11: Negate the bit mask only for bits with an index less than or equal
  1847. /// to the size of \a A or \a B.
  1848. /// \returns Returns 1 if the bit mask is non-zero, otherwise, returns 0.
  1849. #define _mm_cmpistrc(A, B, M) \
  1850. (int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \
  1851. (__v16qi)(__m128i)(B), (int)(M))
  1852. /// \brief Uses the immediate operand \a M to perform a comparison of string
  1853. /// data with implicitly defined lengths that is contained in source operands
  1854. /// \a A and \a B. Returns bit 0 of the resulting bit mask.
  1855. ///
  1856. /// \headerfile <x86intrin.h>
  1857. ///
  1858. /// \code
  1859. /// int _mm_cmpistro(__m128i A, __m128i B, const int M);
  1860. /// \endcode
  1861. ///
  1862. /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
  1863. /// instruction.
  1864. ///
  1865. /// \param A
  1866. /// A 128-bit integer vector containing one of the source operands to be
  1867. /// compared.
  1868. /// \param B
  1869. /// A 128-bit integer vector containing one of the source operands to be
  1870. /// compared.
  1871. /// \param M
  1872. /// An 8-bit immediate operand specifying whether the characters are bytes or
  1873. /// words and the type of comparison to perform. \n
  1874. /// Bits [1:0]: Determine source data format. \n
  1875. /// 00: 16 unsigned bytes \n
  1876. /// 01: 8 unsigned words \n
  1877. /// 10: 16 signed bytes \n
  1878. /// 11: 8 signed words \n
  1879. /// Bits [3:2]: Determine comparison type and aggregation method. \n
  1880. /// 00: Subset: Each character in \a B is compared for equality with all
  1881. /// the characters in \a A. \n
  1882. /// 01: Ranges: Each character in \a B is compared to \a A. The comparison
  1883. /// basis is greater than or equal for even-indexed elements in \a A,
  1884. /// and less than or equal for odd-indexed elements in \a A. \n
  1885. /// 10: Match: Compare each pair of corresponding characters in \a A and
  1886. /// \a B for equality. \n
  1887. /// 11: Substring: Search B for substring matches of \a A. \n
  1888. /// Bits [5:4]: Determine whether to perform a one's complement on the bit
  1889. /// mask of the comparison results. \n
  1890. /// 00: No effect. \n
  1891. /// 01: Negate the bit mask. \n
  1892. /// 10: No effect. \n
  1893. /// 11: Negate the bit mask only for bits with an index less than or equal
  1894. /// to the size of \a A or \a B. \n
  1895. /// \returns Returns bit 0 of the resulting bit mask.
  1896. #define _mm_cmpistro(A, B, M) \
  1897. (int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \
  1898. (__v16qi)(__m128i)(B), (int)(M))
  1899. /// \brief Uses the immediate operand \a M to perform a comparison of string
  1900. /// data with implicitly defined lengths that is contained in source operands
  1901. /// \a A and \a B. Returns 1 if the length of the string in \a A is less than
  1902. /// the maximum, otherwise, returns 0.
  1903. ///
  1904. /// \headerfile <x86intrin.h>
  1905. ///
  1906. /// \code
  1907. /// int _mm_cmpistrs(__m128i A, __m128i B, const int M);
  1908. /// \endcode
  1909. ///
  1910. /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
  1911. /// instruction.
  1912. ///
  1913. /// \param A
  1914. /// A 128-bit integer vector containing one of the source operands to be
  1915. /// compared.
  1916. /// \param B
  1917. /// A 128-bit integer vector containing one of the source operands to be
  1918. /// compared.
  1919. /// \param M
  1920. /// An 8-bit immediate operand specifying whether the characters are bytes or
  1921. /// words and the type of comparison to perform. \n
  1922. /// Bits [1:0]: Determine source data format. \n
  1923. /// 00: 16 unsigned bytes \n
  1924. /// 01: 8 unsigned words \n
  1925. /// 10: 16 signed bytes \n
  1926. /// 11: 8 signed words \n
  1927. /// Bits [3:2]: Determine comparison type and aggregation method. \n
  1928. /// 00: Subset: Each character in \a B is compared for equality with all
  1929. /// the characters in \a A. \n
  1930. /// 01: Ranges: Each character in \a B is compared to \a A. The comparison
  1931. /// basis is greater than or equal for even-indexed elements in \a A,
  1932. /// and less than or equal for odd-indexed elements in \a A. \n
  1933. /// 10: Match: Compare each pair of corresponding characters in \a A and
  1934. /// \a B for equality. \n
  1935. /// 11: Substring: Search \a B for substring matches of \a A. \n
  1936. /// Bits [5:4]: Determine whether to perform a one's complement on the bit
  1937. /// mask of the comparison results. \n
  1938. /// 00: No effect. \n
  1939. /// 01: Negate the bit mask. \n
  1940. /// 10: No effect. \n
  1941. /// 11: Negate the bit mask only for bits with an index less than or equal
  1942. /// to the size of \a A or \a B. \n
  1943. /// \returns Returns 1 if the length of the string in \a A is less than the
  1944. /// maximum, otherwise, returns 0.
  1945. #define _mm_cmpistrs(A, B, M) \
  1946. (int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \
  1947. (__v16qi)(__m128i)(B), (int)(M))
  1948. /// \brief Uses the immediate operand \a M to perform a comparison of string
  1949. /// data with implicitly defined lengths that is contained in source operands
  1950. /// \a A and \a B. Returns 1 if the length of the string in \a B is less than
  1951. /// the maximum, otherwise, returns 0.
  1952. ///
  1953. /// \headerfile <x86intrin.h>
  1954. ///
  1955. /// \code
  1956. /// int _mm_cmpistrz(__m128i A, __m128i B, const int M);
  1957. /// \endcode
  1958. ///
  1959. /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
  1960. /// instruction.
  1961. ///
  1962. /// \param A
  1963. /// A 128-bit integer vector containing one of the source operands to be
  1964. /// compared.
  1965. /// \param B
  1966. /// A 128-bit integer vector containing one of the source operands to be
  1967. /// compared.
  1968. /// \param M
  1969. /// An 8-bit immediate operand specifying whether the characters are bytes or
  1970. /// words and the type of comparison to perform. \n
  1971. /// Bits [1:0]: Determine source data format. \n
  1972. /// 00: 16 unsigned bytes \n
  1973. /// 01: 8 unsigned words \n
  1974. /// 10: 16 signed bytes \n
  1975. /// 11: 8 signed words \n
  1976. /// Bits [3:2]: Determine comparison type and aggregation method. \n
  1977. /// 00: Subset: Each character in \a B is compared for equality with all
  1978. /// the characters in \a A. \n
  1979. /// 01: Ranges: Each character in \a B is compared to \a A. The comparison
  1980. /// basis is greater than or equal for even-indexed elements in \a A,
  1981. /// and less than or equal for odd-indexed elements in \a A. \n
  1982. /// 10: Match: Compare each pair of corresponding characters in \a A and
  1983. /// \a B for equality. \n
  1984. /// 11: Substring: Search \a B for substring matches of \a A. \n
  1985. /// Bits [5:4]: Determine whether to perform a one's complement on the bit
  1986. /// mask of the comparison results. \n
  1987. /// 00: No effect. \n
  1988. /// 01: Negate the bit mask. \n
  1989. /// 10: No effect. \n
  1990. /// 11: Negate the bit mask only for bits with an index less than or equal
  1991. /// to the size of \a A or \a B.
  1992. /// \returns Returns 1 if the length of the string in \a B is less than the
  1993. /// maximum, otherwise, returns 0.
  1994. #define _mm_cmpistrz(A, B, M) \
  1995. (int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \
  1996. (__v16qi)(__m128i)(B), (int)(M))
  1997. /// \brief Uses the immediate operand \a M to perform a comparison of string
  1998. /// data with explicitly defined lengths that is contained in source operands
  1999. /// \a A and \a B. Returns 1 if the bit mask is zero and the length of the
  2000. /// string in \a B is the maximum, otherwise, returns 0.
  2001. ///
  2002. /// \headerfile <x86intrin.h>
  2003. ///
  2004. /// \code
  2005. /// int _mm_cmpestra(__m128i A, int LA, __m128i B, int LB, const int M);
  2006. /// \endcode
  2007. ///
  2008. /// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
  2009. /// instruction.
  2010. ///
  2011. /// \param A
  2012. /// A 128-bit integer vector containing one of the source operands to be
  2013. /// compared.
  2014. /// \param LA
  2015. /// An integer that specifies the length of the string in \a A.
  2016. /// \param B
  2017. /// A 128-bit integer vector containing one of the source operands to be
  2018. /// compared.
  2019. /// \param LB
  2020. /// An integer that specifies the length of the string in \a B.
  2021. /// \param M
  2022. /// An 8-bit immediate operand specifying whether the characters are bytes or
  2023. /// words and the type of comparison to perform. \n
  2024. /// Bits [1:0]: Determine source data format. \n
  2025. /// 00: 16 unsigned bytes \n
  2026. /// 01: 8 unsigned words \n
  2027. /// 10: 16 signed bytes \n
  2028. /// 11: 8 signed words \n
  2029. /// Bits [3:2]: Determine comparison type and aggregation method. \n
  2030. /// 00: Subset: Each character in \a B is compared for equality with all
  2031. /// the characters in \a A. \n
  2032. /// 01: Ranges: Each character in \a B is compared to \a A. The comparison
  2033. /// basis is greater than or equal for even-indexed elements in \a A,
  2034. /// and less than or equal for odd-indexed elements in \a A. \n
  2035. /// 10: Match: Compare each pair of corresponding characters in \a A and
  2036. /// \a B for equality. \n
  2037. /// 11: Substring: Search \a B for substring matches of \a A. \n
  2038. /// Bits [5:4]: Determine whether to perform a one's complement on the bit
  2039. /// mask of the comparison results. \n
  2040. /// 00: No effect. \n
  2041. /// 01: Negate the bit mask. \n
  2042. /// 10: No effect. \n
  2043. /// 11: Negate the bit mask only for bits with an index less than or equal
  2044. /// to the size of \a A or \a B.
  2045. /// \returns Returns 1 if the bit mask is zero and the length of the string in
  2046. /// \a B is the maximum, otherwise, returns 0.
  2047. #define _mm_cmpestra(A, LA, B, LB, M) \
  2048. (int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \
  2049. (__v16qi)(__m128i)(B), (int)(LB), \
  2050. (int)(M))
  2051. /// \brief Uses the immediate operand \a M to perform a comparison of string
  2052. /// data with explicitly defined lengths that is contained in source operands
  2053. /// \a A and \a B. Returns 1 if the resulting mask is non-zero, otherwise,
  2054. /// returns 0.
  2055. ///
  2056. /// \headerfile <x86intrin.h>
  2057. ///
  2058. /// \code
  2059. /// int _mm_cmpestrc(__m128i A, int LA, __m128i B, int LB, const int M);
  2060. /// \endcode
  2061. ///
  2062. /// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
  2063. /// instruction.
  2064. ///
  2065. /// \param A
  2066. /// A 128-bit integer vector containing one of the source operands to be
  2067. /// compared.
  2068. /// \param LA
  2069. /// An integer that specifies the length of the string in \a A.
  2070. /// \param B
  2071. /// A 128-bit integer vector containing one of the source operands to be
  2072. /// compared.
  2073. /// \param LB
  2074. /// An integer that specifies the length of the string in \a B.
  2075. /// \param M
  2076. /// An 8-bit immediate operand specifying whether the characters are bytes or
  2077. /// words and the type of comparison to perform. \n
  2078. /// Bits [1:0]: Determine source data format. \n
  2079. /// 00: 16 unsigned bytes \n
  2080. /// 01: 8 unsigned words \n
  2081. /// 10: 16 signed bytes \n
  2082. /// 11: 8 signed words \n
  2083. /// Bits [3:2]: Determine comparison type and aggregation method. \n
  2084. /// 00: Subset: Each character in \a B is compared for equality with all
  2085. /// the characters in \a A. \n
  2086. /// 01: Ranges: Each character in \a B is compared to \a A. The comparison
  2087. /// basis is greater than or equal for even-indexed elements in \a A,
  2088. /// and less than or equal for odd-indexed elements in \a A. \n
  2089. /// 10: Match: Compare each pair of corresponding characters in \a A and
  2090. /// \a B for equality. \n
  2091. /// 11: Substring: Search \a B for substring matches of \a A. \n
  2092. /// Bits [5:4]: Determine whether to perform a one's complement on the bit
  2093. /// mask of the comparison results. \n
  2094. /// 00: No effect. \n
  2095. /// 01: Negate the bit mask. \n
  2096. /// 10: No effect. \n
  2097. /// 11: Negate the bit mask only for bits with an index less than or equal
  2098. /// to the size of \a A or \a B. \n
  2099. /// \returns Returns 1 if the resulting mask is non-zero, otherwise, returns 0.
  2100. #define _mm_cmpestrc(A, LA, B, LB, M) \
  2101. (int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \
  2102. (__v16qi)(__m128i)(B), (int)(LB), \
  2103. (int)(M))
  2104. /// \brief Uses the immediate operand \a M to perform a comparison of string
  2105. /// data with explicitly defined lengths that is contained in source operands
  2106. /// \a A and \a B. Returns bit 0 of the resulting bit mask.
  2107. ///
  2108. /// \headerfile <x86intrin.h>
  2109. ///
  2110. /// \code
  2111. /// int _mm_cmpestro(__m128i A, int LA, __m128i B, int LB, const int M);
  2112. /// \endcode
  2113. ///
  2114. /// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
  2115. /// instruction.
  2116. ///
  2117. /// \param A
  2118. /// A 128-bit integer vector containing one of the source operands to be
  2119. /// compared.
  2120. /// \param LA
  2121. /// An integer that specifies the length of the string in \a A.
  2122. /// \param B
  2123. /// A 128-bit integer vector containing one of the source operands to be
  2124. /// compared.
  2125. /// \param LB
  2126. /// An integer that specifies the length of the string in \a B.
  2127. /// \param M
  2128. /// An 8-bit immediate operand specifying whether the characters are bytes or
  2129. /// words and the type of comparison to perform. \n
  2130. /// Bits [1:0]: Determine source data format. \n
  2131. /// 00: 16 unsigned bytes \n
  2132. /// 01: 8 unsigned words \n
  2133. /// 10: 16 signed bytes \n
  2134. /// 11: 8 signed words \n
  2135. /// Bits [3:2]: Determine comparison type and aggregation method. \n
  2136. /// 00: Subset: Each character in \a B is compared for equality with all
  2137. /// the characters in \a A. \n
  2138. /// 01: Ranges: Each character in \a B is compared to \a A. The comparison
  2139. /// basis is greater than or equal for even-indexed elements in \a A,
  2140. /// and less than or equal for odd-indexed elements in \a A. \n
  2141. /// 10: Match: Compare each pair of corresponding characters in \a A and
  2142. /// \a B for equality. \n
  2143. /// 11: Substring: Search \a B for substring matches of \a A. \n
  2144. /// Bits [5:4]: Determine whether to perform a one's complement on the bit
  2145. /// mask of the comparison results. \n
  2146. /// 00: No effect. \n
  2147. /// 01: Negate the bit mask. \n
  2148. /// 10: No effect. \n
  2149. /// 11: Negate the bit mask only for bits with an index less than or equal
  2150. /// to the size of \a A or \a B.
  2151. /// \returns Returns bit 0 of the resulting bit mask.
  2152. #define _mm_cmpestro(A, LA, B, LB, M) \
  2153. (int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \
  2154. (__v16qi)(__m128i)(B), (int)(LB), \
  2155. (int)(M))
  2156. /// \brief Uses the immediate operand \a M to perform a comparison of string
  2157. /// data with explicitly defined lengths that is contained in source operands
  2158. /// \a A and \a B. Returns 1 if the length of the string in \a A is less than
  2159. /// the maximum, otherwise, returns 0.
  2160. ///
  2161. /// \headerfile <x86intrin.h>
  2162. ///
  2163. /// \code
  2164. /// int _mm_cmpestrs(__m128i A, int LA, __m128i B, int LB, const int M);
  2165. /// \endcode
  2166. ///
  2167. /// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
  2168. /// instruction.
  2169. ///
  2170. /// \param A
  2171. /// A 128-bit integer vector containing one of the source operands to be
  2172. /// compared.
  2173. /// \param LA
  2174. /// An integer that specifies the length of the string in \a A.
  2175. /// \param B
  2176. /// A 128-bit integer vector containing one of the source operands to be
  2177. /// compared.
  2178. /// \param LB
  2179. /// An integer that specifies the length of the string in \a B.
  2180. /// \param M
  2181. /// An 8-bit immediate operand specifying whether the characters are bytes or
  2182. /// words and the type of comparison to perform. \n
  2183. /// Bits [1:0]: Determine source data format. \n
  2184. /// 00: 16 unsigned bytes \n
  2185. /// 01: 8 unsigned words \n
  2186. /// 10: 16 signed bytes \n
  2187. /// 11: 8 signed words \n
  2188. /// Bits [3:2]: Determine comparison type and aggregation method. \n
  2189. /// 00: Subset: Each character in \a B is compared for equality with all
  2190. /// the characters in \a A. \n
  2191. /// 01: Ranges: Each character in \a B is compared to \a A. The comparison
  2192. /// basis is greater than or equal for even-indexed elements in \a A,
  2193. /// and less than or equal for odd-indexed elements in \a A. \n
  2194. /// 10: Match: Compare each pair of corresponding characters in \a A and
  2195. /// \a B for equality. \n
  2196. /// 11: Substring: Search \a B for substring matches of \a A. \n
  2197. /// Bits [5:4]: Determine whether to perform a one's complement in the bit
  2198. /// mask of the comparison results. \n
  2199. /// 00: No effect. \n
  2200. /// 01: Negate the bit mask. \n
  2201. /// 10: No effect. \n
  2202. /// 11: Negate the bit mask only for bits with an index less than or equal
  2203. /// to the size of \a A or \a B. \n
  2204. /// \returns Returns 1 if the length of the string in \a A is less than the
  2205. /// maximum, otherwise, returns 0.
  2206. #define _mm_cmpestrs(A, LA, B, LB, M) \
  2207. (int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \
  2208. (__v16qi)(__m128i)(B), (int)(LB), \
  2209. (int)(M))
  2210. /// \brief Uses the immediate operand \a M to perform a comparison of string
  2211. /// data with explicitly defined lengths that is contained in source operands
  2212. /// \a A and \a B. Returns 1 if the length of the string in \a B is less than
  2213. /// the maximum, otherwise, returns 0.
  2214. ///
  2215. /// \headerfile <x86intrin.h>
  2216. ///
  2217. /// \code
  2218. /// int _mm_cmpestrz(__m128i A, int LA, __m128i B, int LB, const int M);
  2219. /// \endcode
  2220. ///
  2221. /// This intrinsic corresponds to the <c> VPCMPESTRI </c> instruction.
  2222. ///
  2223. /// \param A
  2224. /// A 128-bit integer vector containing one of the source operands to be
  2225. /// compared.
  2226. /// \param LA
  2227. /// An integer that specifies the length of the string in \a A.
  2228. /// \param B
  2229. /// A 128-bit integer vector containing one of the source operands to be
  2230. /// compared.
  2231. /// \param LB
  2232. /// An integer that specifies the length of the string in \a B.
  2233. /// \param M
  2234. /// An 8-bit immediate operand specifying whether the characters are bytes or
  2235. /// words and the type of comparison to perform. \n
  2236. /// Bits [1:0]: Determine source data format. \n
  2237. /// 00: 16 unsigned bytes \n
  2238. /// 01: 8 unsigned words \n
  2239. /// 10: 16 signed bytes \n
  2240. /// 11: 8 signed words \n
  2241. /// Bits [3:2]: Determine comparison type and aggregation method. \n
  2242. /// 00: Subset: Each character in \a B is compared for equality with all
  2243. /// the characters in \a A. \n
  2244. /// 01: Ranges: Each character in \a B is compared to \a A. The comparison
  2245. /// basis is greater than or equal for even-indexed elements in \a A,
  2246. /// and less than or equal for odd-indexed elements in \a A. \n
  2247. /// 10: Match: Compare each pair of corresponding characters in \a A and
  2248. /// \a B for equality. \n
  2249. /// 11: Substring: Search \a B for substring matches of \a A. \n
  2250. /// Bits [5:4]: Determine whether to perform a one's complement on the bit
  2251. /// mask of the comparison results. \n
  2252. /// 00: No effect. \n
  2253. /// 01: Negate the bit mask. \n
  2254. /// 10: No effect. \n
  2255. /// 11: Negate the bit mask only for bits with an index less than or equal
  2256. /// to the size of \a A or \a B.
  2257. /// \returns Returns 1 if the length of the string in \a B is less than the
  2258. /// maximum, otherwise, returns 0.
  2259. #define _mm_cmpestrz(A, LA, B, LB, M) \
  2260. (int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \
  2261. (__v16qi)(__m128i)(B), (int)(LB), \
  2262. (int)(M))
  2263. /* SSE4.2 Compare Packed Data -- Greater Than. */
  2264. /// \brief Compares each of the corresponding 64-bit values of the 128-bit
  2265. /// integer vectors to determine if the values in the first operand are
  2266. /// greater than those in the second operand.
  2267. ///
  2268. /// \headerfile <x86intrin.h>
  2269. ///
  2270. /// This intrinsic corresponds to the <c> VPCMPGTQ / PCMPGTQ </c> instruction.
  2271. ///
  2272. /// \param __V1
  2273. /// A 128-bit integer vector.
  2274. /// \param __V2
  2275. /// A 128-bit integer vector.
  2276. /// \returns A 128-bit integer vector containing the comparison results.
  2277. static __inline__ __m128i __DEFAULT_FN_ATTRS
  2278. _mm_cmpgt_epi64(__m128i __V1, __m128i __V2)
  2279. {
  2280. return (__m128i)((__v2di)__V1 > (__v2di)__V2);
  2281. }
  2282. /* SSE4.2 Accumulate CRC32. */
  2283. /// \brief Adds the unsigned integer operand to the CRC-32C checksum of the
  2284. /// unsigned char operand.
  2285. ///
  2286. /// \headerfile <x86intrin.h>
  2287. ///
  2288. /// This intrinsic corresponds to the <c> CRC32B </c> instruction.
  2289. ///
  2290. /// \param __C
  2291. /// An unsigned integer operand to add to the CRC-32C checksum of operand
  2292. /// \a __D.
  2293. /// \param __D
  2294. /// An unsigned 8-bit integer operand used to compute the CRC-32C checksum.
  2295. /// \returns The result of adding operand \a __C to the CRC-32C checksum of
  2296. /// operand \a __D.
  2297. static __inline__ unsigned int __DEFAULT_FN_ATTRS
  2298. _mm_crc32_u8(unsigned int __C, unsigned char __D)
  2299. {
  2300. return __builtin_ia32_crc32qi(__C, __D);
  2301. }
  2302. /// \brief Adds the unsigned integer operand to the CRC-32C checksum of the
  2303. /// unsigned short operand.
  2304. ///
  2305. /// \headerfile <x86intrin.h>
  2306. ///
  2307. /// This intrinsic corresponds to the <c> CRC32W </c> instruction.
  2308. ///
  2309. /// \param __C
  2310. /// An unsigned integer operand to add to the CRC-32C checksum of operand
  2311. /// \a __D.
  2312. /// \param __D
  2313. /// An unsigned 16-bit integer operand used to compute the CRC-32C checksum.
  2314. /// \returns The result of adding operand \a __C to the CRC-32C checksum of
  2315. /// operand \a __D.
  2316. static __inline__ unsigned int __DEFAULT_FN_ATTRS
  2317. _mm_crc32_u16(unsigned int __C, unsigned short __D)
  2318. {
  2319. return __builtin_ia32_crc32hi(__C, __D);
  2320. }
  2321. /// \brief Adds the first unsigned integer operand to the CRC-32C checksum of
  2322. /// the second unsigned integer operand.
  2323. ///
  2324. /// \headerfile <x86intrin.h>
  2325. ///
  2326. /// This intrinsic corresponds to the <c> CRC32L </c> instruction.
  2327. ///
  2328. /// \param __C
  2329. /// An unsigned integer operand to add to the CRC-32C checksum of operand
  2330. /// \a __D.
  2331. /// \param __D
  2332. /// An unsigned 32-bit integer operand used to compute the CRC-32C checksum.
  2333. /// \returns The result of adding operand \a __C to the CRC-32C checksum of
  2334. /// operand \a __D.
  2335. static __inline__ unsigned int __DEFAULT_FN_ATTRS
  2336. _mm_crc32_u32(unsigned int __C, unsigned int __D)
  2337. {
  2338. return __builtin_ia32_crc32si(__C, __D);
  2339. }
  2340. #ifdef __x86_64__
  2341. /// \brief Adds the unsigned integer operand to the CRC-32C checksum of the
  2342. /// unsigned 64-bit integer operand.
  2343. ///
  2344. /// \headerfile <x86intrin.h>
  2345. ///
  2346. /// This intrinsic corresponds to the <c> CRC32Q </c> instruction.
  2347. ///
  2348. /// \param __C
  2349. /// An unsigned integer operand to add to the CRC-32C checksum of operand
  2350. /// \a __D.
  2351. /// \param __D
  2352. /// An unsigned 64-bit integer operand used to compute the CRC-32C checksum.
  2353. /// \returns The result of adding operand \a __C to the CRC-32C checksum of
  2354. /// operand \a __D.
  2355. static __inline__ unsigned long long __DEFAULT_FN_ATTRS
  2356. _mm_crc32_u64(unsigned long long __C, unsigned long long __D)
  2357. {
  2358. return __builtin_ia32_crc32di(__C, __D);
  2359. }
  2360. #endif /* __x86_64__ */
  2361. #undef __DEFAULT_FN_ATTRS
  2362. #ifdef __POPCNT__
  2363. #include <popcntintrin.h>
  2364. #endif
  2365. #endif /* _SMMINTRIN_H */