1target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" 2target triple = "aarch64-linux-android" 3 4;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 5;;;;;;;;; INTRINSICS ;;;;;;;;;; 6;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 7 8declare <2 x float> @llvm.aarch64.neon.fmax.v2f32(<2 x float>, <2 x float>) nounwind readnone 9declare <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float>, <4 x float>) nounwind readnone 10declare <2 x i32> @llvm.aarch64.neon.smax.v2i32(<2 x i32>, <2 x i32>) nounwind readnone 11declare <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32>, <4 x i32>) nounwind readnone 12declare <2 x i32> @llvm.aarch64.neon.umax.v2i32(<2 x i32>, <2 x i32>) nounwind readnone 13declare <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32>, <4 x i32>) nounwind readnone 14declare <4 x i16> @llvm.aarch64.neon.smax.v4i16(<4 x i16>, <4 x i16>) nounwind readnone 15declare <4 x i16> @llvm.aarch64.neon.umax.v4i16(<4 x i16>, <4 x i16>) nounwind readnone 16 17declare <2 x float> @llvm.aarch64.neon.fmin.v2f32(<2 x float>, <2 x float>) nounwind readnone 18declare <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float>, <4 x float>) nounwind readnone 19declare <2 x i32> @llvm.aarch64.neon.smin.v2i32(<2 x i32>, <2 x i32>) nounwind readnone 20declare <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32>, <4 x i32>) nounwind readnone 21declare <2 x i32> @llvm.aarch64.neon.umin.v2i32(<2 x i32>, <2 x i32>) nounwind readnone 22declare <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32>, <4 x i32>) nounwind readnone 23declare <4 x i16> @llvm.aarch64.neon.smin.v4i16(<4 x i16>, <4 x i16>) nounwind readnone 24declare <4 x i16> @llvm.aarch64.neon.umin.v4i16(<4 x i16>, <4 x i16>) nounwind readnone 25 26declare <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i16>, <8 x i16>) nounwind readnone 27declare <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i32>, <4 x i32>) nounwind readnone 28declare <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i64>, <2 x i64>) nounwind readnone 29 30declare <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16>, <8 x i16>) nounwind readnone 31declare <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32>, <4 x i32>) nounwind readnone 32declare <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64>, <2 x i64>) nounwind readnone 33 34declare <2 x float> @llvm.aarch64.neon.frecpe.v2f32(<2 x float>) nounwind readnone 35declare <4 x float> @llvm.aarch64.neon.frecpe.v4f32(<4 x float>) nounwind readnone 36 37declare <2 x float> @llvm.aarch64.neon.frsqrte.v2f32(<2 x float>) nounwind readnone 38declare <4 x float> @llvm.aarch64.neon.frsqrte.v4f32(<4 x float>) nounwind readnone 39 40declare <2 x float> @llvm.aarch64.neon.frecps.v2f32(<2 x float>, <2 x float>) nounwind readnone 41declare <4 x float> @llvm.aarch64.neon.frecps.v4f32(<4 x float>, <4 x float>) nounwind readnone 42 43declare <2 x float> @llvm.aarch64.neon.frsqrts.v2f32(<2 x float>, <2 x float>) nounwind readnone 44declare <4 x float> @llvm.aarch64.neon.frsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone 45 46;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 47;;;;;;;;; HELPERS ;;;;;;;;;; 48;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 49 50define internal <4 x float> @smear_4f(float %in) nounwind readnone alwaysinline { 51 %1 = insertelement <4 x float> undef, float %in, i32 0 52 %2 = insertelement <4 x float> %1, float %in, i32 1 53 %3 = insertelement <4 x float> %2, float %in, i32 2 54 %4 = insertelement <4 x float> %3, float %in, i32 3 55 ret <4 x float> %4 56} 57 58define internal <4 x i32> @smear_4i(i32 %in) nounwind readnone alwaysinline { 59 %1 = insertelement <4 x i32> undef, i32 %in, i32 0 60 %2 = insertelement <4 x i32> %1, i32 %in, i32 1 61 %3 = insertelement <4 x i32> %2, i32 %in, i32 2 62 %4 = insertelement <4 x i32> %3, i32 %in, i32 3 63 ret <4 x i32> %4 64} 65 66define internal <4 x i16> @smear_4s(i16 %in) nounwind readnone alwaysinline { 67 %1 = insertelement <4 x i16> undef, i16 %in, i32 0 68 %2 = insertelement <4 x i16> %1, i16 %in, i32 1 69 %3 = insertelement <4 x i16> %2, i16 %in, i32 2 70 %4 = insertelement <4 x i16> %3, i16 %in, i32 3 71 ret <4 x i16> %4 72} 73 74 75 76define internal <2 x float> @smear_2f(float %in) nounwind readnone alwaysinline { 77 %1 = insertelement <2 x float> undef, float %in, i32 0 78 %2 = insertelement <2 x float> %1, float %in, i32 1 79 ret <2 x float> %2 80} 81 82define internal <2 x i32> @smear_2i(i32 %in) nounwind readnone alwaysinline { 83 %1 = insertelement <2 x i32> undef, i32 %in, i32 0 84 %2 = insertelement <2 x i32> %1, i32 %in, i32 1 85 ret <2 x i32> %2 86} 87 88define internal <2 x i16> @smear_2s(i16 %in) nounwind readnone alwaysinline { 89 %1 = insertelement <2 x i16> undef, i16 %in, i32 0 90 %2 = insertelement <2 x i16> %1, i16 %in, i32 1 91 ret <2 x i16> %2 92} 93 94 95define internal <4 x i32> @smear_4i32(i32 %in) nounwind readnone alwaysinline { 96 %1 = insertelement <4 x i32> undef, i32 %in, i32 0 97 %2 = insertelement <4 x i32> %1, i32 %in, i32 1 98 %3 = insertelement <4 x i32> %2, i32 %in, i32 2 99 %4 = insertelement <4 x i32> %3, i32 %in, i32 3 100 ret <4 x i32> %4 101} 102 103 104;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 105;;;;;;;;; CLAMP ;;;;;;;;;; 106;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 107 108define <4 x float> @_Z5clampDv4_fS_S_(<4 x float> %value, <4 x float> %low, <4 x float> %high) nounwind readonly { 109 %1 = tail call <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float> %value, <4 x float> %high) nounwind readnone 110 %2 = tail call <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float> %1, <4 x float> %low) nounwind readnone 111 ret <4 x float> %2 112} 113 114define <4 x float> @_Z5clampDv4_fff(<4 x float> %value, float %low, float %high) nounwind readonly { 115 %_high = tail call <4 x float> @smear_4f(float %high) nounwind readnone 116 %_low = tail call <4 x float> @smear_4f(float %low) nounwind readnone 117 %out = tail call <4 x float> @_Z5clampDv4_fS_S_(<4 x float> %value, <4 x float> %_low, <4 x float> %_high) nounwind readonly 118 ret <4 x float> %out 119} 120 121define <3 x float> @_Z5clampDv3_fS_S_(<3 x float> %value, <3 x float> %low, <3 x float> %high) nounwind readonly { 122 %_value = shufflevector <3 x float> %value, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 123 %_low = shufflevector <3 x float> %low, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 124 %_high = shufflevector <3 x float> %high, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 125 %a = tail call <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float> %_value, <4 x float> %_high) nounwind readnone 126 %b = tail call <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float> %a, <4 x float> %_low) nounwind readnone 127 %c = shufflevector <4 x float> %b, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> 128 ret <3 x float> %c 129} 130 131define <3 x float> @_Z5clampDv3_fff(<3 x float> %value, float %low, float %high) nounwind readonly { 132 %_value = shufflevector <3 x float> %value, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 133 %_high = tail call <4 x float> @smear_4f(float %high) nounwind readnone 134 %_low = tail call <4 x float> @smear_4f(float %low) nounwind readnone 135 %a = tail call <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float> %_value, <4 x float> %_high) nounwind readnone 136 %b = tail call <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float> %a, <4 x float> %_low) nounwind readnone 137 %c = shufflevector <4 x float> %b, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> 138 ret <3 x float> %c 139} 140 141define <2 x float> @_Z5clampDv2_fS_S_(<2 x float> %value, <2 x float> %low, <2 x float> %high) nounwind readonly { 142 %1 = tail call <2 x float> @llvm.aarch64.neon.fmin.v2f32(<2 x float> %value, <2 x float> %high) nounwind readnone 143 %2 = tail call <2 x float> @llvm.aarch64.neon.fmax.v2f32(<2 x float> %1, <2 x float> %low) nounwind readnone 144 ret <2 x float> %2 145} 146 147define <2 x float> @_Z5clampDv2_fff(<2 x float> %value, float %low, float %high) nounwind readonly { 148 %_high = tail call <2 x float> @smear_2f(float %high) nounwind readnone 149 %_low = tail call <2 x float> @smear_2f(float %low) nounwind readnone 150 %a = tail call <2 x float> @llvm.aarch64.neon.fmin.v2f32(<2 x float> %value, <2 x float> %_high) nounwind readnone 151 %b = tail call <2 x float> @llvm.aarch64.neon.fmax.v2f32(<2 x float> %a, <2 x float> %_low) nounwind readnone 152 ret <2 x float> %b 153} 154 155define float @_Z5clampfff(float %value, float %low, float %high) nounwind readonly { 156 %1 = fcmp olt float %value, %high 157 %2 = select i1 %1, float %value, float %high 158 %3 = fcmp ogt float %2, %low 159 %4 = select i1 %3, float %2, float %low 160 ret float %4 161} 162 163 164 165define <4 x i32> @_Z5clampDv4_iS_S_(<4 x i32> %value, <4 x i32> %low, <4 x i32> %high) nounwind readonly { 166 %1 = tail call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> %value, <4 x i32> %high) nounwind readnone 167 %2 = tail call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> %1, <4 x i32> %low) nounwind readnone 168 ret <4 x i32> %2 169} 170 171define <4 x i32> @_Z5clampDv4_iii(<4 x i32> %value, i32 %low, i32 %high) nounwind readonly { 172 %_high = tail call <4 x i32> @smear_4i(i32 %high) nounwind readnone 173 %_low = tail call <4 x i32> @smear_4i(i32 %low) nounwind readnone 174 %1 = tail call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> %value, <4 x i32> %_high) nounwind readnone 175 %2 = tail call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> %1, <4 x i32> %_low) nounwind readnone 176 ret <4 x i32> %2 177} 178 179define <3 x i32> @_Z5clampDv3_iS_S_(<3 x i32> %value, <3 x i32> %low, <3 x i32> %high) nounwind readonly { 180 %_value = shufflevector <3 x i32> %value, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 181 %_low = shufflevector <3 x i32> %low, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 182 %_high = shufflevector <3 x i32> %high, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 183 %a = tail call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> %_value, <4 x i32> %_high) nounwind readnone 184 %b = tail call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> %a, <4 x i32> %_low) nounwind readnone 185 %c = shufflevector <4 x i32> %b, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 186 ret <3 x i32> %c 187} 188 189define <3 x i32> @_Z5clampDv3_iii(<3 x i32> %value, i32 %low, i32 %high) nounwind readonly { 190 %_value = shufflevector <3 x i32> %value, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 191 %_high = tail call <4 x i32> @smear_4i(i32 %high) nounwind readnone 192 %_low = tail call <4 x i32> @smear_4i(i32 %low) nounwind readnone 193 %a = tail call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> %_value, <4 x i32> %_high) nounwind readnone 194 %b = tail call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> %a, <4 x i32> %_low) nounwind readnone 195 %c = shufflevector <4 x i32> %b, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 196 ret <3 x i32> %c 197} 198 199define <2 x i32> @_Z5clampDv2_iS_S_(<2 x i32> %value, <2 x i32> %low, <2 x i32> %high) nounwind readonly { 200 %1 = tail call <2 x i32> @llvm.aarch64.neon.smin.v2i32(<2 x i32> %value, <2 x i32> %high) nounwind readnone 201 %2 = tail call <2 x i32> @llvm.aarch64.neon.smax.v2i32(<2 x i32> %1, <2 x i32> %low) nounwind readnone 202 ret <2 x i32> %2 203} 204 205define <2 x i32> @_Z5clampDv2_iii(<2 x i32> %value, i32 %low, i32 %high) nounwind readonly { 206 %_high = tail call <2 x i32> @smear_2i(i32 %high) nounwind readnone 207 %_low = tail call <2 x i32> @smear_2i(i32 %low) nounwind readnone 208 %a = tail call <2 x i32> @llvm.aarch64.neon.smin.v2i32(<2 x i32> %value, <2 x i32> %_high) nounwind readnone 209 %b = tail call <2 x i32> @llvm.aarch64.neon.smax.v2i32(<2 x i32> %a, <2 x i32> %_low) nounwind readnone 210 ret <2 x i32> %b 211} 212 213 214 215define <4 x i32> @_Z5clampDv4_jS_S_(<4 x i32> %value, <4 x i32> %low, <4 x i32> %high) nounwind readonly { 216 %1 = tail call <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32> %value, <4 x i32> %high) nounwind readnone 217 %2 = tail call <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32> %1, <4 x i32> %low) nounwind readnone 218 ret <4 x i32> %2 219} 220 221define <4 x i32> @_Z5clampDv4_jjj(<4 x i32> %value, i32 %low, i32 %high) nounwind readonly { 222 %_high = tail call <4 x i32> @smear_4i(i32 %high) nounwind readnone 223 %_low = tail call <4 x i32> @smear_4i(i32 %low) nounwind readnone 224 %1 = tail call <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32> %value, <4 x i32> %_high) nounwind readnone 225 %2 = tail call <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32> %1, <4 x i32> %_low) nounwind readnone 226 ret <4 x i32> %2 227} 228 229define <3 x i32> @_Z5clampDv3_jS_S_(<3 x i32> %value, <3 x i32> %low, <3 x i32> %high) nounwind readonly { 230 %_value = shufflevector <3 x i32> %value, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 231 %_low = shufflevector <3 x i32> %low, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 232 %_high = shufflevector <3 x i32> %high, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 233 %a = tail call <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32> %_value, <4 x i32> %_high) nounwind readnone 234 %b = tail call <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32> %a, <4 x i32> %_low) nounwind readnone 235 %c = shufflevector <4 x i32> %b, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 236 ret <3 x i32> %c 237} 238 239define <3 x i32> @_Z5clampDv3_jjj(<3 x i32> %value, i32 %low, i32 %high) nounwind readonly { 240 %_value = shufflevector <3 x i32> %value, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 241 %_high = tail call <4 x i32> @smear_4i(i32 %high) nounwind readnone 242 %_low = tail call <4 x i32> @smear_4i(i32 %low) nounwind readnone 243 %a = tail call <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32> %_value, <4 x i32> %_high) nounwind readnone 244 %b = tail call <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32> %a, <4 x i32> %_low) nounwind readnone 245 %c = shufflevector <4 x i32> %b, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 246 ret <3 x i32> %c 247} 248 249define <2 x i32> @_Z5clampDv2_jS_S_(<2 x i32> %value, <2 x i32> %low, <2 x i32> %high) nounwind readonly { 250 %1 = tail call <2 x i32> @llvm.aarch64.neon.umin.v2i32(<2 x i32> %value, <2 x i32> %high) nounwind readnone 251 %2 = tail call <2 x i32> @llvm.aarch64.neon.umax.v2i32(<2 x i32> %1, <2 x i32> %low) nounwind readnone 252 ret <2 x i32> %2 253} 254 255define <2 x i32> @_Z5clampDv2_jjj(<2 x i32> %value, i32 %low, i32 %high) nounwind readonly { 256 %_high = tail call <2 x i32> @smear_2i(i32 %high) nounwind readnone 257 %_low = tail call <2 x i32> @smear_2i(i32 %low) nounwind readnone 258 %a = tail call <2 x i32> @llvm.aarch64.neon.umin.v2i32(<2 x i32> %value, <2 x i32> %_high) nounwind readnone 259 %b = tail call <2 x i32> @llvm.aarch64.neon.umax.v2i32(<2 x i32> %a, <2 x i32> %_low) nounwind readnone 260 ret <2 x i32> %b 261} 262 263 264;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 265;;;;;;;;; FMAX ;;;;;;;;;; 266;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 267 268define <4 x float> @_Z4fmaxDv4_fS_(<4 x float> %v1, <4 x float> %v2) nounwind readonly { 269 %1 = tail call <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float> %v1, <4 x float> %v2) nounwind readnone 270 ret <4 x float> %1 271} 272 273define <4 x float> @_Z4fmaxDv4_ff(<4 x float> %v1, float %v2) nounwind readonly { 274 %1 = tail call <4 x float> @smear_4f(float %v2) nounwind readnone 275 %2 = tail call <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float> %v1, <4 x float> %1) nounwind readnone 276 ret <4 x float> %2 277} 278 279define <3 x float> @_Z4fmaxDv3_fS_(<3 x float> %v1, <3 x float> %v2) nounwind readonly { 280 %1 = shufflevector <3 x float> %v1, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 281 %2 = shufflevector <3 x float> %v2, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 282 %3 = tail call <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float> %1, <4 x float> %2) nounwind readnone 283 %4 = shufflevector <4 x float> %3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> 284 ret <3 x float> %4 285} 286 287define <3 x float> @_Z4fmaxDv3_ff(<3 x float> %v1, float %v2) nounwind readonly { 288 %1 = shufflevector <3 x float> %v1, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 289 %2 = tail call <4 x float> @smear_4f(float %v2) nounwind readnone 290 %3 = tail call <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float> %1, <4 x float> %2) nounwind readnone 291 %c = shufflevector <4 x float> %3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> 292 ret <3 x float> %c 293} 294 295define <2 x float> @_Z4fmaxDv2_fS_(<2 x float> %v1, <2 x float> %v2) nounwind readonly { 296 %1 = tail call <2 x float> @llvm.aarch64.neon.fmax.v2f32(<2 x float> %v1, <2 x float> %v2) nounwind readnone 297 ret <2 x float> %1 298} 299 300define <2 x float> @_Z4fmaxDv2_ff(<2 x float> %v1, float %v2) nounwind readonly { 301 %1 = tail call <2 x float> @smear_2f(float %v2) nounwind readnone 302 %2 = tail call <2 x float> @llvm.aarch64.neon.fmax.v2f32(<2 x float> %v1, <2 x float> %1) nounwind readnone 303 ret <2 x float> %2 304} 305 306define float @_Z4fmaxff(float %v1, float %v2) nounwind readonly { 307 %1 = fcmp ogt float %v1, %v2 308 %2 = select i1 %1, float %v1, float %v2 309 ret float %2 310} 311 312 313;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 314;;;;;;;;; FMIN ;;;;;;;;;; 315;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 316 317define <4 x float> @_Z4fminDv4_fS_(<4 x float> %v1, <4 x float> %v2) nounwind readonly { 318 %1 = tail call <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float> %v1, <4 x float> %v2) nounwind readnone 319 ret <4 x float> %1 320} 321 322define <4 x float> @_Z4fminDv4_ff(<4 x float> %v1, float %v2) nounwind readonly { 323 %1 = tail call <4 x float> @smear_4f(float %v2) nounwind readnone 324 %2 = tail call <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float> %v1, <4 x float> %1) nounwind readnone 325 ret <4 x float> %2 326} 327 328define <3 x float> @_Z4fminDv3_fS_(<3 x float> %v1, <3 x float> %v2) nounwind readonly { 329 %1 = shufflevector <3 x float> %v1, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 330 %2 = shufflevector <3 x float> %v2, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 331 %3 = tail call <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float> %1, <4 x float> %2) nounwind readnone 332 %4 = shufflevector <4 x float> %3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> 333 ret <3 x float> %4 334} 335 336define <3 x float> @_Z4fminDv3_ff(<3 x float> %v1, float %v2) nounwind readonly { 337 %1 = shufflevector <3 x float> %v1, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 338 %2 = tail call <4 x float> @smear_4f(float %v2) nounwind readnone 339 %3 = tail call <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float> %1, <4 x float> %2) nounwind readnone 340 %c = shufflevector <4 x float> %3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> 341 ret <3 x float> %c 342} 343 344define <2 x float> @_Z4fminDv2_fS_(<2 x float> %v1, <2 x float> %v2) nounwind readonly { 345 %1 = tail call <2 x float> @llvm.aarch64.neon.fmin.v2f32(<2 x float> %v1, <2 x float> %v2) nounwind readnone 346 ret <2 x float> %1 347} 348 349define <2 x float> @_Z4fminDv2_ff(<2 x float> %v1, float %v2) nounwind readonly { 350 %1 = tail call <2 x float> @smear_2f(float %v2) nounwind readnone 351 %2 = tail call <2 x float> @llvm.aarch64.neon.fmin.v2f32(<2 x float> %v1, <2 x float> %1) nounwind readnone 352 ret <2 x float> %2 353} 354 355define float @_Z4fminff(float %v1, float %v2) nounwind readnone { 356 %1 = fcmp olt float %v1, %v2 357 %2 = select i1 %1, float %v1, float %v2 358 ret float %2 359} 360 361 362;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 363;;;;;;;;; MAX ;;;;;;;;;; 364;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 365 366define signext i8 @_Z3maxcc(i8 signext %v1, i8 signext %v2) nounwind readnone { 367 %1 = icmp sgt i8 %v1, %v2 368 %2 = select i1 %1, i8 %v1, i8 %v2 369 ret i8 %2 370} 371 372define <2 x i8> @_Z3maxDv2_cS_(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone { 373 %1 = sext <2 x i8> %v1 to <2 x i32> 374 %2 = sext <2 x i8> %v2 to <2 x i32> 375 %3 = tail call <2 x i32> @llvm.aarch64.neon.smax.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone 376 %4 = trunc <2 x i32> %3 to <2 x i8> 377 ret <2 x i8> %4 378} 379 380define <3 x i8> @_Z3maxDv3_cS_(i32 %v1, i32 %v2) nounwind readnone { 381 %1 = bitcast i32 %v1 to <4 x i8> 382 %2 = bitcast i32 %v2 to <4 x i8> 383 %3 = sext <4 x i8> %1 to <4 x i32> 384 %4 = sext <4 x i8> %2 to <4 x i32> 385 %5 = tail call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone 386 %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 387 %7 = trunc <3 x i32> %6 to <3 x i8> 388 ret <3 x i8> %7 389} 390 391define <4 x i8> @_Z3maxDv4_cS_(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone { 392 %1 = sext <4 x i8> %v1 to <4 x i32> 393 %2 = sext <4 x i8> %v2 to <4 x i32> 394 %3 = tail call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone 395 %4 = trunc <4 x i32> %3 to <4 x i8> 396 ret <4 x i8> %4 397} 398 399define signext i16 @_Z3maxss(i16 signext %v1, i16 signext %v2) nounwind readnone { 400 %1 = icmp sgt i16 %v1, %v2 401 %2 = select i1 %1, i16 %v1, i16 %v2 402 ret i16 %2 403} 404 405define <2 x i16> @_Z3maxDv2_sS_(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone { 406 %1 = sext <2 x i16> %v1 to <2 x i32> 407 %2 = sext <2 x i16> %v2 to <2 x i32> 408 %3 = tail call <2 x i32> @llvm.aarch64.neon.smax.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone 409 %4 = trunc <2 x i32> %3 to <2 x i16> 410 ret <2 x i16> %4 411} 412 413define <3 x i16> @_Z3maxDv3_sS_(<3 x i16> %v1, <3 x i16> %v2) nounwind readnone { 414 %1 = sext <3 x i16> %v1 to <3 x i32> 415 %2 = sext <3 x i16> %v2 to <3 x i32> 416 %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 417 %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 418 %5 = tail call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone 419 %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 420 %7 = trunc <3 x i32> %6 to <3 x i16> 421 ret <3 x i16> %7 422} 423 424define <4 x i16> @_Z3maxDv4_sS_(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone { 425 %1 = sext <4 x i16> %v1 to <4 x i32> 426 %2 = sext <4 x i16> %v2 to <4 x i32> 427 %3 = tail call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone 428 %4 = trunc <4 x i32> %3 to <4 x i16> 429 ret <4 x i16> %4 430} 431 432define i32 @_Z3maxii(i32 %v1, i32 %v2) nounwind readnone { 433 %1 = icmp sgt i32 %v1, %v2 434 %2 = select i1 %1, i32 %v1, i32 %v2 435 ret i32 %2 436} 437 438define <2 x i32> @_Z3maxDv2_iS_(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone { 439 %1 = tail call <2 x i32> @llvm.aarch64.neon.smax.v2i32(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone 440 ret <2 x i32> %1 441} 442 443define <3 x i32> @_Z3maxDv3_iS_(<3 x i32> %v1, <3 x i32> %v2) nounwind readnone { 444 %1 = shufflevector <3 x i32> %v1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 445 %2 = shufflevector <3 x i32> %v2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 446 %3 = tail call <4 x i32 > @llvm.aarch64.neon.smax.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone 447 %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 448 ret <3 x i32> %4 449} 450 451define <4 x i32> @_Z3maxDv4_iS_(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone { 452 %1 = tail call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone 453 ret <4 x i32> %1 454} 455 456define i64 @_Z3maxxx(i64 %v1, i64 %v2) nounwind readnone { 457 %1 = icmp sgt i64 %v1, %v2 458 %2 = select i1 %1, i64 %v1, i64 %v2 459 ret i64 %2 460} 461 462; TODO: long vector types 463 464define zeroext i8 @_Z3maxhh(i8 zeroext %v1, i8 zeroext %v2) nounwind readnone { 465 %1 = icmp ugt i8 %v1, %v2 466 %2 = select i1 %1, i8 %v1, i8 %v2 467 ret i8 %2 468} 469 470define <2 x i8> @_Z3maxDv2_hS_(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone { 471 %1 = zext <2 x i8> %v1 to <2 x i32> 472 %2 = zext <2 x i8> %v2 to <2 x i32> 473 %3 = tail call <2 x i32> @llvm.aarch64.neon.umax.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone 474 %4 = trunc <2 x i32> %3 to <2 x i8> 475 ret <2 x i8> %4 476} 477 478define <3 x i8> @_Z3maxDv3_hS_(i32 %v1, i32 %v2) nounwind readnone { 479 %1 = bitcast i32 %v1 to <4 x i8> 480 %2 = bitcast i32 %v2 to <4 x i8> 481 %3 = zext <4 x i8> %1 to <4 x i32> 482 %4 = zext <4 x i8> %2 to <4 x i32> 483 %5 = tail call <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone 484 %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 485 %7 = trunc <3 x i32> %6 to <3 x i8> 486 ret <3 x i8> %7 487} 488 489define <4 x i8> @_Z3maxDv4_hS_(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone { 490 %1 = zext <4 x i8> %v1 to <4 x i32> 491 %2 = zext <4 x i8> %v2 to <4 x i32> 492 %3 = tail call <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone 493 %4 = trunc <4 x i32> %3 to <4 x i8> 494 ret <4 x i8> %4 495} 496 497define zeroext i16 @_Z3maxtt(i16 zeroext %v1, i16 zeroext %v2) nounwind readnone { 498 %1 = icmp ugt i16 %v1, %v2 499 %2 = select i1 %1, i16 %v1, i16 %v2 500 ret i16 %2 501} 502 503define <2 x i16> @_Z3maxDv2_tS_(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone { 504 %1 = zext <2 x i16> %v1 to <2 x i32> 505 %2 = zext <2 x i16> %v2 to <2 x i32> 506 %3 = tail call <2 x i32> @llvm.aarch64.neon.umax.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone 507 %4 = trunc <2 x i32> %3 to <2 x i16> 508 ret <2 x i16> %4 509} 510 511define <3 x i16> @_Z3maxDv3_tS_(<3 x i16> %v1, <3 x i16> %v2) nounwind readnone { 512 %1 = zext <3 x i16> %v1 to <3 x i32> 513 %2 = zext <3 x i16> %v2 to <3 x i32> 514 %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 515 %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 516 %5 = tail call <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone 517 %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 518 %7 = trunc <3 x i32> %6 to <3 x i16> 519 ret <3 x i16> %7 520} 521 522define <4 x i16> @_Z3maxDv4_tS_(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone { 523 %1 = zext <4 x i16> %v1 to <4 x i32> 524 %2 = zext <4 x i16> %v2 to <4 x i32> 525 %3 = tail call <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone 526 %4 = trunc <4 x i32> %3 to <4 x i16> 527 ret <4 x i16> %4 528} 529 530define i32 @_Z3maxjj(i32 %v1, i32 %v2) nounwind readnone { 531 %1 = icmp ugt i32 %v1, %v2 532 %2 = select i1 %1, i32 %v1, i32 %v2 533 ret i32 %2 534} 535 536define <2 x i32> @_Z3maxDv2_jS_(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone { 537 %1 = tail call <2 x i32> @llvm.aarch64.neon.umax.v2i32(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone 538 ret <2 x i32> %1 539} 540 541define <3 x i32> @_Z3maxDv3_jS_(<3 x i32> %v1, <3 x i32> %v2) nounwind readnone { 542 %1 = shufflevector <3 x i32> %v1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 543 %2 = shufflevector <3 x i32> %v2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 544 %3 = tail call <4 x i32 > @llvm.aarch64.neon.umax.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone 545 %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 546 ret <3 x i32> %4 547} 548 549define <4 x i32> @_Z3maxDv4_jS_(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone { 550 %1 = tail call <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone 551 ret <4 x i32> %1 552} 553 554 555; TODO: long vector types 556 557define float @_Z3maxff(float %v1, float %v2) nounwind readnone { 558 %1 = tail call float @_Z4fmaxff(float %v1, float %v2) 559 ret float %1 560} 561 562define <2 x float> @_Z3maxDv2_fS_(<2 x float> %v1, <2 x float> %v2) nounwind readnone { 563 %1 = tail call <2 x float> @_Z4fmaxDv2_fS_(<2 x float> %v1, <2 x float> %v2) 564 ret <2 x float> %1 565} 566 567define <2 x float> @_Z3maxDv2_ff(<2 x float> %v1, float %v2) nounwind readnone { 568 %1 = tail call <2 x float> @_Z4fmaxDv2_ff(<2 x float> %v1, float %v2) 569 ret <2 x float> %1 570} 571 572define <3 x float> @_Z3maxDv3_fS_(<3 x float> %v1, <3 x float> %v2) nounwind readnone { 573 %1 = tail call <3 x float> @_Z4fmaxDv3_fS_(<3 x float> %v1, <3 x float> %v2) 574 ret <3 x float> %1 575} 576 577define <3 x float> @_Z3maxDv3_ff(<3 x float> %v1, float %v2) nounwind readnone { 578 %1 = tail call <3 x float> @_Z4fmaxDv3_ff(<3 x float> %v1, float %v2) 579 ret <3 x float> %1 580} 581 582define <4 x float> @_Z3maxDv4_fS_(<4 x float> %v1, <4 x float> %v2) nounwind readnone { 583 %1 = tail call <4 x float> @_Z4fmaxDv4_fS_(<4 x float> %v1, <4 x float> %v2) 584 ret <4 x float> %1 585} 586 587define <4 x float> @_Z3maxDv4_ff(<4 x float> %v1, float %v2) nounwind readnone { 588 %1 = tail call <4 x float> @_Z4fmaxDv4_ff(<4 x float> %v1, float %v2) 589 ret <4 x float> %1 590} 591 592 593;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 594;;;;;;;;; MIN ;;;;;;;;;; 595;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 596 597define signext i8 @_Z3mincc(i8 signext %v1, i8 signext %v2) nounwind readnone { 598 %1 = icmp slt i8 %v1, %v2 599 %2 = select i1 %1, i8 %v1, i8 %v2 600 ret i8 %2 601} 602 603define <2 x i8> @_Z3minDv2_cS_(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone { 604 %1 = sext <2 x i8> %v1 to <2 x i32> 605 %2 = sext <2 x i8> %v2 to <2 x i32> 606 %3 = tail call <2 x i32> @llvm.aarch64.neon.smin.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone 607 %4 = trunc <2 x i32> %3 to <2 x i8> 608 ret <2 x i8> %4 609} 610 611define <3 x i8> @_Z3minDv3_cS_(i32 %v1, i32 %v2) nounwind readnone { 612 %1 = bitcast i32 %v1 to <4 x i8> 613 %2 = bitcast i32 %v2 to <4 x i8> 614 %3 = sext <4 x i8> %1 to <4 x i32> 615 %4 = sext <4 x i8> %2 to <4 x i32> 616 %5 = tail call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone 617 %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 618 %7 = trunc <3 x i32> %6 to <3 x i8> 619 ret <3 x i8> %7 620} 621 622define <4 x i8> @_Z3minDv4_cS_(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone { 623 %1 = sext <4 x i8> %v1 to <4 x i32> 624 %2 = sext <4 x i8> %v2 to <4 x i32> 625 %3 = tail call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone 626 %4 = trunc <4 x i32> %3 to <4 x i8> 627 ret <4 x i8> %4 628} 629 630define signext i16 @_Z3minss(i16 signext %v1, i16 signext %v2) nounwind readnone { 631 %1 = icmp slt i16 %v1, %v2 632 %2 = select i1 %1, i16 %v1, i16 %v2 633 ret i16 %2 634} 635 636define <2 x i16> @_Z3minDv2_sS_(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone { 637 %1 = sext <2 x i16> %v1 to <2 x i32> 638 %2 = sext <2 x i16> %v2 to <2 x i32> 639 %3 = tail call <2 x i32> @llvm.aarch64.neon.smin.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone 640 %4 = trunc <2 x i32> %3 to <2 x i16> 641 ret <2 x i16> %4 642} 643 644define <3 x i16> @_Z3minDv3_sS_(<3 x i16> %v1, <3 x i16> %v2) nounwind readnone { 645 %1 = sext <3 x i16> %v1 to <3 x i32> 646 %2 = sext <3 x i16> %v2 to <3 x i32> 647 %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 648 %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 649 %5 = tail call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone 650 %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 651 %7 = trunc <3 x i32> %6 to <3 x i16> 652 ret <3 x i16> %7 653} 654 655define <4 x i16> @_Z3minDv4_sS_(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone { 656 %1 = sext <4 x i16> %v1 to <4 x i32> 657 %2 = sext <4 x i16> %v2 to <4 x i32> 658 %3 = tail call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone 659 %4 = trunc <4 x i32> %3 to <4 x i16> 660 ret <4 x i16> %4 661} 662 663define i32 @_Z3minii(i32 %v1, i32 %v2) nounwind readnone { 664 %1 = icmp slt i32 %v1, %v2 665 %2 = select i1 %1, i32 %v1, i32 %v2 666 ret i32 %2 667} 668 669define <2 x i32> @_Z3minDv2_iS_(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone { 670 %1 = tail call <2 x i32> @llvm.aarch64.neon.smin.v2i32(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone 671 ret <2 x i32> %1 672} 673 674define <3 x i32> @_Z3minDv3_iS_(<3 x i32> %v1, <3 x i32> %v2) nounwind readnone { 675 %1 = shufflevector <3 x i32> %v1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 676 %2 = shufflevector <3 x i32> %v2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 677 %3 = tail call <4 x i32 > @llvm.aarch64.neon.smin.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone 678 %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 679 ret <3 x i32> %4 680} 681 682define <4 x i32> @_Z3minDv4_iS_(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone { 683 %1 = tail call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone 684 ret <4 x i32> %1 685} 686 687define i64 @_Z3minxx(i64 %v1, i64 %v2) nounwind readnone { 688 %1 = icmp slt i64 %v1, %v2 689 %2 = select i1 %1, i64 %v1, i64 %v2 690 ret i64 %2 691} 692 693; TODO: long vector types 694 695define zeroext i8 @_Z3minhh(i8 zeroext %v1, i8 zeroext %v2) nounwind readnone { 696 %1 = icmp ult i8 %v1, %v2 697 %2 = select i1 %1, i8 %v1, i8 %v2 698 ret i8 %2 699} 700 701define <2 x i8> @_Z3minDv2_hS_(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone { 702 %1 = zext <2 x i8> %v1 to <2 x i32> 703 %2 = zext <2 x i8> %v2 to <2 x i32> 704 %3 = tail call <2 x i32> @llvm.aarch64.neon.umin.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone 705 %4 = trunc <2 x i32> %3 to <2 x i8> 706 ret <2 x i8> %4 707} 708 709define <3 x i8> @_Z3minDv3_hS_(i32 %v1, i32 %v2) nounwind readnone { 710 %1 = bitcast i32 %v1 to <4 x i8> 711 %2 = bitcast i32 %v2 to <4 x i8> 712 %3 = zext <4 x i8> %1 to <4 x i32> 713 %4 = zext <4 x i8> %2 to <4 x i32> 714 %5 = tail call <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone 715 %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 716 %7 = trunc <3 x i32> %6 to <3 x i8> 717 ret <3 x i8> %7 718} 719 720define <4 x i8> @_Z3minDv4_hS_(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone { 721 %1 = zext <4 x i8> %v1 to <4 x i32> 722 %2 = zext <4 x i8> %v2 to <4 x i32> 723 %3 = tail call <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone 724 %4 = trunc <4 x i32> %3 to <4 x i8> 725 ret <4 x i8> %4 726} 727 728define zeroext i16 @_Z3mintt(i16 zeroext %v1, i16 zeroext %v2) nounwind readnone { 729 %1 = icmp ult i16 %v1, %v2 730 %2 = select i1 %1, i16 %v1, i16 %v2 731 ret i16 %2 732} 733 734define <2 x i16> @_Z3minDv2_tS_(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone { 735 %1 = zext <2 x i16> %v1 to <2 x i32> 736 %2 = zext <2 x i16> %v2 to <2 x i32> 737 %3 = tail call <2 x i32> @llvm.aarch64.neon.umin.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone 738 %4 = trunc <2 x i32> %3 to <2 x i16> 739 ret <2 x i16> %4 740} 741 742define <3 x i16> @_Z3minDv3_tS_(<3 x i16> %v1, <3 x i16> %v2) nounwind readnone { 743 %1 = zext <3 x i16> %v1 to <3 x i32> 744 %2 = zext <3 x i16> %v2 to <3 x i32> 745 %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 746 %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 747 %5 = tail call <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone 748 %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 749 %7 = trunc <3 x i32> %6 to <3 x i16> 750 ret <3 x i16> %7 751} 752 753define <4 x i16> @_Z3minDv4_tS_(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone { 754 %1 = zext <4 x i16> %v1 to <4 x i32> 755 %2 = zext <4 x i16> %v2 to <4 x i32> 756 %3 = tail call <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone 757 %4 = trunc <4 x i32> %3 to <4 x i16> 758 ret <4 x i16> %4 759} 760 761define i32 @_Z3minjj(i32 %v1, i32 %v2) nounwind readnone { 762 %1 = icmp ult i32 %v1, %v2 763 %2 = select i1 %1, i32 %v1, i32 %v2 764 ret i32 %2 765} 766 767define <2 x i32> @_Z3minDv2_jS_(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone { 768 %1 = tail call <2 x i32> @llvm.aarch64.neon.umin.v2i32(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone 769 ret <2 x i32> %1 770} 771 772define <3 x i32> @_Z3minDv3_jS_(<3 x i32> %v1, <3 x i32> %v2) nounwind readnone { 773 %1 = shufflevector <3 x i32> %v1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 774 %2 = shufflevector <3 x i32> %v2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 775 %3 = tail call <4 x i32 > @llvm.aarch64.neon.umin.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone 776 %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> 777 ret <3 x i32> %4 778} 779 780define <4 x i32> @_Z3minDv4_jS_(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone { 781 %1 = tail call <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone 782 ret <4 x i32> %1 783} 784 785 786; TODO: long vector types 787 788define float @_Z3minff(float %v1, float %v2) nounwind readnone { 789 %1 = tail call float @_Z4fminff(float %v1, float %v2) 790 ret float %1 791} 792 793define <2 x float> @_Z3minDv2_fS_(<2 x float> %v1, <2 x float> %v2) nounwind readnone { 794 %1 = tail call <2 x float> @_Z4fminDv2_fS_(<2 x float> %v1, <2 x float> %v2) 795 ret <2 x float> %1 796} 797 798define <2 x float> @_Z3minDv2_ff(<2 x float> %v1, float %v2) nounwind readnone { 799 %1 = tail call <2 x float> @_Z4fminDv2_ff(<2 x float> %v1, float %v2) 800 ret <2 x float> %1 801} 802 803define <3 x float> @_Z3minDv3_fS_(<3 x float> %v1, <3 x float> %v2) nounwind readnone { 804 %1 = tail call <3 x float> @_Z4fminDv3_fS_(<3 x float> %v1, <3 x float> %v2) 805 ret <3 x float> %1 806} 807 808define <3 x float> @_Z3minDv3_ff(<3 x float> %v1, float %v2) nounwind readnone { 809 %1 = tail call <3 x float> @_Z4fminDv3_ff(<3 x float> %v1, float %v2) 810 ret <3 x float> %1 811} 812 813define <4 x float> @_Z3minDv4_fS_(<4 x float> %v1, <4 x float> %v2) nounwind readnone { 814 %1 = tail call <4 x float> @_Z4fminDv4_fS_(<4 x float> %v1, <4 x float> %v2) 815 ret <4 x float> %1 816} 817 818define <4 x float> @_Z3minDv4_ff(<4 x float> %v1, float %v2) nounwind readnone { 819 %1 = tail call <4 x float> @_Z4fminDv4_ff(<4 x float> %v1, float %v2) 820 ret <4 x float> %1 821} 822 823 824;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 825;;;;;;;;; YUV ;;;;;;;;;; 826;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 827 828@yuv_U = internal constant <4 x i32> <i32 0, i32 -100, i32 516, i32 0>, align 16 829@yuv_V = internal constant <4 x i32> <i32 409, i32 -208, i32 0, i32 0>, align 16 830@yuv_0 = internal constant <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16 831@yuv_255 = internal constant <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535>, align 16 832 833 834define <4 x i8> @_Z18rsYuvToRGBA_uchar4hhh(i8 %pY, i8 %pU, i8 %pV) nounwind readnone alwaysinline { 835 %_sy = zext i8 %pY to i32 836 %_su = zext i8 %pU to i32 837 %_sv = zext i8 %pV to i32 838 839 %_sy2 = add i32 -16, %_sy 840 %_sy3 = mul i32 298, %_sy2 841 %_su2 = add i32 -128, %_su 842 %_sv2 = add i32 -128, %_sv 843 %_y = tail call <4 x i32> @smear_4i32(i32 %_sy3) nounwind readnone 844 %_u = tail call <4 x i32> @smear_4i32(i32 %_su2) nounwind readnone 845 %_v = tail call <4 x i32> @smear_4i32(i32 %_sv2) nounwind readnone 846 847 %mu = load <4 x i32>, <4 x i32>* @yuv_U, align 8 848 %mv = load <4 x i32>, <4 x i32>* @yuv_V, align 8 849 %_u2 = mul <4 x i32> %_u, %mu 850 %_v2 = mul <4 x i32> %_v, %mv 851 %_y2 = add <4 x i32> %_y, %_u2 852 %_y3 = add <4 x i32> %_y2, %_v2 853 854 ; %r1 = tail call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> %_y3, <4 x i32> <i32 8, i32 8, i32 8, i32 8>) nounwind readnone 855; %r2 = trunc <4 x i16> %r1 to <4 x i8> 856; ret <4 x i8> %r2 857 858 %c0 = load <4 x i32>, <4 x i32>* @yuv_0, align 8 859 %c255 = load <4 x i32>, <4 x i32>* @yuv_255, align 8 860 %r1 = tail call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> %_y3, <4 x i32> %c0) nounwind readnone 861 %r2 = tail call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> %r1, <4 x i32> %c255) nounwind readnone 862 %r3 = lshr <4 x i32> %r2, <i32 8, i32 8, i32 8, i32 8> 863 %r4 = trunc <4 x i32> %r3 to <4 x i8> 864 ret <4 x i8> %r4 865} 866 867;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 868;;;;;;;;; half_RECIP ;;;;;;;;;; 869;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 870 871define <2 x float> @_Z10half_recipDv2_f(<2 x float> %v) nounwind readnone { 872 %1 = tail call <2 x float> @llvm.aarch64.neon.frecpe.v2f32(<2 x float> %v) nounwind readnone 873 %2 = tail call <2 x float> @llvm.aarch64.neon.frecps.v2f32(<2 x float> %1, <2 x float> %v) nounwind readnone 874 %3 = fmul <2 x float> %1, %2 875 %4 = tail call <2 x float> @llvm.aarch64.neon.frecps.v2f32(<2 x float> %3, <2 x float> %v) nounwind readnone 876 %5 = fmul <2 x float> %4, %3 877 ret <2 x float> %5 878} 879 880define <4 x float> @_Z10half_recipDv4_f(<4 x float> %v) nounwind readnone { 881 %1 = tail call <4 x float> @llvm.aarch64.neon.frecpe.v4f32(<4 x float> %v) nounwind readnone 882 %2 = tail call <4 x float> @llvm.aarch64.neon.frecps.v4f32(<4 x float> %1, <4 x float> %v) nounwind readnone 883 %3 = fmul <4 x float> %1, %2 884 %4 = tail call <4 x float> @llvm.aarch64.neon.frecps.v4f32(<4 x float> %3, <4 x float> %v) nounwind readnone 885 %5 = fmul <4 x float> %4, %3 886 ret <4 x float> %5 887} 888 889define <3 x float> @_Z10half_recipDv3_f(<3 x float> %v) nounwind readnone { 890 %1 = shufflevector <3 x float> %v, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 891 %2 = tail call <4 x float> @_Z10half_recipDv4_f(<4 x float> %1) nounwind readnone 892 %3 = shufflevector <4 x float> %2, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> 893 ret <3 x float> %3 894} 895 896 897;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 898;;;;;;;;; half_RSQRT ;;;;;;;;;; 899;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 900 901define float @_Z10half_rsqrtf(float %v) { 902 %1 = insertelement <2 x float> undef, float %v, i32 0 903 %2 = tail call <2 x float> @llvm.aarch64.neon.frsqrte.v2f32(<2 x float> %1) nounwind readnone 904 %3 = fmul <2 x float> %2, %2 905 %4 = tail call <2 x float> @llvm.aarch64.neon.frsqrts.v2f32(<2 x float> %1, <2 x float> %3) nounwind readnone 906 %5 = fmul <2 x float> %2, %4 907 %6 = extractelement <2 x float> %5, i32 0 908 ret float %6 909} 910 911define <2 x float> @_Z10half_rsqrtDv2_f(<2 x float> %v) nounwind readnone { 912 %1 = tail call <2 x float> @llvm.aarch64.neon.frsqrte.v2f32(<2 x float> %v) nounwind readnone 913 %2 = fmul <2 x float> %1, %1 914 %3 = tail call <2 x float> @llvm.aarch64.neon.frsqrts.v2f32(<2 x float> %v, <2 x float> %2) nounwind readnone 915 %4 = fmul <2 x float> %1, %3 916 ret <2 x float> %4 917} 918 919define <3 x float> @_Z10half_rsqrtDv3_f(<3 x float> %v) nounwind readnone { 920 %1 = shufflevector <3 x float> %v, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 921 %2 = tail call <4 x float> @llvm.aarch64.neon.frsqrte.v4f32(<4 x float> %1) nounwind readnone 922 %3 = fmul <4 x float> %2, %2 923 %4 = tail call <4 x float> @llvm.aarch64.neon.frsqrts.v4f32(<4 x float> %1, <4 x float> %3) nounwind readnone 924 %5 = fmul <4 x float> %2, %4 925 %6 = shufflevector <4 x float> %5, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> 926 ret <3 x float> %6 927} 928 929define <4 x float> @_Z10half_rsqrtDv4_f(<4 x float> %v) nounwind readnone { 930 %1 = tail call <4 x float> @llvm.aarch64.neon.frsqrte.v4f32(<4 x float> %v) nounwind readnone 931 %2 = fmul <4 x float> %1, %1 932 %3 = tail call <4 x float> @llvm.aarch64.neon.frsqrts.v4f32(<4 x float> %v, <4 x float> %2) nounwind readnone 933 %4 = fmul <4 x float> %1, %3 934 ret <4 x float> %4 935} 936 937;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 938;;;;;;;;; matrix ;;;;;;;;;; 939;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 940 941%struct.rs_matrix4x4 = type { [16 x float] } 942%struct.rs_matrix3x3 = type { [9 x float] } 943%struct.rs_matrix2x2 = type { [4 x float] } 944 945define internal <4 x float> @smear_f(float %in) nounwind readnone alwaysinline { 946 %1 = insertelement <4 x float> undef, float %in, i32 0 947 %2 = insertelement <4 x float> %1, float %in, i32 1 948 %3 = insertelement <4 x float> %2, float %in, i32 2 949 %4 = insertelement <4 x float> %3, float %in, i32 3 950 ret <4 x float> %4 951} 952 953 954define <3 x float> @_Z16rsMatrixMultiplyPK12rs_matrix3x3Dv3_f(%struct.rs_matrix3x3* nocapture %m, <3 x float> %in) nounwind readonly { 955 %x0 = extractelement <3 x float> %in, i32 0 956 %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone 957 %y0 = extractelement <3 x float> %in, i32 1 958 %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone 959 %z0 = extractelement <3 x float> %in, i32 2 960 %z = tail call <4 x float> @smear_f(float %z0) nounwind readnone 961 962 %px = getelementptr inbounds %struct.rs_matrix3x3, %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 0 963 %px2 = bitcast float* %px to <4 x float>* 964 %xm = load <4 x float>, <4 x float>* %px2, align 4 965 966 %py = getelementptr inbounds %struct.rs_matrix3x3, %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 3 967 %py2 = bitcast float* %py to <4 x float>* 968 ; %ym = call <4 x float> @llvm.aarch64.neon.ld4.v4f32(i8* %py2, i32 4) nounwind 969 %ym = load <4 x float>, <4 x float>* %py2, align 4 970 971 %pz = getelementptr inbounds %struct.rs_matrix3x3, %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 5 972 %pz2 = bitcast float* %pz to <4 x float>* 973; %zm2 = call <4 x float> @llvm.aarch64.neon.ld4.v4f32(i8* %pz2, i32 4) nounwind 974 %zm2 = load <4 x float>, <4 x float>* %pz2, align 4 975 %zm = shufflevector <4 x float> %zm2, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 4> 976 977 %a1 = fmul <4 x float> %x, %xm 978 %a2 = fmul <4 x float> %y, %ym 979 %a3 = fadd <4 x float> %a1, %a2 980 %a4 = fmul <4 x float> %z, %zm 981 %a5 = fadd <4 x float> %a4, %a3 982 %a6 = shufflevector <4 x float> %a5, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> 983 ret <3 x float> %a6 984} 985 986define <3 x float> @_Z16rsMatrixMultiplyPK12rs_matrix3x3Dv2_f(%struct.rs_matrix3x3* nocapture %m, <2 x float> %in) nounwind readonly { 987 %x0 = extractelement <2 x float> %in, i32 0 988 %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone 989 %y0 = extractelement <2 x float> %in, i32 1 990 %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone 991 992 %px = getelementptr inbounds %struct.rs_matrix3x3, %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 0 993 %px2 = bitcast float* %px to <4 x float>* 994 %xm = load <4 x float>, <4 x float>* %px2, align 4 995 %py = getelementptr inbounds %struct.rs_matrix3x3, %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 3 996 %py2 = bitcast float* %py to <4 x float>* 997 %ym = load <4 x float>, <4 x float>* %py2, align 4 998 999 %a1 = fmul <4 x float> %x, %xm 1000 %a2 = fmul <4 x float> %y, %ym 1001 %a3 = fadd <4 x float> %a1, %a2 1002 %a4 = shufflevector <4 x float> %a3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> 1003 ret <3 x float> %a4 1004} 1005 1006define <4 x float> @_Z16rsMatrixMultiplyPK12rs_matrix4x4Dv4_f(%struct.rs_matrix4x4* nocapture %m, <4 x float> %in) nounwind readonly { 1007 %x0 = extractelement <4 x float> %in, i32 0 1008 %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone 1009 %y0 = extractelement <4 x float> %in, i32 1 1010 %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone 1011 %z0 = extractelement <4 x float> %in, i32 2 1012 %z = tail call <4 x float> @smear_f(float %z0) nounwind readnone 1013 %w0 = extractelement <4 x float> %in, i32 3 1014 %w = tail call <4 x float> @smear_f(float %w0) nounwind readnone 1015 1016 %px = getelementptr inbounds %struct.rs_matrix4x4, %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 0 1017 %px2 = bitcast float* %px to <4 x float>* 1018 %xm = load <4 x float>, <4 x float>* %px2, align 4 1019 %py = getelementptr inbounds %struct.rs_matrix4x4, %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 4 1020 %py2 = bitcast float* %py to <4 x float>* 1021 %ym = load <4 x float>, <4 x float>* %py2, align 4 1022 %pz = getelementptr inbounds %struct.rs_matrix4x4, %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 8 1023 %pz2 = bitcast float* %pz to <4 x float>* 1024 %zm = load <4 x float>, <4 x float>* %pz2, align 4 1025 %pw = getelementptr inbounds %struct.rs_matrix4x4, %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 12 1026 %pw2 = bitcast float* %pw to <4 x float>* 1027 %wm = load <4 x float>, <4 x float>* %pw2, align 4 1028 1029 %a1 = fmul <4 x float> %x, %xm 1030 %a2 = fmul <4 x float> %y, %ym 1031 %a3 = fadd <4 x float> %a1, %a2 1032 %a4 = fmul <4 x float> %z, %zm 1033 %a5 = fadd <4 x float> %a3, %a4 1034 %a6 = fmul <4 x float> %w, %wm 1035 %a7 = fadd <4 x float> %a5, %a6 1036 ret <4 x float> %a7 1037} 1038 1039define <4 x float> @_Z16rsMatrixMultiplyPK12rs_matrix4x4Dv3_f(%struct.rs_matrix4x4* nocapture %m, <3 x float> %in) nounwind readonly { 1040 %x0 = extractelement <3 x float> %in, i32 0 1041 %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone 1042 %y0 = extractelement <3 x float> %in, i32 1 1043 %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone 1044 %z0 = extractelement <3 x float> %in, i32 2 1045 %z = tail call <4 x float> @smear_f(float %z0) nounwind readnone 1046 1047 %px = getelementptr inbounds %struct.rs_matrix4x4, %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 0 1048 %px2 = bitcast float* %px to <4 x float>* 1049 %xm = load <4 x float>, <4 x float>* %px2, align 4 1050 %py = getelementptr inbounds %struct.rs_matrix4x4, %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 4 1051 %py2 = bitcast float* %py to <4 x float>* 1052 %ym = load <4 x float>, <4 x float>* %py2, align 4 1053 %pz = getelementptr inbounds %struct.rs_matrix4x4, %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 8 1054 %pz2 = bitcast float* %pz to <4 x float>* 1055 %zm = load <4 x float>, <4 x float>* %pz2, align 4 1056 %pw = getelementptr inbounds %struct.rs_matrix4x4, %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 12 1057 %pw2 = bitcast float* %pw to <4 x float>* 1058 %wm = load <4 x float>, <4 x float>* %pw2, align 4 1059 1060 %a1 = fmul <4 x float> %x, %xm 1061 %a2 = fadd <4 x float> %wm, %a1 1062 %a3 = fmul <4 x float> %y, %ym 1063 %a4 = fadd <4 x float> %a2, %a3 1064 %a5 = fmul <4 x float> %z, %zm 1065 %a6 = fadd <4 x float> %a4, %a5 1066 ret <4 x float> %a6 1067} 1068 1069define <4 x float> @_Z16rsMatrixMultiplyPK12rs_matrix4x4Dv2_f(%struct.rs_matrix4x4* nocapture %m, <2 x float> %in) nounwind readonly { 1070 %x0 = extractelement <2 x float> %in, i32 0 1071 %x = tail call <4 x float> @smear_f(float %x0) nounwind readnone 1072 %y0 = extractelement <2 x float> %in, i32 1 1073 %y = tail call <4 x float> @smear_f(float %y0) nounwind readnone 1074 1075 %px = getelementptr inbounds %struct.rs_matrix4x4, %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 0 1076 %px2 = bitcast float* %px to <4 x float>* 1077 %xm = load <4 x float>, <4 x float>* %px2, align 4 1078 %py = getelementptr inbounds %struct.rs_matrix4x4, %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 4 1079 %py2 = bitcast float* %py to <4 x float>* 1080 %ym = load <4 x float>, <4 x float>* %py2, align 4 1081 %pw = getelementptr inbounds %struct.rs_matrix4x4, %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 12 1082 %pw2 = bitcast float* %pw to <4 x float>* 1083 %wm = load <4 x float>, <4 x float>* %pw2, align 4 1084 1085 %a1 = fmul <4 x float> %x, %xm 1086 %a2 = fadd <4 x float> %wm, %a1 1087 %a3 = fmul <4 x float> %y, %ym 1088 %a4 = fadd <4 x float> %a2, %a3 1089 ret <4 x float> %a4 1090} 1091 1092 1093 1094;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1095;;;;;;;;; pixel ops ;;;;;;;;;; 1096;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1097 1098 1099@fc_255.0 = internal constant <4 x float> <float 255.0, float 255.0, float 255.0, float 255.0>, align 16 1100@fc_0.5 = internal constant <4 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, align 16 1101@fc_0 = internal constant <4 x float> <float 0.0, float 0.0, float 0.0, float 0.0>, align 16 1102 1103declare <4 x i8> @_Z14convert_uchar4Dv4_f(<4 x float> %in) nounwind readnone 1104declare <4 x float> @_Z14convert_float4Dv4_h(<4 x i8> %in) nounwind readnone 1105 1106; uchar4 __attribute__((overloadable)) rsPackColorTo8888(float4 color) 1107define <4 x i8> @_Z17rsPackColorTo8888Dv4_f(<4 x float> %color) nounwind readnone { 1108 %f255 = load <4 x float>, <4 x float>* @fc_255.0, align 16 1109 %f05 = load <4 x float>, <4 x float>* @fc_0.5, align 16 1110 %f0 = load <4 x float>, <4 x float>* @fc_0, align 16 1111 %v1 = fmul <4 x float> %f255, %color 1112 %v2 = fadd <4 x float> %f05, %v1 1113 %v3 = tail call <4 x float> @_Z5clampDv4_fS_S_(<4 x float> %v2, <4 x float> %f0, <4 x float> %f255) nounwind readnone 1114 %v4 = tail call <4 x i8> @_Z14convert_uchar4Dv4_f(<4 x float> %v3) nounwind readnone 1115 ret <4 x i8> %v4 1116} 1117 1118; uchar4 __attribute__((overloadable)) rsPackColorTo8888(float3 color) 1119define <4 x i8> @_Z17rsPackColorTo8888Dv3_f(<4 x i32> %color) nounwind readnone { 1120 %1 = bitcast <4 x i32> %color to <4 x float> 1121 %2 = insertelement <4 x float> %1, float 1.0, i32 3 1122 %3 = tail call <4 x i8> @_Z17rsPackColorTo8888Dv4_f(<4 x float> %2) nounwind readnone 1123 ret <4 x i8> %3 1124} 1125 1126; uchar4 __attribute__((overloadable)) rsPackColorTo8888(float r, float g, float b) 1127define <4 x i8> @_Z17rsPackColorTo8888fff(float %r, float %g, float %b) nounwind readnone { 1128 %1 = insertelement <4 x float> undef, float %r, i32 0 1129 %2 = insertelement <4 x float> %1, float %g, i32 1 1130 %3 = insertelement <4 x float> %2, float %b, i32 2 1131 %4 = insertelement <4 x float> %3, float 1.0, i32 3 1132 %5 = tail call <4 x i8> @_Z17rsPackColorTo8888Dv4_f(<4 x float> %4) nounwind readnone 1133 ret <4 x i8> %5 1134} 1135 1136; uchar4 __attribute__((overloadable)) rsPackColorTo8888(float r, float g, float b, float a) 1137define <4 x i8> @_Z17rsPackColorTo8888ffff(float %r, float %g, float %b, float %a) nounwind readnone { 1138 %1 = insertelement <4 x float> undef, float %r, i32 0 1139 %2 = insertelement <4 x float> %1, float %g, i32 1 1140 %3 = insertelement <4 x float> %2, float %b, i32 2 1141 %4 = insertelement <4 x float> %3, float %a, i32 3 1142 %5 = tail call <4 x i8> @_Z17rsPackColorTo8888Dv4_f(<4 x float> %4) nounwind readnone 1143 ret <4 x i8> %5 1144} 1145 1146