1/* 2Copyright (c) 2014, Intel Corporation 3All rights reserved. 4 5Redistribution and use in source and binary forms, with or without 6modification, are permitted provided that the following conditions are met: 7 8 * Redistributions of source code must retain the above copyright notice, 9 * this list of conditions and the following disclaimer. 10 11 * Redistributions in binary form must reproduce the above copyright notice, 12 * this list of conditions and the following disclaimer in the documentation 13 * and/or other materials provided with the distribution. 14 15 * Neither the name of Intel Corporation nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29*/ 30 31#ifndef STRLEN 32# define STRLEN strlen_generic 33#endif 34 35#ifndef L 36# define L(label) .L##label 37#endif 38 39#ifndef cfi_startproc 40# define cfi_startproc .cfi_startproc 41#endif 42 43#ifndef cfi_endproc 44# define cfi_endproc .cfi_endproc 45#endif 46 47#ifndef cfi_rel_offset 48# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off 49#endif 50 51#ifndef cfi_restore 52# define cfi_restore(reg) .cfi_restore reg 53#endif 54 55#ifndef cfi_adjust_cfa_offset 56# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off 57#endif 58 59#ifndef ENTRY 60# define ENTRY(name) \ 61 .type name, @function; \ 62 .globl name; \ 63 .p2align 4; \ 64name: \ 65 cfi_startproc 66#endif 67 68#ifndef END 69# define END(name) \ 70 cfi_endproc; \ 71 .size name, .-name 72#endif 73 74#define CFI_PUSH(REG) \ 75 cfi_adjust_cfa_offset (4); \ 76 cfi_rel_offset (REG, 0) 77 78#define CFI_POP(REG) \ 79 cfi_adjust_cfa_offset (-4); \ 80 cfi_restore (REG) 81 82#define PUSH(REG) pushl REG; CFI_PUSH (REG) 83#define POP(REG) popl REG; CFI_POP (REG) 84 85 .section .text.sse2,"ax",@progbits 86ENTRY (STRLEN) 87 mov 4(%esp), %edx 88 mov %edx, %ecx 89 and $0x3f, %ecx 90 pxor %xmm0, %xmm0 91 cmp $0x30, %ecx 92 ja L(next) 93 movdqu (%edx), %xmm1 94 pcmpeqb %xmm1, %xmm0 95 pmovmskb %xmm0, %ecx 96 test %ecx, %ecx 97 jnz L(exit_less16) 98 mov %edx, %eax 99 and $-16, %eax 100 jmp L(align16_start) 101L(next): 102 mov %edx, %eax 103 and $-16, %eax 104 PUSH (%edi) 105 pcmpeqb (%eax), %xmm0 106 mov $-1, %edi 107 sub %eax, %ecx 108 shl %cl, %edi 109 pmovmskb %xmm0, %ecx 110 and %edi, %ecx 111 POP (%edi) 112 jnz L(exit_unaligned) 113 pxor %xmm0, %xmm0 114L(align16_start): 115 pxor %xmm1, %xmm1 116 pxor %xmm2, %xmm2 117 pxor %xmm3, %xmm3 118 pcmpeqb 16(%eax), %xmm0 119 pmovmskb %xmm0, %ecx 120 test %ecx, %ecx 121 jnz L(exit16) 122 123 pcmpeqb 32(%eax), %xmm1 124 pmovmskb %xmm1, %ecx 125 test %ecx, %ecx 126 jnz L(exit32) 127 128 pcmpeqb 48(%eax), %xmm2 129 pmovmskb %xmm2, %ecx 130 test %ecx, %ecx 131 jnz L(exit48) 132 133 pcmpeqb 64(%eax), %xmm3 134 pmovmskb %xmm3, %ecx 135 test %ecx, %ecx 136 jnz L(exit64) 137 138 pcmpeqb 80(%eax), %xmm0 139 add $64, %eax 140 pmovmskb %xmm0, %ecx 141 test %ecx, %ecx 142 jnz L(exit16) 143 144 pcmpeqb 32(%eax), %xmm1 145 pmovmskb %xmm1, %ecx 146 test %ecx, %ecx 147 jnz L(exit32) 148 149 pcmpeqb 48(%eax), %xmm2 150 pmovmskb %xmm2, %ecx 151 test %ecx, %ecx 152 jnz L(exit48) 153 154 pcmpeqb 64(%eax), %xmm3 155 pmovmskb %xmm3, %ecx 156 test %ecx, %ecx 157 jnz L(exit64) 158 159 pcmpeqb 80(%eax), %xmm0 160 add $64, %eax 161 pmovmskb %xmm0, %ecx 162 test %ecx, %ecx 163 jnz L(exit16) 164 165 pcmpeqb 32(%eax), %xmm1 166 pmovmskb %xmm1, %ecx 167 test %ecx, %ecx 168 jnz L(exit32) 169 170 pcmpeqb 48(%eax), %xmm2 171 pmovmskb %xmm2, %ecx 172 test %ecx, %ecx 173 jnz L(exit48) 174 175 pcmpeqb 64(%eax), %xmm3 176 pmovmskb %xmm3, %ecx 177 test %ecx, %ecx 178 jnz L(exit64) 179 180 pcmpeqb 80(%eax), %xmm0 181 add $64, %eax 182 pmovmskb %xmm0, %ecx 183 test %ecx, %ecx 184 jnz L(exit16) 185 186 pcmpeqb 32(%eax), %xmm1 187 pmovmskb %xmm1, %ecx 188 test %ecx, %ecx 189 jnz L(exit32) 190 191 pcmpeqb 48(%eax), %xmm2 192 pmovmskb %xmm2, %ecx 193 test %ecx, %ecx 194 jnz L(exit48) 195 196 pcmpeqb 64(%eax), %xmm3 197 pmovmskb %xmm3, %ecx 198 test %ecx, %ecx 199 jnz L(exit64) 200 201 202 test $0x3f, %eax 203 jz L(align64_loop) 204 205 pcmpeqb 80(%eax), %xmm0 206 add $80, %eax 207 pmovmskb %xmm0, %ecx 208 test %ecx, %ecx 209 jnz L(exit) 210 211 test $0x3f, %eax 212 jz L(align64_loop) 213 214 pcmpeqb 16(%eax), %xmm1 215 add $16, %eax 216 pmovmskb %xmm1, %ecx 217 test %ecx, %ecx 218 jnz L(exit) 219 220 test $0x3f, %eax 221 jz L(align64_loop) 222 223 pcmpeqb 16(%eax), %xmm2 224 add $16, %eax 225 pmovmskb %xmm2, %ecx 226 test %ecx, %ecx 227 jnz L(exit) 228 229 test $0x3f, %eax 230 jz L(align64_loop) 231 232 pcmpeqb 16(%eax), %xmm3 233 add $16, %eax 234 pmovmskb %xmm3, %ecx 235 test %ecx, %ecx 236 jnz L(exit) 237 238 add $16, %eax 239 .p2align 4 240L(align64_loop): 241 movaps (%eax), %xmm4 242 pminub 16(%eax), %xmm4 243 movaps 32(%eax), %xmm5 244 pminub 48(%eax), %xmm5 245 add $64, %eax 246 pminub %xmm4, %xmm5 247 pcmpeqb %xmm0, %xmm5 248 pmovmskb %xmm5, %ecx 249 test %ecx, %ecx 250 jz L(align64_loop) 251 252 253 pcmpeqb -64(%eax), %xmm0 254 sub $80, %eax 255 pmovmskb %xmm0, %ecx 256 test %ecx, %ecx 257 jnz L(exit16) 258 259 pcmpeqb 32(%eax), %xmm1 260 pmovmskb %xmm1, %ecx 261 test %ecx, %ecx 262 jnz L(exit32) 263 264 pcmpeqb 48(%eax), %xmm2 265 pmovmskb %xmm2, %ecx 266 test %ecx, %ecx 267 jnz L(exit48) 268 269 pcmpeqb 64(%eax), %xmm3 270 pmovmskb %xmm3, %ecx 271 sub %edx, %eax 272 bsf %ecx, %ecx 273 add %ecx, %eax 274 add $64, %eax 275 ret 276 277 .p2align 4 278L(exit): 279 sub %edx, %eax 280 bsf %ecx, %ecx 281 add %ecx, %eax 282 ret 283 284L(exit_less16): 285 bsf %ecx, %eax 286 ret 287 288 .p2align 4 289L(exit_unaligned): 290 sub %edx, %eax 291 bsf %ecx, %ecx 292 add %ecx, %eax 293 ret 294 295 .p2align 4 296L(exit16): 297 sub %edx, %eax 298 bsf %ecx, %ecx 299 add %ecx, %eax 300 add $16, %eax 301 ret 302 303 .p2align 4 304L(exit32): 305 sub %edx, %eax 306 bsf %ecx, %ecx 307 add %ecx, %eax 308 add $32, %eax 309 ret 310 311 .p2align 4 312L(exit48): 313 sub %edx, %eax 314 bsf %ecx, %ecx 315 add %ecx, %eax 316 add $48, %eax 317 ret 318 319 .p2align 4 320L(exit64): 321 sub %edx, %eax 322 bsf %ecx, %ecx 323 add %ecx, %eax 324 add $64, %eax 325 ret 326 327END (STRLEN) 328 329