Le 23/05/2014 13:47, Ben Bacarisse a écrit :
bool table[256] = {0};
bool *tab = table - CHAR_MIN;
do tab[*set] = true; while (*set++);
while (!tab[*str]) ++str;
return str;
gcc compiles this to 20/21 instructions (depending on optimising for
speed or size). That's fewer than your hand-coded assembler version,
though it may, of course, be either more bytes, or slower, or both.
No, at least not in my Mac.
/tmp $ cat strfind.c
#include <limits.h>
#include <stdbool.h>
char *strfind(char *str,char *set)
{
bool table[256] = {0};
bool *tab = table - CHAR_MIN;
do tab[*set] = true; while (*set++);
while (!tab[*str]) ++str;
return str;
}
/tmp $ gcc -Os -c -std=c99 strfind.c
/tmp $ objdump strfind.o
strfind.o:
code: 194 bytes <<<<<<<<<<<<<<<<-------<<<<<<<<<<<<<<<<<
debug: 64 bytes
/tmp $ gcc -Os -c -S -std=c99 strfind.c
/tmp $ cat strfind.s
.section __TEXT,__text,regular,pure_instructions
.globl _strfind
_strfind: ## @strfind
.cfi_startproc
## BB#0:
pushq %rbp
Ltmp2:
.cfi_def_cfa_offset 16
Ltmp3:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Ltmp4:
.cfi_def_cfa_register %rbp
subq $272, %rsp ## imm = 0x110
movq ___stack_chk_guard@GOTPCREL(%rip), %rax
movq (%rax), %rcx
movq %rcx, -8(%rbp)
xorps %xmm0, %xmm0
movaps %xmm0, -32(%rbp)
movaps %xmm0, -48(%rbp)
movaps %xmm0, -64(%rbp)
movaps %xmm0, -80(%rbp)
movaps %xmm0, -96(%rbp)
movaps %xmm0, -112(%rbp)
movaps %xmm0, -128(%rbp)
movaps %xmm0, -144(%rbp)
movaps %xmm0, -160(%rbp)
movaps %xmm0, -176(%rbp)
movaps %xmm0, -192(%rbp)
movaps %xmm0, -208(%rbp)
movaps %xmm0, -224(%rbp)
movaps %xmm0, -240(%rbp)
movaps %xmm0, -256(%rbp)
movaps %xmm0, -272(%rbp)
leaq -272(%rbp), %rcx
LBB0_1: ## =>This Inner Loop Header: Depth=1
movsbq (%rsi), %rdx
movb $1, 128(%rdx,%rcx)
cmpb $0, (%rsi)
leaq 1(%rsi), %rsi
jne LBB0_1
## BB#2: ## %.preheader.preheader
decq %rdi
LBB0_3: ## %.preheader
## =>This Inner Loop Header:
Depth=1
movsbq 1(%rdi), %rdx
incq %rdi
cmpb $0, 128(%rdx,%rcx)
je LBB0_3
## BB#4:
movq (%rax), %rax
cmpq -8(%rbp), %rax
jne LBB0_6
## BB#5:
movq %rdi, %rax
addq $272, %rsp ## imm = 0x110
popq %rbp
ret
LBB0_6:
callq ___stack_chk_fail
.cfi_endproc
..subsections_via_symbols
/tmp $ gcc -v
Configured with: --prefix=/Applications/Xcode.app/Contents/Developer/usr
--with-gxx-include-dir=/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX10.9.sdk/usr/include/c++/4.2.1
Apple LLVM version 5.1 (clang-503.0.40) (based on LLVM 3.4svn)
Target: x86_64-apple-darwin13.2.0
Thread model: posix
Analysis:
Instead of using the stosq instruction, gcc repeats 32 times the store
of 16 bytes using xmm0. In my opinion this is a misguided optimisation.