[Trolling] assembly vs C language

RoSsIaCrIiLoIA · Feb 8, 2005

d_0=32.000000
d_1=38.000000
Test success
d_0=34.000000
d_1=42.000000
Test success
d_0=1.000000
d_1=0.000000
Test success

d_0=32.000000
d_1=38.000000
Test success
d_0=7.000000
d_1=10.000000
Test success
d_0=0.000000
d_1=0.000000
Test success

C:a>ab
d_0=27.000000
d_1=33.000000
Test success
d_0=17.000000
d_1=21.000000
Test success
d_0=0.000000
d_1=0.000000
Test success
(33-27=6) 6/33=x/100 => x=+18%
(21-17=4) 4/21=x/100 => x=+19%
Someone says that only a good assembly programmer can surpass a C
compiler. I'm a novice in the programming world so
Why my C version of strlcpy is slower than assembly version?
Why the C version of strlcpy of a senior C programmer is slower than
the assembly version of a novice as I am? (It does't seem suitable)

--------------------
; nasmw -f obj this_file.asm
section _DATA public align=4 class=DATA use32
section _TEXT public align=1 class=CODE use32
global _asm_strlcpy_m
extern _strlen
;size_t
;strlcpy_m(char* dst, const char* src, size_t sz)
;{size_t z=sz;
; char a;
;----------------------------*/
; if( sz )
; {if( src )
; {--z;
; l1: --sz;
; if(sz==0) goto l2;
; a=*src; ++src;
; *dst=a; ++dst;
; if(a) goto l1;
; return z-sz;
; l2: *dst=0;
; l0: z+=strlen(src);
; }
; else if(dst) *dst=0;
; }
; else if(src) goto l0;
; return z-sz;
;} /* strlcpy */

_asm_strlcpy_m:
push ebx
push esi
push edi
%define @dst [esp+16]
%define @src [esp+20]
%define @sz [esp+24]
mov ebx, @sz ; z=@sz b=sz
mov edi, @dst
mov esi, @src
cmp ebx, 0
je .l5
cmp esi, 0
je .l3
dec dword @sz
..l0:
dec ebx
jz .l1
mov al, [esi]
inc esi
mov [edi], al
inc edi
cmp al, 0
jne .l0
..le:
mov eax, @sz
sub eax, ebx
pop edi
pop esi
pop ebx
ret
..l1:
mov byte[edi], 0
..l2:
push esi
call _strlen
add esp, 4
add @sz, eax
jmp short .le
..l3:
cmp edi, 0
je .le
mov byte[edi], 0
jmp short .le
..l5:
cmp esi, 0
jne .l2
jmp short .le
%undef @dst
%undef @src
%undef @sz
; end file g.asm
--------------------------------------------------
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <assert.h>

/* Yes, I know strlcat and strlcpy are in the implementors
namespace. Change them if you must.
See <http://cbfalconer.home.att.net/download/strlcpy.zip>
for documentation and rationale.

The objective is to detect whether a given string terminates
another string. This is a fairly tortuous way of doing
"endswith", but is fairly clear.

By C.B.Falconer. Released to public domain
*/

/* ---------------------- */
static int endswith_m(const char* , const char* );
static size_t revstring_m(char* string);
void rand_phrase(char* a, size_t le);
size_t strlcpy_m(char* dst, const char* src, size_t sz);
size_t asm_strlcpy_m(char* dst, const char* src, size_t sz);

size_t strlcat_m(char* dst, const char* src, size_t sz);

int test3(unsigned cicli,unsigned len_s0,unsigned len_s1,
size_t (*f0)(char*, const char*, size_t),
size_t (*f1)(char*, const char*, size_t) );

int test1(unsigned cicli, unsigned len_s0,
size_t (*f0)(char*), size_t (*f1)(char*) );

/* reverse string in place. Return length */
static size_t revstring(char *string)
{
char *last, temp;
size_t lgh;

if ((lgh = strlen(string)) > 1) {
last = string + lgh; /* points to '\0' */
while (last-- > string) {
temp = *string; *string++ = *last; *last = temp;
}
}
return lgh;
} /* revstring */

/* ---------------------- */

static size_t strlcpy(char *dst, const char *src, size_t sz)
{
const char *start = src;

if (src && sz--) {
while ((*dst++ = *src))
if (sz--) src++;
else {
*(--dst) = '\0';
break;
}
}
if (src) {
while (*src++) continue;
return src - start - 1;
}
else if (sz) *dst = '\0';
return 0;
} /* strlcpy */

/* ---------------------- */

static size_t strlcat(char *dst, const char *src, size_t sz)
{
char *start = dst;

while (*dst++) /* assumes sz >= strlen(dst) */
if (sz) sz--; /* i.e. well formed string */
dst--;
return dst - start + strlcpy(dst, src, sz);
} /* strlcat */

/* ---------------------- */

/* does searchme end with the phrase phrase? */
/* illustrates the power of reversing things */
/* (if not the efficacy) */
static int endswith(char *phrase, char *searchme)
{
int result, lgh, i;

lgh = revstring(phrase); revstring(searchme);
result = 1;
for (i = 0; i < lgh; i++) /* strncmp if we had it */
if (phrase != searchme) {
result = 0; break;
}
revstring_m(phrase); revstring_m(searchme);
return result;
} /* endswith */

/*End of By C.B.Falconer. */

#define G goto
#define U unsigned
#define R return
#define W while
#define F for
#define P printf
#define B break

int main(void)
{srand((U)time(0));
test3(50000000, 200, 200, asm_strlcpy_m, strlcpy);
test3(50000000, 200, 200, asm_strlcpy_m, strlcpy_m);
test1(300000, 100, revstring, revstring_m );
return 0;
} /* main, endswith */

/* 25 + 25 + 10 = 60 0..59 */
char lettere(U j)
{char *u="abcdefghijklmnopqrstuvwyz"
"ABCDEFGHIJKLMNOPQRSTUVWYZ"
"0123456789";
R u[j%60];
}

void rand_phrase(char* a, size_t le)
{U i=0;
/*------------------------------*/
assert(a!=0);
if(le==0) G l0;
la: a=lettere((U)rand()); if(++i<le)G la;
l0:
a=0;
}

/* does searchme end with the phrase phrase? */
/* By RoSsIaCrIiLoIA */
int endswith_m(const char* phrase, const char* searchme)
{const char *p1, *p2;
int r=0;
/*------------------------------*/
assert(phrase!=0 && searchme!=0);
p1= phrase +strlen(phrase ); /**p1=*p2=0 */
p2= searchme+strlen(searchme); /*ok caso *p1=0 o *p2=0 */
G l1;
l0: --p1; --p2;
l1: if(*p1!=*p2) G le;
if(p2==searchme)G l2;
if(p1!=phrase) G l0;
G le;
l2: r=1;
le:
R r;
}

/* reverse string in place. Return length */
size_t revstring_m(char *string)
{char *last, temp;
size_t lgh;
/*-------------------------------------*/
if( (lgh=strlen(string))>1 )
{
last= string+lgh-1;
l0:
temp=*string, *string=*last, *last=temp;
--last; ++string;
if(last>string) G l0;
}
return lgh;
} /* revstring */

size_t
strlcpy_m(char* dst, const char* src, size_t sz)
{size_t z=sz;
char a;
/*----------------------------*/
if( sz )
{if( src )
{--z;
l1: --sz;
if(sz==0) G l2;
a=*src; ++src;
*dst=a; ++dst;
if(a) G l1;
R z-sz;
l2: *dst=0;
l0: z+=strlen(src);
}
else if(dst) *dst=0;
}
else if(src) G l0;
R z-sz;
} /* strlcpy */

size_t
strlcat_m(char* dst, const char* src, size_t sz)
{size_t z;
/*--------------------*/
if( (z=strlen(dst)) + 1 <= sz )
R z + strlcpy_m(dst+z, src, sz-z);
else {if(sz) dst[sz-1]=0; /* se dest=NULL could write it */
R z + (src ? strlen(src): 0);
}
} /* strlcat */

int test3(cicli, len_s0, len_s1, f0, f1)
U cicli, len_s0, len_s1;
size_t (*f0)(char*, const char*, size_t),
(*f1)(char*, const char*, size_t);
{char *s2, *s0, *s1, *s3;
size_t i, k0, k1, h0, h1, h3;
time_t t0, t1;
/*-------------------------------------{}*/
if(cicli==0||len_s0==0||len_s1==0||f0==0||f1==0)
R 0;
if(len_s0>100000||len_s1>100000)
R 0;
s0=malloc(len_s0);
if(s0==0) R 0;
if((s1=malloc(len_s1))==0)
{free(s0); R 0;}
if((s2=malloc(len_s0))==0)
{free(s0);free(s1); R 0;}
if((s3=malloc(len_s0))==0)
{free(s0);free(s1); free(s2); R 0;}
i=0;
l0:
h0=rand()%len_s0; h1=rand()%len_s1;
h3=h0+1+rand()%(len_s0-h0);
rand_phrase(s0, h0); rand_phrase(s1, h1);

strcpy(s3, s0); strcpy(s2, s0);
if( (k0=f0(s0, s1, h3 )) !=
(k1=f1(s2, s1, h3 )) )
{l1:
P("test failure:"
"s3=%s# s1=%s# s0=%s# s2=%s# k0=%u# k1=%u# len(s0)=%u h0=%u h1=%u
giri=%u\n",
s3, s1, s0, s2, (U)k0, (U)k1,(U)strlen(s0),(U)h0,(U)h1,
(U)i);
free(s0); free(s1); free(s2); free(s3);
R 0;
}
if(strcmp(s0, s2)!=0)G l1;
if(++i<cicli/500)G l0;
i=0;
l34:
h1=rand()%len_s1; rand_phrase(s1, h1);
if(h1==0) G l34;
t0=time(0);
l2:
// h0=rand()%len_s0; rand_phrase(s0, h3);
// h1=rand()%len_s1; rand_phrase(s1, h1);
// h3=(h0+h1+1)%(len_s0);
++*s1;
if(f0(s0, s1, h1+1)==1234567) ++i;
if(++i<cicli)G l2;
t1=time(0);
P("d_0=%f\n", difftime(t1,t0));
i=0;
t0=time(0);
l3:
// h0=rand()%len_s0; rand_phrase(s0, h0);
// h1=rand()%len_s1; rand_phrase(s1, h1);
// h3=(h0+h1+1)%(len_s0);
++*s1;
if(f1(s0, s1, h1+1)==1234567) ++i;
if(++i<cicli)G l3;
t1=time(0);
P("d_1=%f\n", difftime(t1,t0));
free(s0); free(s1); free(s2); free(s3);
P("Test success\n");
R 1;
}

int test1(cicli, len_s0, f0, f1)
U cicli, len_s0;
size_t (*f0)(char*), (*f1)(char*);
{char *s0, *s1, *s2;
size_t i, k0, k1, h0, h1, h3, len_s1=len_s0;
time_t t0, t1;
/*-------------------------------------{}*/
if(cicli==0||len_s0==0||f0==0||f1==0)
R 0;
if(len_s0>100000)
R 0;
s0=malloc(len_s0);
if(s0==0) R 0;
if((s1=malloc(len_s1))==0)
{free(s0); R 0;}
if((s2=malloc(len_s0))==0)
{free(s0);free(s1); R 0;}
i=0;
l0:
h0=rand()%len_s0;
rand_phrase(s0, h0);
strcpy(s1, s0);
if( (k0=f0(s0))!=(k1=f1(s1)) )
{l1:
P("test failure:"
"s1=%s# s0=%s# s2=%s# k0=%u# k1=%u# len(s0)=%u h0=%u h1=%u
giri=%u\n",
s1, s0, s2, (U)k0, (U)k1,(U)strlen(s0),(U)h0,(U)h1, (U)i);
free(s0); free(s1); free(s2);
R 0;
}
if(strcmp(s0, s1)!=0)G l1;
if(++i<cicli/50)G l0;
i=0;
l34:
h1=rand()%len_s1; rand_phrase(s1, h1);
if(h1==0) G l34;
t0=time(0);
l2:
++*s1;
if(f0(s1)==1234567) ++i;
if(++i<cicli)G l2;
t1=time(0);
P("d_0=%f\n", difftime(t1,t0));
i=0;
t0=time(0);
l3:
++*s1;
if(f1(s1)==1234567) ++i;
if(++i<cicli)G l3;
t1=time(0);
P("d_1=%f\n", difftime(t1,t0));
free(s0); free(s1); free(s2);
P("Test success\n");
R 1;
}

Thomas Matthews · Feb 8, 2005

RoSsIaCrIiLoIA wrote:
[snip]

Someone says that only a good assembly programmer can surpass a C
compiler.

Good C programmers can direct the compiler to produce
excellent assembly code.

I'm a novice in the programming world so
Why my C version of strlcpy is slower than assembly version?
Why the C version of strlcpy of a senior C programmer is slower than
the assembly version of a novice as I am? (It does't seem suitable)

Before diving into assembly language, make sure that your
C language function is optimized for speed. Changes to
a high level language are often faster and easier than
writing in assembly language.

If you _really_ need the speed, take the assembly language
generated by the compiler and optimize it or use it as
a foundation. This is how good assembly programmers work
their majick.

If you need the speed, get an assembly language book for the
processor of your system and take advantage of any specialized
instructions. For example, some Intel processors have special
instructions for processing strings. The ARM processor has
specialized instructions for loading registers from memory.

Also, you will need to break your coding style habits and
adopt a new one: elaborate commenting. Many assembly language
functions are trashed because the original author did not
adequately comment the code. One should document the reasons
behind the register usage, for example.

--
Thomas Matthews

C++ newsgroup welcome message:
http://www.slack.net/~shiva/welcome.txt
C++ Faq: http://www.parashift.com/c++-faq-lite
C Faq: http://www.eskimo.com/~scs/c-faq/top.html
alt.comp.lang.learn.c-c++ faq:
http://www.comeaucomputing.com/learn/faq/
Other sites:
http://www.josuttis.com -- C++ STL Library book
http://www.sgi.com/tech/stl -- Standard Template Library

Phil Carmody · Feb 8, 2005

Thomas Matthews said:
If you _really_ need the speed, take the assembly language
generated by the compiler and optimize it or use it as
a foundation. This is how good assembly programmers work
their majick.

Be warned, though - thinking like a compiler will limit your
possibilities. If you want real speed, then do _not_ base
what you do on what a compiler does.

Phil

RoSsIaCrIiLoIA · Feb 9, 2005

_asm_strlcpy_m:
push ebx
push esi
push edi
%define @dst [esp+16]
%define @src [esp+20]
%define @sz [esp+24]
mov ebx, @sz ; z=@sz b=sz
mov edi, @dst
mov esi, @src
cmp ebx, 0
je .l5
cmp esi, 0
je .l3
dec dword @sz
.l0:
dec ebx
jz .l1
mov al, [esi]
inc esi
mov [edi], al
inc edi
cmp al, 0
jne .l0
.le:
mov eax, @sz
sub eax, ebx
pop edi
pop esi
pop ebx
ret
.l1:
mov byte[edi], 0
.l2:
push esi
call _strlen
add esp, 4
add @sz, eax
jmp short .le
.l3:
cmp edi, 0
je .le
mov byte[edi], 0
jmp short .le
.l5:
cmp esi, 0
jne .l2
jmp short .le
%undef @dst
%undef @src
%undef @sz
; end file g.asm

only strlcpy seems 25% slower than _asm_strlcpy_m
How increase 25%?

_asm_strlcpy_m:
push ebx
push esi
push edi
%define @dst [esp+16]
%define @src [esp+20]
%define @sz [esp+24]
mov ebx, @sz ; z=@sz ebx=sz
mov edi, @dst
mov esi, @src
xor eax, eax
cmp ebx, 0
je .l5
cmp esi, 0
je .l3
dec dword @sz
..l0:
mov al, [esi]
inc esi
dec ebx
mov [edi], al
jz .l1
inc edi
cmp eax, 0
jne .l0
..le:
mov eax, @sz
sub eax, ebx
pop edi
pop esi
pop ebx
ret
..l1:
mov byte[edi], 0
dec esi
..l2:
push esi
call _strlen
add esp, 4
add @sz, eax
jmp short .le
..l3:
cmp edi, 0
je .le
mov byte[edi], 0
jmp short .le
..l5:
cmp esi, 0
jne .l2
jmp short .le
%undef @dst
%undef @src
%undef @sz

Thomas Matthews · Feb 9, 2005

Phil said:
Be warned, though - thinking like a compiler will limit your
possibilities. If you want real speed, then do _not_ base
what you do on what a compiler does.

Phil

I understand that speed optimizations should come
from requirements (removing some), design (change
the design to be more efficient), high level code,
then assembly language.

Some small functions take up more time with the
calling protocol and should be instead inlined.
This is a case where converting to assembly language
won't help much.

There are other techniques, such as loop unrolling
and branch minimization, which can be applied to the
high level language.

When the only option left is to optimize the function,
then starting with the compiler's version as a stencil
or a foundation is a better step than writing from
scratch.

--
Thomas Matthews

C++ newsgroup welcome message:
http://www.slack.net/~shiva/welcome.txt
C++ Faq: http://www.parashift.com/c++-faq-lite
C Faq: http://www.eskimo.com/~scs/c-faq/top.html
alt.comp.lang.learn.c-c++ faq:
http://www.comeaucomputing.com/learn/faq/
Other sites:
http://www.josuttis.com -- C++ STL Library book
http://www.sgi.com/tech/stl -- Standard Template Library

\\\\o//annabee · Feb 9, 2005

På Wed, 09 Feb 2005 20:21:47 GMT, skrev Thomas Matthews

I understand that speed optimizations should come
from requirements (removing some), design (change
the design to be more efficient), high level code,
then assembly language.

Some small functions take up more time with the
calling protocol and should be instead inlined.
This is a case where converting to assembly language
won't help much.

Using only asm will allways be a win-win situation :
1. After some time, what was initially hard gets much easier
2. After some time, many optimization tricks come naturally as part of
your style.
3. After some time, you will see the optimization points more easily

I swear to you. When I first started asm programming, a year ago, I wrote
several routines, like string searching, stringcat/copy functions and the
like, so to have something to help me along. (this was thinking HLL)
Theese days, I rarly call those functions, because its simply faster to
write them out by hand, then to find the old code, and refresh how to use
it.

An easy way to dublicate a string : this is not optimized code, but in
many situations it is simply more than accurate. Most string copies are
small strings :

while B$eax <> 0
push w$eax | pop w$ebx
add ebx 2 | cmp B$eax + 1 0 | je L0>
add eax 2
End_While

I write this on one line usually, but there is no space for that here.

This took 5 seconds to write. The logic is so simple and straight forward
that noone will miss it. It will take me longer to recap how to use my
PcharCopyString function. And there, I will also have to recap what
registers gets lost, and maybe even preserve them ...wheras this one, I
can change the two registers used at will, and seemlessly integrate it
into the current code.

After spending time with asm, many situation arise when you can just copy
and paste allready written code and rewrite it, as simple as this, or just
write it out blindly like I just did. In many cases, once you wrote the
logic for one aspect of a solution, the other "octants", come at no cost,
by copy and paste or a macro replacement.

My first IntegertoString function, was two - 3 pages of insanities.
Wolfgang, who has some experience (30) years, wrote his SLOW version in
something like 4-5 lines, and his _SLOW_ version is 16 times faster then
the Delphi intToStr function. Mine, the nuub, version, was allready 4
times as fast as the Delphi version. But this is not the bigger point at
all. The bigger point is _coding_ efficiency. *Programmer time*.

I have little to report yet, that would be any news to asm'ers, but I do
have this to report: Asm is simpler to work with than HLLs. Much clearer,
and its even more effective in terms of developmenttime. I guess most of
the people reading this NG, think I just blindy translated my Delphi stuff
into asm, line by line, but no, I rewrote all of it from scratch. Short of
two-three routines! And will be happy to prove it. And in all fairness, I
spent mostly less than 5 months of the whole year with this. Since RosAsm,
each day has just been a marvellous vacation, with a few dense 3 weeks
sessions now and then.

I have developed a much better eye, and stamina for reading asm, and it
comes much easier to me now. I have literally no problems rewriting 2
pages long routines because of it. And I used to think asm was simply out
of my reach !!

Do not buy into the crap comming from HLL people. Asm is not only simpler
to learn, but is more effective tool to use as well. In DEVELOPEMENT TIME!
In terms of _money_ spent. To feed / pay the programmer. If I had started
RosAsm programming 5 years ago, I would have been five years ahead by now.
It makes me want to scream and shout ! How stupid I have been for ignoring
the marvellous experience of assembly !

There are other techniques, such as loop unrolling
and branch minimization, which can be applied to the
high level language.

I have never needed any of those, surely useful somewhere. But the point
is that most of asm code, doesnt need to be superiour written, even the
newbie code is better, then HLL code. And while you learn, it gets better
and better, you get better, and so the code gets better as well.

asm is win-win. you can write asm fast and easy, with terrible waste,
compared to Wolfgang, and still the code will run like nobodies business,
in 80% of the cases anyway. The 80% cases count more, wouldnt you say ?
The rest will be more timeconsuming, but this will be true for HLL as
well. As the logic of the program itself is simply the same for HLLers
when they need to gain the serious speed.

When the only option left is to optimize the function,
then starting with the compiler's version as a stencil
or a foundation is a better step than writing from
scratch.

Not really true. Because the exersize of applying asm to everyday work,
will teach you to better see where and when to optimize the strategic side
of things as well. Using inline asm, now and then, will never teach you
the important part of the asm experience, the confidence, that long time
experience provide.

I am afraid this argument will never end. Because you just need to make
the jump to see it. There no argument that would have convinced me 10
years ago..... I was absolutely saying about the same things you are
saying now. .....

:-(( And I am very ashamed of it.

NoDot · Feb 9, 2005

\\o//annabee said:
[snip]

Regulars of a.l.a are aware of the fact that this blathering mess is
comming from the verbose supporter of the biggest jerk know to man.

I'm posting this only to comp.lang.c to tell you: look at this text
objectivly.

Do you like my strlcpy strlcat etc?	13	Feb 4, 2005
In C, the longest palindromic subsequence multithread exists	0	Nov 23, 2022
Binary Search in C	7	Dec 27, 2010
trim whitespace v3	170	Aug 23, 2010
linux <--> windows strcpy etc performance	5	Aug 29, 2010
Collect Excel Data from Website	5	Apr 30, 2022
wcstombs() problem	16	Feb 23, 2012
comment on my solution to exercise 3.3 in K&R's The C programming language	12	Apr 29, 2007

[Trolling] assembly vs C language

RoSsIaCrIiLoIA

Thomas Matthews

Phil Carmody

RoSsIaCrIiLoIA

Thomas Matthews

\\\\o//annabee

NoDot

Ask a Question

Similar Threads

Members online

Forum statistics

Latest Threads