The kind1/kind2 variables aren't even going into registers and at
least one test+branch and a jump are executed for every character. Two
tests for 2 and 4 byte kinds. len1 and len2 don't get to go into
registers either.
Here's the full assembler output for unicode_compare:
; COMDAT _unicode_compare
_TEXT SEGMENT
_kind2$ = -20 ; size = 4
_kind1$ = -16 ; size = 4
_len2$ = -12 ; size = 4
_len1$ = -8 ; size = 4
_data2$ = -4 ; size = 4
_unicode_compare PROC ; COMDAT
; _str1$ = ecx
; _str2$ = eax
; 10417: {
push ebp
mov ebp, esp
sub esp, 20 ; 00000014H
push ebx
push esi
mov esi, eax
; 10418: int kind1, kind2;
; 10419: void *data1, *data2;
; 10420: Py_ssize_t len1, len2, i;
; 10421:
; 10422: kind1 = PyUnicode_KIND(str1);
mov eax, DWORD PTR [ecx+16]
mov edx, eax
shr edx, 2
and edx, 7
push edi
mov DWORD PTR _kind1$[ebp], edx
; 10423: kind2 = PyUnicode_KIND(str2);
mov edx, DWORD PTR [esi+16]
mov edi, edx
shr edi, 2
and edi, 7
mov DWORD PTR _kind2$[ebp], edi
; 10424: data1 = PyUnicode_DATA(str1);
test al, 32 ; 00000020H
je SHORT $LN9@unicode_co@2
test al, 64 ; 00000040H
je SHORT $LN7@unicode_co@2
lea ebx, DWORD PTR [ecx+24]
jmp SHORT $LN10@unicode_co@2
$LN7@unicode_co@2:
lea ebx, DWORD PTR [ecx+36]
jmp SHORT $LN10@unicode_co@2
$LN9@unicode_co@2:
mov ebx, DWORD PTR [ecx+36]
$LN10@unicode_co@2:
; 10425: data2 = PyUnicode_DATA(str2);
test dl, 32 ; 00000020H
je SHORT $LN13@unicode_co@2
test dl, 64 ; 00000040H
je SHORT $LN11@unicode_co@2
lea edx, DWORD PTR [esi+24]
jmp SHORT $LN30@unicode_co@2
$LN11@unicode_co@2:
lea eax, DWORD PTR [esi+36]
mov DWORD PTR _data2$[ebp], eax
mov edx, eax
jmp SHORT $LN14@unicode_co@2
$LN13@unicode_co@2:
mov edx, DWORD PTR [esi+36]
$LN30@unicode_co@2:
mov DWORD PTR _data2$[ebp], edx
$LN14@unicode_co@2:
; 10426: len1 = PyUnicode_GET_LENGTH(str1);
mov edi, DWORD PTR [ecx+8]
; 10427: len2 = PyUnicode_GET_LENGTH(str2);
mov ecx, DWORD PTR [esi+8]
; 10428:
; 10429: for (i = 0; i < len1 && i < len2; ++i) {
xor eax, eax
mov DWORD PTR _len1$[ebp], edi
mov DWORD PTR _len2$[ebp], ecx
test edi, edi
jle SHORT $LN2@unicode_co@2
; 10426: len1 = PyUnicode_GET_LENGTH(str1);
mov esi, edx
mov edi, edx
; 10428:
; 10429: for (i = 0; i < len1 && i < len2; ++i) {
sub ebx, edx
jmp SHORT $LN4@unicode_co@2
$LL28@unicode_co@2:
mov edx, DWORD PTR _data2$[ebp]
$LN4@unicode_co@2:
cmp eax, ecx
jge SHORT $LN29@unicode_co@2
; 10430: Py_UCS4 c1, c2;
; 10431: c1 = PyUnicode_READ(kind1, data1, i);
mov ecx, DWORD PTR _kind1$[ebp]
cmp ecx, 1
jne SHORT $LN17@unicode_co@2
lea ecx, DWORD PTR [ebx+eax]
movzx edx, BYTE PTR [ecx+edx]
jmp SHORT $LN16@unicode_co@2
$LN17@unicode_co@2:
cmp ecx, 2
jne SHORT $LN15@unicode_co@2
movzx edx, WORD PTR [ebx+edi]
jmp SHORT $LN16@unicode_co@2
$LN15@unicode_co@2:
mov edx, DWORD PTR [ebx+esi]
$LN16@unicode_co@2:
; 10432: c2 = PyUnicode_READ(kind2, data2, i);
mov ecx, DWORD PTR _kind2$[ebp]
cmp ecx, 1
jne SHORT $LN21@unicode_co@2
mov ecx, DWORD PTR _data2$[ebp]
movzx ecx, BYTE PTR [eax+ecx]
jmp SHORT $LN20@unicode_co@2
$LN21@unicode_co@2:
cmp ecx, 2
jne SHORT $LN19@unicode_co@2
movzx ecx, WORD PTR [edi]
jmp SHORT $LN20@unicode_co@2
$LN19@unicode_co@2:
mov ecx, DWORD PTR [esi]
$LN20@unicode_co@2:
; 10433:
; 10434: if (c1 != c2)
cmp edx, ecx
jne SHORT $LN31@unicode_co@2
mov ecx, DWORD PTR _len2$[ebp]
inc eax
add edi, 2
add esi, 4
cmp eax, DWORD PTR _len1$[ebp]
jl SHORT $LL28@unicode_co@2
$LN29@unicode_co@2:
mov edi, DWORD PTR _len1$[ebp]
$LN2@unicode_co@2:
; 10436: }
; 10437:
; 10438: return (len1 < len2) ? -1 : (len1 != len2);
cmp edi, ecx
jge SHORT $LN23@unicode_co@2
pop edi
pop esi
or eax, -1
pop ebx
; 10439: }
mov esp, ebp
pop ebp
ret 0
$LN31@unicode_co@2:
; 10435: return (c1 < c2) ? -1 : 1;
sbb eax, eax
pop edi
and eax, -2 ; fffffffeH
pop esi
inc eax
pop ebx
; 10439: }
mov esp, ebp
pop ebp
ret 0
$LN23@unicode_co@2:
; 10436: }
; 10437:
; 10438: return (len1 < len2) ? -1 : (len1 != len2);
xor eax, eax
cmp edi, ecx
pop edi
pop esi
setne al
pop ebx
; 10439: }
mov esp, ebp
pop ebp
ret 0
_unicode_compare ENDP
Neil