Thanks for the advice guys,
Combuster: I tried your option, and tbh I can't work out which is faster. The BSR/BSF instructions do seem to be the right instructions for the job, but inserting them inline into the C code really didn't cause a huge drop in function size...
Also I have noticed that those instructions are heavily microcoded and have a latency of 8 cycles - pretty high. I've attached disassembly of both before the BSR instructions and after. I really have no idea which one is faster, I willl have to do some major testing, I think!
I don't know exactly how much can be deduced from this output, most of it seems standard function skeleton code...
---------------------------------------
BEFORE
---------------------------------------
Code: Select all
08048fb0 <_Z14rankComparisonhhi>:
8048fb0: 55 push %ebp
8048fb1: 89 e5 mov %esp,%ebp
8048fb3: 56 push %esi
8048fb4: 53 push %ebx
8048fb5: 83 ec 04 sub $0x4,%esp
8048fb8: 0f b6 45 08 movzbl 0x8(%ebp),%eax
8048fbc: 8b 75 10 mov 0x10(%ebp),%esi
8048fbf: 88 45 f7 mov %al,0xfffffff7(%ebp)
8048fc2: 0f b6 45 0c movzbl 0xc(%ebp),%eax
8048fc6: 8d 4e 01 lea 0x1(%esi),%ecx
8048fc9: 83 f9 07 cmp $0x7,%ecx
8048fcc: 88 45 f6 mov %al,0xfffffff6(%ebp)
8048fcf: 7f 7e jg 804904f <_Z14rankComparisonhhi+0x9f>
8048fd1: 0f b6 5d f7 movzbl 0xfffffff7(%ebp),%ebx
8048fd5: 31 d2 xor %edx,%edx
8048fd7: 89 d8 mov %ebx,%eax
8048fd9: d3 e0 shl %cl,%eax
8048fdb: 84 c0 test %al,%al
8048fdd: 79 0a jns 8048fe9 <_Z14rankComparisonhhi+0x39>
8048fdf: eb 57 jmp 8049038 <_Z14rankComparisonhhi+0x88>
8048fe1: 89 d8 mov %ebx,%eax
8048fe3: d3 e0 shl %cl,%eax
8048fe5: 84 c0 test %al,%al
8048fe7: 78 51 js 804903a <_Z14rankComparisonhhi+0x8a>
8048fe9: b8 80 00 00 00 mov $0x80,%eax
8048fee: d3 f8 sar %cl,%eax
8048ff0: 41 inc %ecx
8048ff1: 08 c2 or %al,%dl
8048ff3: 83 f9 08 cmp $0x8,%ecx
8048ff6: 75 e9 jne 8048fe1 <_Z14rankComparisonhhi+0x31>
8048ff8: 89 f1 mov %esi,%ecx
8048ffa: 49 dec %ecx
8048ffb: 78 33 js 8049030 <_Z14rankComparisonhhi+0x80>
8048ffd: 0f b6 5d f7 movzbl 0xfffffff7(%ebp),%ebx
8049001: eb 0f jmp 8049012 <_Z14rankComparisonhhi+0x62>
8049003: b8 80 00 00 00 mov $0x80,%eax
8049008: d3 f8 sar %cl,%eax
804900a: 49 dec %ecx
804900b: 08 c2 or %al,%dl
804900d: 83 f9 ff cmp $0xffffffff,%ecx
8049010: 74 1e je 8049030 <_Z14rankComparisonhhi+0x80>
8049012: 89 d8 mov %ebx,%eax
8049014: d3 e0 shl %cl,%eax
8049016: 84 c0 test %al,%al
8049018: 79 e9 jns 8049003 <_Z14rankComparisonhhi+0x53>
804901a: 0f b6 45 f6 movzbl 0xfffffff6(%ebp),%eax
804901e: d3 e0 shl %cl,%eax
8049020: 84 c0 test %al,%al
8049022: 79 0c jns 8049030 <_Z14rankComparisonhhi+0x80>
8049024: b8 80 00 00 00 mov $0x80,%eax
8049029: d3 f8 sar %cl,%eax
804902b: 08 c2 or %al,%dl
804902d: 8d 76 00 lea 0x0(%esi),%esi
8049030: 0f b6 c2 movzbl %dl,%eax
8049033: 5a pop %edx
8049034: 5b pop %ebx
8049035: 5e pop %esi
8049036: 5d pop %ebp
8049037: c3 ret
8049038: 31 d2 xor %edx,%edx
804903a: 0f b6 45 f6 movzbl 0xfffffff6(%ebp),%eax
804903e: d3 e0 shl %cl,%eax
8049040: 84 c0 test %al,%al
8049042: 79 b4 jns 8048ff8 <_Z14rankComparisonhhi+0x48>
8049044: b8 80 00 00 00 mov $0x80,%eax
8049049: d3 f8 sar %cl,%eax
804904b: 08 c2 or %al,%dl
804904d: eb a9 jmp 8048ff8 <_Z14rankComparisonhhi+0x48>
804904f: 31 d2 xor %edx,%edx
8049051: eb a5 jmp 8048ff8 <_Z14rankComparisonhhi+0x48>
8049053: 90 nop
8049054: 8d b6 00 00 00 00 lea 0x0(%esi),%esi
804905a: 8d bf 00 00 00 00 lea 0x0(%edi),%edi
----------------------------------------
AFTER
----------------------------------------
Code: Select all
080493a0 <_Z14rankComparisonhhi>:
80493a0: 55 push %ebp
80493a1: 89 e5 mov %esp,%ebp
80493a3: 83 ec 10 sub $0x10,%esp
80493a6: 89 7d fc mov %edi,0xfffffffc(%ebp)
80493a9: 8b 7d 10 mov 0x10(%ebp),%edi
80493ac: 0f b6 45 0c movzbl 0xc(%ebp),%eax
80493b0: 89 5d f4 mov %ebx,0xfffffff4(%ebp)
80493b3: 0f b6 5d 08 movzbl 0x8(%ebp),%ebx
80493b7: 89 75 f8 mov %esi,0xfffffff8(%ebp)
80493ba: be ff 00 00 00 mov $0xff,%esi
80493bf: 8d 4f 01 lea 0x1(%edi),%ecx
80493c2: 89 f2 mov %esi,%edx
80493c4: 88 45 f2 mov %al,0xfffffff2(%ebp)
80493c7: d3 fa sar %cl,%edx
80493c9: 88 d8 mov %bl,%al
80493cb: 20 d0 and %dl,%al
80493cd: 75 44 jne 8049413 <_Z14rankComparisonhhi+0x73>
80493cf: 88 55 f3 mov %dl,0xfffffff3(%ebp)
80493d2: b9 08 00 00 00 mov $0x8,%ecx
80493d7: be ff 00 00 00 mov $0xff,%esi
80493dc: 29 f9 sub %edi,%ecx
80493de: 89 f7 mov %esi,%edi
80493e0: d3 e7 shl %cl,%edi
80493e2: 88 da mov %bl,%dl
80493e4: 89 f8 mov %edi,%eax
80493e6: 20 c2 and %al,%dl
80493e8: 74 16 je 8049400 <_Z14rankComparisonhhi+0x60>
80493ea: 0f b6 c2 movzbl %dl,%eax
80493ed: 0f bc c8 bsf %eax,%ecx
80493f0: 0f b6 45 f2 movzbl 0xfffffff2(%ebp),%eax
80493f4: d3 f8 sar %cl,%eax
80493f6: a8 01 test $0x1,%al
80493f8: 74 46 je 8049440 <_Z14rankComparisonhhi+0xa0>
80493fa: d3 fe sar %cl,%esi
80493fc: 89 f8 mov %edi,%eax
80493fe: 21 f0 and %esi,%eax
8049400: 0a 45 f3 or 0xfffffff3(%ebp),%al
8049403: 8b 5d f4 mov 0xfffffff4(%ebp),%ebx
8049406: 8b 75 f8 mov 0xfffffff8(%ebp),%esi
8049409: 8b 7d fc mov 0xfffffffc(%ebp),%edi
804940c: 89 ec mov %ebp,%esp
804940e: 0f b6 c0 movzbl %al,%eax
8049411: 5d pop %ebp
8049412: c3 ret
8049413: 0f b6 c0 movzbl %al,%eax
8049416: 0f bd c8 bsr %eax,%ecx
8049419: 0f b6 45 f2 movzbl 0xfffffff2(%ebp),%eax
804941d: d3 f8 sar %cl,%eax
804941f: a8 01 test $0x1,%al
8049421: 74 0d je 8049430 <_Z14rankComparisonhhi+0x90>
8049423: d3 e6 shl %cl,%esi
8049425: 89 f0 mov %esi,%eax
8049427: 20 d0 and %dl,%al
8049429: 88 45 f3 mov %al,0xfffffff3(%ebp)
804942c: eb a4 jmp 80493d2 <_Z14rankComparisonhhi+0x32>
804942e: 89 f6 mov %esi,%esi
8049430: 41 inc %ecx
8049431: d3 e6 shl %cl,%esi
8049433: 89 f0 mov %esi,%eax
8049435: 20 d0 and %dl,%al
8049437: 88 45 f3 mov %al,0xfffffff3(%ebp)
804943a: eb 96 jmp 80493d2 <_Z14rankComparisonhhi+0x32>
804943c: 8d 74 26 00 lea 0x0(%esi),%esi
8049440: 49 dec %ecx
8049441: eb b7 jmp 80493fa <_Z14rankComparisonhhi+0x5a>
8049443: 90 nop
8049444: 8d b6 00 00 00 00 lea 0x0(%esi),%esi
804944a: 8d bf 00 00 00 00 lea 0x0(%edi),%edi