I'm familiar with but not in any way an expert in x86 protected mode assembly - I have a routine, written in C++, that needs to be fast. It's fairly simple, monolithic (no jumps) and I ran it through G++ 4.3.2 with -O3 (pretty much full optimisation). I have the original source code and the generated assembly. I'm not asking anyone to spend any of their precious time actually making a better version of this - all I'm asking is, with hand-crafted assembly, do you think that this target code can be improved?
I ask because there seems an *awful* lot of redundant moves in there - granted it needs plenty because of imul only dealing with %edx but still...
Anyways, without further ado, the source (I should also mention that this is simplified to try and grab more out of G++'s optimiser - there is a clamp missing but that would cause jumps):
Code: Select all
uint32_t FreetypeXterm::interpolateColour(uint32_t f, uint32_t b, uint8_t a)
{
Display::PixelFormat pf = m_Mode.pf;
// Extract the red field.
uint8_t range = 1 << pf.mRed;
uint8_t fr = (f >> pf.pRed) & (range-1);
uint8_t br = (b >> pf.pRed) & (range-1);
uint8_t r = (fr * a + br * (256-a)) / 256;
// Green
range = 1 << pf.mGreen;
uint8_t fg = (f >> pf.pGreen) & (range-1);
uint8_t bg = (b >> pf.pGreen) & (range-1);
uint8_t g = (fg * a + bg * (256-a)) / 256;
// Blue
range = 1 << pf.mBlue;
uint8_t fb = (f >> pf.pBlue) & (range-1);
uint8_t bb = (b >> pf.pBlue) & (range-1);
uint8_t bl = (fb * a + bb * (256-a)) / 256;
return 0 |
(static_cast<uint32_t>(r) << pf.pRed) |
(static_cast<uint32_t>(g) << pf.pGreen) |
(static_cast<uint32_t>(bl) << pf.pBlue);
}
Code: Select all
00000000 <_ZN13FreetypeXterm17interpolateColourEmmh>:
0: 55 push %ebp
1: 89 e5 mov %esp,%ebp
3: 57 push %edi
4: 56 push %esi
5: 53 push %ebx
6: 83 ec 24 sub $0x24,%esp
9: 8d 7d e4 lea -0x1c(%ebp),%edi
c: 8b 75 08 mov 0x8(%ebp),%esi
f: 83 c6 18 add $0x18,%esi
12: b9 04 00 00 00 mov $0x4,%ecx
17: f3 a5 rep movsl %ds:(%esi),%es:(%edi)
19: 31 c0 xor %eax,%eax
1b: 8a 45 e5 mov -0x1b(%ebp),%al
1e: 89 45 d4 mov %eax,-0x2c(%ebp)
21: 31 c9 xor %ecx,%ecx
23: 8a 4d e4 mov -0x1c(%ebp),%cl
26: bb 01 00 00 00 mov $0x1,%ebx
2b: 89 d8 mov %ebx,%eax
2d: d3 e0 shl %cl,%eax
2f: 8d 48 ff lea -0x1(%eax),%ecx
32: 88 4d db mov %cl,-0x25(%ebp)
35: 31 c0 xor %eax,%eax
37: 8a 45 14 mov 0x14(%ebp),%al
3a: 89 45 dc mov %eax,-0x24(%ebp)
3d: bf 00 01 00 00 mov $0x100,%edi
42: 29 c7 sub %eax,%edi
44: 8a 55 e7 mov -0x19(%ebp),%dl
47: 81 e2 ff 00 00 00 and $0xff,%edx
4d: 89 d6 mov %edx,%esi
4f: 31 c9 xor %ecx,%ecx
51: 8a 4d e6 mov -0x1a(%ebp),%cl
54: 89 d8 mov %ebx,%eax
56: d3 e0 shl %cl,%eax
58: 8d 40 ff lea -0x1(%eax),%eax
5b: 88 45 d3 mov %al,-0x2d(%ebp)
5e: 31 d2 xor %edx,%edx
60: 8a 55 e9 mov -0x17(%ebp),%dl
63: 89 55 e0 mov %edx,-0x20(%ebp)
66: 31 c0 xor %eax,%eax
68: 8a 45 e8 mov -0x18(%ebp),%al
6b: 88 c1 mov %al,%cl
6d: d3 e3 shl %cl,%ebx
6f: 4b dec %ebx
70: 8b 45 10 mov 0x10(%ebp),%eax
73: 89 f1 mov %esi,%ecx
75: d3 e8 shr %cl,%eax
77: 22 45 d3 and -0x2d(%ebp),%al
7a: 25 ff 00 00 00 and $0xff,%eax
7f: 0f af c7 imul %edi,%eax
82: 8b 55 0c mov 0xc(%ebp),%edx
85: d3 ea shr %cl,%edx
87: 20 55 d3 and %dl,-0x2d(%ebp)
8a: 31 c9 xor %ecx,%ecx
8c: 8a 4d d3 mov -0x2d(%ebp),%cl
8f: 0f af 4d dc imul -0x24(%ebp),%ecx
93: 01 c8 add %ecx,%eax
95: 0f b6 c4 movzbl %ah,%eax
98: 89 f1 mov %esi,%ecx
9a: d3 e0 shl %cl,%eax
9c: 8b 55 10 mov 0x10(%ebp),%edx
9f: 8a 4d d4 mov -0x2c(%ebp),%cl
a2: d3 ea shr %cl,%edx
a4: 22 55 db and -0x25(%ebp),%dl
a7: 81 e2 ff 00 00 00 and $0xff,%edx
ad: 0f af d7 imul %edi,%edx
b0: 8b 75 0c mov 0xc(%ebp),%esi
b3: d3 ee shr %cl,%esi
b5: 89 f1 mov %esi,%ecx
b7: 20 4d db and %cl,-0x25(%ebp)
ba: 31 c9 xor %ecx,%ecx
bc: 8a 4d db mov -0x25(%ebp),%cl
bf: 0f af 4d dc imul -0x24(%ebp),%ecx
c3: 01 ca add %ecx,%edx
c5: 0f b6 d6 movzbl %dh,%edx
c8: 8a 4d d4 mov -0x2c(%ebp),%cl
cb: d3 e2 shl %cl,%edx
cd: 09 d0 or %edx,%eax
cf: 8a 4d e0 mov -0x20(%ebp),%cl
d2: d3 6d 10 shrl %cl,0x10(%ebp)
d5: 8a 55 10 mov 0x10(%ebp),%dl
d8: 21 da and %ebx,%edx
da: 81 e2 ff 00 00 00 and $0xff,%edx
e0: 0f af d7 imul %edi,%edx
e3: d3 6d 0c shrl %cl,0xc(%ebp)
e6: 23 5d 0c and 0xc(%ebp),%ebx
e9: 81 e3 ff 00 00 00 and $0xff,%ebx
ef: 0f af 5d dc imul -0x24(%ebp),%ebx
f3: 01 da add %ebx,%edx
f5: 0f b6 d6 movzbl %dh,%edx
f8: d3 e2 shl %cl,%edx
fa: 09 d0 or %edx,%eax
fc: 83 c4 24 add $0x24,%esp
ff: 5b pop %ebx
100: 5e pop %esi
101: 5f pop %edi
102: 5d pop %ebp
103: c3 ret
104: 8d b6 00 00 00 00 lea 0x0(%esi),%esi
10a: 8d bf 00 00 00 00 lea 0x0(%edi),%edi
James