Hi,
Aplogies to all for this "slightly large" post....
pcmattman wrote:Brendan wrote:
draw things in the double-buffer (e.g. "setPixel()", "fillRectange()", "drawCharacter()", "doHorizontalLine()", "doVerticalLine()", etc), and the code that copies the double-buffer to video display memory is mostly irrelevant
I can tell you now, the slow functions are not SetPixel, but DrawBitmap (not a BMP, but an already loaded array of pixels), DrawRectangle et al... but I have no idea how to optimize those functions.
Is
this the right code?
If it is, I'm not surprised it takes a minute or so to draw a screen...
First, there's a few places where you do something like this:
Code: Select all
// do not draw if it's already there!
if( *((uchar_t*) addr) == (uchar_t) color )
return;
// draw it
*((uchar_t*) addr ) = (uchar_t) color;
This doesn't help performance. Reading from the video card is slower than writing to it, and the conditional branch will be mispredicted by the CPU fairly often (causing expensive pipeline flushes). Even when it doesn't cause a pipeline flush the CPU won't be able to do the write until the read completes. Your video code would be at least twice as fast if you write to the video card regardless of whether you need to or not in this case.
For example, "SetPixel()" should be:
Code: Select all
void VGA::SetPixel( uint_t x, uint_t y, uint_t color )
{
// get the address
unsigned int addr = m_dbuff + ( ( m_width * y ) + x );
// sets a pixel
switch( m_bpp )
{
case 8: /** 8-bit **/
*((uchar_t*) addr ) = (uchar_t) color;
break;
case 16: /** 16-bit **/
*((ushort_t*) addr ) = (ushort_t) color;
break;
case 24: /** 24-bit (32-bit with NO aplha) **/
// WRONG -> *((uint_t*) addr ) = color & 0x00FFFFFF;
*((uchar_t*) addr ) = (uchar_t) color;
*((uchar_t*) (addr + 1) ) = (uchar_t) (color >> 8);
*((uchar_t*) (addr + 2) ) = (uchar_t) (color >> 16);
break;
case 32: /** 32-bit (1 channel for ALPHA) **/
// TWICE? *((uint_t*) m_dbuff + ( ( m_width * y ) + x ) ) = color;
*((uint_t*) addr ) = color;
break;
}
}
Next, comment out the "SetPixel()" function - it should never be used by any of the other functions (all of the other functions should access video display memory directly). For an example, consider your "Rectangle()" function:
Code: Select all
void VGA::Rectangle( uint_t x, uint_t y, uint_t w, uint_t h, uint_t color )
{
for( int _y = y; _y < (y+h); _y++ )
{
for( int _x = x; _x < (x+w); _x++ )
{
SetPixel( _x, _y, color );
}
}
}
Now insert the (modified) "SetPixel()" into it to see how silly it looks:
Code: Select all
void VGA::Rectangle( uint_t x, uint_t y, uint_t w, uint_t h, uint_t color )
{
for( int _y = y; _y < (y+h); _y++ )
{
for( int _x = x; _x < (x+w); _x++ )
{
// get the address
unsigned int addr = m_dbuff + ( ( m_width * y ) + x );
// sets a pixel
switch( m_bpp )
{
case 8: /** 8-bit **/
*((uchar_t*) addr ) = (uchar_t) color;
break;
case 16: /** 16-bit **/
*((ushort_t*) addr ) = (ushort_t) color;
break;
case 24: /** 24-bit (32-bit with NO aplha) **/
*((uchar_t*) addr ) = (uchar_t) color;
*((uchar_t*) (addr + 1) ) = (uchar_t) (color >> 8);
*((uchar_t*) (addr + 2) ) = (uchar_t) (color >> 16);
break;
case 32: /** 32-bit (1 channel for ALPHA) **/
*((uint_t*) addr ) = color;
break;
}
}
}
}
Now optimize to remove the sillyness:
Code: Select all
void VGA::Rectangle( uint_t x, uint_t y, uint_t w, uint_t h, uint_t color )
{
unsigned int addr;
switch( m_bpp )
{
case 8: /** 8-bit **/
for( int _y = y; _y < (y+h); _y++ )
{
addr = m_dbuff + ( ( m_width * y ) + x );
for( int _x = x; _x < (x+w); _x++ )
{
*((uchar_t*) addr ) = (uchar_t) color;
addr += 1;
}
}
break;
case 16: /** 16-bit **/
for( int _y = y; _y < (y+h); _y++ )
{
addr = m_dbuff + ( ( m_width * y ) + x );
for( int _x = x; _x < (x+w); _x++ )
{
*((ushort_t*) addr ) = (ushort_t) color;
addr += 2;
}
}
break;
case 24: /** 24-bit (32-bit with NO aplha) **/
for( int _y = y; _y < (y+h); _y++ )
{
addr = m_dbuff + ( ( m_width * y ) + x );
for( int _x = x; _x < (x+w); _x++ )
{
*((uchar_t*) addr ) = (uchar_t) color;
*((uchar_t*) (addr + 1) ) = (uchar_t) (color >> 8);
*((uchar_t*) (addr + 2) ) = (uchar_t) (color >> 16);
addr += 3;
}
}
break;
case 32: /** 32-bit (1 channel for ALPHA) **/
for( int _y = y; _y < (y+h); _y++ )
{
addr = m_dbuff + ( ( m_width * y ) + x );
for( int _x = x; _x < (x+w); _x++ )
{
*((uint_t*) addr ) = color;
addr += 4;
}
}
break;
}
}
Notice here that the switch/case thing is only done once on the outside of the loops (instead of for every single pixel on the inside of both loops), and that "addr" is only calculated once per horizontal line (not recalculated for every single pixel).
Now we can optimize it more. We can simplify the calculation for "addr" so there's no multiplication in the loop. For 24-bit colour it's splitting the colour into bytes on the inside of the loop, which could be done on the outside of the loop.
The code becomes something like:
Code: Select all
void VGA::Rectangle( uint_t x, uint_t y, uint_t w, uint_t h, uint_t color )
{
void *addr, _addr;
uchar_t red, green, blue;
addr = m_dbuff + ( m_width * y );
switch( m_bpp )
{
case 8: /** 8-bit **/
for( int _y = y; _y < (y+h); _y++ )
{
_addr = addr;
for( int _x = x; _x < (x+w); _x++ )
{
*((uchar_t*) _addr ) = (uchar_t) color;
_addr += 1;
}
addr += m_width;
}
break;
case 16: /** 16-bit **/
for( int _y = y; _y < (y+h); _y++ )
{
_addr = addr;
for( int _x = x; _x < (x+w); _x++ )
{
*((ushort_t*) _addr ) = (ushort_t) color;
_addr += 2;
}
addr += m_width;
}
break;
case 24: /** 24-bit (32-bit with NO aplha) **/
blue = (uchar_t) color;
green = (uchar_t) (color >> 8);
red = (uchar_t) (color >> 16);
for( int _y = y; _y < (y+h); _y++ )
{
_addr = addr;
for( int _x = x; _x < (x+w); _x++ )
{
*((uchar_t*) _addr ) = blue
*((uchar_t*) (_addr + 1) ) = green;
*((uchar_t*) (_addr + 2) ) = red;
_addr += 3;
}
addr += m_width;
}
break;
case 32: /** 32-bit (1 channel for ALPHA) **/
for( int _y = y; _y < (y+h); _y++ )
{
_addr = addr;
for( int _x = x; _x < (x+w); _x++ )
{
*((uint_t*) _addr ) = color;
_addr += 4;
}
addr += m_width;
}
break;
}
}
Next, we're using "_x" to control the inner loops and "_x" isn't being used, and we're also using "_addr". It makes more sense to use "_addr" to control the loop, and get rid of "_x" completely.
Code: Select all
void VGA::Rectangle( uint_t x, uint_t y, uint_t w, uint_t h, uint_t color )
{
void *addr, _addr;
uchar_t red, green, blue;
addr = m_dbuff + ( m_width * y );
switch( m_bpp )
{
case 8: /** 8-bit **/
for( int _y = y; _y < (y+h); _y++ )
{
for( _addr = addr; _addr < addr + w; _addr += 1 )
{
*((uchar_t*) _addr ) = (uchar_t) color;
}
addr += m_width;
}
break;
case 16: /** 16-bit **/
for( int _y = y; _y < (y+h); _y++ )
{
for( _addr = addr; _addr < addr + (w * 2); _addr += 2 )
{
*((ushort_t*) _addr ) = (ushort_t) color;
}
addr += m_width;
}
break;
case 24: /** 24-bit (32-bit with NO aplha) **/
blue = (uchar_t) color;
green = (uchar_t) (color >> 8);
red = (uchar_t) (color >> 16);
for( int _y = y; _y < (y+h); _y++ )
{
for( _addr = addr; _addr < addr + (w * 3); _addr += 3 )
{
*((uchar_t*) _addr ) = blue
*((uchar_t*) (_addr + 1) ) = green;
*((uchar_t*) (_addr + 2) ) = red;
}
addr += m_width;
}
break;
case 32: /** 32-bit (1 channel for ALPHA) **/
for( int _y = y; _y < (y+h); _y++ )
{
for( _addr = addr; _addr < addr + (w * 4); _addr += 4 )
{
*((uint_t*) _addr ) = color;
}
addr += m_width;
}
break;
}
}
Now, for 8-bit it's doing 4 writes to display memory when it could do 4 pixels at a time as 32-bit writes. This is a bit tricky because you'd want to make sure 32-bit writes are aligned. Similarly, 16-bit could do 2 pixels at a time. 24-bit is a little trickier because a pixel is 3 bytes, so we need to do 12 bytes at a time (4 pixels as three 32-bit writes) to keep the 32-bit writes aligned.
Code: Select all
void VGA::Rectangle( uint_t x, uint_t y, uint_t w, uint_t h, uint_t color )
{
void *addr, _addr;
uchar_t red, green, blue;
uint_t temp1, temp2, temp3;
addr = m_dbuff + ( m_width * y );
switch( m_bpp )
{
case 8: /** 8-bit **/
temp1 = color & 0xFF;
temp1 |= (temp1 << 8) | (temp1 << 16) | (temp1 << 24);
for( int _y = y; _y < (y+h); _y++ )
{
_addr = addr;
while( ( _addr < addr_x + w ) && ( (_addr & 3) != 0) ) {
*((uchar_t*) _addr ) = (uchar_t) color;
_addr++;
}
while( _addr + 3 < addr_x + w ) {
*((uint_t*) _addr ) = temp1;
_addr += 4;
}
while( _addr < addr_x + w ) {
*((uchar_t*) _addr ) = (uchar_t) color;
_addr++;
}
addr += m_width;
}
break;
case 16: /** 16-bit **/
temp1 = color & 0xFFFF;
temp1 |= temp1 << 16;
for( int _y = y; _y < (y+h); _y++ )
{
_addr = addr;
while( ( _addr < addr + (w * 2) ) && ( (_addr & 3) != 0) ) {
*((ushort_t*) _addr ) = (ushort_t) color;
_addr += 2;
}
while( _addr + 3 < addr + w ) {
*((uint_t*) _addr ) = temp1;
_addr += 4;
}
while( _addr < addr + (w * 2) ) {
*((ushort_t*) _addr ) = (ushort_t) color;
_addr += 2;
}
addr += m_width;
}
break;
case 24: /** 24-bit (32-bit with NO aplha) **/
blue = (uchar_t) color;
green = (uchar_t) (color >> 8);
red = (uchar_t) (color >> 16);
temp1 = blue | (green << 8) | (red << 16) | (blue << 24);
temp2 = green | (red << 8) | (blue << 16) | (green << 24);
temp3 = red | (blue << 8) | (green << 16) | (red << 24);
for( int _y = y; _y < (y+h); _y++ )
{
_addr = addr;
while( ( _addr < addr + (w * 3) ) && ( (_addr % 12) != 0) ) {
*((uchar_t*) _addr ) = blue
*((uchar_t*) (_addr + 1) ) = green;
*((uchar_t*) (_addr + 2) ) = red;
_addr += 3;
}
while( _addr + 11 < addr + w ) {
*((uint_t*) _addr ) = temp1;
*((uint_t*) (_addr + 4) ) = temp2;
*((uint_t*) (_addr + 8) ) = temp3;
_addr += 12;
}
while( _addr < addr + (w * 3) ) {
*((uchar_t*) _addr ) = blue
*((uchar_t*) (_addr + 1) ) = green;
*((uchar_t*) (_addr + 2) ) = red;
_addr += 3;
}
addr += m_width;
}
break;
case 32: /** 32-bit (1 channel for ALPHA) **/
for( int _y = y; _y < (y+h); _y++ )
{
for( _addr = addr; _addr < addr + (w * 4); _addr += 4 )
{
*((uint_t*) _addr ) = color;
}
addr += m_width;
}
break;
}
}
Lastly, if the width of the rectange is small it'll only need the first "while" loop, so we can do a special case to make that faster.
If the starting address of one horizontal line is aligned on a 4 byte boundary, then we could assume that the starting address for the next horizontal line will also be aligned. In this case, if the assumption is true, then we could do a special case to make it faster (by skipping the first "while" loop). We can make sure this assumption is true by only ever allowing buffers that are suitable sizes. For 8-bit pixels the horizontal resolution must be a multiple of 4, for 16-bit pixels the horizontal resolution must be a multiple of 2, and for 24-bit pixels the horizontal resolution must be a multiple of 4. That sounds acceptable to me...
The same thinking applies to the ending addresses - if the ending address of one horizontal line is aligned on a 4 byte boundary, then we could assume that the ending address for the next horizontal line will also be aligned, and we could skip the last "while" loop to make that faster.
This gives 5 special cases - small width, left and right edges aligned, left only aligned, right only aligned, and left and right edges unaligned:
Code: Select all
void VGA::Rectangle( uint_t x, uint_t y, uint_t w, uint_t h, uint_t color )
{
void *addr, _addr;
void *addr1, addr2, addr3, addr4;
uchar_t red, green, blue;
uint_t temp1, temp2, temp3;
addr = m_dbuff + ( m_width * y );
switch( m_bpp )
{
case 8: /** 8-bit **/
if(w < 7) {
// Small width
for( int _y = y; _y < (y+h); _y++ )
{
for( _addr = addr; _addr < addr + w; _addr += 1 )
{
*((uchar_t*) _addr ) = (uchar_t) color;
}
addr += m_width;
}
} else {
addr1 = addr; // Address of first byte at start of line
addr2 = (addr + 3) & ~3; // Address of first aligned dword at start of line
addr3 = (addr + w) & 3; // Address of first aligned dword after end of line
addr4 = addr + w; // Address of byte after end of line
temp1 = color & 0xFF;
temp1 |= (temp1 << 8) | (temp1 << 16) | (temp1 << 24);
if(addr1 == addr2) {
if(addr3 == addr4) {
// Both edges aligned
for( int _y = y; _y < (y+h); _y++ )
{
for( _addr = addr2, _addr < addr3, _addr += 4) {
*((uint_t*) _addr ) = temp1;
}
addr2 += m_width;
addr3 += m_width;
}
} else {
// Left edge aligned, right edge not aligned
for( int _y = y; _y < (y+h); _y++ )
{
for( _addr = addr2, _addr < addr3, _addr += 4) {
*((uint_t*) _addr ) = temp1;
}
for( _addr = addr3, _addr < addr4, _addr++) {
*((uchar_t*) _addr ) = (uchar_t) color;
}
addr2 += m_width;
addr3 += m_width;
addr4 += m_width;
}
}
} else {
if(addr3 == addr4) {
// Left edge not aligned, right edge aligned
for( int _y = y; _y < (y+h); _y++ )
{
for( _addr = addr1, _addr < addr2, _addr++) {
*((uchar_t*) _addr ) = (uchar_t) color;
}
for( _addr = addr2, _addr < addr3, _addr += 4) {
*((uint_t*) _addr ) = temp1;
}
addr1 += m_width;
addr2 += m_width;
addr3 += m_width;
}
} else {
// Both edges not aligned
for( int _y = y; _y < (y+h); _y++ )
{
for( _addr = addr1, _addr < addr2, _addr++) {
*((uchar_t*) _addr ) = (uchar_t) color;
}
for( _addr = addr2, _addr < addr3, _addr += 4) {
*((uint_t*) _addr ) = temp1;
}
for( _addr = addr3, _addr < addr4, _addr++) {
*((uchar_t*) _addr ) = (uchar_t) color;
}
addr1 += m_width;
addr2 += m_width;
addr3 += m_width;
addr4 += m_width;
}
}
}
}
break;
case 16: /** 16-bit **/
if(w < 3) {
// Small width
for( int _y = y; _y < (y+h); _y++ )
{
for( _addr = addr; _addr < addr + (w * 2); _addr += 2 )
{
*((ushort_t*) _addr ) = (ushort_t) color;
}
addr += m_width;
}
} else {
addr1 = addr; // Address of first byte at start of line
addr2 = (addr + 3) & ~3; // Address of first aligned dword at start of line
addr3 = (addr + w * 2) & 3; // Address of first aligned dword after end of line
addr4 = addr + w * 2; // Address of byte after end of line
temp1 = color & 0xFFFF;
temp1 |= temp1 << 16;
if(addr1 == addr2) {
if(addr3 == addr4) {
// Both edges aligned
for( int _y = y; _y < (y+h); _y++ )
{
for( _addr = addr2, _addr < addr3, _addr += 4) {
*((uint_t*) _addr ) = temp1;
}
addr2 += m_width;
addr3 += m_width;
}
} else {
// Left edge aligned, right edge not aligned
for( int _y = y; _y < (y+h); _y++ )
{
for( _addr = addr2, _addr < addr3, _addr += 4) {
*((uint_t*) _addr ) = temp1;
}
for( _addr = addr3, _addr < addr4, _addr += 2) {
*((ushort_t*) _addr ) = (ushort_t) color;
}
addr2 += m_width;
addr3 += m_width;
addr4 += m_width;
}
}
} else {
if(addr3 == addr4) {
// Left edge not aligned, right edge aligned
for( int _y = y; _y < (y+h); _y++ )
{
for( _addr = addr1, _addr < addr2, _addr += 2) {
*((ushort_t*) _addr ) = (ushort_t) color;
}
for( _addr = addr2, _addr < addr3, _addr += 4) {
*((uint_t*) _addr ) = temp1;
}
addr1 += m_width;
addr2 += m_width;
addr3 += m_width;
}
} else {
// Both edges not aligned
for( int _y = y; _y < (y+h); _y++ )
{
for( _addr = addr1, _addr < addr2, _addr += 2) {
*((ushort_t*) _addr ) = (ushort_t) color;
}
for( _addr = addr2, _addr < addr3, _addr += 4) {
*((uint_t*) _addr ) = temp1;
}
for( _addr = addr3, _addr < addr4, _addr += 2) {
*((ushort_t*) _addr ) = (ushort_t) color;
}
addr1 += m_width;
addr2 += m_width;
addr3 += m_width;
addr4 += m_width;
}
}
}
}
break;
case 24: /** 24-bit (32-bit with NO aplha) **/
blue = (uchar_t) color;
green = (uchar_t) (color >> 8);
red = (uchar_t) (color >> 16);
if(w < 11) {
// Small width
for( int _y = y; _y < (y+h); _y++ )
{
for( _addr = addr; _addr < addr + (w * 3); _addr += 3 )
{
*((uchar_t*) _addr ) = blue
*((uchar_t*) (_addr + 1) ) = green;
*((uchar_t*) (_addr + 2) ) = red;
}
addr += m_width;
}
} else {
addr1 = addr; // Address of first byte at start of line
addr2 = (addr + 3) & ~3; // Address of first aligned dword at start of line
addr3 = (addr + w * 3) & 3; // Address of first aligned dword after end of line
addr4 = addr + w * 3; // Address of byte after end of line
temp1 = blue | (green << 8) | (red << 16) | (blue << 24);
temp2 = green | (red << 8) | (blue << 16) | (green << 24);
temp3 = red | (blue << 8) | (green << 16) | (red << 24);
if(addr1 == addr2) {
if(addr3 == addr4) {
// Both edges aligned
for( int _y = y; _y < (y+h); _y++ )
{
for( _addr = addr2, _addr < addr3, _addr += 12) {
*((uint_t*) _addr ) = temp1;
*((uint_t*) (_addr + 4) ) = temp2;
*((uint_t*) (_addr + 8) ) = temp3;
}
addr2 += m_width;
addr3 += m_width;
}
} else {
// Left edge aligned, right edge not aligned
for( int _y = y; _y < (y+h); _y++ )
{
for( _addr = addr2, _addr < addr3, _addr += 12) {
*((uint_t*) _addr ) = temp1;
*((uint_t*) (_addr + 4) ) = temp2;
*((uint_t*) (_addr + 8) ) = temp3;
}
for( _addr = addr3, _addr < addr4, _addr += 3) {
*((uchar_t*) _addr ) = blue
*((uchar_t*) (_addr + 1) ) = green;
*((uchar_t*) (_addr + 2) ) = red;
}
addr2 += m_width;
addr3 += m_width;
addr4 += m_width;
}
}
} else {
if(addr3 == addr4) {
// Left edge not aligned, right edge aligned
for( int _y = y; _y < (y+h); _y++ )
{
for( _addr = addr1, _addr < addr2, _addr += 3) {
*((uchar_t*) _addr ) = blue
*((uchar_t*) (_addr + 1) ) = green;
*((uchar_t*) (_addr + 2) ) = red;
}
for( _addr = addr2, _addr < addr3, _addr += 12) {
*((uint_t*) _addr ) = temp1;
*((uint_t*) (_addr + 4) ) = temp2;
*((uint_t*) (_addr + 8) ) = temp3;
}
addr1 += m_width;
addr2 += m_width;
addr3 += m_width;
}
} else {
// Both edges not aligned
for( int _y = y; _y < (y+h); _y++ )
{
for( _addr = addr1, _addr < addr2, _addr += 3) {
*((uchar_t*) _addr ) = blue
*((uchar_t*) (_addr + 1) ) = green;
*((uchar_t*) (_addr + 2) ) = red;
}
for( _addr = addr2, _addr < addr3, _addr += 12) {
*((uint_t*) _addr ) = temp1;
*((uint_t*) (_addr + 4) ) = temp2;
*((uint_t*) (_addr + 8) ) = temp3;
}
for( _addr = addr3, _addr < addr4, _addr += 3) {
*((uchar_t*) _addr ) = blue
*((uchar_t*) (_addr + 1) ) = green;
*((uchar_t*) (_addr + 2) ) = red;
}
addr1 += m_width;
addr2 += m_width;
addr3 += m_width;
addr4 += m_width;
}
}
}
}
break;
case 32: /** 32-bit (1 channel for ALPHA) **/
for( int _y = y; _y < (y+h); _y++ )
{
for( _addr = addr; _addr < addr + (w * 4); _addr += 4 )
{
*((uint_t*) _addr ) = color;
}
addr += m_width;
}
break;
}
}
AFAIK to get any more performance out of this code you'd need to use:
- a) non-temporal prefetches, or prefetches, or preloading (select the first one that's supported by the CPU).
b) SSE, or 64-bit code and MMX, or MMX (select the first one that's supported by the CPU, if any).
This means that for absolute maximum performance, you'd need to detect what the CPU supports and have 12 different "Rectangle()" functions.
It's still not finished though - you should protect yourself from bad input parameters. For e.g. at the start of the function add the following:
Code: Select all
if(x >= the_horizontal_resolution_of_the_buffer) return;
if(y >= the_vertical_resolution_of_the_buffer) return;
if(x + w >= the_horizontal_resolution_of_the_buffer) {
w = the_horizontal_resolution_of_the_buffer - x;
}
if(y + h >= the_vertical_resolution_of_the_buffer) {
h = the_vertical_resolution_of_the_buffer - x;
}
I'll leave the rest of the code for you to optimize. If you give all of your functions the same treatment as I did for "Rectangle()", then you'll probably be able to reduce the time it takes to redraw the screen on Bochs from 1 minute to a fraction of a second...
One last thing: none of the above is tested - I might be missing a few type casts, etc.
Cheers,
Brendan