Packing binary data and compressing at the bit level, etc...
Posted: Sat Jan 30, 2016 10:14 am
>> You can see the sample code for this topic running here <<
I have been doing some effort to use binary data for low level tasks in the HTML5/JavaScript environment for things like attempting to implement a CPU emulator to run nice and compact code in the web browser.
While trying to use a lot of binary data and code I have seen that it would be better to store such data compressed and then uncompress it from JavaScript, and then make the application save a copy of itself with the state of its data to save any changes of our work.
I have seen that I can use Base64 with the raw binary ASCII character values 0-63 ( !"#$%&'()*+,-./0123456789:;<=>?) so I can effectively take strictly 6 bits, unlike using the regulas Base64 alphabet (ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=).
The trick is that later, if there are 8-bit characters in the original data intermixed between the regular 7-bit ASCII characters (e.g., for UTF-8 Unicode characters and texts), we can force all bytes to contain only 6 used bits after encoding in binary-mode Base64, and then we can use the 8 bits by packing 6 bits of the current byte and 2 bits of the next byte and so on (packing 6 bits into 8 bits).
In this way we force all bytes to contain the same number of bits temporarily and then safely take back the space used by 7 and 8-bit characters by packing.
Here is the code of Base64 functions to encode and decode packed into an extremely simple JavaScript class:
I have been doing some effort to use binary data for low level tasks in the HTML5/JavaScript environment for things like attempting to implement a CPU emulator to run nice and compact code in the web browser.
While trying to use a lot of binary data and code I have seen that it would be better to store such data compressed and then uncompress it from JavaScript, and then make the application save a copy of itself with the state of its data to save any changes of our work.
I have seen that I can use Base64 with the raw binary ASCII character values 0-63 ( !"#$%&'()*+,-./0123456789:;<=>?) so I can effectively take strictly 6 bits, unlike using the regulas Base64 alphabet (ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=).
The trick is that later, if there are 8-bit characters in the original data intermixed between the regular 7-bit ASCII characters (e.g., for UTF-8 Unicode characters and texts), we can force all bytes to contain only 6 used bits after encoding in binary-mode Base64, and then we can use the 8 bits by packing 6 bits of the current byte and 2 bits of the next byte and so on (packing 6 bits into 8 bits).
In this way we force all bytes to contain the same number of bits temporarily and then safely take back the space used by 7 and 8-bit characters by packing.
Here is the code of Base64 functions to encode and decode packed into an extremely simple JavaScript class:
Code: Select all
<title>Binary Base64 Encoding/Decoding (with 6-bit ASCII values 0-63)</title>
<body bgcolor="#bcbcbc" style="font-size:19px"><script>
function Base64(alphabet_to_use=0)
{
this.nalpha=alphabet_to_use;
this.alphabet64=[
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=", //0
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_=" //1
];
//Generate a purely binary string (used to effectively pack 6 bits into 8-bit bytes
//and normalize the 8 bits):
///
this.alphabet64[2]="";
for(var x=0; x<64; x++)
{
this.alphabet64[2]+=String.fromCharCode(x);
}
this.skip64=[
"= \t\r\n", //0
"= \t\r\n", //1
"" //2
];
//Converts a raw "binary" data string into Base64 data:
///
this.btoa_DataView=function(binary_data)
{
var Base64_TBL=this.alphabet64[this.nalpha].split("");
var pad64=this.alphabet64[this.nalpha][64];
if(pad64==undefined)pad64="";
//This will contain our resulting converted data
///
var base64="";
//Check that the data is a string and that its length is not 0:
///
if(!(binary_data.byteLength>0))return "";
//Temporary 32-bit DWORD:
///
var tmp="";
//4-byte temporary Base64 chunk for each 3 bytes of data, and/or plus padding if necessary:
///
var tm2="";
//Number of '=' padding characters, to avoid generating Base64 charactes that should be padding to begin with:
//Number of '=' characters if there is no further data in size divisible exactly by 3:
///
var padcount="";
//This loop advances in groups of 3 because 3 "binary" letters
//produce 4 Base64 letters:
///
for(var x=0;x<binary_data.byteLength;x+=3)
{
//INIT: Read a DWORD safely byte by byte, with "memory boundary checking"
//INIT: Read a DWORD safely byte by byte, with "memory boundary checking"
//INIT: Read a DWORD safely byte by byte, with "memory boundary checking"
//INIT: Read a DWORD safely byte by byte, with "memory boundary checking"
///
tmp=binary_data.getUint8(x)<<24; //bits 31-24
if(x+1<binary_data.byteLength)
tmp|=binary_data.getUint8(x+1)<<16; //bits 23-16
else padcount++; //If there's no data, increase padding and this bit group is 0 automatically
if(x+2<binary_data.byteLength)
tmp|=binary_data.getUint8(x+2)<<8; //bits 15-8
else padcount++; //If there's no data, increase padding and this bit group is 0 automatically
//bits 7-0 are 0 always
//END: Read a DWORD safely byte by byte, with "memory boundary checking"
//END: Read a DWORD safely byte by byte, with "memory boundary checking"
//END: Read a DWORD safely byte by byte, with "memory boundary checking"
//END: Read a DWORD safely byte by byte, with "memory boundary checking"
//Shift 8 bits left to re-align all bit values in an order
//proper for the 6 bits of Base64.
//
//This will NOT discard the first 2 bits of the DWORD, but anyway
//the bits of the next byte of data, if present (byte 4),
//belong to the next group of 3 bytes and are useless for the
//current 32-bit run:
///
tmp>>=8; //tmp is a 32-bit DWORD here
//"Flush" past, now useless data, before using the buffer again (might not be necessary
//in C or assembly since the data in those languages will always be overwritten either by
//data or padding; but it is required in JavaScript because the string cannot be handled
//so freely in such language).
///
tm2="";
var sshl=6; //I thought that this bit shifting was going to be dynamic, but after the adjusting above
//and the bit masking below inside the loop, it isn't necessary at all.
for(var y=0;y<4;y++)
{
//Get bits 31-24, then 23-16 and 15-8 and use them as index for the third, second
//and first Base64 characters, respectively:
///
//Save the corresponding Base64 character, backwards, or if we are in a range in which
//we previously detected that there was no data available for bytes 2 of 3 and/or 3 of 3,
//just add padding.
//
//In other words, if the count of required padding characters is 0 (3 original bytes were
//present for this loop run), or if the count of padding characters is not 0 and y is
//in a range above/outside that of the padding characters to generate, then save a Base64
//indexed character.
//
//Otherwise, we are in a byte range for padding, and we must generate and save a padding character
//(it could and should only happen at the very end of the whole data buffer):
///
if(padcount==0 || (padcount!=0 && y>padcount-1))
{
tm2=Base64_TBL[tmp&63]+tm2;
}
else tm2=tm2+pad64;
//Keep shifting bits. We have saved backwards because in this way we
//reduce the amount of bit shifting and bit masking required to get
//the 6 bits required for each Base64 character, and still we can get
//each Base64 character as soon as possible, as soon as its offset
//is available to us.
///
tmp>>=6;
}
//Save this chunk of Base64 characters:
///
base64+=tm2;
}
return base64;
};
this.StrPad=function(str,padchar,paddedstrlen,direction)
{
//If no direction was specified, the default action is
//to pad leftmost:
///
if(!direction)direction="l";
//Don't allow empty padding character variable or a bad final padded length
//because it would cause an infinite loop:
///
if(typeof(paddedstrlen)!="number" || typeof(padchar)!="string")return str;
if(!(padchar.length>0) || paddedstrlen<=0)return str;
if(direction.toLowerCase()=="r")
{
while(str.length<paddedstrlen)
{
str=str+padchar.charAt(0);
}
}
else
{
while(str.length<paddedstrlen)
{
str=padchar.charAt(0)+str;
}
}
return str;
};
//Convert a raw "binary" data in Base64 data:
///
this.btoa=function(binary_data)
{
var pad64=this.alphabet64[this.nalpha][64];
if(pad64==undefined)pad64="";
binary_data=unescape(encodeURI(binary_data));
const Base64_TBL=this.alphabet64[this.nalpha];
var base64="";
if(typeof(binary_data)!="string")return "";
if(!(binary_data.length>0))return "";
var tmp="";
var tm2="";
for(var x=0;x<binary_data.length;x+=3)
{
tmp=this.str2bin(binary_data.substring(x,x+3));
for(var y=0;y<tmp.length;y+=6)
{
if(tmp.substring(y,y+6).length<6)
{
tm2=this.StrPad(str=tmp.substring(y,y+6),padchar="0",paddedstrlen=6,direction="r");
base64+=Base64_TBL[parseInt(tm2,2)];
}
else
{
base64+=Base64_TBL[parseInt(tmp.substring(y,y+6),2)];
}
if(tmp.length==8)
{
if(y==6)
{
base64+=pad64+pad64;
}
}
else if(tmp.length==16)
{
if(y==12)
{
base64+=pad64;
}
}
}
}
return base64;
};
this.atob=function(base64_data)
{
var binary64="";
var binary="";
var pad64=this.alphabet64[this.nalpha][64];
if(pad64==undefined)pad64="";
if(typeof(base64_data)!="string")return "";
if(!(base64_data.length>0))return "";
var tmp="";
var c="";
for(var x=0;x<base64_data.length;x++)
{
c=base64_data.charAt(x);
if(this.skip64[this.nalpha].indexOf(c)>=0)continue;
else c=this.alphabet64[this.nalpha].indexOf(c);
binary64+=this.StrPad(str=c.toString(2),padchar="0",paddedstrlen=6,direction="l");
if(binary64.length>=8)
{
binary+=String.fromCharCode(parseInt(binary64.substring(0,8),2));
binary64=binary64.substring(8,binary64.length);
}
}
return binary;
};
//This function takes any string, including special characters, treats them
//as a binary 8-bit data string and returns a string with the binary representation
//of those bits.
///
this.str2bin=function(ASCII_str)
{
if(typeof(ASCII_str)!="string")return "";
var rret="";
for(var x=0;x<ASCII_str.length;x++)
{
rret+=this.StrPad(str=ASCII_str.charCodeAt(x).toString(2),
padchar="0",
paddedstrlen=8,
direction="l"
);
}
return rret;
};
};
</script>
<script>
//Use the binary Base64 alphabet at index 2 of our class so that we can use 6-bit characters:
///
var base64=new Base64(2);
//This is the string to encode/decode:
///
var s=" This article's lead section may not adequately summarize key points of its contents. Please consider expanding the lead to provide an accessible overview of all important aspects of the article. Please discuss this issue on the article's talk page. (February 2013)";
//Then let's use base64.btoa(original_data) and base64.atob(base64_code) to encode and decode, respectively:
///
document.write(
"<pre><b>Binary Base64 alphabet:</b><br />"+base64.alphabet64[base64.nalpha]+"<br />-------<br /><br /><br /><br />"+
"<b>Coded with binary-mode Base64:</b><br />"+base64.btoa(s)+"<br /><br /><b>Coded with regular Base64:</b><br />"+btoa(s)+"<br /><br /><br />"+
"<b>Decoded string from binary Base64:</b><br />"+base64.atob(base64.btoa(s))+"<br /><b>Decoded string from text Base64:</b><br />"+atob(btoa(s))+"<br /><br />"+
"<b>Are decoded strings identical?</b> "+(base64.atob(base64.btoa(s))==atob(btoa(s)))
);
</script>