Explain how Base64 works
Case: ‘goose’ –> Base64 encoded –> 6bmF
So how does it code?
Take the word ‘goose’ for example
Base64 encoding table:
Rules for transcoding strings to base64: The first step is to set up three bytes in a group of 24 binary bits. Second, divide the 24 bits into four groups of six bits each. The third step is to add two 00’s before each group to expand to 32 binary bits, or four bytes. The fourth step, according to the table above, is to obtain the corresponding symbol for each byte after the extension, which is the encoded value of Base64.
Exception case handling (if less than three bytes)
- In the case of two bytes: the total 16 binary bits of the two bytes are converted into three groups according to the above rules. In addition to adding two zeros to the front of the last group, two zeros should also be added to the back. This results in a three-bit Base64 encoding with a “=” sign at the end.
- In the case of one byte: the 8 binary bits of this byte are converted into two groups according to the above rules, and the last group is added with two 0’s in front and four 0’s in the back. This results in a two-bit Base64 encoding with two “=” signs at the end.
Use node’s buffer for processing:
demo1.js
const CHARTS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/';
function encode(str) {
let buf = Buffer.from(str);
let result = ' ';
let suffix = ' ';
for (let b of buf) {
result += b.toString(2).padStart(8.0)}if ((result.length % 6) = = =4) { // One byte
suffix = '=';
result += '00';
}
if ((result.length % 6) = = =2) { // The remaining two bytes
suffix = '= =';
result += '0000';
}
return result.match(/(\d{6})/g).map(val= > parseInt(val, 2)).map(val= > CHARTS[val]).join(' ') + suffix
}
function decode(str) {
var result = ' ';
for (let b of str) {
if(b==='=') break;
result += (CHARTS.indexOf(b)).toString(2).padStart(6.0)}var list = result.match(/(\d{8})/g).map((number) = >{
return parseInt(number, 2)})return Buffer.from(list).toString('utf8')}var ss = encode('jukun m');
console.log(ss) // 5pyx5piGbQ==
console.log(decode(ss)) / / jukun m
Copy the code
The above code illustrates the process of codec. But if you look at the code of the third-party library, you will find that you can’t understand ~
Why don’t you understand?
The reason:
- To be compatible with the Browser’s Unicode encoding, Unicode needs to be codec first.
- Introducing binary operation to improve bit operation efficiency;
For example:
Goose -> Unicode Character set (40517 decimal) -> 0B1001 1110 0100 0101(binary) goose -> UTF8 encoding (15317381 decimal) -> 0B1110 1001 1011 1001 1000 0101 (binary)
Turn Unicode utf8:
- The higher four bits of the higher byte and 1110 form a new byte.
- The lower four bits of the higher byte, the higher two bits of the lower byte, and 10 form a new byte
- Combines the lower six bits of the status byte with 10 to form a new byte.
- The three new bytes are arranged in order to form a new encoding, the UTF encoding.
Utf8 to Unicode: reverse the above order. You can return the new Unicode encoding.
let _keyStr = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=";
console.log(decode(encode('goose')))
function encode(input) {
var output = "";
var chr1, chr2, chr3, enc1, enc2, enc3, enc4;
var i = 0;
// Change Unicode to UTf8
input = _utf8_encode(input);
// Convert utf8 to base64
while (i < input.length) {
chr1 = input.charCodeAt(i++);
chr2 = input.charCodeAt(i++);
chr3 = input.charCodeAt(i++);
enc1 = chr1 >> 2;
enc2 = ((chr1 & 3) < <4) | (chr2 >> 4);
enc3 = ((chr2 & 15) < <2) | (chr3 >> 6);
enc4 = chr3 & 63;
if (isNaN(chr2)) {
enc3 = enc4 = 64;
} else if (isNaN(chr3)) {
enc4 = 64;
}
output = output +
_keyStr.charAt(enc1) + _keyStr.charAt(enc2) +
_keyStr.charAt(enc3) + _keyStr.charAt(enc4);
}
return output;
}
function decode(input) {
var output = "";
var chr1, chr2, chr3;
var enc1, enc2, enc3, enc4;
var i = 0;
input = input.replace(/[^A-Za-z0-9\+\/\=]/g."");
// Convert base64 to UTf8
while (i < input.length) {
enc1 = _keyStr.indexOf(input.charAt(i++));
enc2 = _keyStr.indexOf(input.charAt(i++));
enc3 = _keyStr.indexOf(input.charAt(i++));
enc4 = _keyStr.indexOf(input.charAt(i++));
chr1 = (enc1 << 2) | (enc2 >> 4);
chr2 = ((enc2 & 15) < <4) | (enc3 >> 2);
chr3 = ((enc3 & 3) < <6) | enc4;
output = output + String.fromCharCode(chr1);
if(enc3 ! =64) {
output = output + String.fromCharCode(chr2);
}
if(enc4 ! =64) {
output = output + String.fromCharCode(chr3); }}// Convert utF8 to Browser-recognized Unicode
output = _utf8_decode(output);
return output;
}
function _utf8_encode(string) { // Change Unicode to UTf8
string = string.replace(/\r\n/g."\n");
var utftext = "";
for (var n = 0; n < string.length; n++) {
var c = string.charCodeAt(n);
if (c < 128) {
utftext += String.fromCharCode(c);
} else if ((c > 127) && (c < 2048)) {
utftext += String.fromCharCode((c >> 6) | 192);
utftext += String.fromCharCode((c & 63) | 128);
} else {
utftext += String.fromCharCode((c >> 12) | 0b11100000);
utftext += String.fromCharCode(((c >> 6) & 0b111111) | 0b10000000);
utftext += String.fromCharCode((c & 0b111111) | 0b10000000); }}return utftext;
}
function _utf8_decode(utftext) { // Convert utf8 to Unicode
var string = "";
var i = 0;
var c = c1 = c2 = 0;
while (i < utftext.length) {
c = utftext.charCodeAt(i); // The first byte
if (c < 128) {
string += String.fromCharCode(c);
i++;
} else if ((c > 191) && (c < 224)) {
c2 = utftext.charCodeAt(i + 1);
string += String.fromCharCode(((c & 31) < <6) | (c2 & 63));
i += 2;
} else {
c2 = utftext.charCodeAt(i + 1); // The first byte
c3 = utftext.charCodeAt(i + 2); // The second byte
string += String.fromCharCode(((c & 0b1111) < <12) | ((c2 & 0b111111) < <6) | (c3 & 0b111111));
i += 3; }}return string;
}
Copy the code
Important: in the above code, understand the binary shift operation, and Unicode code transcoding rules. It’s not that hard to understand.
Explanation:
- The Unicode character set corresponds to the sum of words and words, and corresponds to the concept of language;
- Utf8 encoding is a encoding format, equivalent to Chinese, English concepts.
- Goose is called ‘goose’ in Chinese, and ‘goose’ in English. Chinese, English in the computer is equivalent to a kind of code, similar to UTF8, GBK.
Reference documents:
Base64 principle
Programmers must: thoroughly understand the common 7 Chinese character encoding
What is the difference between Unicode and UTF-8
Base64 notes