Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit bd73d9a

Browse files
committed
Support UTF-8 4 bytes encoding, fixing emoji related bugs
1 parent c45b6f9 commit bd73d9a

File tree

1 file changed

+36
-6
lines changed

1 file changed

+36
-6
lines changed

sources/net.sf.j2s.java.core/src/java/lang/Encoding.js

Lines changed: 36 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -33,14 +33,27 @@ Encoding.readUTF8 = function (str) {
3333
var c2 = str.charCodeAt(i) & 0x3f;
3434
var c = (c1 << 6) + c2;
3535
arrs[arrs.length] = String.fromCharCode(c);
36-
} else if (charCode >= 0xe0) {
36+
} else if (charCode >= 0xe0 && charCode < 0xf0) {
3737
var c1 = charCode & 0x0f;
3838
i++;
3939
var c2 = str.charCodeAt(i) & 0x3f;
4040
i++;
4141
var c3 = str.charCodeAt(i) & 0x3f;
4242
var c = (c1 << 12) + (c2 << 6) + c3;
4343
arrs[arrs.length] = String.fromCharCode(c);
44+
} else if (charCode >= 0xf0) {
45+
var c1 = charCode & 0x07;
46+
i++;
47+
var c2 = str.charCodeAt(i);
48+
i++;
49+
var c3 = str.charCodeAt(i);
50+
i++;
51+
var c4 = str.charCodeAt(i);
52+
//var c = ((c1 & 0x06) << 18) + ((c2 & 0x3f)<< 12) + ((c3 & 0x3f) << 6) + (c4 & 0x3f) - 0x10000;
53+
var highCode = (c1 & 0x07 << 8) + ((c2 & 0x3f) << 2) + ((c3 & 0x30) >> 4) + 0xd800 - 0x0040;
54+
var lowCode = ((c3 & 0x0f) << 6) + (c4 & 0x3f) + 0xdc00;
55+
arrs[arrs.length] = String.fromCharCode(highCode);
56+
arrs[arrs.length] = String.fromCharCode(lowCode);
4457
}
4558
}
4659
return arrs.join ('');
@@ -66,16 +79,33 @@ Encoding.convert2UTF8 = function (str) {
6679
var charCode = str.charCodeAt(i);
6780
if (charCode < 0x80) {
6881
arrs[offset + i - startIdx] = str.charAt(i);
82+
continue;
6983
} else if (charCode <= 0x07ff) { //(charCode > 0xc0 && charCode < 0xe0) {
7084
var c1 = 0xc0 + ((charCode & 0x07c0) >> 6);
7185
var c2 = 0x80 + (charCode & 0x003f);
7286
arrs[offset + i - startIdx] = String.fromCharCode(c1) + String.fromCharCode(c2);
73-
} else {
74-
var c1 = 0xe0 + ((charCode & 0xf000) >> 12);
75-
var c2 = 0x80 + ((charCode & 0x0fc0) >> 6);
76-
var c3 = 0x80 + (charCode & 0x003f);
77-
arrs[offset + i - startIdx] = String.fromCharCode(c1) + String.fromCharCode(c2) + String.fromCharCode(c3);
87+
continue;
88+
} else if (charCode >= 0xd800 && charCode <= 0xdbff) { // high-surrogate code point
89+
if (i < str.length - 1) {
90+
var lowCode = str.charCodeAt(i+1);
91+
if (lowCode >= 0xdc00 && lowCode <= 0xdfff) { // low-surrogate code point
92+
i++;
93+
// charCode = ((charCode & 0x03ff) << 10) + (lowCode & 0x03ff) + 0x10000;
94+
// utf8mb4: 0x10000 <= charCode <= 0x1fffff
95+
var highCode = charCode + 0x0040; // + 0x10000
96+
var c1 = 0xf0 + ((highCode & 0x0700) >> 16);
97+
var c2 = 0x80 + ((highCode & 0x00fc) >> 2);
98+
var c3 = 0x80 + ((highCode & 0x0003) << 4) + ((lowCode & 0x03c0) >> 6);
99+
var c4 = 0x80 + (lowCode & 0x003f);
100+
arrs[offset + i - startIdx] = String.fromCharCode(c1) + String.fromCharCode(c2) + String.fromCharCode(c3) + String.fromCharCode(c4);
101+
continue;
102+
}
103+
}
78104
}
105+
var c1 = 0xe0 + ((charCode & 0xf000) >> 12);
106+
var c2 = 0x80 + ((charCode & 0x0fc0) >> 6);
107+
var c3 = 0x80 + (charCode & 0x003f);
108+
arrs[offset + i - startIdx] = String.fromCharCode(c1) + String.fromCharCode(c2) + String.fromCharCode(c3);
79109
}
80110
return arrs.join ('');
81111
};

0 commit comments

Comments
 (0)