Skip to content

Commit bd3cd35

Browse files
committedJun 8, 2020
Update gb18030 encoding to :2005 edition
1 parent 5ceabd4 commit bd3cd35

File tree

5 files changed

+83
-33
lines changed

5 files changed

+83
-33
lines changed
 

‎encodings/dbcs-codec.js

+66-30
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,48 @@ function DBCSCodec(codecOptions, iconv) {
4949
for (var i = 0; i < mappingTable.length; i++)
5050
this._addDecodeChunk(mappingTable[i]);
5151

52+
// Load & create GB18030 tables when needed.
53+
if (typeof codecOptions.gb18030 === 'function') {
54+
this.gb18030 = codecOptions.gb18030(); // Load GB18030 ranges.
55+
56+
// Add GB18030 common decode nodes.
57+
var commonThirdByteNodeIdx = this.decodeTables.length;
58+
this.decodeTables.push(UNASSIGNED_NODE.slice(0));
59+
60+
var commonFourthByteNodeIdx = this.decodeTables.length;
61+
this.decodeTables.push(UNASSIGNED_NODE.slice(0));
62+
63+
// Fill out the tree
64+
var firstByteNode = this.decodeTables[0];
65+
for (var i = 0x81; i <= 0xFE; i++) {
66+
var secondByteNode = this.decodeTables[NODE_START - firstByteNode[i]];
67+
for (var j = 0x30; j <= 0x39; j++) {
68+
if (secondByteNode[j] === UNASSIGNED) {
69+
secondByteNode[j] = NODE_START - commonThirdByteNodeIdx;
70+
} else if (secondByteNode[j] > NODE_START) {
71+
throw new Error("gb18030 decode tables conflict at byte 2");
72+
}
73+
74+
var thirdByteNode = this.decodeTables[NODE_START - secondByteNode[j]];
75+
for (var k = 0x81; k <= 0xFE; k++) {
76+
if (thirdByteNode[k] === UNASSIGNED) {
77+
thirdByteNode[k] = NODE_START - commonFourthByteNodeIdx;
78+
} else if (thirdByteNode[k] === NODE_START - commonFourthByteNodeIdx) {
79+
continue;
80+
} else if (thirdByteNode[k] > NODE_START) {
81+
throw new Error("gb18030 decode tables conflict at byte 3");
82+
}
83+
84+
var fourthByteNode = this.decodeTables[NODE_START - thirdByteNode[k]];
85+
for (var l = 0x30; l <= 0x39; l++) {
86+
if (fourthByteNode[l] === UNASSIGNED)
87+
fourthByteNode[l] = GB18030_CODE;
88+
}
89+
}
90+
}
91+
}
92+
}
93+
5294
this.defaultCharUnicode = iconv.defaultCharUnicode;
5395

5496

@@ -92,30 +134,6 @@ function DBCSCodec(codecOptions, iconv) {
92134
this.defCharSB = this.encodeTable[0][iconv.defaultCharSingleByte.charCodeAt(0)];
93135
if (this.defCharSB === UNASSIGNED) this.defCharSB = this.encodeTable[0]['?'];
94136
if (this.defCharSB === UNASSIGNED) this.defCharSB = "?".charCodeAt(0);
95-
96-
97-
// Load & create GB18030 tables when needed.
98-
if (typeof codecOptions.gb18030 === 'function') {
99-
this.gb18030 = codecOptions.gb18030(); // Load GB18030 ranges.
100-
101-
// Add GB18030 decode tables.
102-
var thirdByteNodeIdx = this.decodeTables.length;
103-
var thirdByteNode = this.decodeTables[thirdByteNodeIdx] = UNASSIGNED_NODE.slice(0);
104-
105-
var fourthByteNodeIdx = this.decodeTables.length;
106-
var fourthByteNode = this.decodeTables[fourthByteNodeIdx] = UNASSIGNED_NODE.slice(0);
107-
108-
for (var i = 0x81; i <= 0xFE; i++) {
109-
var secondByteNodeIdx = NODE_START - this.decodeTables[0][i];
110-
var secondByteNode = this.decodeTables[secondByteNodeIdx];
111-
for (var j = 0x30; j <= 0x39; j++)
112-
secondByteNode[j] = NODE_START - thirdByteNodeIdx;
113-
}
114-
for (var i = 0x81; i <= 0xFE; i++)
115-
thirdByteNode[i] = NODE_START - fourthByteNodeIdx;
116-
for (var i = 0x30; i <= 0x39; i++)
117-
fourthByteNode[i] = GB18030_CODE
118-
}
119137
}
120138

121139
DBCSCodec.prototype.encoder = DBCSEncoder;
@@ -124,7 +142,7 @@ DBCSCodec.prototype.decoder = DBCSDecoder;
124142
// Decoder helpers
125143
DBCSCodec.prototype._getDecodeTrieNode = function(addr) {
126144
var bytes = [];
127-
for (; addr > 0; addr >>= 8)
145+
for (; addr > 0; addr >>>= 8)
128146
bytes.push(addr & 0xFF);
129147
if (bytes.length == 0)
130148
bytes.push(0);
@@ -249,19 +267,32 @@ DBCSCodec.prototype._setEncodeSequence = function(seq, dbcsCode) {
249267

250268
DBCSCodec.prototype._fillEncodeTable = function(nodeIdx, prefix, skipEncodeChars) {
251269
var node = this.decodeTables[nodeIdx];
270+
var hasValues = false;
271+
var subNodeEmpty = {};
252272
for (var i = 0; i < 0x100; i++) {
253273
var uCode = node[i];
254274
var mbCode = prefix + i;
255275
if (skipEncodeChars[mbCode])
256276
continue;
257277

258-
if (uCode >= 0)
278+
if (uCode >= 0) {
259279
this._setEncodeChar(uCode, mbCode);
260-
else if (uCode <= NODE_START)
261-
this._fillEncodeTable(NODE_START - uCode, mbCode << 8, skipEncodeChars);
262-
else if (uCode <= SEQ_START)
280+
hasValues = true;
281+
} else if (uCode <= NODE_START) {
282+
var subNodeIdx = NODE_START - uCode;
283+
if (!subNodeEmpty[subNodeIdx]) { // Skip empty subtrees (they are too large in gb18030).
284+
var newPrefix = (mbCode << 8) >>> 0; // NOTE: '>>> 0' keeps 32-bit num positive.
285+
if (this._fillEncodeTable(subNodeIdx, newPrefix, skipEncodeChars))
286+
hasValues = true;
287+
else
288+
subNodeEmpty[subNodeIdx] = true;
289+
}
290+
} else if (uCode <= SEQ_START) {
263291
this._setEncodeSequence(this.decodeTableSeq[SEQ_START - uCode], mbCode);
292+
hasValues = true;
293+
}
264294
}
295+
return hasValues;
265296
}
266297

267298

@@ -388,10 +419,15 @@ DBCSEncoder.prototype.write = function(str) {
388419
newBuf[j++] = dbcsCode >> 8; // high byte
389420
newBuf[j++] = dbcsCode & 0xFF; // low byte
390421
}
391-
else {
422+
else if (dbcsCode < 0x1000000) {
392423
newBuf[j++] = dbcsCode >> 16;
393424
newBuf[j++] = (dbcsCode >> 8) & 0xFF;
394425
newBuf[j++] = dbcsCode & 0xFF;
426+
} else {
427+
newBuf[j++] = dbcsCode >>> 24;
428+
newBuf[j++] = (dbcsCode >>> 16) & 0xFF;
429+
newBuf[j++] = (dbcsCode >>> 8) & 0xFF;
430+
newBuf[j++] = dbcsCode & 0xFF;
395431
}
396432
}
397433

‎encodings/tables/gbk-added.json

+3-2
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
["a7c2","",14],
2828
["a7f2","",12],
2929
["a896","",10],
30-
["a8bc",""],
30+
["a8bc","ḿ"],
3131
["a8bf","ǹ"],
3232
["a8c1",""],
3333
["a8ea","",20],
@@ -51,5 +51,6 @@
5151
["fca1","",93],
5252
["fda1","",93],
5353
["fe50","⺁⺄㑳㑇⺈⺋㖞㘚㘎⺌⺗㥮㤘㧏㧟㩳㧐㭎㱮㳠⺧⺪䁖䅟⺮䌷⺳⺶⺷䎱䎬⺻䏝䓖䙡䙌"],
54-
["fe80","䜣䜩䝼䞍⻊䥇䥺䥽䦂䦃䦅䦆䦟䦛䦷䦶䲣䲟䲠䲡䱷䲢䴓",6,"䶮",93]
54+
["fe80","䜣䜩䝼䞍⻊䥇䥺䥽䦂䦃䦅䦆䦟䦛䦷䦶䲣䲟䲠䲡䱷䲢䴓",6,"䶮",93],
55+
["8135f437",""]
5556
]

‎generation/gen-dbcs.js

+4-1
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,10 @@ async.parallel({
6868
gbkadd[i] = gbChar;
6969
}
7070

71-
utils.writeTable("gbk-added", utils.generateTable(gbkadd));
71+
// GB18030:2005 addition
72+
gbk2005add = [['8135f437', '']];
73+
74+
utils.writeTable("gbk-added", utils.generateTable(gbkadd).concat(gbk2005add));
7275

7376
// Write GB18030 ranges
7477
var ranges = { uChars: [], gbChars: [] };

‎test/dbcs-test.js

+3
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,9 @@ var iconvCannotDecode = { // Characters that we can decode, but iconv cannot. En
137137
// Encoding Standard stands on the side of WebKit, so we are too.
138138
// See discussion in https://www.w3.org/Bugs/Public/show_bug.cgi?id=25396 and http://goo.gl/ocjnDR
139139
"a3a0": "\u3000",
140+
141+
// Align with GB18030-2005 standard (see https://github.com/whatwg/encoding/issues/22)
142+
"a8bc": "ḿ",
140143
},
141144
gb18030: {
142145
"80": "€", "a3a0": "\u3000",

‎test/gbk-test.js

+7
Original file line numberDiff line numberDiff line change
@@ -110,4 +110,11 @@ describe("GBK tests", function() {
110110
}
111111
});
112112

113+
it("GB18030:2005 changes are applied", function() {
114+
// See https://github.com/whatwg/encoding/issues/22
115+
var chars = "\u1E3F\u0000\uE7C7"; // Use \u0000 as separator
116+
var gbkChars = Buffer.from([0xA8, 0xBC, 0x00, 0x81, 0x35, 0xF4, 0x37]);
117+
assert.strictEqual(iconv.decode(gbkChars, "GB18030"), chars);
118+
assert.strictEqual(iconv.encode(chars, "GB18030").toString('hex'), gbkChars.toString('hex'));
119+
});
113120
});

0 commit comments

Comments
 (0)
Please sign in to comment.