@@ -100,65 +100,86 @@ Utf32Encoder.prototype.end = function() {
100
100
function Utf32Decoder ( options , codec ) {
101
101
this . isLE = codec . isLE ;
102
102
this . badChar = codec . iconv . defaultCharUnicode . charCodeAt ( 0 ) ;
103
- this . overflow = null ;
103
+ this . overflow = [ ] ;
104
104
}
105
105
106
106
Utf32Decoder . prototype . write = function ( src ) {
107
107
if ( src . length === 0 )
108
108
return '' ;
109
109
110
- // Support Uint8Array
111
- if ( ! Buffer . isBuffer ( src ) ) {
112
- src = Buffer . from ( src ) ;
113
- }
110
+ var i = 0 ;
111
+ var codepoint = 0 ;
112
+ var dst = Buffer . alloc ( src . length + 4 ) ;
113
+ var offset = 0 ;
114
+ var isLE = this . isLE ;
115
+ var overflow = this . overflow ;
116
+ var badChar = this . badChar ;
117
+
118
+ if ( overflow . length > 0 ) {
119
+ for ( ; i < src . length && overflow . length < 4 ; i ++ )
120
+ overflow . push ( src [ i ] ) ;
121
+
122
+ if ( overflow . length === 4 ) {
123
+ // NOTE: codepoint is a signed int32 and can be negative.
124
+ // NOTE: We copied this block from below to help V8 optimize it (it works with array, not buffer).
125
+ if ( isLE ) {
126
+ codepoint = overflow [ i ] | ( overflow [ i + 1 ] << 8 ) | ( overflow [ i + 2 ] << 16 ) | ( overflow [ i + 3 ] << 24 ) ;
127
+ } else {
128
+ codepoint = overflow [ i + 3 ] | ( overflow [ i + 2 ] << 8 ) | ( overflow [ i + 1 ] << 16 ) | ( overflow [ i ] << 24 ) ;
129
+ }
130
+ overflow . length = 0 ;
114
131
115
- if ( this . overflow )
116
- src = Buffer . concat ( [ this . overflow , src ] ) ;
132
+ offset = _writeCodepoint ( dst , offset , codepoint , badChar ) ;
133
+ }
134
+ }
117
135
118
- var goodLength = src . length - src . length % 4 ;
136
+ // Main loop. Should be as optimized as possible.
137
+ for ( ; i < src . length - 3 ; i += 4 ) {
138
+ // NOTE: codepoint is a signed int32 and can be negative.
139
+ if ( isLE ) {
140
+ codepoint = src [ i ] | ( src [ i + 1 ] << 8 ) | ( src [ i + 2 ] << 16 ) | ( src [ i + 3 ] << 24 ) ;
141
+ } else {
142
+ codepoint = src [ i + 3 ] | ( src [ i + 2 ] << 8 ) | ( src [ i + 1 ] << 16 ) | ( src [ i ] << 24 ) ;
143
+ }
144
+ offset = _writeCodepoint ( dst , offset , codepoint , badChar ) ;
145
+ }
119
146
120
- if ( src . length !== goodLength ) {
121
- this . overflow = src . slice ( goodLength ) ;
122
- src = src . slice ( 0 , goodLength ) ;
147
+ // Keep overflowing bytes.
148
+ for ( ; i < src . length ; i ++ ) {
149
+ overflow . push ( src [ i ] ) ;
123
150
}
124
- else
125
- this . overflow = null ;
126
151
127
- var dst = Buffer . alloc ( goodLength ) ;
128
- var offset = 0 ;
152
+ return dst . slice ( 0 , offset ) . toString ( 'ucs2' ) ;
153
+ } ;
129
154
130
- for ( var i = 0 ; i < goodLength ; i += 4 ) {
131
- var codepoint = this . isLE ? src . readUInt32LE ( i ) : src . readUInt32BE ( i ) ;
155
+ function _writeCodepoint ( dst , offset , codepoint , badChar ) {
156
+ // NOTE: codepoint is signed int32 and can be negative. We keep it that way to help V8 with optimizations.
157
+ if ( codepoint < 0 || codepoint > 0x10FFFF ) {
158
+ // Not a valid Unicode codepoint
159
+ codepoint = badChar ;
160
+ }
132
161
133
- if ( codepoint < 0x10000 ) {
134
- // Simple 16-bit character
135
- dst . writeUInt16LE ( codepoint , offset ) ;
136
- offset += 2 ;
137
- }
138
- else {
139
- if ( codepoint > 0x10FFFF ) {
140
- // Not a valid Unicode codepoint
141
- dst . writeUInt16LE ( this . badChar , offset ) ;
142
- offset += 2 ;
143
- }
144
- else {
145
- // Create high and low surrogates.
146
- codepoint -= 0x10000 ;
147
- var high = 0xD800 | ( codepoint >> 10 ) ;
148
- var low = 0xDC00 + ( codepoint & 0x3FF ) ;
149
- dst . writeUInt16LE ( high , offset ) ;
150
- offset += 2 ;
151
- dst . writeUInt16LE ( low , offset ) ;
152
- offset += 2 ;
153
- }
154
- }
162
+ // Ephemeral Planes: Write high surrogate.
163
+ if ( codepoint >= 0x10000 ) {
164
+ codepoint -= 0x10000 ;
165
+
166
+ var high = 0xD800 | ( codepoint >> 10 ) ;
167
+ dst [ offset ++ ] = high & 0xff ;
168
+ dst [ offset ++ ] = high >> 8 ;
169
+
170
+ // Low surrogate is written below.
171
+ var codepoint = 0xDC00 | ( codepoint & 0x3FF ) ;
155
172
}
156
173
157
- return dst . slice ( 0 , offset ) . toString ( 'ucs2' ) ;
174
+ // Write BMP char or low surrogate.
175
+ dst [ offset ++ ] = codepoint & 0xff ;
176
+ dst [ offset ++ ] = codepoint >> 8 ;
177
+
178
+ return offset ;
158
179
} ;
159
180
160
181
Utf32Decoder . prototype . end = function ( ) {
161
- this . overflow = null ;
182
+ this . overflow . length = 0 ;
162
183
} ;
163
184
164
185
// == UTF-32 Auto codec =============================================================
@@ -201,91 +222,98 @@ Utf32AutoEncoder.prototype.end = function() {
201
222
202
223
function Utf32AutoDecoder ( options , codec ) {
203
224
this . decoder = null ;
204
- this . initialBytes = [ ] ;
205
- this . initialBytesLen = 0 ;
225
+ this . initialBufs = [ ] ;
226
+ this . initialBufsLen = 0 ;
206
227
this . options = options || { } ;
207
228
this . iconv = codec . iconv ;
208
229
}
209
230
210
231
Utf32AutoDecoder . prototype . write = function ( buf ) {
211
232
if ( ! this . decoder ) {
212
- // Support Uint8Array
213
- if ( ! Buffer . isBuffer ( buf ) ) {
214
- buf = Buffer . from ( buf ) ;
215
- }
216
-
217
233
// Codec is not chosen yet. Accumulate initial bytes.
218
- this . initialBytes . push ( buf ) ;
219
- this . initialBytesLen += buf . length ;
234
+ this . initialBufs . push ( buf ) ;
235
+ this . initialBufsLen += buf . length ;
220
236
221
- if ( this . initialBytesLen < 32 ) // We need more bytes to use space heuristic (see below)
237
+ if ( this . initialBufsLen < 32 ) // We need more bytes to use space heuristic (see below)
222
238
return '' ;
223
239
224
240
// We have enough bytes -> detect endianness.
225
- var buf = Buffer . concat ( this . initialBytes ) ,
226
- encoding = detectEncoding ( buf , this . options . defaultEncoding ) ;
241
+ var encoding = detectEncoding ( this . initialBufs , this . options . defaultEncoding ) ;
227
242
this . decoder = this . iconv . getDecoder ( encoding , this . options ) ;
228
- this . initialBytes . length = this . initialBytesLen = 0 ;
243
+
244
+ var resStr = '' ;
245
+ for ( var i = 0 ; i < this . initialBufs . length ; i ++ )
246
+ resStr += this . decoder . write ( this . initialBufs [ i ] ) ;
247
+
248
+ this . initialBufs . length = this . initialBufsLen = 0 ;
249
+ return resStr ;
229
250
}
230
251
231
252
return this . decoder . write ( buf ) ;
232
253
} ;
233
254
234
255
Utf32AutoDecoder . prototype . end = function ( ) {
235
256
if ( ! this . decoder ) {
236
- var buf = Buffer . concat ( this . initialBytes ) ,
237
- encoding = detectEncoding ( buf , this . options . defaultEncoding ) ;
257
+ var encoding = detectEncoding ( this . initialBufs , this . options . defaultEncoding ) ;
238
258
this . decoder = this . iconv . getDecoder ( encoding , this . options ) ;
239
259
240
- var res = this . decoder . write ( buf ) ,
241
- trail = this . decoder . end ( ) ;
260
+ var resStr = '' ;
261
+ for ( var i = 0 ; i < this . initialBufs . length ; i ++ )
262
+ resStr += this . decoder . write ( this . initialBufs [ i ] ) ;
242
263
243
- return trail ? ( res + trail ) : res ;
264
+ var trail = this . decoder . end ( ) ;
265
+ if ( trail )
266
+ resStr += trail ;
267
+
268
+ this . initialBufs . length = this . initialBufsLen = 0 ;
269
+ return resStr ;
244
270
}
245
271
246
272
return this . decoder . end ( ) ;
247
273
} ;
248
274
249
- function detectEncoding ( buf , defaultEncoding ) {
250
- var enc = defaultEncoding || 'utf-32le' ;
251
-
252
- if ( buf . length >= 4 ) {
253
- // Check BOM.
254
- if ( buf . readUInt32BE ( 0 ) === 0xFEFF ) // UTF-32LE BOM
255
- enc = 'utf-32be' ;
256
- else if ( buf . readUInt32LE ( 0 ) === 0xFEFF ) // UTF-32LE BOM
257
- enc = 'utf-32le' ;
258
- else {
259
- // No BOM found. Try to deduce encoding from initial content.
260
- // Using the wrong endian-ism for UTF-32 will very often result in codepoints that are beyond
261
- // the valid Unicode limit of 0x10FFFF. That will be used as the primary determinant.
262
- //
263
- // Further, we can suppose the content is mostly plain ASCII chars (U+00**).
264
- // So, we count ASCII as if it was LE or BE, and decide from that.
265
- var invalidLE = 0 , invalidBE = 0 ;
266
- var asciiCharsLE = 0 , asciiCharsBE = 0 , // Counts of chars in both positions
267
- _len = Math . min ( buf . length - ( buf . length % 4 ) , 128 ) ; // Len is always even.
268
-
269
- for ( var i = 0 ; i < _len ; i += 4 ) {
270
- var b0 = buf [ i ] , b1 = buf [ i + 1 ] , b2 = buf [ i + 2 ] , b3 = buf [ i + 3 ] ;
271
-
272
- if ( b0 !== 0 || b1 > 0x10 ) ++ invalidBE ;
273
- if ( b3 !== 0 || b2 > 0x10 ) ++ invalidLE ;
274
-
275
- if ( b0 === 0 && b1 === 0 && b2 === 0 && b3 !== 0 ) asciiCharsBE ++ ;
276
- if ( b0 !== 0 && b1 === 0 && b2 === 0 && b3 === 0 ) asciiCharsLE ++ ;
275
+ function detectEncoding ( bufs , defaultEncoding ) {
276
+ var b = [ ] ;
277
+ var charsProcessed = 0 ;
278
+ var invalidLE = 0 , invalidBE = 0 ; // Number of invalid chars when decoded as LE or BE.
279
+ var bmpCharsLE = 0 , bmpCharsBE = 0 ; // Number of BMP chars when decoded as LE or BE.
280
+
281
+ outer_loop:
282
+ for ( var i = 0 ; i < bufs . length ; i ++ ) {
283
+ var buf = bufs [ i ] ;
284
+ for ( var j = 0 ; j < buf . length ; j ++ ) {
285
+ b . push ( buf [ j ] ) ;
286
+ if ( b . length === 4 ) {
287
+ if ( charsProcessed === 0 ) {
288
+ // Check BOM first.
289
+ if ( b [ 0 ] === 0xFF && b [ 1 ] === 0xFE && b [ 2 ] === 0 && b [ 3 ] === 0 ) {
290
+ return 'utf-32le' ;
291
+ }
292
+ if ( b [ 0 ] === 0 && b [ 1 ] === 0 && b [ 2 ] === 0xFE && b [ 3 ] === 0xFF ) {
293
+ return 'utf-32be' ;
294
+ }
295
+ }
296
+
297
+ if ( b [ 0 ] !== 0 || b [ 1 ] > 0x10 ) invalidBE ++ ;
298
+ if ( b [ 3 ] !== 0 || b [ 2 ] > 0x10 ) invalidLE ++ ;
299
+
300
+ if ( b [ 0 ] === 0 && b [ 1 ] === 0 && ( b [ 2 ] !== 0 || b [ 3 ] !== 0 ) ) bmpCharsBE ++ ;
301
+ if ( ( b [ 0 ] !== 0 || b [ 1 ] !== 0 ) && b [ 2 ] === 0 && b [ 3 ] === 0 ) bmpCharsLE ++ ;
302
+
303
+ b . length = 0 ;
304
+ charsProcessed ++ ;
305
+
306
+ if ( charsProcessed >= 100 ) {
307
+ break outer_loop;
308
+ }
277
309
}
278
-
279
- if ( invalidBE < invalidLE )
280
- enc = 'utf-32be' ;
281
- else if ( invalidLE < invalidBE )
282
- enc = 'utf-32le' ;
283
- if ( asciiCharsBE > asciiCharsLE )
284
- enc = 'utf-32be' ;
285
- else if ( asciiCharsBE < asciiCharsLE )
286
- enc = 'utf-32le' ;
287
310
}
288
311
}
289
312
290
- return enc ;
313
+ // Make decisions.
314
+ if ( bmpCharsBE - invalidBE > bmpCharsLE - invalidLE ) return 'utf-32be' ;
315
+ if ( bmpCharsBE - invalidBE < bmpCharsLE - invalidLE ) return 'utf-32le' ;
316
+
317
+ // Couldn't decide (likely all zeros or not enough data).
318
+ return defaultEncoding || 'utf-32le' ;
291
319
}
0 commit comments