mirror of
https://github.com/nodejs/node.git
synced 2025-05-05 22:50:18 +00:00

The copyright and license notice is already in the LICENSE file. There is no justifiable reason to also require that it be included in every file, since the individual files are not individually distributed except as part of the entire package.
192 lines
6.1 KiB
JavaScript
192 lines
6.1 KiB
JavaScript
'use strict';
|
|
|
|
function assertEncoding(encoding) {
|
|
if (encoding && !Buffer.isEncoding(encoding)) {
|
|
throw new Error('Unknown encoding: ' + encoding);
|
|
}
|
|
}
|
|
|
|
// StringDecoder provides an interface for efficiently splitting a series of
|
|
// buffers into a series of JS strings without breaking apart multi-byte
|
|
// characters. CESU-8 is handled as part of the UTF-8 encoding.
|
|
//
|
|
// @TODO Handling all encodings inside a single object makes it very difficult
|
|
// to reason about this code, so it should be split up in the future.
|
|
// @TODO There should be a utf8-strict encoding that rejects invalid UTF-8 code
|
|
// points as used by CESU-8.
|
|
var StringDecoder = exports.StringDecoder = function(encoding) {
|
|
this.encoding = (encoding || 'utf8').toLowerCase().replace(/[-_]/, '');
|
|
assertEncoding(encoding);
|
|
switch (this.encoding) {
|
|
case 'utf8':
|
|
// CESU-8 represents each of Surrogate Pair by 3-bytes
|
|
this.surrogateSize = 3;
|
|
break;
|
|
case 'ucs2':
|
|
case 'utf16le':
|
|
// UTF-16 represents each of Surrogate Pair by 2-bytes
|
|
this.surrogateSize = 2;
|
|
this.detectIncompleteChar = utf16DetectIncompleteChar;
|
|
break;
|
|
case 'base64':
|
|
// Base-64 stores 3 bytes in 4 chars, and pads the remainder.
|
|
this.surrogateSize = 3;
|
|
this.detectIncompleteChar = base64DetectIncompleteChar;
|
|
break;
|
|
default:
|
|
this.write = passThroughWrite;
|
|
return;
|
|
}
|
|
|
|
// Enough space to store all bytes of a single character. UTF-8 needs 4
|
|
// bytes, but CESU-8 may require up to 6 (3 bytes per surrogate).
|
|
this.charBuffer = new Buffer(6);
|
|
// Number of bytes received for the current incomplete multi-byte character.
|
|
this.charReceived = 0;
|
|
// Number of bytes expected for the current incomplete multi-byte character.
|
|
this.charLength = 0;
|
|
};
|
|
|
|
|
|
// write decodes the given buffer and returns it as JS string that is
|
|
// guaranteed to not contain any partial multi-byte characters. Any partial
|
|
// character found at the end of the buffer is buffered up, and will be
|
|
// returned when calling write again with the remaining bytes.
|
|
//
|
|
// Note: Converting a Buffer containing an orphan surrogate to a String
|
|
// currently works, but converting a String to a Buffer (via `new Buffer`, or
|
|
// Buffer#write) will replace incomplete surrogates with the unicode
|
|
// replacement character. See https://codereview.chromium.org/121173009/ .
|
|
StringDecoder.prototype.write = function(buffer) {
|
|
var charStr = '';
|
|
// if our last write ended with an incomplete multibyte character
|
|
while (this.charLength) {
|
|
// determine how many remaining bytes this buffer has to offer for this char
|
|
var available = (buffer.length >= this.charLength - this.charReceived) ?
|
|
this.charLength - this.charReceived :
|
|
buffer.length;
|
|
|
|
// add the new bytes to the char buffer
|
|
buffer.copy(this.charBuffer, this.charReceived, 0, available);
|
|
this.charReceived += available;
|
|
|
|
if (this.charReceived < this.charLength) {
|
|
// still not enough chars in this buffer? wait for more ...
|
|
return '';
|
|
}
|
|
|
|
// remove bytes belonging to the current character from the buffer
|
|
buffer = buffer.slice(available, buffer.length);
|
|
|
|
// get the character that was split
|
|
charStr = this.charBuffer.slice(0, this.charLength).toString(this.encoding);
|
|
|
|
// CESU-8: lead surrogate (D800-DBFF) is also the incomplete character
|
|
var charCode = charStr.charCodeAt(charStr.length - 1);
|
|
if (charCode >= 0xD800 && charCode <= 0xDBFF) {
|
|
this.charLength += this.surrogateSize;
|
|
charStr = '';
|
|
continue;
|
|
}
|
|
this.charReceived = this.charLength = 0;
|
|
|
|
// if there are no more bytes in this buffer, just emit our char
|
|
if (buffer.length === 0) {
|
|
return charStr;
|
|
}
|
|
break;
|
|
}
|
|
|
|
// determine and set charLength / charReceived
|
|
this.detectIncompleteChar(buffer);
|
|
|
|
var end = buffer.length;
|
|
if (this.charLength) {
|
|
// buffer the incomplete character bytes we got
|
|
buffer.copy(this.charBuffer, 0, buffer.length - this.charReceived, end);
|
|
end -= this.charReceived;
|
|
}
|
|
|
|
charStr += buffer.toString(this.encoding, 0, end);
|
|
|
|
var end = charStr.length - 1;
|
|
var charCode = charStr.charCodeAt(end);
|
|
// CESU-8: lead surrogate (D800-DBFF) is also the incomplete character
|
|
if (charCode >= 0xD800 && charCode <= 0xDBFF) {
|
|
var size = this.surrogateSize;
|
|
this.charLength += size;
|
|
this.charReceived += size;
|
|
this.charBuffer.copy(this.charBuffer, size, 0, size);
|
|
buffer.copy(this.charBuffer, 0, 0, size);
|
|
return charStr.substring(0, end);
|
|
}
|
|
|
|
// or just emit the charStr
|
|
return charStr;
|
|
};
|
|
|
|
// detectIncompleteChar determines if there is an incomplete UTF-8 character at
|
|
// the end of the given buffer. If so, it sets this.charLength to the byte
|
|
// length that character, and sets this.charReceived to the number of bytes
|
|
// that are available for this character.
|
|
StringDecoder.prototype.detectIncompleteChar = function(buffer) {
|
|
// determine how many bytes we have to check at the end of this buffer
|
|
var i = (buffer.length >= 3) ? 3 : buffer.length;
|
|
|
|
// Figure out if one of the last i bytes of our buffer announces an
|
|
// incomplete char.
|
|
for (; i > 0; i--) {
|
|
var c = buffer[buffer.length - i];
|
|
|
|
// See http://en.wikipedia.org/wiki/UTF-8#Description
|
|
|
|
// 110XXXXX
|
|
if (i == 1 && c >> 5 == 0x06) {
|
|
this.charLength = 2;
|
|
break;
|
|
}
|
|
|
|
// 1110XXXX
|
|
if (i <= 2 && c >> 4 == 0x0E) {
|
|
this.charLength = 3;
|
|
break;
|
|
}
|
|
|
|
// 11110XXX
|
|
if (i <= 3 && c >> 3 == 0x1E) {
|
|
this.charLength = 4;
|
|
break;
|
|
}
|
|
}
|
|
this.charReceived = i;
|
|
};
|
|
|
|
StringDecoder.prototype.end = function(buffer) {
|
|
var res = '';
|
|
if (buffer && buffer.length)
|
|
res = this.write(buffer);
|
|
|
|
if (this.charReceived) {
|
|
var cr = this.charReceived;
|
|
var buf = this.charBuffer;
|
|
var enc = this.encoding;
|
|
res += buf.slice(0, cr).toString(enc);
|
|
}
|
|
|
|
return res;
|
|
};
|
|
|
|
function passThroughWrite(buffer) {
|
|
return buffer.toString(this.encoding);
|
|
}
|
|
|
|
function utf16DetectIncompleteChar(buffer) {
|
|
this.charReceived = buffer.length % 2;
|
|
this.charLength = this.charReceived ? 2 : 0;
|
|
}
|
|
|
|
function base64DetectIncompleteChar(buffer) {
|
|
this.charReceived = buffer.length % 3;
|
|
this.charLength = this.charReceived ? 3 : 0;
|
|
}
|