123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302 |
- "use strict";
- const whatwgEncoding = require("whatwg-encoding");
- // https://html.spec.whatwg.org/#encoding-sniffing-algorithm
- module.exports = function sniffHTMLEncoding(buffer, options) {
- let encoding = whatwgEncoding.getBOMEncoding(buffer); // see https://github.com/whatwg/html/issues/1910
- if (options === undefined) {
- options = {};
- }
- if (encoding === null && options.transportLayerEncodingLabel !== undefined) {
- encoding = whatwgEncoding.labelToName(options.transportLayerEncodingLabel);
- }
- if (encoding === null) {
- encoding = prescanMetaCharset(buffer);
- }
- if (encoding === null && options.defaultEncoding !== undefined) {
- encoding = options.defaultEncoding;
- }
- if (encoding === null) {
- encoding = "windows-1252";
- }
- return encoding;
- };
- // https://html.spec.whatwg.org/multipage/syntax.html#prescan-a-byte-stream-to-determine-its-encoding
- function prescanMetaCharset(buffer) {
- const l = Math.min(buffer.length, 1024);
- for (let i = 0; i < l; i++) {
- let c = buffer[i];
- if (c === 0x3C) {
- // "<"
- let c1 = buffer[i + 1];
- let c2 = buffer[i + 2];
- const c3 = buffer[i + 3];
- const c4 = buffer[i + 4];
- const c5 = buffer[i + 5];
- // !-- (comment start)
- if (c1 === 0x21 && c2 === 0x2D && c3 === 0x2D) {
- i += 4;
- for (; i < l; i++) {
- c = buffer[i];
- c1 = buffer[i + 1];
- c2 = buffer[i + 2];
- // --> (comment end)
- if (c === 0x2D && c1 === 0x2D && c2 === 0x3E) {
- i += 2;
- break;
- }
- }
- } else if ((c1 === 0x4D || c1 === 0x6D) &&
- (c2 === 0x45 || c2 === 0x65) &&
- (c3 === 0x54 || c3 === 0x74) &&
- (c4 === 0x41 || c4 === 0x61) &&
- (isSpaceCharacter(c5) || c5 === 0x2F)) {
- // "meta" + space or /
- i += 6;
- let gotPragma = false;
- let needPragma = null;
- let charset = null;
- let attrRes;
- do {
- attrRes = getAttribute(buffer, i, l);
- if (attrRes.attr) {
- if (attrRes.attr.name === "http-equiv") {
- gotPragma = attrRes.attr.value === "content-type";
- } else if (attrRes.attr.name === "content" && !charset) {
- charset = extractCharacterEncodingFromMeta(attrRes.attr.value);
- if (charset !== null) {
- needPragma = true;
- }
- } else if (attrRes.attr.name === "charset") {
- charset = whatwgEncoding.labelToName(attrRes.attr.value);
- needPragma = false;
- }
- }
- i = attrRes.i;
- } while (attrRes.attr);
- if (needPragma === null) {
- continue;
- }
- if (needPragma === true && gotPragma === false) {
- continue;
- }
- if (charset === null) {
- continue;
- }
- if (charset === "UTF-16LE" || charset === "UTF-16BE") {
- charset = "UTF-8";
- }
- if (charset === "x-user-defined") {
- charset = "windows-1252";
- }
- return charset;
- } else if ((c1 >= 0x41 && c1 <= 0x5A) || (c1 >= 0x61 && c1 <= 0x7A)) {
- // a-z or A-Z
- for (i += 2; i < l; i++) {
- c = buffer[i];
- // space or >
- if (isSpaceCharacter(c) || c === 0x3E) {
- break;
- }
- }
- let attrRes;
- do {
- attrRes = getAttribute(buffer, i, l);
- i = attrRes.i;
- } while (attrRes.attr);
- } else if (c1 === 0x21 || c1 === 0x2F || c1 === 0x3F) {
- // ! or / or ?
- for (i += 2; i < l; i++) {
- c = buffer[i];
- // >
- if (c === 0x3E) {
- break;
- }
- }
- }
- }
- }
- return null;
- }
- // https://html.spec.whatwg.org/multipage/syntax.html#concept-get-attributes-when-sniffing
- function getAttribute(buffer, i, l) {
- for (; i < l; i++) {
- let c = buffer[i];
- // space or /
- if (isSpaceCharacter(c) || c === 0x2F) {
- continue;
- }
- // ">"
- if (c === 0x3E) {
- i++;
- break;
- }
- let name = "";
- let value = "";
- nameLoop:for (; i < l; i++) {
- c = buffer[i];
- // "="
- if (c === 0x3D && name !== "") {
- i++;
- break;
- }
- // space
- if (isSpaceCharacter(c)) {
- for (i++; i < l; i++) {
- c = buffer[i];
- // space
- if (isSpaceCharacter(c)) {
- continue;
- }
- // not "="
- if (c !== 0x3D) {
- return { attr: { name, value }, i };
- }
- i++;
- break nameLoop;
- }
- break;
- }
- // / or >
- if (c === 0x2F || c === 0x3E) {
- return { attr: { name, value }, i };
- }
- // A-Z
- if (c >= 0x41 && c <= 0x5A) {
- name += String.fromCharCode(c + 0x20); // lowercase
- } else {
- name += String.fromCharCode(c);
- }
- }
- c = buffer[i];
- // space
- if (isSpaceCharacter(c)) {
- for (i++; i < l; i++) {
- c = buffer[i];
- // space
- if (isSpaceCharacter(c)) {
- continue;
- } else {
- break;
- }
- }
- }
- // " or '
- if (c === 0x22 || c === 0x27) {
- const quote = c;
- for (i++; i < l; i++) {
- c = buffer[i];
- if (c === quote) {
- i++;
- return { attr: { name, value }, i };
- }
- // A-Z
- if (c >= 0x41 && c <= 0x5A) {
- value += String.fromCharCode(c + 0x20); // lowercase
- } else {
- value += String.fromCharCode(c);
- }
- }
- }
- // >
- if (c === 0x3E) {
- return { attr: { name, value }, i };
- }
- // A-Z
- if (c >= 0x41 && c <= 0x5A) {
- value += String.fromCharCode(c + 0x20); // lowercase
- } else {
- value += String.fromCharCode(c);
- }
- for (i++; i < l; i++) {
- c = buffer[i];
- // space or >
- if (isSpaceCharacter(c) || c === 0x3E) {
- return { attr: { name, value }, i };
- }
- // A-Z
- if (c >= 0x41 && c <= 0x5A) {
- value += String.fromCharCode(c + 0x20); // lowercase
- } else {
- value += String.fromCharCode(c);
- }
- }
- }
- return { i };
- }
- function extractCharacterEncodingFromMeta(string) {
- let position = 0;
- while (true) {
- let subPosition = string.substring(position).search(/charset/i);
- if (subPosition === -1) {
- return null;
- }
- subPosition += "charset".length;
- while (isSpaceCharacter(string[subPosition].charCodeAt(0))) {
- ++subPosition;
- }
- if (string[subPosition] !== "=") {
- position = subPosition - 1;
- continue;
- }
- ++subPosition;
- while (isSpaceCharacter(string[subPosition].charCodeAt(0))) {
- ++subPosition;
- }
- position = subPosition;
- break;
- }
- if (string[position] === "\"" || string[position] === "'") {
- const nextIndex = string.indexOf(string[position], position + 1);
- if (nextIndex !== -1) {
- return whatwgEncoding.labelToName(string.substring(position + 1, nextIndex));
- }
- // It is an unmatched quotation mark
- return null;
- }
- if (string.length === position + 1) {
- return null;
- }
- let end = string.substring(position + 1).search(/\x09|\x0A|\x0C|\x0D|\x20|;/);
- if (end === -1) {
- end = string.length;
- }
- return whatwgEncoding.labelToName(string.substring(position, end));
- }
- function isSpaceCharacter(c) {
- return c === 0x09 || c === 0x0A || c === 0x0C || c === 0x0D || c === 0x20;
- }
|