html-encoding-sniffer.js 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302
  1. "use strict";
  2. const whatwgEncoding = require("whatwg-encoding");
  3. // https://html.spec.whatwg.org/#encoding-sniffing-algorithm
  4. module.exports = function sniffHTMLEncoding(buffer, options) {
  5. let encoding = whatwgEncoding.getBOMEncoding(buffer); // see https://github.com/whatwg/html/issues/1910
  6. if (options === undefined) {
  7. options = {};
  8. }
  9. if (encoding === null && options.transportLayerEncodingLabel !== undefined) {
  10. encoding = whatwgEncoding.labelToName(options.transportLayerEncodingLabel);
  11. }
  12. if (encoding === null) {
  13. encoding = prescanMetaCharset(buffer);
  14. }
  15. if (encoding === null && options.defaultEncoding !== undefined) {
  16. encoding = options.defaultEncoding;
  17. }
  18. if (encoding === null) {
  19. encoding = "windows-1252";
  20. }
  21. return encoding;
  22. };
  23. // https://html.spec.whatwg.org/multipage/syntax.html#prescan-a-byte-stream-to-determine-its-encoding
  24. function prescanMetaCharset(buffer) {
  25. const l = Math.min(buffer.length, 1024);
  26. for (let i = 0; i < l; i++) {
  27. let c = buffer[i];
  28. if (c === 0x3C) {
  29. // "<"
  30. let c1 = buffer[i + 1];
  31. let c2 = buffer[i + 2];
  32. const c3 = buffer[i + 3];
  33. const c4 = buffer[i + 4];
  34. const c5 = buffer[i + 5];
  35. // !-- (comment start)
  36. if (c1 === 0x21 && c2 === 0x2D && c3 === 0x2D) {
  37. i += 4;
  38. for (; i < l; i++) {
  39. c = buffer[i];
  40. c1 = buffer[i + 1];
  41. c2 = buffer[i + 2];
  42. // --> (comment end)
  43. if (c === 0x2D && c1 === 0x2D && c2 === 0x3E) {
  44. i += 2;
  45. break;
  46. }
  47. }
  48. } else if ((c1 === 0x4D || c1 === 0x6D) &&
  49. (c2 === 0x45 || c2 === 0x65) &&
  50. (c3 === 0x54 || c3 === 0x74) &&
  51. (c4 === 0x41 || c4 === 0x61) &&
  52. (isSpaceCharacter(c5) || c5 === 0x2F)) {
  53. // "meta" + space or /
  54. i += 6;
  55. let gotPragma = false;
  56. let needPragma = null;
  57. let charset = null;
  58. let attrRes;
  59. do {
  60. attrRes = getAttribute(buffer, i, l);
  61. if (attrRes.attr) {
  62. if (attrRes.attr.name === "http-equiv") {
  63. gotPragma = attrRes.attr.value === "content-type";
  64. } else if (attrRes.attr.name === "content" && !charset) {
  65. charset = extractCharacterEncodingFromMeta(attrRes.attr.value);
  66. if (charset !== null) {
  67. needPragma = true;
  68. }
  69. } else if (attrRes.attr.name === "charset") {
  70. charset = whatwgEncoding.labelToName(attrRes.attr.value);
  71. needPragma = false;
  72. }
  73. }
  74. i = attrRes.i;
  75. } while (attrRes.attr);
  76. if (needPragma === null) {
  77. continue;
  78. }
  79. if (needPragma === true && gotPragma === false) {
  80. continue;
  81. }
  82. if (charset === null) {
  83. continue;
  84. }
  85. if (charset === "UTF-16LE" || charset === "UTF-16BE") {
  86. charset = "UTF-8";
  87. }
  88. if (charset === "x-user-defined") {
  89. charset = "windows-1252";
  90. }
  91. return charset;
  92. } else if ((c1 >= 0x41 && c1 <= 0x5A) || (c1 >= 0x61 && c1 <= 0x7A)) {
  93. // a-z or A-Z
  94. for (i += 2; i < l; i++) {
  95. c = buffer[i];
  96. // space or >
  97. if (isSpaceCharacter(c) || c === 0x3E) {
  98. break;
  99. }
  100. }
  101. let attrRes;
  102. do {
  103. attrRes = getAttribute(buffer, i, l);
  104. i = attrRes.i;
  105. } while (attrRes.attr);
  106. } else if (c1 === 0x21 || c1 === 0x2F || c1 === 0x3F) {
  107. // ! or / or ?
  108. for (i += 2; i < l; i++) {
  109. c = buffer[i];
  110. // >
  111. if (c === 0x3E) {
  112. break;
  113. }
  114. }
  115. }
  116. }
  117. }
  118. return null;
  119. }
  120. // https://html.spec.whatwg.org/multipage/syntax.html#concept-get-attributes-when-sniffing
  121. function getAttribute(buffer, i, l) {
  122. for (; i < l; i++) {
  123. let c = buffer[i];
  124. // space or /
  125. if (isSpaceCharacter(c) || c === 0x2F) {
  126. continue;
  127. }
  128. // ">"
  129. if (c === 0x3E) {
  130. i++;
  131. break;
  132. }
  133. let name = "";
  134. let value = "";
  135. nameLoop:for (; i < l; i++) {
  136. c = buffer[i];
  137. // "="
  138. if (c === 0x3D && name !== "") {
  139. i++;
  140. break;
  141. }
  142. // space
  143. if (isSpaceCharacter(c)) {
  144. for (i++; i < l; i++) {
  145. c = buffer[i];
  146. // space
  147. if (isSpaceCharacter(c)) {
  148. continue;
  149. }
  150. // not "="
  151. if (c !== 0x3D) {
  152. return { attr: { name, value }, i };
  153. }
  154. i++;
  155. break nameLoop;
  156. }
  157. break;
  158. }
  159. // / or >
  160. if (c === 0x2F || c === 0x3E) {
  161. return { attr: { name, value }, i };
  162. }
  163. // A-Z
  164. if (c >= 0x41 && c <= 0x5A) {
  165. name += String.fromCharCode(c + 0x20); // lowercase
  166. } else {
  167. name += String.fromCharCode(c);
  168. }
  169. }
  170. c = buffer[i];
  171. // space
  172. if (isSpaceCharacter(c)) {
  173. for (i++; i < l; i++) {
  174. c = buffer[i];
  175. // space
  176. if (isSpaceCharacter(c)) {
  177. continue;
  178. } else {
  179. break;
  180. }
  181. }
  182. }
  183. // " or '
  184. if (c === 0x22 || c === 0x27) {
  185. const quote = c;
  186. for (i++; i < l; i++) {
  187. c = buffer[i];
  188. if (c === quote) {
  189. i++;
  190. return { attr: { name, value }, i };
  191. }
  192. // A-Z
  193. if (c >= 0x41 && c <= 0x5A) {
  194. value += String.fromCharCode(c + 0x20); // lowercase
  195. } else {
  196. value += String.fromCharCode(c);
  197. }
  198. }
  199. }
  200. // >
  201. if (c === 0x3E) {
  202. return { attr: { name, value }, i };
  203. }
  204. // A-Z
  205. if (c >= 0x41 && c <= 0x5A) {
  206. value += String.fromCharCode(c + 0x20); // lowercase
  207. } else {
  208. value += String.fromCharCode(c);
  209. }
  210. for (i++; i < l; i++) {
  211. c = buffer[i];
  212. // space or >
  213. if (isSpaceCharacter(c) || c === 0x3E) {
  214. return { attr: { name, value }, i };
  215. }
  216. // A-Z
  217. if (c >= 0x41 && c <= 0x5A) {
  218. value += String.fromCharCode(c + 0x20); // lowercase
  219. } else {
  220. value += String.fromCharCode(c);
  221. }
  222. }
  223. }
  224. return { i };
  225. }
  226. function extractCharacterEncodingFromMeta(string) {
  227. let position = 0;
  228. while (true) {
  229. let subPosition = string.substring(position).search(/charset/i);
  230. if (subPosition === -1) {
  231. return null;
  232. }
  233. subPosition += "charset".length;
  234. while (isSpaceCharacter(string[subPosition].charCodeAt(0))) {
  235. ++subPosition;
  236. }
  237. if (string[subPosition] !== "=") {
  238. position = subPosition - 1;
  239. continue;
  240. }
  241. ++subPosition;
  242. while (isSpaceCharacter(string[subPosition].charCodeAt(0))) {
  243. ++subPosition;
  244. }
  245. position = subPosition;
  246. break;
  247. }
  248. if (string[position] === "\"" || string[position] === "'") {
  249. const nextIndex = string.indexOf(string[position], position + 1);
  250. if (nextIndex !== -1) {
  251. return whatwgEncoding.labelToName(string.substring(position + 1, nextIndex));
  252. }
  253. // It is an unmatched quotation mark
  254. return null;
  255. }
  256. if (string.length === position + 1) {
  257. return null;
  258. }
  259. let end = string.substring(position + 1).search(/\x09|\x0A|\x0C|\x0D|\x20|;/);
  260. if (end === -1) {
  261. end = string.length;
  262. }
  263. return whatwgEncoding.labelToName(string.substring(position, end));
  264. }
  265. function isSpaceCharacter(c) {
  266. return c === 0x09 || c === 0x0A || c === 0x0C || c === 0x0D || c === 0x20;
  267. }