utf8.js 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210
  1. /*! https://mths.be/utf8js v2.1.2 by @mathias */
  2. var stringFromCharCode = String.fromCharCode;
  3. // Taken from https://mths.be/punycode
  4. function ucs2decode(string) {
  5. var output = [];
  6. var counter = 0;
  7. var length = string.length;
  8. var value;
  9. var extra;
  10. while (counter < length) {
  11. value = string.charCodeAt(counter++);
  12. if (value >= 0xD800 && value <= 0xDBFF && counter < length) {
  13. // high surrogate, and there is a next character
  14. extra = string.charCodeAt(counter++);
  15. if ((extra & 0xFC00) == 0xDC00) { // low surrogate
  16. output.push(((value & 0x3FF) << 10) + (extra & 0x3FF) + 0x10000);
  17. } else {
  18. // unmatched surrogate; only append this code unit, in case the next
  19. // code unit is the high surrogate of a surrogate pair
  20. output.push(value);
  21. counter--;
  22. }
  23. } else {
  24. output.push(value);
  25. }
  26. }
  27. return output;
  28. }
  29. // Taken from https://mths.be/punycode
  30. function ucs2encode(array) {
  31. var length = array.length;
  32. var index = -1;
  33. var value;
  34. var output = '';
  35. while (++index < length) {
  36. value = array[index];
  37. if (value > 0xFFFF) {
  38. value -= 0x10000;
  39. output += stringFromCharCode(value >>> 10 & 0x3FF | 0xD800);
  40. value = 0xDC00 | value & 0x3FF;
  41. }
  42. output += stringFromCharCode(value);
  43. }
  44. return output;
  45. }
  46. function checkScalarValue(codePoint, strict) {
  47. if (codePoint >= 0xD800 && codePoint <= 0xDFFF) {
  48. if (strict) {
  49. throw Error(
  50. 'Lone surrogate U+' + codePoint.toString(16).toUpperCase() +
  51. ' is not a scalar value'
  52. );
  53. }
  54. return false;
  55. }
  56. return true;
  57. }
  58. /*--------------------------------------------------------------------------*/
  59. function createByte(codePoint, shift) {
  60. return stringFromCharCode(((codePoint >> shift) & 0x3F) | 0x80);
  61. }
  62. function encodeCodePoint(codePoint, strict) {
  63. if ((codePoint & 0xFFFFFF80) == 0) { // 1-byte sequence
  64. return stringFromCharCode(codePoint);
  65. }
  66. var symbol = '';
  67. if ((codePoint & 0xFFFFF800) == 0) { // 2-byte sequence
  68. symbol = stringFromCharCode(((codePoint >> 6) & 0x1F) | 0xC0);
  69. }
  70. else if ((codePoint & 0xFFFF0000) == 0) { // 3-byte sequence
  71. if (!checkScalarValue(codePoint, strict)) {
  72. codePoint = 0xFFFD;
  73. }
  74. symbol = stringFromCharCode(((codePoint >> 12) & 0x0F) | 0xE0);
  75. symbol += createByte(codePoint, 6);
  76. }
  77. else if ((codePoint & 0xFFE00000) == 0) { // 4-byte sequence
  78. symbol = stringFromCharCode(((codePoint >> 18) & 0x07) | 0xF0);
  79. symbol += createByte(codePoint, 12);
  80. symbol += createByte(codePoint, 6);
  81. }
  82. symbol += stringFromCharCode((codePoint & 0x3F) | 0x80);
  83. return symbol;
  84. }
  85. function utf8encode(string, opts) {
  86. opts = opts || {};
  87. var strict = false !== opts.strict;
  88. var codePoints = ucs2decode(string);
  89. var length = codePoints.length;
  90. var index = -1;
  91. var codePoint;
  92. var byteString = '';
  93. while (++index < length) {
  94. codePoint = codePoints[index];
  95. byteString += encodeCodePoint(codePoint, strict);
  96. }
  97. return byteString;
  98. }
  99. /*--------------------------------------------------------------------------*/
  100. function readContinuationByte() {
  101. if (byteIndex >= byteCount) {
  102. throw Error('Invalid byte index');
  103. }
  104. var continuationByte = byteArray[byteIndex] & 0xFF;
  105. byteIndex++;
  106. if ((continuationByte & 0xC0) == 0x80) {
  107. return continuationByte & 0x3F;
  108. }
  109. // If we end up here, it’s not a continuation byte
  110. throw Error('Invalid continuation byte');
  111. }
  112. function decodeSymbol(strict) {
  113. var byte1;
  114. var byte2;
  115. var byte3;
  116. var byte4;
  117. var codePoint;
  118. if (byteIndex > byteCount) {
  119. throw Error('Invalid byte index');
  120. }
  121. if (byteIndex == byteCount) {
  122. return false;
  123. }
  124. // Read first byte
  125. byte1 = byteArray[byteIndex] & 0xFF;
  126. byteIndex++;
  127. // 1-byte sequence (no continuation bytes)
  128. if ((byte1 & 0x80) == 0) {
  129. return byte1;
  130. }
  131. // 2-byte sequence
  132. if ((byte1 & 0xE0) == 0xC0) {
  133. byte2 = readContinuationByte();
  134. codePoint = ((byte1 & 0x1F) << 6) | byte2;
  135. if (codePoint >= 0x80) {
  136. return codePoint;
  137. } else {
  138. throw Error('Invalid continuation byte');
  139. }
  140. }
  141. // 3-byte sequence (may include unpaired surrogates)
  142. if ((byte1 & 0xF0) == 0xE0) {
  143. byte2 = readContinuationByte();
  144. byte3 = readContinuationByte();
  145. codePoint = ((byte1 & 0x0F) << 12) | (byte2 << 6) | byte3;
  146. if (codePoint >= 0x0800) {
  147. return checkScalarValue(codePoint, strict) ? codePoint : 0xFFFD;
  148. } else {
  149. throw Error('Invalid continuation byte');
  150. }
  151. }
  152. // 4-byte sequence
  153. if ((byte1 & 0xF8) == 0xF0) {
  154. byte2 = readContinuationByte();
  155. byte3 = readContinuationByte();
  156. byte4 = readContinuationByte();
  157. codePoint = ((byte1 & 0x07) << 0x12) | (byte2 << 0x0C) |
  158. (byte3 << 0x06) | byte4;
  159. if (codePoint >= 0x010000 && codePoint <= 0x10FFFF) {
  160. return codePoint;
  161. }
  162. }
  163. throw Error('Invalid UTF-8 detected');
  164. }
  165. var byteArray;
  166. var byteCount;
  167. var byteIndex;
  168. function utf8decode(byteString, opts) {
  169. opts = opts || {};
  170. var strict = false !== opts.strict;
  171. byteArray = ucs2decode(byteString);
  172. byteCount = byteArray.length;
  173. byteIndex = 0;
  174. var codePoints = [];
  175. var tmp;
  176. while ((tmp = decodeSymbol(strict)) !== false) {
  177. codePoints.push(tmp);
  178. }
  179. return ucs2encode(codePoints);
  180. }
  181. module.exports = {
  182. version: '2.1.2',
  183. encode: utf8encode,
  184. decode: utf8decode
  185. };