123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591 |
- var TokenStream = require('../common/TokenStream');
- var adoptBuffer = require('../common/adopt-buffer');
- var constants = require('./const');
- var TYPE = constants.TYPE;
- var charCodeDefinitions = require('./char-code-definitions');
- var isNewline = charCodeDefinitions.isNewline;
- var isName = charCodeDefinitions.isName;
- var isValidEscape = charCodeDefinitions.isValidEscape;
- var isNumberStart = charCodeDefinitions.isNumberStart;
- var isIdentifierStart = charCodeDefinitions.isIdentifierStart;
- var charCodeCategory = charCodeDefinitions.charCodeCategory;
- var isBOM = charCodeDefinitions.isBOM;
- var utils = require('./utils');
- var cmpStr = utils.cmpStr;
- var getNewlineLength = utils.getNewlineLength;
- var findWhiteSpaceEnd = utils.findWhiteSpaceEnd;
- var consumeEscaped = utils.consumeEscaped;
- var consumeName = utils.consumeName;
- var consumeNumber = utils.consumeNumber;
- var consumeBadUrlRemnants = utils.consumeBadUrlRemnants;
- var OFFSET_MASK = 0x00FFFFFF;
- var TYPE_SHIFT = 24;
- function tokenize(source, stream) {
- function getCharCode(offset) {
- return offset < sourceLength ? source.charCodeAt(offset) : 0;
- }
- // § 4.3.3. Consume a numeric token
- function consumeNumericToken() {
- // Consume a number and let number be the result.
- offset = consumeNumber(source, offset);
- // If the next 3 input code points would start an identifier, then:
- if (isIdentifierStart(getCharCode(offset), getCharCode(offset + 1), getCharCode(offset + 2))) {
- // Create a <dimension-token> with the same value and type flag as number, and a unit set initially to the empty string.
- // Consume a name. Set the <dimension-token>’s unit to the returned value.
- // Return the <dimension-token>.
- type = TYPE.Dimension;
- offset = consumeName(source, offset);
- return;
- }
- // Otherwise, if the next input code point is U+0025 PERCENTAGE SIGN (%), consume it.
- if (getCharCode(offset) === 0x0025) {
- // Create a <percentage-token> with the same value as number, and return it.
- type = TYPE.Percentage;
- offset++;
- return;
- }
- // Otherwise, create a <number-token> with the same value and type flag as number, and return it.
- type = TYPE.Number;
- }
- // § 4.3.4. Consume an ident-like token
- function consumeIdentLikeToken() {
- const nameStartOffset = offset;
- // Consume a name, and let string be the result.
- offset = consumeName(source, offset);
- // If string’s value is an ASCII case-insensitive match for "url",
- // and the next input code point is U+0028 LEFT PARENTHESIS ((), consume it.
- if (cmpStr(source, nameStartOffset, offset, 'url') && getCharCode(offset) === 0x0028) {
- // While the next two input code points are whitespace, consume the next input code point.
- offset = findWhiteSpaceEnd(source, offset + 1);
- // If the next one or two input code points are U+0022 QUOTATION MARK ("), U+0027 APOSTROPHE ('),
- // or whitespace followed by U+0022 QUOTATION MARK (") or U+0027 APOSTROPHE ('),
- // then create a <function-token> with its value set to string and return it.
- if (getCharCode(offset) === 0x0022 ||
- getCharCode(offset) === 0x0027) {
- type = TYPE.Function;
- offset = nameStartOffset + 4;
- return;
- }
- // Otherwise, consume a url token, and return it.
- consumeUrlToken();
- return;
- }
- // Otherwise, if the next input code point is U+0028 LEFT PARENTHESIS ((), consume it.
- // Create a <function-token> with its value set to string and return it.
- if (getCharCode(offset) === 0x0028) {
- type = TYPE.Function;
- offset++;
- return;
- }
- // Otherwise, create an <ident-token> with its value set to string and return it.
- type = TYPE.Ident;
- }
- // § 4.3.5. Consume a string token
- function consumeStringToken(endingCodePoint) {
- // This algorithm may be called with an ending code point, which denotes the code point
- // that ends the string. If an ending code point is not specified,
- // the current input code point is used.
- if (!endingCodePoint) {
- endingCodePoint = getCharCode(offset++);
- }
- // Initially create a <string-token> with its value set to the empty string.
- type = TYPE.String;
- // Repeatedly consume the next input code point from the stream:
- for (; offset < source.length; offset++) {
- var code = source.charCodeAt(offset);
- switch (charCodeCategory(code)) {
- // ending code point
- case endingCodePoint:
- // Return the <string-token>.
- offset++;
- return;
- // EOF
- case charCodeCategory.Eof:
- // This is a parse error. Return the <string-token>.
- return;
- // newline
- case charCodeCategory.WhiteSpace:
- if (isNewline(code)) {
- // This is a parse error. Reconsume the current input code point,
- // create a <bad-string-token>, and return it.
- offset += getNewlineLength(source, offset, code);
- type = TYPE.BadString;
- return;
- }
- break;
- // U+005C REVERSE SOLIDUS (\)
- case 0x005C:
- // If the next input code point is EOF, do nothing.
- if (offset === source.length - 1) {
- break;
- }
- var nextCode = getCharCode(offset + 1);
- // Otherwise, if the next input code point is a newline, consume it.
- if (isNewline(nextCode)) {
- offset += getNewlineLength(source, offset + 1, nextCode);
- } else if (isValidEscape(code, nextCode)) {
- // Otherwise, (the stream starts with a valid escape) consume
- // an escaped code point and append the returned code point to
- // the <string-token>’s value.
- offset = consumeEscaped(source, offset) - 1;
- }
- break;
- // anything else
- // Append the current input code point to the <string-token>’s value.
- }
- }
- }
- // § 4.3.6. Consume a url token
- // Note: This algorithm assumes that the initial "url(" has already been consumed.
- // This algorithm also assumes that it’s being called to consume an "unquoted" value, like url(foo).
- // A quoted value, like url("foo"), is parsed as a <function-token>. Consume an ident-like token
- // automatically handles this distinction; this algorithm shouldn’t be called directly otherwise.
- function consumeUrlToken() {
- // Initially create a <url-token> with its value set to the empty string.
- type = TYPE.Url;
- // Consume as much whitespace as possible.
- offset = findWhiteSpaceEnd(source, offset);
- // Repeatedly consume the next input code point from the stream:
- for (; offset < source.length; offset++) {
- var code = source.charCodeAt(offset);
- switch (charCodeCategory(code)) {
- // U+0029 RIGHT PARENTHESIS ())
- case 0x0029:
- // Return the <url-token>.
- offset++;
- return;
- // EOF
- case charCodeCategory.Eof:
- // This is a parse error. Return the <url-token>.
- return;
- // whitespace
- case charCodeCategory.WhiteSpace:
- // Consume as much whitespace as possible.
- offset = findWhiteSpaceEnd(source, offset);
- // If the next input code point is U+0029 RIGHT PARENTHESIS ()) or EOF,
- // consume it and return the <url-token>
- // (if EOF was encountered, this is a parse error);
- if (getCharCode(offset) === 0x0029 || offset >= source.length) {
- if (offset < source.length) {
- offset++;
- }
- return;
- }
- // otherwise, consume the remnants of a bad url, create a <bad-url-token>,
- // and return it.
- offset = consumeBadUrlRemnants(source, offset);
- type = TYPE.BadUrl;
- return;
- // U+0022 QUOTATION MARK (")
- // U+0027 APOSTROPHE (')
- // U+0028 LEFT PARENTHESIS (()
- // non-printable code point
- case 0x0022:
- case 0x0027:
- case 0x0028:
- case charCodeCategory.NonPrintable:
- // This is a parse error. Consume the remnants of a bad url,
- // create a <bad-url-token>, and return it.
- offset = consumeBadUrlRemnants(source, offset);
- type = TYPE.BadUrl;
- return;
- // U+005C REVERSE SOLIDUS (\)
- case 0x005C:
- // If the stream starts with a valid escape, consume an escaped code point and
- // append the returned code point to the <url-token>’s value.
- if (isValidEscape(code, getCharCode(offset + 1))) {
- offset = consumeEscaped(source, offset) - 1;
- break;
- }
- // Otherwise, this is a parse error. Consume the remnants of a bad url,
- // create a <bad-url-token>, and return it.
- offset = consumeBadUrlRemnants(source, offset);
- type = TYPE.BadUrl;
- return;
- // anything else
- // Append the current input code point to the <url-token>’s value.
- }
- }
- }
- if (!stream) {
- stream = new TokenStream();
- }
- // ensure source is a string
- source = String(source || '');
- var sourceLength = source.length;
- var offsetAndType = adoptBuffer(stream.offsetAndType, sourceLength + 1); // +1 because of eof-token
- var balance = adoptBuffer(stream.balance, sourceLength + 1);
- var tokenCount = 0;
- var start = isBOM(getCharCode(0));
- var offset = start;
- var balanceCloseType = 0;
- var balanceStart = 0;
- var balancePrev = 0;
- // https://drafts.csswg.org/css-syntax-3/#consume-token
- // § 4.3.1. Consume a token
- while (offset < sourceLength) {
- var code = source.charCodeAt(offset);
- var type = 0;
- balance[tokenCount] = sourceLength;
- switch (charCodeCategory(code)) {
- // whitespace
- case charCodeCategory.WhiteSpace:
- // Consume as much whitespace as possible. Return a <whitespace-token>.
- type = TYPE.WhiteSpace;
- offset = findWhiteSpaceEnd(source, offset + 1);
- break;
- // U+0022 QUOTATION MARK (")
- case 0x0022:
- // Consume a string token and return it.
- consumeStringToken();
- break;
- // U+0023 NUMBER SIGN (#)
- case 0x0023:
- // If the next input code point is a name code point or the next two input code points are a valid escape, then:
- if (isName(getCharCode(offset + 1)) || isValidEscape(getCharCode(offset + 1), getCharCode(offset + 2))) {
- // Create a <hash-token>.
- type = TYPE.Hash;
- // If the next 3 input code points would start an identifier, set the <hash-token>’s type flag to "id".
- // if (isIdentifierStart(getCharCode(offset + 1), getCharCode(offset + 2), getCharCode(offset + 3))) {
- // // TODO: set id flag
- // }
- // Consume a name, and set the <hash-token>’s value to the returned string.
- offset = consumeName(source, offset + 1);
- // Return the <hash-token>.
- } else {
- // Otherwise, return a <delim-token> with its value set to the current input code point.
- type = TYPE.Delim;
- offset++;
- }
- break;
- // U+0027 APOSTROPHE (')
- case 0x0027:
- // Consume a string token and return it.
- consumeStringToken();
- break;
- // U+0028 LEFT PARENTHESIS (()
- case 0x0028:
- // Return a <(-token>.
- type = TYPE.LeftParenthesis;
- offset++;
- break;
- // U+0029 RIGHT PARENTHESIS ())
- case 0x0029:
- // Return a <)-token>.
- type = TYPE.RightParenthesis;
- offset++;
- break;
- // U+002B PLUS SIGN (+)
- case 0x002B:
- // If the input stream starts with a number, ...
- if (isNumberStart(code, getCharCode(offset + 1), getCharCode(offset + 2))) {
- // ... reconsume the current input code point, consume a numeric token, and return it.
- consumeNumericToken();
- } else {
- // Otherwise, return a <delim-token> with its value set to the current input code point.
- type = TYPE.Delim;
- offset++;
- }
- break;
- // U+002C COMMA (,)
- case 0x002C:
- // Return a <comma-token>.
- type = TYPE.Comma;
- offset++;
- break;
- // U+002D HYPHEN-MINUS (-)
- case 0x002D:
- // If the input stream starts with a number, reconsume the current input code point, consume a numeric token, and return it.
- if (isNumberStart(code, getCharCode(offset + 1), getCharCode(offset + 2))) {
- consumeNumericToken();
- } else {
- // Otherwise, if the next 2 input code points are U+002D HYPHEN-MINUS U+003E GREATER-THAN SIGN (->), consume them and return a <CDC-token>.
- if (getCharCode(offset + 1) === 0x002D &&
- getCharCode(offset + 2) === 0x003E) {
- type = TYPE.CDC;
- offset = offset + 3;
- } else {
- // Otherwise, if the input stream starts with an identifier, ...
- if (isIdentifierStart(code, getCharCode(offset + 1), getCharCode(offset + 2))) {
- // ... reconsume the current input code point, consume an ident-like token, and return it.
- consumeIdentLikeToken();
- } else {
- // Otherwise, return a <delim-token> with its value set to the current input code point.
- type = TYPE.Delim;
- offset++;
- }
- }
- }
- break;
- // U+002E FULL STOP (.)
- case 0x002E:
- // If the input stream starts with a number, ...
- if (isNumberStart(code, getCharCode(offset + 1), getCharCode(offset + 2))) {
- // ... reconsume the current input code point, consume a numeric token, and return it.
- consumeNumericToken();
- } else {
- // Otherwise, return a <delim-token> with its value set to the current input code point.
- type = TYPE.Delim;
- offset++;
- }
- break;
- // U+002F SOLIDUS (/)
- case 0x002F:
- // If the next two input code point are U+002F SOLIDUS (/) followed by a U+002A ASTERISK (*),
- if (getCharCode(offset + 1) === 0x002A) {
- // ... consume them and all following code points up to and including the first U+002A ASTERISK (*)
- // followed by a U+002F SOLIDUS (/), or up to an EOF code point.
- type = TYPE.Comment;
- offset = source.indexOf('*/', offset + 2) + 2;
- if (offset === 1) {
- offset = source.length;
- }
- } else {
- type = TYPE.Delim;
- offset++;
- }
- break;
- // U+003A COLON (:)
- case 0x003A:
- // Return a <colon-token>.
- type = TYPE.Colon;
- offset++;
- break;
- // U+003B SEMICOLON (;)
- case 0x003B:
- // Return a <semicolon-token>.
- type = TYPE.Semicolon;
- offset++;
- break;
- // U+003C LESS-THAN SIGN (<)
- case 0x003C:
- // If the next 3 input code points are U+0021 EXCLAMATION MARK U+002D HYPHEN-MINUS U+002D HYPHEN-MINUS (!--), ...
- if (getCharCode(offset + 1) === 0x0021 &&
- getCharCode(offset + 2) === 0x002D &&
- getCharCode(offset + 3) === 0x002D) {
- // ... consume them and return a <CDO-token>.
- type = TYPE.CDO;
- offset = offset + 4;
- } else {
- // Otherwise, return a <delim-token> with its value set to the current input code point.
- type = TYPE.Delim;
- offset++;
- }
- break;
- // U+0040 COMMERCIAL AT (@)
- case 0x0040:
- // If the next 3 input code points would start an identifier, ...
- if (isIdentifierStart(getCharCode(offset + 1), getCharCode(offset + 2), getCharCode(offset + 3))) {
- // ... consume a name, create an <at-keyword-token> with its value set to the returned value, and return it.
- type = TYPE.AtKeyword;
- offset = consumeName(source, offset + 1);
- } else {
- // Otherwise, return a <delim-token> with its value set to the current input code point.
- type = TYPE.Delim;
- offset++;
- }
- break;
- // U+005B LEFT SQUARE BRACKET ([)
- case 0x005B:
- // Return a <[-token>.
- type = TYPE.LeftSquareBracket;
- offset++;
- break;
- // U+005C REVERSE SOLIDUS (\)
- case 0x005C:
- // If the input stream starts with a valid escape, ...
- if (isValidEscape(code, getCharCode(offset + 1))) {
- // ... reconsume the current input code point, consume an ident-like token, and return it.
- consumeIdentLikeToken();
- } else {
- // Otherwise, this is a parse error. Return a <delim-token> with its value set to the current input code point.
- type = TYPE.Delim;
- offset++;
- }
- break;
- // U+005D RIGHT SQUARE BRACKET (])
- case 0x005D:
- // Return a <]-token>.
- type = TYPE.RightSquareBracket;
- offset++;
- break;
- // U+007B LEFT CURLY BRACKET ({)
- case 0x007B:
- // Return a <{-token>.
- type = TYPE.LeftCurlyBracket;
- offset++;
- break;
- // U+007D RIGHT CURLY BRACKET (})
- case 0x007D:
- // Return a <}-token>.
- type = TYPE.RightCurlyBracket;
- offset++;
- break;
- // digit
- case charCodeCategory.Digit:
- // Reconsume the current input code point, consume a numeric token, and return it.
- consumeNumericToken();
- break;
- // name-start code point
- case charCodeCategory.NameStart:
- // Reconsume the current input code point, consume an ident-like token, and return it.
- consumeIdentLikeToken();
- break;
- // EOF
- case charCodeCategory.Eof:
- // Return an <EOF-token>.
- break;
- // anything else
- default:
- // Return a <delim-token> with its value set to the current input code point.
- type = TYPE.Delim;
- offset++;
- }
- switch (type) {
- case balanceCloseType:
- balancePrev = balanceStart & OFFSET_MASK;
- balanceStart = balance[balancePrev];
- balanceCloseType = balanceStart >> TYPE_SHIFT;
- balance[tokenCount] = balancePrev;
- balance[balancePrev++] = tokenCount;
- for (; balancePrev < tokenCount; balancePrev++) {
- if (balance[balancePrev] === sourceLength) {
- balance[balancePrev] = tokenCount;
- }
- }
- break;
- case TYPE.LeftParenthesis:
- case TYPE.Function:
- balance[tokenCount] = balanceStart;
- balanceCloseType = TYPE.RightParenthesis;
- balanceStart = (balanceCloseType << TYPE_SHIFT) | tokenCount;
- break;
- case TYPE.LeftSquareBracket:
- balance[tokenCount] = balanceStart;
- balanceCloseType = TYPE.RightSquareBracket;
- balanceStart = (balanceCloseType << TYPE_SHIFT) | tokenCount;
- break;
- case TYPE.LeftCurlyBracket:
- balance[tokenCount] = balanceStart;
- balanceCloseType = TYPE.RightCurlyBracket;
- balanceStart = (balanceCloseType << TYPE_SHIFT) | tokenCount;
- break;
- }
- offsetAndType[tokenCount++] = (type << TYPE_SHIFT) | offset;
- }
- // finalize buffers
- offsetAndType[tokenCount] = (TYPE.EOF << TYPE_SHIFT) | offset; // <EOF-token>
- balance[tokenCount] = sourceLength;
- balance[sourceLength] = sourceLength; // prevents false positive balance match with any token
- while (balanceStart !== 0) {
- balancePrev = balanceStart & OFFSET_MASK;
- balanceStart = balance[balancePrev];
- balance[balancePrev] = sourceLength;
- }
- // update stream
- stream.source = source;
- stream.firstCharOffset = start;
- stream.offsetAndType = offsetAndType;
- stream.tokenCount = tokenCount;
- stream.balance = balance;
- stream.reset();
- stream.next();
- return stream;
- }
- // extend tokenizer with constants
- Object.keys(constants).forEach(function(key) {
- tokenize[key] = constants[key];
- });
- // extend tokenizer with static methods from utils
- Object.keys(charCodeDefinitions).forEach(function(key) {
- tokenize[key] = charCodeDefinitions[key];
- });
- Object.keys(utils).forEach(function(key) {
- tokenize[key] = utils[key];
- });
- module.exports = tokenize;
|