You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
591 lines
23 KiB
591 lines
23 KiB
var TokenStream = require('../common/TokenStream'); |
|
var adoptBuffer = require('../common/adopt-buffer'); |
|
|
|
var constants = require('./const'); |
|
var TYPE = constants.TYPE; |
|
|
|
var charCodeDefinitions = require('./char-code-definitions'); |
|
var isNewline = charCodeDefinitions.isNewline; |
|
var isName = charCodeDefinitions.isName; |
|
var isValidEscape = charCodeDefinitions.isValidEscape; |
|
var isNumberStart = charCodeDefinitions.isNumberStart; |
|
var isIdentifierStart = charCodeDefinitions.isIdentifierStart; |
|
var charCodeCategory = charCodeDefinitions.charCodeCategory; |
|
var isBOM = charCodeDefinitions.isBOM; |
|
|
|
var utils = require('./utils'); |
|
var cmpStr = utils.cmpStr; |
|
var getNewlineLength = utils.getNewlineLength; |
|
var findWhiteSpaceEnd = utils.findWhiteSpaceEnd; |
|
var consumeEscaped = utils.consumeEscaped; |
|
var consumeName = utils.consumeName; |
|
var consumeNumber = utils.consumeNumber; |
|
var consumeBadUrlRemnants = utils.consumeBadUrlRemnants; |
|
|
|
var OFFSET_MASK = 0x00FFFFFF; |
|
var TYPE_SHIFT = 24; |
|
|
|
function tokenize(source, stream) { |
|
function getCharCode(offset) { |
|
return offset < sourceLength ? source.charCodeAt(offset) : 0; |
|
} |
|
|
|
// § 4.3.3. Consume a numeric token |
|
function consumeNumericToken() { |
|
// Consume a number and let number be the result. |
|
offset = consumeNumber(source, offset); |
|
|
|
// If the next 3 input code points would start an identifier, then: |
|
if (isIdentifierStart(getCharCode(offset), getCharCode(offset + 1), getCharCode(offset + 2))) { |
|
// Create a <dimension-token> with the same value and type flag as number, and a unit set initially to the empty string. |
|
// Consume a name. Set the <dimension-token>’s unit to the returned value. |
|
// Return the <dimension-token>. |
|
type = TYPE.Dimension; |
|
offset = consumeName(source, offset); |
|
return; |
|
} |
|
|
|
// Otherwise, if the next input code point is U+0025 PERCENTAGE SIGN (%), consume it. |
|
if (getCharCode(offset) === 0x0025) { |
|
// Create a <percentage-token> with the same value as number, and return it. |
|
type = TYPE.Percentage; |
|
offset++; |
|
return; |
|
} |
|
|
|
// Otherwise, create a <number-token> with the same value and type flag as number, and return it. |
|
type = TYPE.Number; |
|
} |
|
|
|
// § 4.3.4. Consume an ident-like token |
|
function consumeIdentLikeToken() { |
|
const nameStartOffset = offset; |
|
|
|
// Consume a name, and let string be the result. |
|
offset = consumeName(source, offset); |
|
|
|
// If string’s value is an ASCII case-insensitive match for "url", |
|
// and the next input code point is U+0028 LEFT PARENTHESIS ((), consume it. |
|
if (cmpStr(source, nameStartOffset, offset, 'url') && getCharCode(offset) === 0x0028) { |
|
// While the next two input code points are whitespace, consume the next input code point. |
|
offset = findWhiteSpaceEnd(source, offset + 1); |
|
|
|
// If the next one or two input code points are U+0022 QUOTATION MARK ("), U+0027 APOSTROPHE ('), |
|
// or whitespace followed by U+0022 QUOTATION MARK (") or U+0027 APOSTROPHE ('), |
|
// then create a <function-token> with its value set to string and return it. |
|
if (getCharCode(offset) === 0x0022 || |
|
getCharCode(offset) === 0x0027) { |
|
type = TYPE.Function; |
|
offset = nameStartOffset + 4; |
|
return; |
|
} |
|
|
|
// Otherwise, consume a url token, and return it. |
|
consumeUrlToken(); |
|
return; |
|
} |
|
|
|
// Otherwise, if the next input code point is U+0028 LEFT PARENTHESIS ((), consume it. |
|
// Create a <function-token> with its value set to string and return it. |
|
if (getCharCode(offset) === 0x0028) { |
|
type = TYPE.Function; |
|
offset++; |
|
return; |
|
} |
|
|
|
// Otherwise, create an <ident-token> with its value set to string and return it. |
|
type = TYPE.Ident; |
|
} |
|
|
|
// § 4.3.5. Consume a string token |
|
function consumeStringToken(endingCodePoint) { |
|
// This algorithm may be called with an ending code point, which denotes the code point |
|
// that ends the string. If an ending code point is not specified, |
|
// the current input code point is used. |
|
if (!endingCodePoint) { |
|
endingCodePoint = getCharCode(offset++); |
|
} |
|
|
|
// Initially create a <string-token> with its value set to the empty string. |
|
type = TYPE.String; |
|
|
|
// Repeatedly consume the next input code point from the stream: |
|
for (; offset < source.length; offset++) { |
|
var code = source.charCodeAt(offset); |
|
|
|
switch (charCodeCategory(code)) { |
|
// ending code point |
|
case endingCodePoint: |
|
// Return the <string-token>. |
|
offset++; |
|
return; |
|
|
|
// EOF |
|
case charCodeCategory.Eof: |
|
// This is a parse error. Return the <string-token>. |
|
return; |
|
|
|
// newline |
|
case charCodeCategory.WhiteSpace: |
|
if (isNewline(code)) { |
|
// This is a parse error. Reconsume the current input code point, |
|
// create a <bad-string-token>, and return it. |
|
offset += getNewlineLength(source, offset, code); |
|
type = TYPE.BadString; |
|
return; |
|
} |
|
break; |
|
|
|
// U+005C REVERSE SOLIDUS (\) |
|
case 0x005C: |
|
// If the next input code point is EOF, do nothing. |
|
if (offset === source.length - 1) { |
|
break; |
|
} |
|
|
|
var nextCode = getCharCode(offset + 1); |
|
|
|
// Otherwise, if the next input code point is a newline, consume it. |
|
if (isNewline(nextCode)) { |
|
offset += getNewlineLength(source, offset + 1, nextCode); |
|
} else if (isValidEscape(code, nextCode)) { |
|
// Otherwise, (the stream starts with a valid escape) consume |
|
// an escaped code point and append the returned code point to |
|
// the <string-token>’s value. |
|
offset = consumeEscaped(source, offset) - 1; |
|
} |
|
break; |
|
|
|
// anything else |
|
// Append the current input code point to the <string-token>’s value. |
|
} |
|
} |
|
} |
|
|
|
// § 4.3.6. Consume a url token |
|
// Note: This algorithm assumes that the initial "url(" has already been consumed. |
|
// This algorithm also assumes that it’s being called to consume an "unquoted" value, like url(foo). |
|
// A quoted value, like url("foo"), is parsed as a <function-token>. Consume an ident-like token |
|
// automatically handles this distinction; this algorithm shouldn’t be called directly otherwise. |
|
function consumeUrlToken() { |
|
// Initially create a <url-token> with its value set to the empty string. |
|
type = TYPE.Url; |
|
|
|
// Consume as much whitespace as possible. |
|
offset = findWhiteSpaceEnd(source, offset); |
|
|
|
// Repeatedly consume the next input code point from the stream: |
|
for (; offset < source.length; offset++) { |
|
var code = source.charCodeAt(offset); |
|
|
|
switch (charCodeCategory(code)) { |
|
// U+0029 RIGHT PARENTHESIS ()) |
|
case 0x0029: |
|
// Return the <url-token>. |
|
offset++; |
|
return; |
|
|
|
// EOF |
|
case charCodeCategory.Eof: |
|
// This is a parse error. Return the <url-token>. |
|
return; |
|
|
|
// whitespace |
|
case charCodeCategory.WhiteSpace: |
|
// Consume as much whitespace as possible. |
|
offset = findWhiteSpaceEnd(source, offset); |
|
|
|
// If the next input code point is U+0029 RIGHT PARENTHESIS ()) or EOF, |
|
// consume it and return the <url-token> |
|
// (if EOF was encountered, this is a parse error); |
|
if (getCharCode(offset) === 0x0029 || offset >= source.length) { |
|
if (offset < source.length) { |
|
offset++; |
|
} |
|
return; |
|
} |
|
|
|
// otherwise, consume the remnants of a bad url, create a <bad-url-token>, |
|
// and return it. |
|
offset = consumeBadUrlRemnants(source, offset); |
|
type = TYPE.BadUrl; |
|
return; |
|
|
|
// U+0022 QUOTATION MARK (") |
|
// U+0027 APOSTROPHE (') |
|
// U+0028 LEFT PARENTHESIS (() |
|
// non-printable code point |
|
case 0x0022: |
|
case 0x0027: |
|
case 0x0028: |
|
case charCodeCategory.NonPrintable: |
|
// This is a parse error. Consume the remnants of a bad url, |
|
// create a <bad-url-token>, and return it. |
|
offset = consumeBadUrlRemnants(source, offset); |
|
type = TYPE.BadUrl; |
|
return; |
|
|
|
// U+005C REVERSE SOLIDUS (\) |
|
case 0x005C: |
|
// If the stream starts with a valid escape, consume an escaped code point and |
|
// append the returned code point to the <url-token>’s value. |
|
if (isValidEscape(code, getCharCode(offset + 1))) { |
|
offset = consumeEscaped(source, offset) - 1; |
|
break; |
|
} |
|
|
|
// Otherwise, this is a parse error. Consume the remnants of a bad url, |
|
// create a <bad-url-token>, and return it. |
|
offset = consumeBadUrlRemnants(source, offset); |
|
type = TYPE.BadUrl; |
|
return; |
|
|
|
// anything else |
|
// Append the current input code point to the <url-token>’s value. |
|
} |
|
} |
|
} |
|
|
|
if (!stream) { |
|
stream = new TokenStream(); |
|
} |
|
|
|
// ensure source is a string |
|
source = String(source || ''); |
|
|
|
var sourceLength = source.length; |
|
var offsetAndType = adoptBuffer(stream.offsetAndType, sourceLength + 1); // +1 because of eof-token |
|
var balance = adoptBuffer(stream.balance, sourceLength + 1); |
|
var tokenCount = 0; |
|
var start = isBOM(getCharCode(0)); |
|
var offset = start; |
|
var balanceCloseType = 0; |
|
var balanceStart = 0; |
|
var balancePrev = 0; |
|
|
|
// https://drafts.csswg.org/css-syntax-3/#consume-token |
|
// § 4.3.1. Consume a token |
|
while (offset < sourceLength) { |
|
var code = source.charCodeAt(offset); |
|
var type = 0; |
|
|
|
balance[tokenCount] = sourceLength; |
|
|
|
switch (charCodeCategory(code)) { |
|
// whitespace |
|
case charCodeCategory.WhiteSpace: |
|
// Consume as much whitespace as possible. Return a <whitespace-token>. |
|
type = TYPE.WhiteSpace; |
|
offset = findWhiteSpaceEnd(source, offset + 1); |
|
break; |
|
|
|
// U+0022 QUOTATION MARK (") |
|
case 0x0022: |
|
// Consume a string token and return it. |
|
consumeStringToken(); |
|
break; |
|
|
|
// U+0023 NUMBER SIGN (#) |
|
case 0x0023: |
|
// If the next input code point is a name code point or the next two input code points are a valid escape, then: |
|
if (isName(getCharCode(offset + 1)) || isValidEscape(getCharCode(offset + 1), getCharCode(offset + 2))) { |
|
// Create a <hash-token>. |
|
type = TYPE.Hash; |
|
|
|
// If the next 3 input code points would start an identifier, set the <hash-token>’s type flag to "id". |
|
// if (isIdentifierStart(getCharCode(offset + 1), getCharCode(offset + 2), getCharCode(offset + 3))) { |
|
// // TODO: set id flag |
|
// } |
|
|
|
// Consume a name, and set the <hash-token>’s value to the returned string. |
|
offset = consumeName(source, offset + 1); |
|
|
|
// Return the <hash-token>. |
|
} else { |
|
// Otherwise, return a <delim-token> with its value set to the current input code point. |
|
type = TYPE.Delim; |
|
offset++; |
|
} |
|
|
|
break; |
|
|
|
// U+0027 APOSTROPHE (') |
|
case 0x0027: |
|
// Consume a string token and return it. |
|
consumeStringToken(); |
|
break; |
|
|
|
// U+0028 LEFT PARENTHESIS (() |
|
case 0x0028: |
|
// Return a <(-token>. |
|
type = TYPE.LeftParenthesis; |
|
offset++; |
|
break; |
|
|
|
// U+0029 RIGHT PARENTHESIS ()) |
|
case 0x0029: |
|
// Return a <)-token>. |
|
type = TYPE.RightParenthesis; |
|
offset++; |
|
break; |
|
|
|
// U+002B PLUS SIGN (+) |
|
case 0x002B: |
|
// If the input stream starts with a number, ... |
|
if (isNumberStart(code, getCharCode(offset + 1), getCharCode(offset + 2))) { |
|
// ... reconsume the current input code point, consume a numeric token, and return it. |
|
consumeNumericToken(); |
|
} else { |
|
// Otherwise, return a <delim-token> with its value set to the current input code point. |
|
type = TYPE.Delim; |
|
offset++; |
|
} |
|
break; |
|
|
|
// U+002C COMMA (,) |
|
case 0x002C: |
|
// Return a <comma-token>. |
|
type = TYPE.Comma; |
|
offset++; |
|
break; |
|
|
|
// U+002D HYPHEN-MINUS (-) |
|
case 0x002D: |
|
// If the input stream starts with a number, reconsume the current input code point, consume a numeric token, and return it. |
|
if (isNumberStart(code, getCharCode(offset + 1), getCharCode(offset + 2))) { |
|
consumeNumericToken(); |
|
} else { |
|
// Otherwise, if the next 2 input code points are U+002D HYPHEN-MINUS U+003E GREATER-THAN SIGN (->), consume them and return a <CDC-token>. |
|
if (getCharCode(offset + 1) === 0x002D && |
|
getCharCode(offset + 2) === 0x003E) { |
|
type = TYPE.CDC; |
|
offset = offset + 3; |
|
} else { |
|
// Otherwise, if the input stream starts with an identifier, ... |
|
if (isIdentifierStart(code, getCharCode(offset + 1), getCharCode(offset + 2))) { |
|
// ... reconsume the current input code point, consume an ident-like token, and return it. |
|
consumeIdentLikeToken(); |
|
} else { |
|
// Otherwise, return a <delim-token> with its value set to the current input code point. |
|
type = TYPE.Delim; |
|
offset++; |
|
} |
|
} |
|
} |
|
break; |
|
|
|
// U+002E FULL STOP (.) |
|
case 0x002E: |
|
// If the input stream starts with a number, ... |
|
if (isNumberStart(code, getCharCode(offset + 1), getCharCode(offset + 2))) { |
|
// ... reconsume the current input code point, consume a numeric token, and return it. |
|
consumeNumericToken(); |
|
} else { |
|
// Otherwise, return a <delim-token> with its value set to the current input code point. |
|
type = TYPE.Delim; |
|
offset++; |
|
} |
|
|
|
break; |
|
|
|
// U+002F SOLIDUS (/) |
|
case 0x002F: |
|
// If the next two input code point are U+002F SOLIDUS (/) followed by a U+002A ASTERISK (*), |
|
if (getCharCode(offset + 1) === 0x002A) { |
|
// ... consume them and all following code points up to and including the first U+002A ASTERISK (*) |
|
// followed by a U+002F SOLIDUS (/), or up to an EOF code point. |
|
type = TYPE.Comment; |
|
offset = source.indexOf('*/', offset + 2) + 2; |
|
if (offset === 1) { |
|
offset = source.length; |
|
} |
|
} else { |
|
type = TYPE.Delim; |
|
offset++; |
|
} |
|
break; |
|
|
|
// U+003A COLON (:) |
|
case 0x003A: |
|
// Return a <colon-token>. |
|
type = TYPE.Colon; |
|
offset++; |
|
break; |
|
|
|
// U+003B SEMICOLON (;) |
|
case 0x003B: |
|
// Return a <semicolon-token>. |
|
type = TYPE.Semicolon; |
|
offset++; |
|
break; |
|
|
|
// U+003C LESS-THAN SIGN (<) |
|
case 0x003C: |
|
// If the next 3 input code points are U+0021 EXCLAMATION MARK U+002D HYPHEN-MINUS U+002D HYPHEN-MINUS (!--), ... |
|
if (getCharCode(offset + 1) === 0x0021 && |
|
getCharCode(offset + 2) === 0x002D && |
|
getCharCode(offset + 3) === 0x002D) { |
|
// ... consume them and return a <CDO-token>. |
|
type = TYPE.CDO; |
|
offset = offset + 4; |
|
} else { |
|
// Otherwise, return a <delim-token> with its value set to the current input code point. |
|
type = TYPE.Delim; |
|
offset++; |
|
} |
|
|
|
break; |
|
|
|
// U+0040 COMMERCIAL AT (@) |
|
case 0x0040: |
|
// If the next 3 input code points would start an identifier, ... |
|
if (isIdentifierStart(getCharCode(offset + 1), getCharCode(offset + 2), getCharCode(offset + 3))) { |
|
// ... consume a name, create an <at-keyword-token> with its value set to the returned value, and return it. |
|
type = TYPE.AtKeyword; |
|
offset = consumeName(source, offset + 1); |
|
} else { |
|
// Otherwise, return a <delim-token> with its value set to the current input code point. |
|
type = TYPE.Delim; |
|
offset++; |
|
} |
|
|
|
break; |
|
|
|
// U+005B LEFT SQUARE BRACKET ([) |
|
case 0x005B: |
|
// Return a <[-token>. |
|
type = TYPE.LeftSquareBracket; |
|
offset++; |
|
break; |
|
|
|
// U+005C REVERSE SOLIDUS (\) |
|
case 0x005C: |
|
// If the input stream starts with a valid escape, ... |
|
if (isValidEscape(code, getCharCode(offset + 1))) { |
|
// ... reconsume the current input code point, consume an ident-like token, and return it. |
|
consumeIdentLikeToken(); |
|
} else { |
|
// Otherwise, this is a parse error. Return a <delim-token> with its value set to the current input code point. |
|
type = TYPE.Delim; |
|
offset++; |
|
} |
|
break; |
|
|
|
// U+005D RIGHT SQUARE BRACKET (]) |
|
case 0x005D: |
|
// Return a <]-token>. |
|
type = TYPE.RightSquareBracket; |
|
offset++; |
|
break; |
|
|
|
// U+007B LEFT CURLY BRACKET ({) |
|
case 0x007B: |
|
// Return a <{-token>. |
|
type = TYPE.LeftCurlyBracket; |
|
offset++; |
|
break; |
|
|
|
// U+007D RIGHT CURLY BRACKET (}) |
|
case 0x007D: |
|
// Return a <}-token>. |
|
type = TYPE.RightCurlyBracket; |
|
offset++; |
|
break; |
|
|
|
// digit |
|
case charCodeCategory.Digit: |
|
// Reconsume the current input code point, consume a numeric token, and return it. |
|
consumeNumericToken(); |
|
break; |
|
|
|
// name-start code point |
|
case charCodeCategory.NameStart: |
|
// Reconsume the current input code point, consume an ident-like token, and return it. |
|
consumeIdentLikeToken(); |
|
break; |
|
|
|
// EOF |
|
case charCodeCategory.Eof: |
|
// Return an <EOF-token>. |
|
break; |
|
|
|
// anything else |
|
default: |
|
// Return a <delim-token> with its value set to the current input code point. |
|
type = TYPE.Delim; |
|
offset++; |
|
} |
|
|
|
switch (type) { |
|
case balanceCloseType: |
|
balancePrev = balanceStart & OFFSET_MASK; |
|
balanceStart = balance[balancePrev]; |
|
balanceCloseType = balanceStart >> TYPE_SHIFT; |
|
balance[tokenCount] = balancePrev; |
|
balance[balancePrev++] = tokenCount; |
|
for (; balancePrev < tokenCount; balancePrev++) { |
|
if (balance[balancePrev] === sourceLength) { |
|
balance[balancePrev] = tokenCount; |
|
} |
|
} |
|
break; |
|
|
|
case TYPE.LeftParenthesis: |
|
case TYPE.Function: |
|
balance[tokenCount] = balanceStart; |
|
balanceCloseType = TYPE.RightParenthesis; |
|
balanceStart = (balanceCloseType << TYPE_SHIFT) | tokenCount; |
|
break; |
|
|
|
case TYPE.LeftSquareBracket: |
|
balance[tokenCount] = balanceStart; |
|
balanceCloseType = TYPE.RightSquareBracket; |
|
balanceStart = (balanceCloseType << TYPE_SHIFT) | tokenCount; |
|
break; |
|
|
|
case TYPE.LeftCurlyBracket: |
|
balance[tokenCount] = balanceStart; |
|
balanceCloseType = TYPE.RightCurlyBracket; |
|
balanceStart = (balanceCloseType << TYPE_SHIFT) | tokenCount; |
|
break; |
|
} |
|
|
|
offsetAndType[tokenCount++] = (type << TYPE_SHIFT) | offset; |
|
} |
|
|
|
// finalize buffers |
|
offsetAndType[tokenCount] = (TYPE.EOF << TYPE_SHIFT) | offset; // <EOF-token> |
|
balance[tokenCount] = sourceLength; |
|
balance[sourceLength] = sourceLength; // prevents false positive balance match with any token |
|
while (balanceStart !== 0) { |
|
balancePrev = balanceStart & OFFSET_MASK; |
|
balanceStart = balance[balancePrev]; |
|
balance[balancePrev] = sourceLength; |
|
} |
|
|
|
// update stream |
|
stream.source = source; |
|
stream.firstCharOffset = start; |
|
stream.offsetAndType = offsetAndType; |
|
stream.tokenCount = tokenCount; |
|
stream.balance = balance; |
|
stream.reset(); |
|
stream.next(); |
|
|
|
return stream; |
|
} |
|
|
|
// extend tokenizer with constants |
|
Object.keys(constants).forEach(function(key) { |
|
tokenize[key] = constants[key]; |
|
}); |
|
|
|
// extend tokenizer with static methods from utils |
|
Object.keys(charCodeDefinitions).forEach(function(key) { |
|
tokenize[key] = charCodeDefinitions[key]; |
|
}); |
|
Object.keys(utils).forEach(function(key) { |
|
tokenize[key] = utils[key]; |
|
}); |
|
|
|
module.exports = tokenize;
|
|
|