import * as TYPE from './types.js'; import { isNewline, isName, isValidEscape, isNumberStart, isIdentifierStart, isBOM, charCodeCategory, WhiteSpaceCategory, DigitCategory, NameStartCategory, NonPrintableCategory } from './char-code-definitions.js'; import { cmpStr, getNewlineLength, findWhiteSpaceEnd, consumeEscaped, consumeName, consumeNumber, consumeBadUrlRemnants } from './utils.js'; export function tokenize(source, onToken) { function getCharCode(offset) { return offset < sourceLength ? source.charCodeAt(offset) : 0; } // § 4.3.3. Consume a numeric token function consumeNumericToken() { // Consume a number and let number be the result. offset = consumeNumber(source, offset); // If the next 3 input code points would start an identifier, then: if (isIdentifierStart(getCharCode(offset), getCharCode(offset + 1), getCharCode(offset + 2))) { // Create a with the same value and type flag as number, and a unit set initially to the empty string. // Consume a name. Set the ’s unit to the returned value. // Return the . type = TYPE.Dimension; offset = consumeName(source, offset); return; } // Otherwise, if the next input code point is U+0025 PERCENTAGE SIGN (%), consume it. if (getCharCode(offset) === 0x0025) { // Create a with the same value as number, and return it. type = TYPE.Percentage; offset++; return; } // Otherwise, create a with the same value and type flag as number, and return it. type = TYPE.Number; } // § 4.3.4. Consume an ident-like token function consumeIdentLikeToken() { const nameStartOffset = offset; // Consume a name, and let string be the result. offset = consumeName(source, offset); // If string’s value is an ASCII case-insensitive match for "url", // and the next input code point is U+0028 LEFT PARENTHESIS ((), consume it. if (cmpStr(source, nameStartOffset, offset, 'url') && getCharCode(offset) === 0x0028) { // While the next two input code points are whitespace, consume the next input code point. offset = findWhiteSpaceEnd(source, offset + 1); // If the next one or two input code points are U+0022 QUOTATION MARK ("), U+0027 APOSTROPHE ('), // or whitespace followed by U+0022 QUOTATION MARK (") or U+0027 APOSTROPHE ('), // then create a with its value set to string and return it. if (getCharCode(offset) === 0x0022 || getCharCode(offset) === 0x0027) { type = TYPE.Function; offset = nameStartOffset + 4; return; } // Otherwise, consume a url token, and return it. consumeUrlToken(); return; } // Otherwise, if the next input code point is U+0028 LEFT PARENTHESIS ((), consume it. // Create a with its value set to string and return it. if (getCharCode(offset) === 0x0028) { type = TYPE.Function; offset++; return; } // Otherwise, create an with its value set to string and return it. type = TYPE.Ident; } // § 4.3.5. Consume a string token function consumeStringToken(endingCodePoint) { // This algorithm may be called with an ending code point, which denotes the code point // that ends the string. If an ending code point is not specified, // the current input code point is used. if (!endingCodePoint) { endingCodePoint = getCharCode(offset++); } // Initially create a with its value set to the empty string. type = TYPE.String; // Repeatedly consume the next input code point from the stream: for (; offset < source.length; offset++) { const code = source.charCodeAt(offset); switch (charCodeCategory(code)) { // ending code point case endingCodePoint: // Return the . offset++; return; // EOF // case EofCategory: // This is a parse error. Return the . // return; // newline case WhiteSpaceCategory: if (isNewline(code)) { // This is a parse error. Reconsume the current input code point, // create a , and return it. offset += getNewlineLength(source, offset, code); type = TYPE.BadString; return; } break; // U+005C REVERSE SOLIDUS (\) case 0x005C: // If the next input code point is EOF, do nothing. if (offset === source.length - 1) { break; } const nextCode = getCharCode(offset + 1); // Otherwise, if the next input code point is a newline, consume it. if (isNewline(nextCode)) { offset += getNewlineLength(source, offset + 1, nextCode); } else if (isValidEscape(code, nextCode)) { // Otherwise, (the stream starts with a valid escape) consume // an escaped code point and append the returned code point to // the ’s value. offset = consumeEscaped(source, offset) - 1; } break; // anything else // Append the current input code point to the ’s value. } } } // § 4.3.6. Consume a url token // Note: This algorithm assumes that the initial "url(" has already been consumed. // This algorithm also assumes that it’s being called to consume an "unquoted" value, like url(foo). // A quoted value, like url("foo"), is parsed as a . Consume an ident-like token // automatically handles this distinction; this algorithm shouldn’t be called directly otherwise. function consumeUrlToken() { // Initially create a with its value set to the empty string. type = TYPE.Url; // Consume as much whitespace as possible. offset = findWhiteSpaceEnd(source, offset); // Repeatedly consume the next input code point from the stream: for (; offset < source.length; offset++) { const code = source.charCodeAt(offset); switch (charCodeCategory(code)) { // U+0029 RIGHT PARENTHESIS ()) case 0x0029: // Return the . offset++; return; // EOF // case EofCategory: // This is a parse error. Return the . // return; // whitespace case WhiteSpaceCategory: // Consume as much whitespace as possible. offset = findWhiteSpaceEnd(source, offset); // If the next input code point is U+0029 RIGHT PARENTHESIS ()) or EOF, // consume it and return the // (if EOF was encountered, this is a parse error); if (getCharCode(offset) === 0x0029 || offset >= source.length) { if (offset < source.length) { offset++; } return; } // otherwise, consume the remnants of a bad url, create a , // and return it. offset = consumeBadUrlRemnants(source, offset); type = TYPE.BadUrl; return; // U+0022 QUOTATION MARK (") // U+0027 APOSTROPHE (') // U+0028 LEFT PARENTHESIS (() // non-printable code point case 0x0022: case 0x0027: case 0x0028: case NonPrintableCategory: // This is a parse error. Consume the remnants of a bad url, // create a , and return it. offset = consumeBadUrlRemnants(source, offset); type = TYPE.BadUrl; return; // U+005C REVERSE SOLIDUS (\) case 0x005C: // If the stream starts with a valid escape, consume an escaped code point and // append the returned code point to the ’s value. if (isValidEscape(code, getCharCode(offset + 1))) { offset = consumeEscaped(source, offset) - 1; break; } // Otherwise, this is a parse error. Consume the remnants of a bad url, // create a , and return it. offset = consumeBadUrlRemnants(source, offset); type = TYPE.BadUrl; return; // anything else // Append the current input code point to the ’s value. } } } // ensure source is a string source = String(source || ''); const sourceLength = source.length; let start = isBOM(getCharCode(0)); let offset = start; let type; // https://drafts.csswg.org/css-syntax-3/#consume-token // § 4.3.1. Consume a token while (offset < sourceLength) { const code = source.charCodeAt(offset); switch (charCodeCategory(code)) { // whitespace case WhiteSpaceCategory: // Consume as much whitespace as possible. Return a . type = TYPE.WhiteSpace; offset = findWhiteSpaceEnd(source, offset + 1); break; // U+0022 QUOTATION MARK (") case 0x0022: // Consume a string token and return it. consumeStringToken(); break; // U+0023 NUMBER SIGN (#) case 0x0023: // If the next input code point is a name code point or the next two input code points are a valid escape, then: if (isName(getCharCode(offset + 1)) || isValidEscape(getCharCode(offset + 1), getCharCode(offset + 2))) { // Create a . type = TYPE.Hash; // If the next 3 input code points would start an identifier, set the ’s type flag to "id". // if (isIdentifierStart(getCharCode(offset + 1), getCharCode(offset + 2), getCharCode(offset + 3))) { // // TODO: set id flag // } // Consume a name, and set the ’s value to the returned string. offset = consumeName(source, offset + 1); // Return the . } else { // Otherwise, return a with its value set to the current input code point. type = TYPE.Delim; offset++; } break; // U+0027 APOSTROPHE (') case 0x0027: // Consume a string token and return it. consumeStringToken(); break; // U+0028 LEFT PARENTHESIS (() case 0x0028: // Return a <(-token>. type = TYPE.LeftParenthesis; offset++; break; // U+0029 RIGHT PARENTHESIS ()) case 0x0029: // Return a <)-token>. type = TYPE.RightParenthesis; offset++; break; // U+002B PLUS SIGN (+) case 0x002B: // If the input stream starts with a number, ... if (isNumberStart(code, getCharCode(offset + 1), getCharCode(offset + 2))) { // ... reconsume the current input code point, consume a numeric token, and return it. consumeNumericToken(); } else { // Otherwise, return a with its value set to the current input code point. type = TYPE.Delim; offset++; } break; // U+002C COMMA (,) case 0x002C: // Return a . type = TYPE.Comma; offset++; break; // U+002D HYPHEN-MINUS (-) case 0x002D: // If the input stream starts with a number, reconsume the current input code point, consume a numeric token, and return it. if (isNumberStart(code, getCharCode(offset + 1), getCharCode(offset + 2))) { consumeNumericToken(); } else { // Otherwise, if the next 2 input code points are U+002D HYPHEN-MINUS U+003E GREATER-THAN SIGN (->), consume them and return a . if (getCharCode(offset + 1) === 0x002D && getCharCode(offset + 2) === 0x003E) { type = TYPE.CDC; offset = offset + 3; } else { // Otherwise, if the input stream starts with an identifier, ... if (isIdentifierStart(code, getCharCode(offset + 1), getCharCode(offset + 2))) { // ... reconsume the current input code point, consume an ident-like token, and return it. consumeIdentLikeToken(); } else { // Otherwise, return a with its value set to the current input code point. type = TYPE.Delim; offset++; } } } break; // U+002E FULL STOP (.) case 0x002E: // If the input stream starts with a number, ... if (isNumberStart(code, getCharCode(offset + 1), getCharCode(offset + 2))) { // ... reconsume the current input code point, consume a numeric token, and return it. consumeNumericToken(); } else { // Otherwise, return a with its value set to the current input code point. type = TYPE.Delim; offset++; } break; // U+002F SOLIDUS (/) case 0x002F: // If the next two input code point are U+002F SOLIDUS (/) followed by a U+002A ASTERISK (*), if (getCharCode(offset + 1) === 0x002A) { // ... consume them and all following code points up to and including the first U+002A ASTERISK (*) // followed by a U+002F SOLIDUS (/), or up to an EOF code point. type = TYPE.Comment; offset = source.indexOf('*/', offset + 2); offset = offset === -1 ? source.length : offset + 2; } else { type = TYPE.Delim; offset++; } break; // U+003A COLON (:) case 0x003A: // Return a . type = TYPE.Colon; offset++; break; // U+003B SEMICOLON (;) case 0x003B: // Return a . type = TYPE.Semicolon; offset++; break; // U+003C LESS-THAN SIGN (<) case 0x003C: // If the next 3 input code points are U+0021 EXCLAMATION MARK U+002D HYPHEN-MINUS U+002D HYPHEN-MINUS (!--), ... if (getCharCode(offset + 1) === 0x0021 && getCharCode(offset + 2) === 0x002D && getCharCode(offset + 3) === 0x002D) { // ... consume them and return a . type = TYPE.CDO; offset = offset + 4; } else { // Otherwise, return a with its value set to the current input code point. type = TYPE.Delim; offset++; } break; // U+0040 COMMERCIAL AT (@) case 0x0040: // If the next 3 input code points would start an identifier, ... if (isIdentifierStart(getCharCode(offset + 1), getCharCode(offset + 2), getCharCode(offset + 3))) { // ... consume a name, create an with its value set to the returned value, and return it. type = TYPE.AtKeyword; offset = consumeName(source, offset + 1); } else { // Otherwise, return a with its value set to the current input code point. type = TYPE.Delim; offset++; } break; // U+005B LEFT SQUARE BRACKET ([) case 0x005B: // Return a <[-token>. type = TYPE.LeftSquareBracket; offset++; break; // U+005C REVERSE SOLIDUS (\) case 0x005C: // If the input stream starts with a valid escape, ... if (isValidEscape(code, getCharCode(offset + 1))) { // ... reconsume the current input code point, consume an ident-like token, and return it. consumeIdentLikeToken(); } else { // Otherwise, this is a parse error. Return a with its value set to the current input code point. type = TYPE.Delim; offset++; } break; // U+005D RIGHT SQUARE BRACKET (]) case 0x005D: // Return a <]-token>. type = TYPE.RightSquareBracket; offset++; break; // U+007B LEFT CURLY BRACKET ({) case 0x007B: // Return a <{-token>. type = TYPE.LeftCurlyBracket; offset++; break; // U+007D RIGHT CURLY BRACKET (}) case 0x007D: // Return a <}-token>. type = TYPE.RightCurlyBracket; offset++; break; // digit case DigitCategory: // Reconsume the current input code point, consume a numeric token, and return it. consumeNumericToken(); break; // name-start code point case NameStartCategory: // Reconsume the current input code point, consume an ident-like token, and return it. consumeIdentLikeToken(); break; // EOF // case EofCategory: // Return an . // break; // anything else default: // Return a with its value set to the current input code point. type = TYPE.Delim; offset++; } // put token to stream onToken(type, start, start = offset); } } export * from './types.js'; export * as tokenTypes from './types.js'; export { default as tokenNames } from './names.js'; export * from './char-code-definitions.js'; export * from './utils.js'; export * from './OffsetToLocation.js'; export * from './TokenStream.js';