kopia lustrzana https://github.com/c9/core
653 wiersze
27 KiB
JavaScript
653 wiersze
27 KiB
JavaScript
/*
|
|
* Copyright (c) 2013 Adobe Systems Incorporated. All rights reserved.
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
* to deal in the Software without restriction, including without limitation
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice shall be included in
|
|
* all copies or substantial portions of the Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
|
* DEALINGS IN THE SOFTWARE.
|
|
*
|
|
*/
|
|
|
|
// A simple HTML tokenizer, originally adapted from https://github.com/fb55/htmlparser2
|
|
// (MIT-licensed), but with significant customizations for use in HTML live development.
|
|
|
|
/*jslint vars: true, plusplus: true, devel: true, nomen: true, indent: 4, maxerr: 50, continue: true */
|
|
/*global define */
|
|
/*unittests: HTML Tokenizer*/
|
|
|
|
define(function (require, exports, module) {
|
|
|
|
"use strict";
|
|
var i = 0,
|
|
|
|
TEXT = i++,
|
|
BEFORE_TAG_NAME = i++, //after <
|
|
IN_TAG_NAME = i++,
|
|
BEFORE_CLOSING_TAG_NAME = i++,
|
|
IN_CLOSING_TAG_NAME = i++,
|
|
AFTER_CLOSING_TAG_NAME = i++,
|
|
AFTER_SELFCLOSE_SLASH = i++,
|
|
|
|
//attributes
|
|
BEFORE_ATTRIBUTE_NAME = i++,
|
|
AFTER_QUOTED_ATTRIBUTE_VALUE = i++,
|
|
IN_ATTRIBUTE_NAME = i++,
|
|
AFTER_ATTRIBUTE_NAME = i++,
|
|
BEFORE_ATTRIBUTE_VALUE = i++,
|
|
IN_ATTRIBUTE_VALUE_DOUBLE_QUOTES = i++, // "
|
|
IN_ATTRIBUTE_VALUE_SINGLE_QUOTES = i++, // '
|
|
IN_ATTRIBUTE_VALUE_NO_QUOTES = i++,
|
|
|
|
//declarations
|
|
BEFORE_DECLARATION = i++, // !
|
|
IN_DECLARATION = i++,
|
|
|
|
//processing instructions
|
|
IN_PROCESSING_INSTRUCTION = i++, // ?
|
|
|
|
//comments
|
|
BEFORE_COMMENT = i++,
|
|
IN_COMMENT = i++,
|
|
AFTER_COMMENT_1 = i++,
|
|
AFTER_COMMENT_2 = i++,
|
|
|
|
//cdata
|
|
BEFORE_CDATA_1 = i++, // [
|
|
BEFORE_CDATA_2 = i++, // C
|
|
BEFORE_CDATA_3 = i++, // D
|
|
BEFORE_CDATA_4 = i++, // A
|
|
BEFORE_CDATA_5 = i++, // T
|
|
BEFORE_CDATA_6 = i++, // A
|
|
IN_CDATA = i++, // [
|
|
AFTER_CDATA_1 = i++, // ]
|
|
AFTER_CDATA_2 = i++, // ]
|
|
|
|
//special tags
|
|
SPECIAL = i++; //S
|
|
|
|
/**
|
|
* @private
|
|
* @param {string} c the character to test
|
|
* @return {boolean} true if c is whitespace
|
|
*/
|
|
function isWhitespace(c) {
|
|
return c === " " || c === "\t" || c === "\r" || c === "\n";
|
|
}
|
|
|
|
/**
|
|
* @private
|
|
* @param {string} c the character to test
|
|
* @return {boolean} true if c is legal in an HTML tag name
|
|
*/
|
|
function isLegalInTagName(c) {
|
|
// We allow "-" in tag names since they're popular in Angular custom tag names
|
|
// and will be legal in the web components spec.
|
|
return (/[A-Za-z0-9\-]/).test(c);
|
|
}
|
|
|
|
/**
|
|
* @private
|
|
* @param {string} c the character to test
|
|
* @return {boolean} true if c is legal in an HTML attribute name
|
|
*/
|
|
function isLegalInAttributeName(c) {
|
|
return c !== '"' && c !== "'" && c !== "<" && c !== "=";
|
|
}
|
|
|
|
/**
|
|
* @private
|
|
* @param {string} c the character to test
|
|
* @return {boolean} true if c is legal in an unquoted attribute value
|
|
*/
|
|
function isLegalInUnquotedAttributeValue(c) {
|
|
return c !== "<" && c !== "=";
|
|
}
|
|
|
|
function _clonePos(pos, offset) {
|
|
return pos ? { row: pos.row, column: pos.column + (offset || 0) } : null;
|
|
}
|
|
|
|
/**
|
|
* @constructor
|
|
* A simple HTML tokenizer. See the description of nextToken() for usage details.
|
|
* @param {string} text The HTML document to tokenize.
|
|
*/
|
|
function Tokenizer(text) {
|
|
this._state = TEXT;
|
|
this._buffer = text;
|
|
this._sectionStart = 0;
|
|
this._sectionStartPos = { row: 0, column: 0 };
|
|
this._index = 0;
|
|
this._indexPos = { row: 0, column: 0 };
|
|
this._special = 0; // 1 for script, 2 for style
|
|
this._token = null;
|
|
this._nextToken = null;
|
|
}
|
|
|
|
/**
|
|
* Returns the next token in the HTML document, or null if we're at the end of the document.
|
|
* @return {?{type: string, contents: string, start: number, end: number}} token The next token, with the following fields:
|
|
* type: The type of token, one of:
|
|
* "error" - invalid syntax was found, tokenization aborted. Calling nextToken() again will produce undefined results.
|
|
* "text" - contents contains the text
|
|
* "opentagname" - an open tag was started; contents contains the tag name
|
|
* "attribname" - an attribute was encountered; contents contains the attribute name
|
|
* "attribvalue" - the value for the previous attribname was encountered; contents contains the (unquoted) value
|
|
* (Note that attributes like checked and disabled might not have values.)
|
|
* "opentagend" - the end of an open tag was encountered; contents is unspecified
|
|
* "selfclosingtag" - a "/>" was encountered indicating that a void element was self-closed; contents is unspecified
|
|
* (Note that this is optional in HTML; void elements like <img> will end with "opentagend", not "selfclosingtag")
|
|
* "closetag" - a close tag was encountered; contents contains the tag name
|
|
* "comment" - a comment was encountered; contents contains the body of the comment
|
|
* "cdata" - a CDATA block was encountered; contents contains the text inside the block
|
|
* contents: the contents of the token, as specified above. Note that "opentagend" and "selfclosingtag" really specify positions,
|
|
* not tokens, and so have no contents.
|
|
* start: the start index of the token contents within the text, or -1 for "opentagend" and "selfclosingtag"
|
|
* end: the end index of the token contents within the text, or the position of the boundary for "opentagend" and "selfclosingtag"
|
|
*/
|
|
Tokenizer.prototype.nextToken = function () {
|
|
this._token = null;
|
|
|
|
if (this._nextToken) {
|
|
var result = this._nextToken;
|
|
this._nextToken = null;
|
|
return result;
|
|
}
|
|
|
|
while (this._index < this._buffer.length && !this._token) {
|
|
var c = this._buffer.charAt(this._index);
|
|
if (this._state === TEXT) {
|
|
if (c === "<") {
|
|
this._emitTokenIfNonempty("text");
|
|
this._state = BEFORE_TAG_NAME;
|
|
this._startSection();
|
|
}
|
|
} else if (this._state === BEFORE_TAG_NAME) {
|
|
if (c === "/") {
|
|
this._state = BEFORE_CLOSING_TAG_NAME;
|
|
} else if (c === ">" || this._special > 0) {
|
|
this._state = TEXT;
|
|
} else {
|
|
if (c === "!") {
|
|
this._state = BEFORE_DECLARATION;
|
|
this._startSection(1);
|
|
} else if (c === "?") {
|
|
this._state = IN_PROCESSING_INSTRUCTION;
|
|
this._startSection(1);
|
|
} else if (!isLegalInTagName(c)) {
|
|
this._emitSpecialToken("error");
|
|
break;
|
|
} else if (!isWhitespace(c)) {
|
|
this._state = IN_TAG_NAME;
|
|
this._startSection();
|
|
}
|
|
}
|
|
} else if (this._state === IN_TAG_NAME) {
|
|
if (c === "/") {
|
|
this._emitToken("opentagname");
|
|
this._checkSpecial();
|
|
this._emitSpecialToken("selfclosingtag", this._index + 2, _clonePos(this._indexPos, 2));
|
|
this._state = AFTER_SELFCLOSE_SLASH;
|
|
} else if (c === ">") {
|
|
this._emitToken("opentagname");
|
|
this._checkSpecial();
|
|
this._emitSpecialToken("opentagend", this._index + 1, _clonePos(this._indexPos, 1));
|
|
if (this._special) {
|
|
this._startSpecial();
|
|
} else {
|
|
this._state = TEXT;
|
|
this._startSection(1);
|
|
}
|
|
} else if (isWhitespace(c)) {
|
|
this._emitToken("opentagname");
|
|
this._checkSpecial();
|
|
this._state = BEFORE_ATTRIBUTE_NAME;
|
|
} else if (!isLegalInTagName(c)) {
|
|
this._emitSpecialToken("error");
|
|
break;
|
|
}
|
|
} else if (this._state === BEFORE_CLOSING_TAG_NAME) {
|
|
if (c === ">") {
|
|
this._state = TEXT;
|
|
} else if (!isLegalInTagName(c)) {
|
|
this._emitSpecialToken("error");
|
|
break;
|
|
} else if (!isWhitespace(c)) {
|
|
this._state = IN_CLOSING_TAG_NAME;
|
|
this._startSection();
|
|
}
|
|
} else if (this._state === IN_CLOSING_TAG_NAME) {
|
|
if (c === ">") {
|
|
this._emitToken("closetag");
|
|
this._state = TEXT;
|
|
this._startSection(1);
|
|
this._special = 0;
|
|
} else if (isWhitespace(c)) {
|
|
this._emitToken("closetag");
|
|
this._state = AFTER_CLOSING_TAG_NAME;
|
|
this._special = 0;
|
|
} else if (!isLegalInTagName(c)) {
|
|
this._emitSpecialToken("error");
|
|
break;
|
|
}
|
|
} else if (this._state === AFTER_CLOSING_TAG_NAME) {
|
|
if (c === ">") {
|
|
this._state = TEXT;
|
|
this._startSection(1);
|
|
} else if (!isWhitespace(c)) {
|
|
// There must be only whitespace in the closing tag after the name until the ">".
|
|
this._emitSpecialToken("error");
|
|
break;
|
|
}
|
|
} else if (this._state === AFTER_SELFCLOSE_SLASH) {
|
|
// Nothing (even whitespace) can come between the / and > of a self-close.
|
|
if (c === ">") {
|
|
this._state = TEXT;
|
|
this._startSection(1);
|
|
} else {
|
|
this._emitSpecialToken("error");
|
|
break;
|
|
}
|
|
|
|
/*
|
|
* attributes
|
|
*/
|
|
} else if (this._state === BEFORE_ATTRIBUTE_NAME) {
|
|
if (c === ">") {
|
|
this._state = TEXT;
|
|
this._emitSpecialToken("opentagend", this._index + 1, _clonePos(this._indexPos, 1));
|
|
if (this._special) {
|
|
this._startSpecial();
|
|
} else {
|
|
this._startSection(1);
|
|
}
|
|
} else if (c === "/") {
|
|
this._emitSpecialToken("selfclosingtag", this._index + 2, _clonePos(this._indexPos, 2));
|
|
this._state = AFTER_SELFCLOSE_SLASH;
|
|
} else if (!isLegalInAttributeName(c)) {
|
|
this._emitSpecialToken("error");
|
|
break;
|
|
} else if (!isWhitespace(c)) {
|
|
this._state = IN_ATTRIBUTE_NAME;
|
|
this._startSection();
|
|
}
|
|
} else if (this._state === IN_ATTRIBUTE_NAME) {
|
|
if (c === "=") {
|
|
this._emitTokenIfNonempty("attribname");
|
|
this._state = BEFORE_ATTRIBUTE_VALUE;
|
|
} else if (isWhitespace(c)) {
|
|
this._emitTokenIfNonempty("attribname");
|
|
this._state = AFTER_ATTRIBUTE_NAME;
|
|
} else if (c === "/" || c === ">") {
|
|
this._emitTokenIfNonempty("attribname");
|
|
this._state = BEFORE_ATTRIBUTE_NAME;
|
|
continue;
|
|
} else if (!isLegalInAttributeName(c)) {
|
|
this._emitSpecialToken("error");
|
|
break;
|
|
}
|
|
} else if (this._state === AFTER_ATTRIBUTE_NAME) {
|
|
if (c === "=") {
|
|
this._state = BEFORE_ATTRIBUTE_VALUE;
|
|
} else if (c === "/" || c === ">") {
|
|
this._state = BEFORE_ATTRIBUTE_NAME;
|
|
continue;
|
|
} else if (!isLegalInAttributeName(c)) {
|
|
this._emitSpecialToken("error");
|
|
break;
|
|
} else if (!isWhitespace(c)) {
|
|
this._state = IN_ATTRIBUTE_NAME;
|
|
this._startSection();
|
|
}
|
|
} else if (this._state === BEFORE_ATTRIBUTE_VALUE) {
|
|
if (c === "\"") {
|
|
this._state = IN_ATTRIBUTE_VALUE_DOUBLE_QUOTES;
|
|
this._startSection(1);
|
|
} else if (c === "'") {
|
|
this._state = IN_ATTRIBUTE_VALUE_SINGLE_QUOTES;
|
|
this._startSection(1);
|
|
} else if (!isLegalInUnquotedAttributeValue(c)) {
|
|
this._emitSpecialToken("error");
|
|
break;
|
|
} else if (!isWhitespace(c)) {
|
|
this._state = IN_ATTRIBUTE_VALUE_NO_QUOTES;
|
|
this._startSection();
|
|
}
|
|
} else if (this._state === IN_ATTRIBUTE_VALUE_DOUBLE_QUOTES) {
|
|
if (c === "\"") {
|
|
this._emitToken("attribvalue");
|
|
this._state = AFTER_QUOTED_ATTRIBUTE_VALUE;
|
|
}
|
|
} else if (this._state === IN_ATTRIBUTE_VALUE_SINGLE_QUOTES) {
|
|
if (c === "'") {
|
|
this._state = AFTER_QUOTED_ATTRIBUTE_VALUE;
|
|
this._emitToken("attribvalue");
|
|
}
|
|
} else if (this._state === IN_ATTRIBUTE_VALUE_NO_QUOTES) {
|
|
if (c === ">") {
|
|
this._emitToken("attribvalue");
|
|
this._emitSpecialToken("opentagend", this._index + 1, _clonePos(this._indexPos, 1));
|
|
if (this._special) {
|
|
this._startSpecial();
|
|
} else {
|
|
this._state = TEXT;
|
|
this._startSection(1);
|
|
}
|
|
} else if (isWhitespace(c)) {
|
|
this._emitToken("attribvalue");
|
|
this._state = BEFORE_ATTRIBUTE_NAME;
|
|
} else if (!isLegalInUnquotedAttributeValue(c)) {
|
|
this._emitSpecialToken("error");
|
|
break;
|
|
}
|
|
} else if (this._state === AFTER_QUOTED_ATTRIBUTE_VALUE) {
|
|
// There must be at least one whitespace between the end of a quoted
|
|
// attribute value and the next attribute, if any.
|
|
if (c === ">") {
|
|
this._state = TEXT;
|
|
this._emitSpecialToken("opentagend", this._index + 1, _clonePos(this._indexPos, 1));
|
|
if (this._special) {
|
|
this._startSpecial();
|
|
} else {
|
|
this._startSection(1);
|
|
}
|
|
} else if (c === "/") {
|
|
this._emitSpecialToken("selfclosingtag", this._index + 2, _clonePos(this._indexPos, 2));
|
|
this._state = AFTER_SELFCLOSE_SLASH;
|
|
} else if (isWhitespace(c)) {
|
|
this._state = BEFORE_ATTRIBUTE_NAME;
|
|
} else {
|
|
this._emitSpecialToken("error");
|
|
break;
|
|
}
|
|
|
|
/*
|
|
* declarations
|
|
*/
|
|
} else if (this._state === BEFORE_DECLARATION) {
|
|
if (c === "[") {
|
|
this._state = BEFORE_CDATA_1;
|
|
} else if (c === "-") {
|
|
this._state = BEFORE_COMMENT;
|
|
} else {
|
|
this._state = IN_DECLARATION;
|
|
}
|
|
} else if (this._state === IN_DECLARATION) {
|
|
if (c === ">") {
|
|
this._emitToken("declaration");
|
|
this._state = TEXT;
|
|
this._startSection(1);
|
|
}
|
|
|
|
|
|
/*
|
|
* processing instructions
|
|
*/
|
|
} else if (this._state === IN_PROCESSING_INSTRUCTION) {
|
|
if (c === ">") {
|
|
this._emitToken("processinginstruction");
|
|
this._state = TEXT;
|
|
this._startSection(1);
|
|
}
|
|
|
|
|
|
/*
|
|
* comments
|
|
*/
|
|
} else if (this._state === BEFORE_COMMENT) {
|
|
if (c === "-") {
|
|
this._state = IN_COMMENT;
|
|
this._startSection(1);
|
|
} else {
|
|
this._state = IN_DECLARATION;
|
|
}
|
|
} else if (this._state === IN_COMMENT) {
|
|
if (c === "-") {
|
|
this._state = AFTER_COMMENT_1;
|
|
}
|
|
} else if (this._state === AFTER_COMMENT_1) {
|
|
if (c === "-") {
|
|
this._state = AFTER_COMMENT_2;
|
|
} else {
|
|
this._state = IN_COMMENT;
|
|
}
|
|
} else if (this._state === AFTER_COMMENT_2) {
|
|
if (c === ">") {
|
|
//remove 2 trailing chars
|
|
// It should be okay to just decrement the char position by 2 because we know neither of the previous
|
|
// characters is a newline.
|
|
this._emitToken("comment", this._index - 2, _clonePos(this._indexPos, -2));
|
|
this._state = TEXT;
|
|
this._startSection(1);
|
|
} else if (c !== "-") {
|
|
this._state = IN_COMMENT;
|
|
}
|
|
// else: stay in AFTER_COMMENT_2 (`--->`)
|
|
|
|
|
|
/*
|
|
* cdata
|
|
*/
|
|
} else if (this._state === BEFORE_CDATA_1) {
|
|
if (c === "C") {
|
|
this._state = BEFORE_CDATA_2;
|
|
} else {
|
|
this._state = IN_DECLARATION;
|
|
}
|
|
} else if (this._state === BEFORE_CDATA_2) {
|
|
if (c === "D") {
|
|
this._state = BEFORE_CDATA_3;
|
|
} else {
|
|
this._state = IN_DECLARATION;
|
|
}
|
|
} else if (this._state === BEFORE_CDATA_3) {
|
|
if (c === "A") {
|
|
this._state = BEFORE_CDATA_4;
|
|
} else {
|
|
this._state = IN_DECLARATION;
|
|
}
|
|
} else if (this._state === BEFORE_CDATA_4) {
|
|
if (c === "T") {
|
|
this._state = BEFORE_CDATA_5;
|
|
} else {
|
|
this._state = IN_DECLARATION;
|
|
}
|
|
} else if (this._state === BEFORE_CDATA_5) {
|
|
if (c === "A") {
|
|
this._state = BEFORE_CDATA_6;
|
|
} else {
|
|
this._state = IN_DECLARATION;
|
|
}
|
|
} else if (this._state === BEFORE_CDATA_6) {
|
|
if (c === "[") {
|
|
this._state = IN_CDATA;
|
|
this._startSection(1);
|
|
} else {
|
|
this._state = IN_DECLARATION;
|
|
}
|
|
} else if (this._state === IN_CDATA) {
|
|
if (c === "]") {
|
|
this._state = AFTER_CDATA_1;
|
|
}
|
|
} else if (this._state === AFTER_CDATA_1) {
|
|
if (c === "]") {
|
|
this._state = AFTER_CDATA_2;
|
|
} else {
|
|
this._state = IN_CDATA;
|
|
}
|
|
} else if (this._state === AFTER_CDATA_2) {
|
|
if (c === ">") {
|
|
//remove 2 trailing chars
|
|
// It should be okay to just decrement the char position by 2 because we know neither of the previous
|
|
// characters is a newline.
|
|
this._emitToken("cdata", this._index - 2, _clonePos(this._indexPos, -2));
|
|
this._state = TEXT;
|
|
this._startSection(1);
|
|
} else if (c !== "]") {
|
|
this._state = IN_CDATA;
|
|
}
|
|
//else: stay in AFTER_CDATA_2 (`]]]>`)
|
|
|
|
|
|
/*
|
|
* special tags
|
|
*/
|
|
} else if (this._state === SPECIAL) {
|
|
if (c === "<") {
|
|
var re;
|
|
switch (this._special) {
|
|
case "c": re = /^\/script[ >]/i; break;
|
|
case "t": re = /^\/style[ >]/i ; break;
|
|
case "e": re = /^\/textarea[ >]/i ; break;
|
|
}
|
|
var m = re.exec(this._buffer.substring(this._index + 1, this._index + 15));
|
|
if (m) {
|
|
this._state = TEXT;
|
|
continue;
|
|
}
|
|
}
|
|
} else {
|
|
console.error("HTMLTokenizer: Encountered unknown state");
|
|
this._emitSpecialToken("error");
|
|
break;
|
|
}
|
|
|
|
if (c === "\n") {
|
|
this._indexPos.row++;
|
|
this._indexPos.column = 0;
|
|
} else {
|
|
this._indexPos.column++;
|
|
}
|
|
this._index++;
|
|
}
|
|
|
|
if (this._index === this._buffer.length) {
|
|
// We hit EOF in the middle of processing something else.
|
|
if (this._state !== TEXT) {
|
|
this._emitSpecialToken("error");
|
|
} else {
|
|
this._emitTokenIfNonempty("text");
|
|
}
|
|
}
|
|
return this._token;
|
|
};
|
|
|
|
|
|
Tokenizer.prototype._checkSpecial = function() {
|
|
var name = this._token && this._token.contents;
|
|
if (/^s(?:cript|tyle)|textarea$/i.test(name)) {
|
|
this._special = name[1].toLowerCase();
|
|
return true;
|
|
}
|
|
return false;
|
|
};
|
|
|
|
Tokenizer.prototype._startSpecial = function() {
|
|
this._state = SPECIAL;
|
|
this._startSection(1);
|
|
return true;
|
|
};
|
|
|
|
Tokenizer.prototype._startSection = function (offset) {
|
|
offset = offset || 0;
|
|
this._sectionStart = this._index + offset;
|
|
|
|
// Normally it wouldn't be safe to assume that we can just add the offset to the
|
|
// character position, because there might be a newline, which would require us to
|
|
// move to the next line. However, in all the cases where this is called, we are
|
|
// adjusting for characters that we know are not newlines.
|
|
this._sectionStartPos = _clonePos(this._indexPos, offset);
|
|
};
|
|
|
|
/**
|
|
* @private
|
|
* Extract the portion of the buffer since _sectionStart and set it to be the next token we return
|
|
* from `nextToken()`. If there's already a _token, we stuff it in _nextToken instead.
|
|
* @param {string} type The token's type (see documentation for `nextToken()`)
|
|
* @param {number} index If specified, the index to use as the end of the token; uses this._index if not specified
|
|
*/
|
|
Tokenizer.prototype._setToken = function (type, index, indexPos) {
|
|
if (index === undefined) {
|
|
index = this._index;
|
|
}
|
|
if (indexPos === undefined) {
|
|
indexPos = this._indexPos;
|
|
}
|
|
var token = {
|
|
type: type,
|
|
contents: this._sectionStart === -1 ? "" : this._buffer.substring(this._sectionStart, index),
|
|
start: this._sectionStart,
|
|
end: index,
|
|
startPos: _clonePos(this._sectionStartPos),
|
|
endPos: _clonePos(indexPos)
|
|
};
|
|
if (this._token) {
|
|
// Queue this token to be emitted next. In theory it would be more general to have
|
|
// an arbitrary-length queue, but currently we only ever emit at most two tokens in a
|
|
// single pass through the tokenization loop.
|
|
if (this._nextToken) {
|
|
console.error("HTMLTokenizer: Tried to emit more than two tokens in a single call");
|
|
}
|
|
this._nextToken = token;
|
|
} else {
|
|
this._token = token;
|
|
}
|
|
};
|
|
|
|
/**
|
|
* @private
|
|
* Sets the token to be returned from `nextToken()` and resets the section start to an invalid value.
|
|
* this._sectionStart should be set to a valid value before the next call to one of the `_emit` methods.
|
|
* @param {string} type The token's type (see documentation for `nextToken()`)
|
|
* @param {number} index If specified, the index to use as the end of the token; uses this._index if not specified
|
|
*/
|
|
Tokenizer.prototype._emitToken = function (type, index, indexPos) {
|
|
this._setToken(type, index, indexPos);
|
|
this._sectionStart = -1;
|
|
this._sectionStartPos = null;
|
|
};
|
|
|
|
/**
|
|
* @private
|
|
* Like `_emitToken()`, but used for special tokens that don't have real content (like opentagend and selfclosingtag).
|
|
* @param {string} type The token's type (see documentation for `nextToken()`)
|
|
* @param {number} index If specified, the index to use as the end of the token; uses this._index if not specified
|
|
*/
|
|
Tokenizer.prototype._emitSpecialToken = function (type, index, indexPos) {
|
|
// Force the section start to be -1, since these tokens don't have meaningful content--they're
|
|
// just marking particular boundaries we care about (end of an open tag or a self-closing tag).
|
|
this._sectionStart = -1;
|
|
this._sectionStartPos = null;
|
|
this._emitToken(type, index, indexPos);
|
|
};
|
|
|
|
/**
|
|
* @private
|
|
* Like `_emitToken()`, but only emits a token if there is actually content in it. Note that this still
|
|
* resets this._sectionStart to an invalid value even if there is no content, so a new section must be
|
|
* started before the next `_emit`.
|
|
* @param {string} type The token's type (see documentation for `nextToken()`)
|
|
*/
|
|
Tokenizer.prototype._emitTokenIfNonempty = function (type) {
|
|
if (this._index > this._sectionStart && this._sectionStart != -1) {
|
|
this._setToken(type);
|
|
}
|
|
this._sectionStart = -1;
|
|
this._sectionStartPos = null;
|
|
};
|
|
|
|
exports.Tokenizer = Tokenizer;
|
|
}); |