c9-core/plugins/c9.ide.language.html.diff/HTMLTokenizer.js

/*
 * Copyright (c) 2013 Adobe Systems Incorporated. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 *
 */

// A simple HTML tokenizer, originally adapted from https://github.com/fb55/htmlparser2
// (MIT-licensed), but with significant customizations for use in HTML live development.

/*jslint vars: true, plusplus: true, devel: true, nomen: true, indent: 4, maxerr: 50, continue: true */
/*global define */
/*unittests: HTML Tokenizer*/

define(function (require, exports, module) {

    "use strict";
    var i = 0,

        TEXT = i++,
        BEFORE_TAG_NAME = i++, //after <
        IN_TAG_NAME = i++,
        BEFORE_CLOSING_TAG_NAME = i++,
        IN_CLOSING_TAG_NAME = i++,
        AFTER_CLOSING_TAG_NAME = i++,
        AFTER_SELFCLOSE_SLASH = i++,

        //attributes
        BEFORE_ATTRIBUTE_NAME = i++,
        AFTER_QUOTED_ATTRIBUTE_VALUE = i++,
        IN_ATTRIBUTE_NAME = i++,
        AFTER_ATTRIBUTE_NAME = i++,
        BEFORE_ATTRIBUTE_VALUE = i++,
        IN_ATTRIBUTE_VALUE_DOUBLE_QUOTES = i++, // "
        IN_ATTRIBUTE_VALUE_SINGLE_QUOTES = i++, // '
        IN_ATTRIBUTE_VALUE_NO_QUOTES = i++,

        //declarations
        BEFORE_DECLARATION = i++, // !
        IN_DECLARATION = i++,

        //processing instructions
        IN_PROCESSING_INSTRUCTION = i++, // ?

        //comments
        BEFORE_COMMENT = i++,
        IN_COMMENT = i++,
        AFTER_COMMENT_1 = i++,
        AFTER_COMMENT_2 = i++,

        //cdata
        BEFORE_CDATA_1 = i++, // [
        BEFORE_CDATA_2 = i++, // C
        BEFORE_CDATA_3 = i++, // D
        BEFORE_CDATA_4 = i++, // A
        BEFORE_CDATA_5 = i++, // T
        BEFORE_CDATA_6 = i++, // A
        IN_CDATA = i++, // [
        AFTER_CDATA_1 = i++, // ]
        AFTER_CDATA_2 = i++, // ]

        //special tags
        SPECIAL = i++; //S

    /**
     * @private
     * @param {string} c the character to test
     * @return {boolean} true if c is whitespace
     */
    function isWhitespace(c) {
        return c === " " || c === "\t" || c === "\r" || c === "\n";
    }

    /**
     * @private
     * @param {string} c the character to test
     * @return {boolean} true if c is legal in an HTML tag name
     */
    function isLegalInTagName(c) {
        // We allow "-" in tag names since they're popular in Angular custom tag names
        // and will be legal in the web components spec.
        return (/[A-Za-z0-9\-]/).test(c);
    }

    /**
     * @private
     * @param {string} c the character to test
     * @return {boolean} true if c is legal in an HTML attribute name
     */
    function isLegalInAttributeName(c) {
        return c !== '"' && c !== "'" && c !== "<" && c !== "=";
    }

    /**
     * @private
     * @param {string} c the character to test
     * @return {boolean} true if c is legal in an unquoted attribute value
     */
    function isLegalInUnquotedAttributeValue(c) {
        return c !== "<" && c !== "=";
    }

    function _clonePos(pos, offset) {
        return pos ? { row: pos.row, column: pos.column + (offset || 0) } : null;
    }

    /**
     * @constructor
     * A simple HTML tokenizer. See the description of nextToken() for usage details.
     * @param {string} text The HTML document to tokenize.
     */
    function Tokenizer(text) {
        this._state = TEXT;
        this._buffer = text;
        this._sectionStart = 0;
        this._sectionStartPos = { row: 0, column: 0 };
        this._index = 0;
        this._indexPos = { row: 0, column: 0 };
        this._special = 0; // 1 for script, 2 for style
        this._token = null;
        this._nextToken = null;
    }

    /**
     * Returns the next token in the HTML document, or null if we're at the end of the document.
     * @return {?{type: string, contents: string, start: number, end: number}} token The next token, with the following fields:
     *    type: The type of token, one of:
     *          "error" - invalid syntax was found, tokenization aborted. Calling nextToken() again will produce undefined results.
     *          "text" - contents contains the text
     *          "opentagname" - an open tag was started; contents contains the tag name
     *          "attribname" - an attribute was encountered; contents contains the attribute name
     *          "attribvalue" - the value for the previous attribname was encountered; contents contains the (unquoted) value
     *              (Note that attributes like checked and disabled might not have values.)
     *          "opentagend" - the end of an open tag was encountered; contents is unspecified
     *          "selfclosingtag" - a "/>" was encountered indicating that a void element was self-closed; contents is unspecified
     *              (Note that this is optional in HTML; void elements like <img> will end with "opentagend", not "selfclosingtag")
     *          "closetag" - a close tag was encountered; contents contains the tag name
     *          "comment" - a comment was encountered; contents contains the body of the comment
     *          "cdata" - a CDATA block was encountered; contents contains the text inside the block
     *    contents: the contents of the token, as specified above. Note that "opentagend" and "selfclosingtag" really specify positions,
     *          not tokens, and so have no contents.
     *    start: the start index of the token contents within the text, or -1 for "opentagend" and "selfclosingtag"
     *    end: the end index of the token contents within the text, or the position of the boundary for "opentagend" and "selfclosingtag"
     */
    Tokenizer.prototype.nextToken = function () {
        this._token = null;

        if (this._nextToken) {
            var result = this._nextToken;
            this._nextToken = null;
            return result;
        }

        while (this._index < this._buffer.length && !this._token) {
            var c = this._buffer.charAt(this._index);
            if (this._state === TEXT) {
                if (c === "<") {
                    this._emitTokenIfNonempty("text");
                    this._state = BEFORE_TAG_NAME;
                    this._startSection();
                }
            } else if (this._state === BEFORE_TAG_NAME) {
                if (c === "/") {
                    this._state = BEFORE_CLOSING_TAG_NAME;
                } else if (c === ">" || this._special > 0) {
                    this._state = TEXT;
                } else {
                    if (c === "!") {
                        this._state = BEFORE_DECLARATION;
                        this._startSection(1);
                    } else if (c === "?") {
                        this._state = IN_PROCESSING_INSTRUCTION;
                        this._startSection(1);
                    } else if (!isLegalInTagName(c)) {
                        this._emitSpecialToken("error");
                        break;
                    } else if (!isWhitespace(c)) {
                        this._state = IN_TAG_NAME;
                        this._startSection();
                    }
                }
            } else if (this._state === IN_TAG_NAME) {
                if (c === "/") {
                    this._emitToken("opentagname");
                    this._checkSpecial();
                    this._emitSpecialToken("selfclosingtag", this._index + 2, _clonePos(this._indexPos, 2));
                    this._state = AFTER_SELFCLOSE_SLASH;
                } else if (c === ">") {
                    this._emitToken("opentagname");
                    this._checkSpecial();
                    this._emitSpecialToken("opentagend", this._index + 1, _clonePos(this._indexPos, 1));
                    if (this._special) {
                        this._startSpecial();
                    } else {
                        this._state = TEXT;
                        this._startSection(1);
                    }
                } else if (isWhitespace(c)) {
                    this._emitToken("opentagname");
                    this._checkSpecial();
                    this._state = BEFORE_ATTRIBUTE_NAME;
                } else if (!isLegalInTagName(c)) {
                    this._emitSpecialToken("error");
                    break;
                }
            } else if (this._state === BEFORE_CLOSING_TAG_NAME) {
                if (c === ">") {
                    this._state = TEXT;
                } else if (!isLegalInTagName(c)) {
                    this._emitSpecialToken("error");
                    break;
                } else if (!isWhitespace(c)) {
                    this._state = IN_CLOSING_TAG_NAME;
                    this._startSection();
                }
            } else if (this._state === IN_CLOSING_TAG_NAME) {
                if (c === ">") {
                    this._emitToken("closetag");
                    this._state = TEXT;
                    this._startSection(1);
                    this._special = 0;
                } else if (isWhitespace(c)) {
                    this._emitToken("closetag");
                    this._state = AFTER_CLOSING_TAG_NAME;
                    this._special = 0;
                } else if (!isLegalInTagName(c)) {
                    this._emitSpecialToken("error");
                    break;
                }
            } else if (this._state === AFTER_CLOSING_TAG_NAME) {
                if (c === ">") {
                    this._state = TEXT;
                    this._startSection(1);
                } else if (!isWhitespace(c)) {
                    // There must be only whitespace in the closing tag after the name until the ">".
                    this._emitSpecialToken("error");
                    break;
                }
            } else if (this._state === AFTER_SELFCLOSE_SLASH) {
                // Nothing (even whitespace) can come between the / and > of a self-close.
                if (c === ">") {
                    this._state = TEXT;
                    this._startSection(1);
                } else {
                    this._emitSpecialToken("error");
                    break;
                }

            /*
            *	attributes
            */
            } else if (this._state === BEFORE_ATTRIBUTE_NAME) {
                if (c === ">") {
                    this._state = TEXT;
                    this._emitSpecialToken("opentagend", this._index + 1, _clonePos(this._indexPos, 1));
                    if (this._special) {
                        this._startSpecial();
                    } else {
                        this._startSection(1);
                    }
                } else if (c === "/") {
                    this._emitSpecialToken("selfclosingtag", this._index + 2, _clonePos(this._indexPos, 2));
                    this._state = AFTER_SELFCLOSE_SLASH;
                } else if (!isLegalInAttributeName(c)) {
                    this._emitSpecialToken("error");
                    break;
                } else if (!isWhitespace(c)) {
                    this._state = IN_ATTRIBUTE_NAME;
                    this._startSection();
                }
            } else if (this._state === IN_ATTRIBUTE_NAME) {
                if (c === "=") {
                    this._emitTokenIfNonempty("attribname");
                    this._state = BEFORE_ATTRIBUTE_VALUE;
                } else if (isWhitespace(c)) {
                    this._emitTokenIfNonempty("attribname");
                    this._state = AFTER_ATTRIBUTE_NAME;
                } else if (c === "/" || c === ">") {
                    this._emitTokenIfNonempty("attribname");
                    this._state = BEFORE_ATTRIBUTE_NAME;
                    continue;
                } else if (!isLegalInAttributeName(c)) {
                    this._emitSpecialToken("error");
                    break;
                }
            } else if (this._state === AFTER_ATTRIBUTE_NAME) {
                if (c === "=") {
                    this._state = BEFORE_ATTRIBUTE_VALUE;
                } else if (c === "/" || c === ">") {
                    this._state = BEFORE_ATTRIBUTE_NAME;
                    continue;
                } else if (!isLegalInAttributeName(c)) {
                    this._emitSpecialToken("error");
                    break;
                } else if (!isWhitespace(c)) {
                    this._state = IN_ATTRIBUTE_NAME;
                    this._startSection();
                }
            } else if (this._state === BEFORE_ATTRIBUTE_VALUE) {
                if (c === "\"") {
                    this._state = IN_ATTRIBUTE_VALUE_DOUBLE_QUOTES;
                    this._startSection(1);
                } else if (c === "'") {
                    this._state = IN_ATTRIBUTE_VALUE_SINGLE_QUOTES;
                    this._startSection(1);
                } else if (!isLegalInUnquotedAttributeValue(c)) {
                    this._emitSpecialToken("error");
                    break;
                } else if (!isWhitespace(c)) {
                    this._state = IN_ATTRIBUTE_VALUE_NO_QUOTES;
                    this._startSection();
                }
            } else if (this._state === IN_ATTRIBUTE_VALUE_DOUBLE_QUOTES) {
                if (c === "\"") {
                    this._emitToken("attribvalue");
                    this._state = AFTER_QUOTED_ATTRIBUTE_VALUE;
                }
            } else if (this._state === IN_ATTRIBUTE_VALUE_SINGLE_QUOTES) {
                if (c === "'") {
                    this._state = AFTER_QUOTED_ATTRIBUTE_VALUE;
                    this._emitToken("attribvalue");
                }
            } else if (this._state === IN_ATTRIBUTE_VALUE_NO_QUOTES) {
                if (c === ">") {
                    this._emitToken("attribvalue");
                    this._emitSpecialToken("opentagend", this._index + 1, _clonePos(this._indexPos, 1));
                    if (this._special) {
                        this._startSpecial();
                    } else {
                        this._state = TEXT;
                        this._startSection(1);
                    }
                } else if (isWhitespace(c)) {
                    this._emitToken("attribvalue");
                    this._state = BEFORE_ATTRIBUTE_NAME;
                } else if (!isLegalInUnquotedAttributeValue(c)) {
                    this._emitSpecialToken("error");
                    break;
                }
            } else if (this._state === AFTER_QUOTED_ATTRIBUTE_VALUE) {
                // There must be at least one whitespace between the end of a quoted
                // attribute value and the next attribute, if any.
                if (c === ">") {
                    this._state = TEXT;
                    this._emitSpecialToken("opentagend", this._index + 1, _clonePos(this._indexPos, 1));
                    if (this._special) {
                        this._startSpecial();
                    } else {
                        this._startSection(1);
                    }
                } else if (c === "/") {
                    this._emitSpecialToken("selfclosingtag", this._index + 2, _clonePos(this._indexPos, 2));
                    this._state = AFTER_SELFCLOSE_SLASH;
                } else if (isWhitespace(c)) {
                    this._state = BEFORE_ATTRIBUTE_NAME;
                } else {
                    this._emitSpecialToken("error");
                    break;
                }

            /*
            *	declarations
            */
            } else if (this._state === BEFORE_DECLARATION) {
                if (c === "[") {
                    this._state = BEFORE_CDATA_1;
                } else if (c === "-") {
                    this._state = BEFORE_COMMENT;
                } else {
                    this._state = IN_DECLARATION;
                }
            } else if (this._state === IN_DECLARATION) {
                if (c === ">") {
                    this._emitToken("declaration");
                    this._state = TEXT;
                    this._startSection(1);
                }


            /*
            *	processing instructions
            */
            } else if (this._state === IN_PROCESSING_INSTRUCTION) {
                if (c === ">") {
                    this._emitToken("processinginstruction");
                    this._state = TEXT;
                    this._startSection(1);
                }


            /*
            *	comments
            */
            } else if (this._state === BEFORE_COMMENT) {
                if (c === "-") {
                    this._state = IN_COMMENT;
                    this._startSection(1);
                } else {
                    this._state = IN_DECLARATION;
                }
            } else if (this._state === IN_COMMENT) {
                if (c === "-") {
                    this._state = AFTER_COMMENT_1;
                }
            } else if (this._state === AFTER_COMMENT_1) {
                if (c === "-") {
                    this._state = AFTER_COMMENT_2;
                } else {
                    this._state = IN_COMMENT;
                }
            } else if (this._state === AFTER_COMMENT_2) {
                if (c === ">") {
                    //remove 2 trailing chars
                    // It should be okay to just decrement the char position by 2 because we know neither of the previous
                    // characters is a newline.
                    this._emitToken("comment", this._index - 2, _clonePos(this._indexPos, -2));
                    this._state = TEXT;
                    this._startSection(1);
                } else if (c !== "-") {
                    this._state = IN_COMMENT;
                }
                // else: stay in AFTER_COMMENT_2 (`--->`)


            /*
            *	cdata
            */
            } else if (this._state === BEFORE_CDATA_1) {
                if (c === "C") {
                    this._state = BEFORE_CDATA_2;
                } else {
                    this._state = IN_DECLARATION;
                }
            } else if (this._state === BEFORE_CDATA_2) {
                if (c === "D") {
                    this._state = BEFORE_CDATA_3;
                } else {
                    this._state = IN_DECLARATION;
                }
            } else if (this._state === BEFORE_CDATA_3) {
                if (c === "A") {
                    this._state = BEFORE_CDATA_4;
                } else {
                    this._state = IN_DECLARATION;
                }
            } else if (this._state === BEFORE_CDATA_4) {
                if (c === "T") {
                    this._state = BEFORE_CDATA_5;
                } else {
                    this._state = IN_DECLARATION;
                }
            } else if (this._state === BEFORE_CDATA_5) {
                if (c === "A") {
                    this._state = BEFORE_CDATA_6;
                } else {
                    this._state = IN_DECLARATION;
                }
            } else if (this._state === BEFORE_CDATA_6) {
                if (c === "[") {
                    this._state = IN_CDATA;
                    this._startSection(1);
                } else {
                    this._state = IN_DECLARATION;
                }
            } else if (this._state === IN_CDATA) {
                if (c === "]") {
                    this._state = AFTER_CDATA_1;
                }
            } else if (this._state === AFTER_CDATA_1) {
                if (c === "]") {
                    this._state = AFTER_CDATA_2;
                } else {
                    this._state = IN_CDATA;
                }
            } else if (this._state === AFTER_CDATA_2) {
                if (c === ">") {
                    //remove 2 trailing chars
                    // It should be okay to just decrement the char position by 2 because we know neither of the previous
                    // characters is a newline.
                    this._emitToken("cdata", this._index - 2, _clonePos(this._indexPos, -2));
                    this._state = TEXT;
                    this._startSection(1);
                } else if (c !== "]") {
                    this._state = IN_CDATA;
                }
                //else: stay in AFTER_CDATA_2 (`]]]>`)


            /*
            * special tags
            */
            } else if (this._state === SPECIAL) {
                if (c === "<") {
                    var re;
                    switch (this._special) {
                        case "c": re = /^\/script[ >]/i; break;
                        case "t": re = /^\/style[ >]/i ; break;
                        case "e": re = /^\/textarea[ >]/i ; break;
                    }
                    var m = re.exec(this._buffer.substring(this._index + 1, this._index + 15));
                    if (m) {
                        this._state = TEXT;
                        continue;
                    }
                }
            } else {
                console.error("HTMLTokenizer: Encountered unknown state");
                this._emitSpecialToken("error");
                break;
            }

            if (c === "\n") {
                this._indexPos.row++;
                this._indexPos.column = 0;
            } else {
                this._indexPos.column++;
            }
            this._index++;
        }

        if (this._index === this._buffer.length) {
            // We hit EOF in the middle of processing something else.
            if (this._state !== TEXT) {
                this._emitSpecialToken("error");
            } else {
                this._emitTokenIfNonempty("text");
            }
        }
        return this._token;
    };


    Tokenizer.prototype._checkSpecial = function() {
        var name = this._token && this._token.contents;
        if (/^s(?:cript|tyle)|textarea$/i.test(name)) {
            this._special = name[1].toLowerCase();
            return true;
        }
        return false;
    };

    Tokenizer.prototype._startSpecial = function() {
        this._state = SPECIAL;
        this._startSection(1);
        return true;
    };

    Tokenizer.prototype._startSection = function (offset) {
        offset = offset || 0;
        this._sectionStart = this._index + offset;

        // Normally it wouldn't be safe to assume that we can just add the offset to the
        // character position, because there might be a newline, which would require us to
        // move to the next line. However, in all the cases where this is called, we are
        // adjusting for characters that we know are not newlines.
        this._sectionStartPos = _clonePos(this._indexPos, offset);
    };

    /**
     * @private
     * Extract the portion of the buffer since _sectionStart and set it to be the next token we return
     * from `nextToken()`. If there's already a _token, we stuff it in _nextToken instead.
     * @param {string} type The token's type (see documentation for `nextToken()`)
     * @param {number} index If specified, the index to use as the end of the token; uses this._index if not specified
     */
    Tokenizer.prototype._setToken = function (type, index, indexPos) {
        if (index === undefined) {
            index = this._index;
        }
        if (indexPos === undefined) {
            indexPos = this._indexPos;
        }
        var token = {
            type: type,
            contents: this._sectionStart === -1 ? "" : this._buffer.substring(this._sectionStart, index),
            start: this._sectionStart,
            end: index,
            startPos: _clonePos(this._sectionStartPos),
            endPos: _clonePos(indexPos)
        };
        if (this._token) {
            // Queue this token to be emitted next. In theory it would be more general to have
            // an arbitrary-length queue, but currently we only ever emit at most two tokens in a
            // single pass through the tokenization loop.
            if (this._nextToken) {
                console.error("HTMLTokenizer: Tried to emit more than two tokens in a single call");
            }
            this._nextToken = token;
        } else {
            this._token = token;
        }
    };

    /**
     * @private
     * Sets the token to be returned from `nextToken()` and resets the section start to an invalid value.
     * this._sectionStart should be set to a valid value before the next call to one of the `_emit` methods.
     * @param {string} type The token's type (see documentation for `nextToken()`)
     * @param {number} index If specified, the index to use as the end of the token; uses this._index if not specified
     */
    Tokenizer.prototype._emitToken = function (type, index, indexPos) {
        this._setToken(type, index, indexPos);
        this._sectionStart = -1;
        this._sectionStartPos = null;
    };

    /**
     * @private
     * Like `_emitToken()`, but used for special tokens that don't have real content (like opentagend and selfclosingtag).
     * @param {string} type The token's type (see documentation for `nextToken()`)
     * @param {number} index If specified, the index to use as the end of the token; uses this._index if not specified
     */
    Tokenizer.prototype._emitSpecialToken = function (type, index, indexPos) {
        // Force the section start to be -1, since these tokens don't have meaningful content--they're
        // just marking particular boundaries we care about (end of an open tag or a self-closing tag).
        this._sectionStart = -1;
        this._sectionStartPos = null;
        this._emitToken(type, index, indexPos);
    };

    /**
     * @private
     * Like `_emitToken()`, but only emits a token if there is actually content in it. Note that this still
     * resets this._sectionStart to an invalid value even if there is no content, so a new section must be
     * started before the next `_emit`.
     * @param {string} type The token's type (see documentation for `nextToken()`)
     */
    Tokenizer.prototype._emitTokenIfNonempty = function (type) {
        if (this._index > this._sectionStart && this._sectionStart != -1) {
            this._setToken(type);
        }
        this._sectionStart = -1;
        this._sectionStartPos = null;
    };

    exports.Tokenizer = Tokenizer;
});