/* * Copyright (c) 2013 Adobe Systems Incorporated. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * */ /*jslint vars: true, plusplus: true, devel: true, nomen: true, indent: 4, maxerr: 50 */ /*global define */ /*unittests: HTML Instrumentation*/ define(function (require, exports, module) { "use strict"; var extend = function(obj, mixin) { Object.keys(mixin).forEach(function(key) { obj[key] = mixin[key]; }); return obj; }; // var DocumentManager = require("document/DocumentManager"), var Tokenizer = require("./HTMLTokenizer").Tokenizer, MurmurHash3 = require("./murmurhash3_gc"), PerfUtils = { //require("utils/PerfUtils"); markStart: function() {}, finalizeMeasurement: function() {}, addMeasurement: function() {} }; var seed = Math.floor(Math.random() * 65535); /** * A list of tags whose start causes any of a given set of immediate parent * tags to close. This mostly comes from the HTML5 spec section on omitted close tags: * http://www.w3.org/html/wg/drafts/html/master/syntax.html#optional-tags * This doesn't handle general content model violations. */ var openImpliesClose = { li: { li: true }, dt: { dd: true, dt: true }, dd: { dd: true, dt: true }, address: { p: true }, article: { p: true }, aside: { p: true }, blockquote: { p: true }, dir: { p: true }, div: { p: true }, dl: { p: true }, fieldset: { p: true }, footer: { p: true }, form: { p: true }, h1: { p: true }, h2: { p: true }, h3: { p: true }, h4: { p: true }, h5: { p: true }, h6: { p: true }, header: { p: true }, hgroup: { p: true }, hr: { p: true }, main: { p: true }, menu: { p: true }, nav: { p: true }, ol: { p: true }, p: { p: true }, pre: { p: true }, section: { p: true }, table: { p: true }, ul: { p: true }, rt: { rp: true, rt: true }, rp: { rp: true, rt: true }, optgroup: { optgroup: true, option: true }, option: { option: true }, tbody: { thead: true, tbody: true, tfoot: true }, tfoot: { tbody: true }, tr: { tr: true, th: true, td: true }, th: { th: true, td: true }, td: { thead: true, th: true, td: true }, body: { head: true, link: true, script: true } }; /** * A list of tags that are self-closing (do not contain other elements). * Mostly taken from http://www.w3.org/html/wg/drafts/html/master/syntax.html#void-elements */ var voidElements = { area: true, base: true, basefont: true, br: true, col: true, command: true, embed: true, frame: true, hr: true, img: true, input: true, isindex: true, keygen: true, link: true, menuitem: true, meta: true, param: true, source: true, track: true, wbr: true }; /** * @constructor * * A SimpleNode represents one node in a SimpleDOM tree. Each node can have * any set of properties on it, though there are a couple of assumptions made. * Elements will have `children` and `attributes` properties. Text nodes will have a `content` * property. All Elements will have a `tagID` and text nodes *can* have one. * * @param {Object} properties the properties provided will be set on the new object. */ function SimpleNode(properties) { extend(this, properties); } SimpleNode.prototype = { /** * Updates signatures used to optimize the number of comparisons done during * diffing. This is important to call if you change: * * * children * * child node attributes * * text content of a text node * * child node text */ update: function () { if (this.isElement()) { var i, subtreeHashes = "", childHashes = "", child; for (i = 0; i < this.children.length; i++) { child = this.children[i]; if (child.isElement()) { childHashes += String(child.tagID); subtreeHashes += String(child.tagID) + child.attributeSignature + child.subtreeSignature; } else { childHashes += child.textSignature; subtreeHashes += child.textSignature; } } this.childSignature = MurmurHash3.hashString(childHashes, childHashes.length, seed); this.subtreeSignature = MurmurHash3.hashString(subtreeHashes, subtreeHashes.length, seed); } else { this.textSignature = MurmurHash3.hashString(this.content, this.content.length, seed); } }, /** * Updates the signature of this node's attributes. Call this after making attribute changes. */ updateAttributeSignature: function () { var attributeString = JSON.stringify(this.attributes); this.attributeSignature = MurmurHash3.hashString(attributeString, attributeString.length, seed); }, /** * Is this node an element node? * * @return {bool} true if it is an element */ isElement: function () { return !!this.children; }, /** * Is this node a text node? * * @return {bool} true if it is text */ isText: function () { return !this.children; } }; /** * @private * * Generates a synthetic ID for text nodes. These IDs are only used * for convenience when reading a SimpleDOM that is dumped to the console. * * @param {Object} textNode new node for which we are generating an ID * @return {string} ID for the node */ function getTextNodeID(textNode) { var childIndex = textNode.parent.children.indexOf(textNode); if (childIndex === 0) { return textNode.parent.tagID + ".0"; } return textNode.parent.children[childIndex - 1].tagID + "t"; } /** * @private * * Adds two {row, column}-style positions, returning a new pos. */ function _addPos(pos1, pos2) { return { row: pos1.row + pos2.row, column: (pos2.row === 0 ? pos1.column + pos2.column : pos2.column) }; } /** * @private * * Offsets the character offset of the given {row, column} pos by the given amount and returns a new * pos. Not for general purpose use as it does not account for line boundaries. */ function _offsetPos(pos, offset) { return { row: pos.row, column: pos.column + offset }; } /** * @constructor * * A Builder creates a SimpleDOM tree of SimpleNode objects representing the * "important" contents of an HTML document. It does not include things like comments. * The nodes include information about their position in the text provided. * * @param {string} text The text to parse * @param {?int} startOffset starting offset in the text * @param {?{row: int, column: int}} startOffsetPos row/column position in the text */ function Builder(text, startOffset, startOffsetPos) { this.stack = []; this.text = text; this.t = new Tokenizer(text); this.currentTag = null; this.startOffset = startOffset || 0; this.startOffsetPos = startOffsetPos || { row: 0, column: 0 }; } Builder.prototype._logError = function (token) { var error = { token: token }, startPos = token ? (token.startPos || token.endPos) : this.startOffsetPos, endPos = token ? token.endPos : this.startOffsetPos; error.startPos = _addPos(this.startOffsetPos, startPos); error.endPos = _addPos(this.startOffsetPos, endPos); if (!this.errors) { this.errors = []; } this.errors.push(error); }; /** * Builds the SimpleDOM. * * @param {?bool} strict if errors are detected, halt and return null * @param {?Object} markCache a cache that can be used in ID generation (is passed to `getID`) * @return {SimpleNode} root of tree or null if parsing failed */ Builder.prototype.build = function (strict, markCache) { var self = this; var token, lastClosedTag, lastTextNode, lastIndex = 0; var stack = this.stack; var attributeName = null; var nodeMap = {}; markCache = markCache || {}; // Start timers for building full and partial DOMs. // Appropriate timer is used, and the other is discarded. var timerBuildFull = "HTMLInstr. Build DOM Full"; var timerBuildPart = "HTMLInstr. Build DOM Partial"; PerfUtils.markStart([timerBuildFull, timerBuildPart]); function closeTag(endIndex, endPos) { lastClosedTag = stack[stack.length - 1]; stack.pop(); lastClosedTag.update(); lastClosedTag.end = self.startOffset + endIndex; lastClosedTag.endPos = _addPos(self.startOffsetPos, endPos); } while ((token = this.t.nextToken()) !== null) { // lastTextNode is used to glue text nodes together // If the last node we saw was text but this one is not, then we're done gluing. // If this node is a comment, we might still encounter more text. if (token.type !== "text" && token.type !== "comment" && lastTextNode) { lastTextNode = null; } if (token.type === "error") { PerfUtils.finalizeMeasurement(timerBuildFull); // discard PerfUtils.addMeasurement(timerBuildPart); // use this._logError(token); return null; } else if (token.type === "opentagname") { var newTagName = token.contents.toLowerCase(), newTag; if (openImpliesClose.hasOwnProperty(newTagName)) { var closable = openImpliesClose[newTagName]; while (stack.length > 0 && closable.hasOwnProperty(stack[stack.length - 1].tag)) { // Close the previous tag at the start of this tag. // Adjust backwards for the < before the tag name. closeTag(token.start - 1, _offsetPos(token.startPos, -1)); } } newTag = new SimpleNode({ tag: token.contents.toLowerCase(), children: [], attributes: {}, parent: (stack.length ? stack[stack.length - 1] : null), start: this.startOffset + token.start - 1, startPos: _addPos(this.startOffsetPos, _offsetPos(token.startPos, -1)) // ok because we know the previous char was a "<" }); newTag.tagID = this.getID(newTag, markCache); // During undo in particular, it's possible that tag IDs may be reused and // the marks in the document may be misleading. If a tag ID has been reused, // we apply a new tag ID to ensure that our edits come out correctly. if (nodeMap[newTag.tagID]) { newTag.tagID = this.getNewID(newTag); } nodeMap[newTag.tagID] = newTag; if (newTag.parent) { newTag.parent.children.push(newTag); } this.currentTag = newTag; if (voidElements.hasOwnProperty(newTag.tag)) { // This is a self-closing element. newTag.update(); } else { stack.push(newTag); } } else if (token.type === "opentagend" || token.type === "selfclosingtag") { // TODO: disallow

? if (this.currentTag) { if (token.type === "selfclosingtag" && stack.length && stack[stack.length - 1] === this.currentTag) { // This must have been a self-closing tag that we didn't identify as a void element // (e.g. an SVG tag). Pop it off the stack as if we had encountered its close tag. closeTag(token.end, token.endPos); } else { // We're ending an open tag. Record the end of the open tag as the end of the // range. (If we later find a close tag for this tag, the end will get overwritten // with the end of the close tag. In the case of a self-closing tag, we should never // encounter that.) // Note that we don't need to update the signature here because the signature only // relies on the tag name and ID, and isn't affected by the tag's attributes, so // the signature we calculated when creating the tag is still the same. If we later // find a close tag for this tag, we'll update the signature to account for its // children at that point (in the next "else" case). this.currentTag.end = this.startOffset + token.end; this.currentTag.endPos = _addPos(this.startOffsetPos, token.endPos); lastClosedTag = this.currentTag; this.currentTag.updateAttributeSignature(); this.currentTag = null; } } } else if (token.type === "closetag") { // If this is a self-closing element, ignore the close tag. var closeTagName = token.contents.toLowerCase(); if (!voidElements.hasOwnProperty(closeTagName)) { // Find the topmost item on the stack that matches. If we can't find one, assume // this is just a dangling closing tag and ignore it. var i; for (i = stack.length - 1; i >= 0; i--) { if (stack[i].tag === closeTagName) { break; } } if (i !== stack.length - 1) { this._logError(token); // If we're in strict mode, treat unbalanced tags as invalid. if (strict) return null; } if (i >= 0) { do { // For all tags we're implicitly closing (before we hit the matching tag), we want the // implied end to be the beginning of the close tag (which is two characters, "", after the end of // the tagname). if (stack.length === i + 1) { closeTag(token.end + 1, _offsetPos(token.endPos, 1)); } else { closeTag(token.start - 2, _offsetPos(token.startPos, -2)); } } while (stack.length > i); } else { // If we're in strict mode, treat unmatched close tags as invalid. Otherwise // we just silently ignore them. this._logError(token); if (strict) { PerfUtils.finalizeMeasurement(timerBuildFull); PerfUtils.addMeasurement(timerBuildPart); return null; } } } } else if (token.type === "attribname") { attributeName = token.contents.toLowerCase(); // Set the value to the empty string in case this is an empty attribute. If it's not, // it will get overwritten by the attribvalue later. this.currentTag.attributes[attributeName] = ""; } else if (token.type === "attribvalue" && attributeName !== null) { this.currentTag.attributes[attributeName] = token.contents; attributeName = null; } else if (token.type === "text") { if (stack.length) { var parent = stack[stack.length - 1]; var newNode; // Check to see if we're continuing a previous text. if (lastTextNode) { newNode = lastTextNode; newNode.content += token.contents; } else { newNode = new SimpleNode({ parent: stack[stack.length - 1], content: token.contents }); parent.children.push(newNode); newNode.tagID = getTextNodeID(newNode); nodeMap[newNode.tagID] = newNode; lastTextNode = newNode; } newNode.update(); } } lastIndex = token.end; } // If we have any tags hanging open (e.g. html or body), fail the parse if we're in strict mode, // otherwise close them at the end of the document. if (stack.length) { this._logError(token); if (strict) { PerfUtils.finalizeMeasurement(timerBuildFull); PerfUtils.addMeasurement(timerBuildPart); return null; } else { // Manually compute the position of the end of the text (we can't rely on the // tokenizer for this since it may not get to the very end) // TODO: should probably make the tokenizer get to the end... var lines = this.text.split("\n"), lastPos = { row: lines.length - 1, column: lines[lines.length - 1].length }; while (stack.length) { closeTag(this.text.length, lastPos); } } } var dom = lastClosedTag; if (!dom) { // This can happen if the document has no nontrivial content, or if the user tries to // have something at the root other than the HTML tag. In all such cases, we treat the // document as invalid. this._logError(token); return null; } dom.nodeMap = nodeMap; PerfUtils.addMeasurement(timerBuildFull); // use PerfUtils.finalizeMeasurement(timerBuildPart); // discard return dom; }; Builder.newIdGenerator = function () { var tagID = 5; // 1, 2, 3 reserved for html head body return function (newTag) { if (newTag) { if (newTag.tag == "html") return 1; if (newTag.tag == "body") return 3; if (newTag.tag == "head") return 2; } return tagID++; }; }; /** * Returns a new tag ID. * * @return {int} unique tag ID */ Builder.prototype.getNewID = Builder.newIdGenerator(); /** * Returns the best tag ID for the new tag object given. * The default implementation just calls `getNewID` * and returns a unique ID. * * @param {Object} newTag tag object to potentially inspect to choose an ID * @return {int} unique tag ID */ Builder.prototype.getID = Builder.prototype.getNewID; /** * Builds a SimpleDOM from the text provided. If `strict` mode is true, parsing * will halt as soon as any error is seen and null will be returned. * * @param {string} text Text of document to parse * @param {bool} strict True for strict parsing * @return {SimpleNode} root of tree or null if strict failed */ function build(text, strict) { var builder = new Builder(text); return builder.build(strict); } /** * @private * * Generates a string version of a SimpleDOM for debugging purposes. * * @param {SimpleNode} root root of the tree * @return {string} Text version of the tree. */ function _dumpDOM(root) { var result = "", indent = ""; function walk(node) { if (node.tag) { result += indent + "TAG " + node.tagID + " " + node.tag + " " + JSON.stringify(node.attributes) + "\n"; } else { result += indent + "TEXT " + (node.tagID || "- ") + node.content + "\n"; } if (node.isElement()) { indent += " "; node.children.forEach(walk); indent = indent.slice(2); } } walk(root); return result; } /** * Generate instrumented HTML for the specified session's document, and mark the associated tag * ranges in the session. Each tag has a "data-cloud9-id" attribute with a unique ID for its * value. For example, "

" becomes something like "
". The attribute * value is just a number that is guaranteed to be unique. * * Also stores marks in the given session that correspond to the tag ranges. These marks are used * to track the DOM structure for in-browser highlighting and live HTML updating. * * This only needs to be done once on load of a document. As the document is edited in memory, * the instrumentation is kept up to date via the diffs and edits that are generated on change * events. Call this again only if you want to do a full re-sync of the session's DOM state. * * @param {session} session The session whose document we're instrumenting, and which we should * mark ranges in. * @return {string} instrumented html content */ function generateInstrumentedHTML(text) { var builder = new Builder(text); builder.getID = builder.getNewID = Builder.newIdGenerator(); var dom = builder.build(); var orig = text; var gen = ""; var lastIndex = 0; if (!dom) { return text; } // Walk through the dom nodes and insert the 'data-cloud9-id' attribute at the // end of the open tag function walk(node) { if (node.tag) { var attrText = " data-cloud9-id='" + node.tagID + "'"; // Insert the attribute as the first attribute in the tag. var insertIndex = node.start + node.tag.length + 1; gen += orig.substr(lastIndex, insertIndex - lastIndex) + attrText; lastIndex = insertIndex; } if (node.isElement()) { node.children.forEach(walk); } } walk(dom); gen += orig.substr(lastIndex); return gen; } // Public API exports.build = build; exports.Builder = Builder; exports.SimpleNode = SimpleNode; exports.generateInstrumentedHTML = generateInstrumentedHTML; // Private API exports._dumpDOM = _dumpDOM; exports._offsetPos = _offsetPos; exports._getTextNodeID = getTextNodeID; exports._seed = seed; });