From ace57dd20528ab6b99f9979ea438b08561208b40 Mon Sep 17 00:00:00 2001 From: Jermolene Date: Thu, 17 Apr 2014 12:00:32 +0100 Subject: [PATCH] Refactor utilities out of HTML parser Some of the functions are useful general purpose parser helpers. --- core/modules/parsers/parseutils.js | 268 ++++++++++++++++++ core/modules/parsers/wikiparser/rules/html.js | 259 +---------------- 2 files changed, 279 insertions(+), 248 deletions(-) create mode 100644 core/modules/parsers/parseutils.js diff --git a/core/modules/parsers/parseutils.js b/core/modules/parsers/parseutils.js new file mode 100644 index 000000000..baf1f93bf --- /dev/null +++ b/core/modules/parsers/parseutils.js @@ -0,0 +1,268 @@ +/*\ +title: $:/core/modules/utils/parseutils.js +type: application/javascript +module-type: utils + +Utility functions concerned with parsing text into tokens. + +Most functions have the following pattern: + +* The parameters are: +** `source`: the source string being parsed +** `pos`: the current parse position within the string +** Any further parameters are used to identify the token that is being parsed +* The return value is: +** null if the token was not found at the specified position +** an object representing the token with the following standard fields: +*** `type`: string indicating the type of the token +*** `start`: start position of the token in the source string +*** `end`: end position of the token in the source string +*** Any further fields required to describe the token + +The exception is `skipWhiteSpace`, which just returns the position after the whitespace. + +\*/ +(function(){ + +/*jslint node: true, browser: true */ +/*global $tw: false */ +"use strict"; + +/* +Look for a whitespace token. Returns null if not found, otherwise returns {type: "whitespace", start:, end:,} +*/ +exports.parseWhiteSpace = function(source,pos) { + var node = { + type: "whitespace", + start: pos + }; + var re = /(\s)+/g; + re.lastIndex = pos; + var match = re.exec(source); + if(match && match.index === pos) { + node.end = pos + match[0].length; + return node; + } + return null; +}; + +/* +Convenience wrapper for parseWhiteSpace. Returns the position after the whitespace +*/ +exports.skipWhiteSpace = function(source,pos) { + var whitespace = $tw.utils.parseWhiteSpace(source,pos); + if(whitespace) { + return whitespace.end; + } + return pos; +}; + +/* +Look for a given string token. Returns null if not found, otherwise returns {type: "token", value:, start:, end:,} +*/ +exports.parseTokenString = function(source,pos,token) { + var match = source.indexOf(token,pos) === pos; + if(match) { + return { + type: "token", + value: token, + start: pos, + end: pos + token.length + }; + } + return null; +}; + +/* +Look for a token matching a regex. Returns null if not found, otherwise returns {type: "regexp", match:, start:, end:,} +*/ +exports.parseTokenRegExp = function(source,pos,reToken) { + var node = { + type: "regexp", + start: pos + }; + reToken.lastIndex = pos; + node.match = reToken.exec(source); + if(node.match && node.match.index === pos) { + node.end = pos + node.match[0].length; + return node; + } else { + return null; + } +}; + +/* +Look for a string literal. Returns null if not found, otherwise returns {type: "string", value:, start:, end:,} +*/ +exports.parseStringLiteral = function(source,pos) { + var node = { + type: "string", + start: pos + }; + var reString = /(?:"([^"]*)")|(?:'([^']*)')/g; + reString.lastIndex = pos; + var match = reString.exec(source); + if(match && match.index === pos) { + node.value = match[1] === undefined ? match[2] : match[1]; + node.end = pos + match[0].length; + return node; + } else { + return null; + } +}; + +/* +Look for a macro invocation parameter. Returns null if not found, or {type: "macro-parameter", name:, value:, start:, end:} +*/ +exports.parseMacroParameter = function(source,pos) { + var node = { + type: "macro-parameter", + start: pos + }; + // Define our regexp + var reMacroParameter = /(?:([A-Za-z0-9\-_]+)\s*:)?(?:\s*(?:"([^"]*)"|'([^']*)'|\[\[([^\]]*)\]\]|([^\s>"'=]+)))/g; + // Skip whitespace + pos = $tw.utils.skipWhiteSpace(source,pos); + // Look for the parameter + var token = $tw.utils.parseTokenRegExp(source,pos,reMacroParameter); + if(!token) { + return null; + } + pos = token.end; + // Get the parameter details + node.value = token.match[2] !== undefined ? token.match[2] : ( + token.match[3] !== undefined ? token.match[3] : ( + token.match[4] !== undefined ? token.match[4] : ( + token.match[5] !== undefined ? token.match[5] : ( + "" + ) + ) + ) + ); + if(token.match[1]) { + node.name = token.match[1]; + } + // Update the end position + node.end = pos; + return node; +}; + +/* +Look for a macro invocation. Returns null if not found, or {type: "macrocall", name:, parameters:, start:, end:} +*/ +exports.parseMacroInvocation = function(source,pos) { + var node = { + type: "macrocall", + start: pos, + params: [] + }; + // Define our regexps + var reMacroName = /([^\s>"'=]+)/g; + // Skip whitespace + pos = $tw.utils.skipWhiteSpace(source,pos); + // Look for a double less than sign + var token = $tw.utils.parseTokenString(source,pos,"<<"); + if(!token) { + return null; + } + pos = token.end; + // Get the macro name + var name = $tw.utils.parseTokenRegExp(source,pos,reMacroName); + if(!name) { + return null; + } + node.name = name.match[1]; + pos = name.end; + // Process parameters + var parameter = $tw.utils.parseMacroParameter(source,pos); + while(parameter) { + node.params.push(parameter); + pos = parameter.end; + // Get the next parameter + parameter = $tw.utils.parseMacroParameter(source,pos); + } + // Skip whitespace + pos = $tw.utils.skipWhiteSpace(source,pos); + // Look for a double greater than sign + token = $tw.utils.parseTokenString(source,pos,">>"); + if(!token) { + return null; + } + pos = token.end; + // Update the end position + node.end = pos; + return node; +}; + +/* +Look for an HTML attribute definition. Returns null if not found, otherwise returns {type: "attribute", name:, valueType: "string|indirect|macro", value:, start:, end:,} +*/ +exports.parseAttribute = function(source,pos) { + var node = { + start: pos + }; + // Define our regexps + var reAttributeName = /([^\/\s>"'=]+)/g, + reUnquotedAttribute = /([^\/\s<>"'=]+)/g, + reIndirectValue = /\{\{([^\}]+)\}\}/g; + // Skip whitespace + pos = $tw.utils.skipWhiteSpace(source,pos); + // Get the attribute name + var name = $tw.utils.parseTokenRegExp(source,pos,reAttributeName); + if(!name) { + return null; + } + node.name = name.match[1]; + pos = name.end; + // Skip whitespace + pos = $tw.utils.skipWhiteSpace(source,pos); + // Look for an equals sign + var token = $tw.utils.parseTokenString(source,pos,"="); + if(token) { + pos = token.end; + // Skip whitespace + pos = $tw.utils.skipWhiteSpace(source,pos); + // Look for a string literal + var stringLiteral = $tw.utils.parseStringLiteral(source,pos); + if(stringLiteral) { + pos = stringLiteral.end; + node.type = "string"; + node.value = stringLiteral.value; + } else { + // Look for an indirect value + var indirectValue = $tw.utils.parseTokenRegExp(source,pos,reIndirectValue); + if(indirectValue) { + pos = indirectValue.end; + node.type = "indirect"; + node.textReference = indirectValue.match[1]; + } else { + // Look for a unquoted value + var unquotedValue = $tw.utils.parseTokenRegExp(source,pos,reUnquotedAttribute); + if(unquotedValue) { + pos = unquotedValue.end; + node.type = "string"; + node.value = unquotedValue.match[1]; + } else { + // Look for a macro invocation value + var macroInvocation = $tw.utils.parseMacroInvocation(source,pos); + if(macroInvocation) { + pos = macroInvocation.end; + node.type = "macro"; + node.value = macroInvocation; + } else { + node.type = "string"; + node.value = "true"; + } + } + } + } + } else { + node.type = "string"; + node.value = "true"; + } + // Update the end position + node.end = pos; + return node; +}; + +})(); diff --git a/core/modules/parsers/wikiparser/rules/html.js b/core/modules/parsers/wikiparser/rules/html.js index 559dcfddf..9d35569ac 100644 --- a/core/modules/parsers/wikiparser/rules/html.js +++ b/core/modules/parsers/wikiparser/rules/html.js @@ -48,7 +48,7 @@ exports.parse = function() { // Advance the parser position to past the tag this.parser.pos = tag.end; // Check for an immediately following double linebreak - var hasLineBreak = !tag.isSelfClosing && !!this.parseTokenRegExp(this.parser.source,this.parser.pos,/([^\S\n]*\r?\n(?:[^\S\n]*\r?\n|$))/g); + var hasLineBreak = !tag.isSelfClosing && !!$tw.utils.parseTokenRegExp(this.parser.source,this.parser.pos,/([^\S\n]*\r?\n(?:[^\S\n]*\r?\n|$))/g); // Set whether we're in block mode tag.isBlock = this.is.block || hasLineBreak; // Parse the body if we need to @@ -71,244 +71,7 @@ exports.parse = function() { }; /* -Look for a whitespace token. Returns null if not found, otherwise returns {type: "whitespace", start:, end:,} -*/ -exports.parseWhiteSpace = function(source,pos) { - var node = { - type: "whitespace", - start: pos - }; - var re = /(\s)+/g; - re.lastIndex = pos; - var match = re.exec(source); - if(match && match.index === pos) { - node.end = pos + match[0].length; - return node; - } - return null; -}; - -/* -Convenience wrapper for parseWhiteSpace -*/ -exports.skipWhiteSpace = function(source,pos) { - var whitespace = this.parseWhiteSpace(source,pos); - if(whitespace) { - return whitespace.end; - } - return pos; -}; - -/* -Look for a given string token. Returns null if not found, otherwise returns {type: "token", value:, start:, end:,} -*/ -exports.parseTokenString = function(source,pos,token) { - var match = source.indexOf(token,pos) === pos; - if(match) { - return { - type: "token", - value: token, - start: pos, - end: pos + token.length - }; - } - return null; -}; - -/* -Look for a token matching a regex. Returns null if not found, otherwise returns {type: "regexp", match:, start:, end:,} -*/ -exports.parseTokenRegExp = function(source,pos,reToken) { - var node = { - type: "regexp", - start: pos - }; - reToken.lastIndex = pos; - node.match = reToken.exec(source); - if(node.match && node.match.index === pos) { - node.end = pos + node.match[0].length; - return node; - } else { - return null; - } -}; - -/* -Look for a string literal. Returns null if not found, otherwise returns {type: "string", value:, start:, end:,} -*/ -exports.parseStringLiteral = function(source,pos) { - var node = { - type: "string", - start: pos - }; - var reString = /(?:"([^"]*)")|(?:'([^']*)')/g; - reString.lastIndex = pos; - var match = reString.exec(source); - if(match && match.index === pos) { - node.value = match[1] === undefined ? match[2] : match[1]; - node.end = pos + match[0].length; - return node; - } else { - return null; - } -}; - -/* -Look for a macro invocation parameter. Returns null if not found, or {type: "macro-parameter", name:, value:, start:, end:} -*/ -exports.parseMacroParameter = function(source,pos) { - var node = { - type: "macro-parameter", - start: pos - }; - // Define our regexp - var reMacroParameter = /(?:([A-Za-z0-9\-_]+)\s*:)?(?:\s*(?:"([^"]*)"|'([^']*)'|\[\[([^\]]*)\]\]|([^\s>"'=]+)))/g; - // Skip whitespace - pos = this.skipWhiteSpace(source,pos); - // Look for the parameter - var token = this.parseTokenRegExp(source,pos,reMacroParameter); - if(!token) { - return null; - } - pos = token.end; - // Get the parameter details - node.value = token.match[2] !== undefined ? token.match[2] : ( - token.match[3] !== undefined ? token.match[3] : ( - token.match[4] !== undefined ? token.match[4] : ( - token.match[5] !== undefined ? token.match[5] : ( - "" - ) - ) - ) - ); - if(token.match[1]) { - node.name = token.match[1]; - } - // Update the end position - node.end = pos; - return node; -}; - -/* -Look for a macro invocation. Returns null if not found, or {type: "macrocall", name:, parameters:, start:, end:} -*/ -exports.parseMacroInvocation = function(source,pos) { - var node = { - type: "macrocall", - start: pos, - params: [] - }; - // Define our regexps - var reMacroName = /([^\s>"'=]+)/g; - // Skip whitespace - pos = this.skipWhiteSpace(source,pos); - // Look for a double less than sign - var token = this.parseTokenString(source,pos,"<<"); - if(!token) { - return null; - } - pos = token.end; - // Get the macro name - var name = this.parseTokenRegExp(source,pos,reMacroName); - if(!name) { - return null; - } - node.name = name.match[1]; - pos = name.end; - // Process parameters - var parameter = this.parseMacroParameter(source,pos); - while(parameter) { - node.params.push(parameter); - pos = parameter.end; - // Get the next parameter - parameter = this.parseMacroParameter(source,pos); - } - // Skip whitespace - pos = this.skipWhiteSpace(source,pos); - // Look for a double greater than sign - token = this.parseTokenString(source,pos,">>"); - if(!token) { - return null; - } - pos = token.end; - // Update the end position - node.end = pos; - return node; -}; - -/* -Look for an HTML attribute definition. Returns null if not found, otherwise returns {type: "attribute", name:, valueType: "string|indirect|macro", value:, start:, end:,} -*/ -exports.parseAttribute = function(source,pos) { - var node = { - start: pos - }; - // Define our regexps - var reAttributeName = /([^\/\s>"'=]+)/g, - reUnquotedAttribute = /([^\/\s<>"'=]+)/g, - reIndirectValue = /\{\{([^\}]+)\}\}/g; - // Skip whitespace - pos = this.skipWhiteSpace(source,pos); - // Get the attribute name - var name = this.parseTokenRegExp(source,pos,reAttributeName); - if(!name) { - return null; - } - node.name = name.match[1]; - pos = name.end; - // Skip whitespace - pos = this.skipWhiteSpace(source,pos); - // Look for an equals sign - var token = this.parseTokenString(source,pos,"="); - if(token) { - pos = token.end; - // Skip whitespace - pos = this.skipWhiteSpace(source,pos); - // Look for a string literal - var stringLiteral = this.parseStringLiteral(source,pos); - if(stringLiteral) { - pos = stringLiteral.end; - node.type = "string"; - node.value = stringLiteral.value; - } else { - // Look for an indirect value - var indirectValue = this.parseTokenRegExp(source,pos,reIndirectValue); - if(indirectValue) { - pos = indirectValue.end; - node.type = "indirect"; - node.textReference = indirectValue.match[1]; - } else { - // Look for a unquoted value - var unquotedValue = this.parseTokenRegExp(source,pos,reUnquotedAttribute); - if(unquotedValue) { - pos = unquotedValue.end; - node.type = "string"; - node.value = unquotedValue.match[1]; - } else { - // Look for a macro invocation value - var macroInvocation = this.parseMacroInvocation(source,pos); - if(macroInvocation) { - pos = macroInvocation.end; - node.type = "macro"; - node.value = macroInvocation; - } else { - node.type = "string"; - node.value = "true"; - } - } - } - } - } else { - node.type = "string"; - node.value = "true"; - } - // Update the end position - node.end = pos; - return node; -}; - -/* -Look for an HTML tag. Returns null if not found, otherwise returns {type: "tag", name:, attributes: [], isSelfClosing:, start:, end:,} +Look for an HTML tag. Returns null if not found, otherwise returns {type: "element", name:, attributes: [], isSelfClosing:, start:, end:,} */ exports.parseTag = function(source,pos,options) { options = options || {}; @@ -321,45 +84,45 @@ exports.parseTag = function(source,pos,options) { // Define our regexps var reTagName = /([a-zA-Z0-9\-\$]+)/g; // Skip whitespace - pos = this.skipWhiteSpace(source,pos); + pos = $tw.utils.skipWhiteSpace(source,pos); // Look for a less than sign - token = this.parseTokenString(source,pos,"<"); + token = $tw.utils.parseTokenString(source,pos,"<"); if(!token) { return null; } pos = token.end; // Get the tag name - token = this.parseTokenRegExp(source,pos,reTagName); + token = $tw.utils.parseTokenRegExp(source,pos,reTagName); if(!token) { return null; } node.tag = token.match[1]; pos = token.end; // Process attributes - var attribute = this.parseAttribute(source,pos); + var attribute = $tw.utils.parseAttribute(source,pos); while(attribute) { node.attributes[attribute.name] = attribute; pos = attribute.end; // Get the next attribute - attribute = this.parseAttribute(source,pos); + attribute = $tw.utils.parseAttribute(source,pos); } // Skip whitespace - pos = this.skipWhiteSpace(source,pos); + pos = $tw.utils.skipWhiteSpace(source,pos); // Look for a closing slash - token = this.parseTokenString(source,pos,"/"); + token = $tw.utils.parseTokenString(source,pos,"/"); if(token) { pos = token.end; node.isSelfClosing = true; } // Look for a greater than sign - token = this.parseTokenString(source,pos,">"); + token = $tw.utils.parseTokenString(source,pos,">"); if(!token) { return null; } pos = token.end; // Check for a required line break if(options.requireLineBreak) { - token = this.parseTokenRegExp(source,pos,/([^\S\n]*\r?\n(?:[^\S\n]*\r?\n|$))/g); + token = $tw.utils.parseTokenRegExp(source,pos,/([^\S\n]*\r?\n(?:[^\S\n]*\r?\n|$))/g); if(!token) { return null; }