601 lines
17 KiB
JavaScript
601 lines
17 KiB
JavaScript
|
(function(root, factory) {
|
||
|
if (typeof define === 'function' && define.amd) {
|
||
|
define([], factory) /* global define */
|
||
|
} else if (typeof module === 'object' && module.exports) {
|
||
|
module.exports = factory()
|
||
|
} else {
|
||
|
root.moo = factory()
|
||
|
}
|
||
|
}(this, function() {
|
||
|
'use strict';
|
||
|
|
||
|
var hasOwnProperty = Object.prototype.hasOwnProperty
|
||
|
var toString = Object.prototype.toString
|
||
|
var hasSticky = typeof new RegExp().sticky === 'boolean'
|
||
|
|
||
|
/***************************************************************************/
|
||
|
|
||
|
function isRegExp(o) { return o && toString.call(o) === '[object RegExp]' }
|
||
|
function isObject(o) { return o && typeof o === 'object' && !isRegExp(o) && !Array.isArray(o) }
|
||
|
|
||
|
function reEscape(s) {
|
||
|
return s.replace(/[-\/\\^$*+?.()|[\]{}]/g, '\\$&')
|
||
|
}
|
||
|
function reGroups(s) {
|
||
|
var re = new RegExp('|' + s)
|
||
|
return re.exec('').length - 1
|
||
|
}
|
||
|
function reCapture(s) {
|
||
|
return '(' + s + ')'
|
||
|
}
|
||
|
function reUnion(regexps) {
|
||
|
if (!regexps.length) return '(?!)'
|
||
|
var source = regexps.map(function(s) {
|
||
|
return "(?:" + s + ")"
|
||
|
}).join('|')
|
||
|
return "(?:" + source + ")"
|
||
|
}
|
||
|
|
||
|
function regexpOrLiteral(obj) {
|
||
|
if (typeof obj === 'string') {
|
||
|
return '(?:' + reEscape(obj) + ')'
|
||
|
|
||
|
} else if (isRegExp(obj)) {
|
||
|
// TODO: consider /u support
|
||
|
if (obj.ignoreCase) throw new Error('RegExp /i flag not allowed')
|
||
|
if (obj.global) throw new Error('RegExp /g flag is implied')
|
||
|
if (obj.sticky) throw new Error('RegExp /y flag is implied')
|
||
|
if (obj.multiline) throw new Error('RegExp /m flag is implied')
|
||
|
return obj.source
|
||
|
|
||
|
} else {
|
||
|
throw new Error('Not a pattern: ' + obj)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
function objectToRules(object) {
|
||
|
var keys = Object.getOwnPropertyNames(object)
|
||
|
var result = []
|
||
|
for (var i = 0; i < keys.length; i++) {
|
||
|
var key = keys[i]
|
||
|
var thing = object[key]
|
||
|
var rules = [].concat(thing)
|
||
|
if (key === 'include') {
|
||
|
for (var j = 0; j < rules.length; j++) {
|
||
|
result.push({include: rules[j]})
|
||
|
}
|
||
|
continue
|
||
|
}
|
||
|
var match = []
|
||
|
rules.forEach(function(rule) {
|
||
|
if (isObject(rule)) {
|
||
|
if (match.length) result.push(ruleOptions(key, match))
|
||
|
result.push(ruleOptions(key, rule))
|
||
|
match = []
|
||
|
} else {
|
||
|
match.push(rule)
|
||
|
}
|
||
|
})
|
||
|
if (match.length) result.push(ruleOptions(key, match))
|
||
|
}
|
||
|
return result
|
||
|
}
|
||
|
|
||
|
function arrayToRules(array) {
|
||
|
var result = []
|
||
|
for (var i = 0; i < array.length; i++) {
|
||
|
var obj = array[i]
|
||
|
if (obj.include) {
|
||
|
var include = [].concat(obj.include)
|
||
|
for (var j = 0; j < include.length; j++) {
|
||
|
result.push({include: include[j]})
|
||
|
}
|
||
|
continue
|
||
|
}
|
||
|
if (!obj.type) {
|
||
|
throw new Error('Rule has no type: ' + JSON.stringify(obj))
|
||
|
}
|
||
|
result.push(ruleOptions(obj.type, obj))
|
||
|
}
|
||
|
return result
|
||
|
}
|
||
|
|
||
|
function ruleOptions(type, obj) {
|
||
|
if (!isObject(obj)) {
|
||
|
obj = { match: obj }
|
||
|
}
|
||
|
if (obj.include) {
|
||
|
throw new Error('Matching rules cannot also include states')
|
||
|
}
|
||
|
|
||
|
// nb. error and fallback imply lineBreaks
|
||
|
var options = {
|
||
|
defaultType: type,
|
||
|
lineBreaks: !!obj.error || !!obj.fallback,
|
||
|
pop: false,
|
||
|
next: null,
|
||
|
push: null,
|
||
|
error: false,
|
||
|
fallback: false,
|
||
|
value: null,
|
||
|
type: null,
|
||
|
shouldThrow: false,
|
||
|
}
|
||
|
|
||
|
// Avoid Object.assign(), so we support IE9+
|
||
|
for (var key in obj) {
|
||
|
if (hasOwnProperty.call(obj, key)) {
|
||
|
options[key] = obj[key]
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// type transform cannot be a string
|
||
|
if (typeof options.type === 'string' && type !== options.type) {
|
||
|
throw new Error("Type transform cannot be a string (type '" + options.type + "' for token '" + type + "')")
|
||
|
}
|
||
|
|
||
|
// convert to array
|
||
|
var match = options.match
|
||
|
options.match = Array.isArray(match) ? match : match ? [match] : []
|
||
|
options.match.sort(function(a, b) {
|
||
|
return isRegExp(a) && isRegExp(b) ? 0
|
||
|
: isRegExp(b) ? -1 : isRegExp(a) ? +1 : b.length - a.length
|
||
|
})
|
||
|
return options
|
||
|
}
|
||
|
|
||
|
function toRules(spec) {
|
||
|
return Array.isArray(spec) ? arrayToRules(spec) : objectToRules(spec)
|
||
|
}
|
||
|
|
||
|
var defaultErrorRule = ruleOptions('error', {lineBreaks: true, shouldThrow: true})
|
||
|
function compileRules(rules, hasStates) {
|
||
|
var errorRule = null
|
||
|
var fast = Object.create(null)
|
||
|
var fastAllowed = true
|
||
|
var unicodeFlag = null
|
||
|
var groups = []
|
||
|
var parts = []
|
||
|
|
||
|
// If there is a fallback rule, then disable fast matching
|
||
|
for (var i = 0; i < rules.length; i++) {
|
||
|
if (rules[i].fallback) {
|
||
|
fastAllowed = false
|
||
|
}
|
||
|
}
|
||
|
|
||
|
for (var i = 0; i < rules.length; i++) {
|
||
|
var options = rules[i]
|
||
|
|
||
|
if (options.include) {
|
||
|
// all valid inclusions are removed by states() preprocessor
|
||
|
throw new Error('Inheritance is not allowed in stateless lexers')
|
||
|
}
|
||
|
|
||
|
if (options.error || options.fallback) {
|
||
|
// errorRule can only be set once
|
||
|
if (errorRule) {
|
||
|
if (!options.fallback === !errorRule.fallback) {
|
||
|
throw new Error("Multiple " + (options.fallback ? "fallback" : "error") + " rules not allowed (for token '" + options.defaultType + "')")
|
||
|
} else {
|
||
|
throw new Error("fallback and error are mutually exclusive (for token '" + options.defaultType + "')")
|
||
|
}
|
||
|
}
|
||
|
errorRule = options
|
||
|
}
|
||
|
|
||
|
var match = options.match.slice()
|
||
|
if (fastAllowed) {
|
||
|
while (match.length && typeof match[0] === 'string' && match[0].length === 1) {
|
||
|
var word = match.shift()
|
||
|
fast[word.charCodeAt(0)] = options
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Warn about inappropriate state-switching options
|
||
|
if (options.pop || options.push || options.next) {
|
||
|
if (!hasStates) {
|
||
|
throw new Error("State-switching options are not allowed in stateless lexers (for token '" + options.defaultType + "')")
|
||
|
}
|
||
|
if (options.fallback) {
|
||
|
throw new Error("State-switching options are not allowed on fallback tokens (for token '" + options.defaultType + "')")
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Only rules with a .match are included in the RegExp
|
||
|
if (match.length === 0) {
|
||
|
continue
|
||
|
}
|
||
|
fastAllowed = false
|
||
|
|
||
|
groups.push(options)
|
||
|
|
||
|
// Check unicode flag is used everywhere or nowhere
|
||
|
for (var j = 0; j < match.length; j++) {
|
||
|
var obj = match[j]
|
||
|
if (!isRegExp(obj)) {
|
||
|
continue
|
||
|
}
|
||
|
|
||
|
if (unicodeFlag === null) {
|
||
|
unicodeFlag = obj.unicode
|
||
|
} else if (unicodeFlag !== obj.unicode && options.fallback === false) {
|
||
|
throw new Error('If one rule is /u then all must be')
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// convert to RegExp
|
||
|
var pat = reUnion(match.map(regexpOrLiteral))
|
||
|
|
||
|
// validate
|
||
|
var regexp = new RegExp(pat)
|
||
|
if (regexp.test("")) {
|
||
|
throw new Error("RegExp matches empty string: " + regexp)
|
||
|
}
|
||
|
var groupCount = reGroups(pat)
|
||
|
if (groupCount > 0) {
|
||
|
throw new Error("RegExp has capture groups: " + regexp + "\nUse (?: … ) instead")
|
||
|
}
|
||
|
|
||
|
// try and detect rules matching newlines
|
||
|
if (!options.lineBreaks && regexp.test('\n')) {
|
||
|
throw new Error('Rule should declare lineBreaks: ' + regexp)
|
||
|
}
|
||
|
|
||
|
// store regex
|
||
|
parts.push(reCapture(pat))
|
||
|
}
|
||
|
|
||
|
|
||
|
// If there's no fallback rule, use the sticky flag so we only look for
|
||
|
// matches at the current index.
|
||
|
//
|
||
|
// If we don't support the sticky flag, then fake it using an irrefutable
|
||
|
// match (i.e. an empty pattern).
|
||
|
var fallbackRule = errorRule && errorRule.fallback
|
||
|
var flags = hasSticky && !fallbackRule ? 'ym' : 'gm'
|
||
|
var suffix = hasSticky || fallbackRule ? '' : '|'
|
||
|
|
||
|
if (unicodeFlag === true) flags += "u"
|
||
|
var combined = new RegExp(reUnion(parts) + suffix, flags)
|
||
|
return {regexp: combined, groups: groups, fast: fast, error: errorRule || defaultErrorRule}
|
||
|
}
|
||
|
|
||
|
function compile(rules) {
|
||
|
var result = compileRules(toRules(rules))
|
||
|
return new Lexer({start: result}, 'start')
|
||
|
}
|
||
|
|
||
|
function checkStateGroup(g, name, map) {
|
||
|
var state = g && (g.push || g.next)
|
||
|
if (state && !map[state]) {
|
||
|
throw new Error("Missing state '" + state + "' (in token '" + g.defaultType + "' of state '" + name + "')")
|
||
|
}
|
||
|
if (g && g.pop && +g.pop !== 1) {
|
||
|
throw new Error("pop must be 1 (in token '" + g.defaultType + "' of state '" + name + "')")
|
||
|
}
|
||
|
}
|
||
|
function compileStates(states, start) {
|
||
|
var all = states.$all ? toRules(states.$all) : []
|
||
|
delete states.$all
|
||
|
|
||
|
var keys = Object.getOwnPropertyNames(states)
|
||
|
if (!start) start = keys[0]
|
||
|
|
||
|
var ruleMap = Object.create(null)
|
||
|
for (var i = 0; i < keys.length; i++) {
|
||
|
var key = keys[i]
|
||
|
ruleMap[key] = toRules(states[key]).concat(all)
|
||
|
}
|
||
|
for (var i = 0; i < keys.length; i++) {
|
||
|
var key = keys[i]
|
||
|
var rules = ruleMap[key]
|
||
|
var included = Object.create(null)
|
||
|
for (var j = 0; j < rules.length; j++) {
|
||
|
var rule = rules[j]
|
||
|
if (!rule.include) continue
|
||
|
var splice = [j, 1]
|
||
|
if (rule.include !== key && !included[rule.include]) {
|
||
|
included[rule.include] = true
|
||
|
var newRules = ruleMap[rule.include]
|
||
|
if (!newRules) {
|
||
|
throw new Error("Cannot include nonexistent state '" + rule.include + "' (in state '" + key + "')")
|
||
|
}
|
||
|
for (var k = 0; k < newRules.length; k++) {
|
||
|
var newRule = newRules[k]
|
||
|
if (rules.indexOf(newRule) !== -1) continue
|
||
|
splice.push(newRule)
|
||
|
}
|
||
|
}
|
||
|
rules.splice.apply(rules, splice)
|
||
|
j--
|
||
|
}
|
||
|
}
|
||
|
|
||
|
var map = Object.create(null)
|
||
|
for (var i = 0; i < keys.length; i++) {
|
||
|
var key = keys[i]
|
||
|
map[key] = compileRules(ruleMap[key], true)
|
||
|
}
|
||
|
|
||
|
for (var i = 0; i < keys.length; i++) {
|
||
|
var name = keys[i]
|
||
|
var state = map[name]
|
||
|
var groups = state.groups
|
||
|
for (var j = 0; j < groups.length; j++) {
|
||
|
checkStateGroup(groups[j], name, map)
|
||
|
}
|
||
|
var fastKeys = Object.getOwnPropertyNames(state.fast)
|
||
|
for (var j = 0; j < fastKeys.length; j++) {
|
||
|
checkStateGroup(state.fast[fastKeys[j]], name, map)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return new Lexer(map, start)
|
||
|
}
|
||
|
|
||
|
function keywordTransform(map) {
|
||
|
var reverseMap = Object.create(null)
|
||
|
var byLength = Object.create(null)
|
||
|
var types = Object.getOwnPropertyNames(map)
|
||
|
for (var i = 0; i < types.length; i++) {
|
||
|
var tokenType = types[i]
|
||
|
var item = map[tokenType]
|
||
|
var keywordList = Array.isArray(item) ? item : [item]
|
||
|
keywordList.forEach(function(keyword) {
|
||
|
(byLength[keyword.length] = byLength[keyword.length] || []).push(keyword)
|
||
|
if (typeof keyword !== 'string') {
|
||
|
throw new Error("keyword must be string (in keyword '" + tokenType + "')")
|
||
|
}
|
||
|
reverseMap[keyword] = tokenType
|
||
|
})
|
||
|
}
|
||
|
|
||
|
// fast string lookup
|
||
|
// https://jsperf.com/string-lookups
|
||
|
function str(x) { return JSON.stringify(x) }
|
||
|
var source = ''
|
||
|
source += 'switch (value.length) {\n'
|
||
|
for (var length in byLength) {
|
||
|
var keywords = byLength[length]
|
||
|
source += 'case ' + length + ':\n'
|
||
|
source += 'switch (value) {\n'
|
||
|
keywords.forEach(function(keyword) {
|
||
|
var tokenType = reverseMap[keyword]
|
||
|
source += 'case ' + str(keyword) + ': return ' + str(tokenType) + '\n'
|
||
|
})
|
||
|
source += '}\n'
|
||
|
}
|
||
|
source += '}\n'
|
||
|
return Function('value', source) // type
|
||
|
}
|
||
|
|
||
|
/***************************************************************************/
|
||
|
|
||
|
var Lexer = function(states, state) {
|
||
|
this.startState = state
|
||
|
this.states = states
|
||
|
this.buffer = ''
|
||
|
this.stack = []
|
||
|
this.reset()
|
||
|
}
|
||
|
|
||
|
Lexer.prototype.reset = function(data, info) {
|
||
|
this.buffer = data || ''
|
||
|
this.index = 0
|
||
|
this.line = info ? info.line : 1
|
||
|
this.col = info ? info.col : 1
|
||
|
this.queuedToken = info ? info.queuedToken : null
|
||
|
this.queuedThrow = info ? info.queuedThrow : null
|
||
|
this.setState(info ? info.state : this.startState)
|
||
|
this.stack = info && info.stack ? info.stack.slice() : []
|
||
|
return this
|
||
|
}
|
||
|
|
||
|
Lexer.prototype.save = function() {
|
||
|
return {
|
||
|
line: this.line,
|
||
|
col: this.col,
|
||
|
state: this.state,
|
||
|
stack: this.stack.slice(),
|
||
|
queuedToken: this.queuedToken,
|
||
|
queuedThrow: this.queuedThrow,
|
||
|
}
|
||
|
}
|
||
|
|
||
|
Lexer.prototype.setState = function(state) {
|
||
|
if (!state || this.state === state) return
|
||
|
this.state = state
|
||
|
var info = this.states[state]
|
||
|
this.groups = info.groups
|
||
|
this.error = info.error
|
||
|
this.re = info.regexp
|
||
|
this.fast = info.fast
|
||
|
}
|
||
|
|
||
|
Lexer.prototype.popState = function() {
|
||
|
this.setState(this.stack.pop())
|
||
|
}
|
||
|
|
||
|
Lexer.prototype.pushState = function(state) {
|
||
|
this.stack.push(this.state)
|
||
|
this.setState(state)
|
||
|
}
|
||
|
|
||
|
var eat = hasSticky ? function(re, buffer) { // assume re is /y
|
||
|
return re.exec(buffer)
|
||
|
} : function(re, buffer) { // assume re is /g
|
||
|
var match = re.exec(buffer)
|
||
|
// will always match, since we used the |(?:) trick
|
||
|
if (match[0].length === 0) {
|
||
|
return null
|
||
|
}
|
||
|
return match
|
||
|
}
|
||
|
|
||
|
Lexer.prototype._getGroup = function(match) {
|
||
|
var groupCount = this.groups.length
|
||
|
for (var i = 0; i < groupCount; i++) {
|
||
|
if (match[i + 1] !== undefined) {
|
||
|
return this.groups[i]
|
||
|
}
|
||
|
}
|
||
|
throw new Error('Cannot find token type for matched text')
|
||
|
}
|
||
|
|
||
|
function tokenToString() {
|
||
|
return this.value
|
||
|
}
|
||
|
|
||
|
Lexer.prototype.next = function() {
|
||
|
var index = this.index
|
||
|
|
||
|
// If a fallback token matched, we don't need to re-run the RegExp
|
||
|
if (this.queuedGroup) {
|
||
|
var token = this._token(this.queuedGroup, this.queuedText, index)
|
||
|
this.queuedGroup = null
|
||
|
this.queuedText = ""
|
||
|
return token
|
||
|
}
|
||
|
|
||
|
var buffer = this.buffer
|
||
|
if (index === buffer.length) {
|
||
|
return // EOF
|
||
|
}
|
||
|
|
||
|
// Fast matching for single characters
|
||
|
var group = this.fast[buffer.charCodeAt(index)]
|
||
|
if (group) {
|
||
|
return this._token(group, buffer.charAt(index), index)
|
||
|
}
|
||
|
|
||
|
// Execute RegExp
|
||
|
var re = this.re
|
||
|
re.lastIndex = index
|
||
|
var match = eat(re, buffer)
|
||
|
|
||
|
// Error tokens match the remaining buffer
|
||
|
var error = this.error
|
||
|
if (match == null) {
|
||
|
return this._token(error, buffer.slice(index, buffer.length), index)
|
||
|
}
|
||
|
|
||
|
var group = this._getGroup(match)
|
||
|
var text = match[0]
|
||
|
|
||
|
if (error.fallback && match.index !== index) {
|
||
|
this.queuedGroup = group
|
||
|
this.queuedText = text
|
||
|
|
||
|
// Fallback tokens contain the unmatched portion of the buffer
|
||
|
return this._token(error, buffer.slice(index, match.index), index)
|
||
|
}
|
||
|
|
||
|
return this._token(group, text, index)
|
||
|
}
|
||
|
|
||
|
Lexer.prototype._token = function(group, text, offset) {
|
||
|
// count line breaks
|
||
|
var lineBreaks = 0
|
||
|
if (group.lineBreaks) {
|
||
|
var matchNL = /\n/g
|
||
|
var nl = 1
|
||
|
if (text === '\n') {
|
||
|
lineBreaks = 1
|
||
|
} else {
|
||
|
while (matchNL.exec(text)) { lineBreaks++; nl = matchNL.lastIndex }
|
||
|
}
|
||
|
}
|
||
|
|
||
|
var token = {
|
||
|
type: (typeof group.type === 'function' && group.type(text)) || group.defaultType,
|
||
|
value: typeof group.value === 'function' ? group.value(text) : text,
|
||
|
text: text,
|
||
|
toString: tokenToString,
|
||
|
offset: offset,
|
||
|
lineBreaks: lineBreaks,
|
||
|
line: this.line,
|
||
|
col: this.col,
|
||
|
}
|
||
|
// nb. adding more props to token object will make V8 sad!
|
||
|
|
||
|
var size = text.length
|
||
|
this.index += size
|
||
|
this.line += lineBreaks
|
||
|
if (lineBreaks !== 0) {
|
||
|
this.col = size - nl + 1
|
||
|
} else {
|
||
|
this.col += size
|
||
|
}
|
||
|
|
||
|
// throw, if no rule with {error: true}
|
||
|
if (group.shouldThrow) {
|
||
|
throw new Error(this.formatError(token, "invalid syntax"))
|
||
|
}
|
||
|
|
||
|
if (group.pop) this.popState()
|
||
|
else if (group.push) this.pushState(group.push)
|
||
|
else if (group.next) this.setState(group.next)
|
||
|
|
||
|
return token
|
||
|
}
|
||
|
|
||
|
if (typeof Symbol !== 'undefined' && Symbol.iterator) {
|
||
|
var LexerIterator = function(lexer) {
|
||
|
this.lexer = lexer
|
||
|
}
|
||
|
|
||
|
LexerIterator.prototype.next = function() {
|
||
|
var token = this.lexer.next()
|
||
|
return {value: token, done: !token}
|
||
|
}
|
||
|
|
||
|
LexerIterator.prototype[Symbol.iterator] = function() {
|
||
|
return this
|
||
|
}
|
||
|
|
||
|
Lexer.prototype[Symbol.iterator] = function() {
|
||
|
return new LexerIterator(this)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
Lexer.prototype.formatError = function(token, message) {
|
||
|
if (token == null) {
|
||
|
// An undefined token indicates EOF
|
||
|
var text = this.buffer.slice(this.index)
|
||
|
var token = {
|
||
|
text: text,
|
||
|
offset: this.index,
|
||
|
lineBreaks: text.indexOf('\n') === -1 ? 0 : 1,
|
||
|
line: this.line,
|
||
|
col: this.col,
|
||
|
}
|
||
|
}
|
||
|
var start = Math.max(0, token.offset - token.col + 1)
|
||
|
var eol = token.lineBreaks ? token.text.indexOf('\n') : token.text.length
|
||
|
var firstLine = this.buffer.substring(start, token.offset + eol)
|
||
|
message += " at line " + token.line + " col " + token.col + ":\n\n"
|
||
|
message += " " + firstLine + "\n"
|
||
|
message += " " + Array(token.col).join(" ") + "^"
|
||
|
return message
|
||
|
}
|
||
|
|
||
|
Lexer.prototype.clone = function() {
|
||
|
return new Lexer(this.states, this.state)
|
||
|
}
|
||
|
|
||
|
Lexer.prototype.has = function(tokenType) {
|
||
|
return true
|
||
|
}
|
||
|
|
||
|
|
||
|
return {
|
||
|
compile: compile,
|
||
|
states: compileStates,
|
||
|
error: Object.freeze({error: true}),
|
||
|
fallback: Object.freeze({fallback: true}),
|
||
|
keywords: keywordTransform,
|
||
|
}
|
||
|
|
||
|
}));
|