//
// Copyright (c) 2008, Brian Frank and Andy Frank
// Licensed under the Academic Free License version 3.0
//
// History:
// 10 Aug 08 Brian Frank Creation
// 30 Aug 11 Brian Frank Refactor out of fluxText
//
**
** SyntaxParser parses text into a SyntaxDoc
**
internal class SyntaxParser
{
new make(SyntaxRules rules)
{
this.rules = rules
this.tokenizer = LineTokenizer(rules)
}
SyntaxDoc parse(InStream in)
{
doc := SyntaxDoc(rules)
SyntaxLine? tail := null
num := 1
while (true)
{
// read next line of text
text := in.readLine
if (text == null) break
// parse into SyntaxLine
line := parseLine(num++, text)
// append to line linked list
if (tail == null) doc.lines = line
else tail.next = line
tail = line
}
return doc
}
private SyntaxLine parseLine(Int num, Str lineText)
{
line := SyntaxLine(num)
try
{
// normalize tabs
if (tabsToSpaces != 0)
lineText = convertTabsToSpaces(lineText, tabsToSpaces)
// tokenize segments
tokenizer.tokenizeLine(lineText) |type, text|
{
line.segments.add(type).add(text)
}
}
catch (Err e)
{
e.trace
line.segments = [SyntaxType.text, lineText]
}
return line
}
private static Str convertTabsToSpaces(Str text, Int ts)
{
if (!text.contains("\t")) return text
s := StrBuf()
text.each |ch|
{
if (ch == '\t')
s.add(Str.spaces(ts - (s.size%ts)))
else
s.addChar(ch)
}
return s.toStr
}
** Number of spaces to convert a tab character to or zero
** to disable tab to space conversion
Int tabsToSpaces := 2
private SyntaxRules rules // syntax rules for current document
private LineTokenizer tokenizer // line tokenizer for rules
}
**************************************************************************
** LineTokenizer
**************************************************************************
internal class LineTokenizer
{
//////////////////////////////////////////////////////////////////////////
// Construction
//////////////////////////////////////////////////////////////////////////
new make(SyntaxRules rules)
{
this.rules = rules
this.brackets = rules.brackets
// build keyword map, and also a prefix map
// of the first two characters for fast yes/no
keywords = Str :Bool[:] { def=false }
keywordPrefixes = Int:Bool[:] { def=false }
if (rules.keywords != null)
{
rules.keywords.each |Str k|
{
keywords[k] = true
keywordPrefixes[k[0].shiftl(16).or(k[1])] = true
}
}
// single line comments
comments= Matcher[,]
rules.comments?.each |Str s| { comments.add(toMatcher(s)) }
// block comments
commentStart = toMatcher(rules.blockCommentStart)
commentEnd = toMatcher(rules.blockCommentEnd)
// str literals
strs = StrMatch[,]
if (rules.strs != null)
rules.strs.each |SyntaxStr s| { strs.add(toStrMatch(s)) }
}
//////////////////////////////////////////////////////////////////////////
// Tokenizing
//////////////////////////////////////////////////////////////////////////
** Tokenize to given callback
Void tokenizeLine(Str line, |SyntaxType, Str| f)
{
// reset line
this.line = line
this.lineSize = line.size
this.pos = 0
if (lineSize > 0) this.cur = line[0]
if (lineSize > 1) this.peek = line[1]
// iterate until we hit end of line
textStart := 0
while (cur != 0)
{
// parse next token
thisStart := pos
type := next
// if this is text, keep chugging until we
// get to next special token or end of line
if (type == SyntaxType.text) continue
// iterate last text chunk
if (textStart < thisStart)
f(SyntaxType.text, line[textStart..<thisStart])
// iterate this token
f(type, line[thisStart..<pos])
// reset textStart
textStart = pos
}
// iterate last text chunk
if (textStart < line.size)
f(SyntaxType.text, line[textStart..<line.size])
}
private SyntaxType next()
{
// if inside multi-line string literal
if (inStr != null) return strLiteral(inStr)
// if inside block comment or comment open
if (inComment > 0 || commentStart.isMatch) return blockComment
// check for end-of-line comments
for (i:=0; i<comments.size; ++i)
{
if (comments[i].isMatch)
{
cur = 0
pos = line.size // force end of line
return SyntaxType.comment
}
}
// identifier which might be keyword
if (keywordPrefixes[cur.shiftl(16).or(peek)] &&
(pos==0 || !line[pos-1].isAlphaNum))
{
start := pos
consume
consume
while (cur.isAlphaNum || cur == '_') consume
word := line[start..<pos]
if (keywords[word]) return SyntaxType.keyword
return SyntaxType.text
}
// check for str literals
for (i:=0; i<strs.size; ++i)
if (strs[i].start.isMatch) return strLiteral(strs[i])
// brackets
if (brackets.containsChar(cur))
{
consume
return SyntaxType.bracket
}
// other chars
consume
return SyntaxType.text
}
private SyntaxType blockComment()
{
while (cur != 0)
{
if (commentStart.isMatch)
{
commentStart.consume
++inComment
if (!rules.blockCommentsNest) inComment = 1
}
if (commentEnd.isMatch)
{
commentEnd.consume
--inComment
}
if (inComment <= 0) break
consume
}
return SyntaxType.comment
}
private SyntaxType strLiteral(StrMatch s)
{
if (inStr !== s) s.start.consume
while (cur != 0)
{
if (s.end.isMatch && countEscapes(s.escape).isEven)
{
s.end.consume
inStr = null
return SyntaxType.literal
}
consume
}
if (s.multiLine) inStr = s
return SyntaxType.literal
}
//////////////////////////////////////////////////////////////////////////
// Matching Functions
//////////////////////////////////////////////////////////////////////////
StrMatch toStrMatch(SyntaxStr s)
{
return StrMatch
{
start = toMatcher(s.delimiter, s.escape)
end = toMatcher(s.delimiterEnd ?: s.delimiter, s.escape)
escape = s.escape
multiLine = s.multiLine
}
}
Matcher toMatcher(Str? tok, Int esc := 0)
{
tok = tok?.trim ?: ""
switch (tok.size)
{
case 0:
return Matcher(0, |->Bool| { noMatch }, |->| {})
case 1:
if (esc > 0)
return Matcher(1, |->Bool| { match1Esc(tok[0], esc) }, |->| { consume })
else
return Matcher(1, |->Bool| { match1(tok[0]) }, |->| { consume })
case 2:
if (esc > 0)
return Matcher(2, |->Bool| { match2Esc(tok[0], tok[1], esc) }, |->| { consume; consume })
else
return Matcher(2, |->Bool| { match2(tok[0], tok[1]) }, |->| { consume; consume })
default:
return Matcher(tok.size, |->Bool| { matchN(tok) }, |->| { consumeN(tok.size) })
}
}
Bool noMatch() { return false }
Bool match1(Int ch1) { return cur == ch1 }
Bool match2(Int ch1, Int ch2) { return cur == ch1 && peek == ch2 }
Bool match1Esc(Int ch1, Int esc) { return cur == ch1 && countEscapes(esc).isEven }
Bool match2Esc(Int ch1, Int ch2, Int esc) { return cur == ch1 && peek == ch2 && countEscapes(esc).isEven }
Bool matchN(Str chars) // assume no escape for 3 or more
{
try
{
if (cur != chars[0] || peek != chars[1]) return false
for (i:=2; i<chars.size; ++i) if (chars[i] != line[pos+i]) return false
return true
}
catch (Err e)
{
return false
}
}
** Count the number of escape chars preceeding the current char.
private Int countEscapes(Int esc)
{
n := 0
while (line[pos-n-1] == esc) n++
return n
}
//////////////////////////////////////////////////////////////////////////
// Consume
//////////////////////////////////////////////////////////////////////////
private Void consume()
{
cur = peek
pos++
if (pos >= lineSize) pos = lineSize
if (pos+1 < line.size)
{
peek = line[pos+1] // next peek is cur+1
}
else
{
peek = 0
}
}
Void consumeN(Int n)
{
for (; n > 0; --n) consume
}
//////////////////////////////////////////////////////////////////////////
// Fields
//////////////////////////////////////////////////////////////////////////
// rule lookups
private SyntaxRules rules // syntax rules for tokenizing
private Str brackets // str of bracket symbols
private Int:Bool keywordPrefixes // first two letter of keywords
private Str:Bool keywords // keywords
private Matcher[] comments // matchers for eol comments
private Matcher commentStart // matcher to check comment start
private Matcher commentEnd // matcher to check comment end
private StrMatch[] strs // matchers for str literals
// multi-line blocks
private Int inComment // nested level of block comments
private StrMatch? inStr // in multi-line string literal
// current line
private Str? line // line being parsed
private Int lineSize // total size of line
private Int pos // index into line for cur
private Int cur // current char
private Int peek // next char
}
**************************************************************************
** Matcher
**************************************************************************
** Matcher is used to match a specific token
** against the current character
internal class Matcher
{
new make(Int sz, |->Bool| m, |->| c) { size = sz; matchFunc = m; consumeFunc = c }
Bool isMatch() { return matchFunc.call }
Void consume() { consumeFunc.call }
|->Bool| matchFunc
|->| consumeFunc
const Int size
}
** StrMatch handles matching the start and end
** delimiter and managing multi-line string blocks
internal class StrMatch
{
Matcher? start
Matcher? end
Int escape
Bool multiLine
}