Improve the lexer implementation

elk-language · Jul 25, 2023 · 657cea7 · 657cea7
1 parent ee5ed49
commit 657cea7
Show file tree

Hide file tree

Showing 4 changed files with 99 additions and 50 deletions.
diff --git a/_example/even-lexer/main.go b/_example/even-lexer/main.go
@@ -12,8 +12,8 @@ import (
 func main() {
 	p := prompt.New(
 		executor,
-		prompt.WithLexer(prompt.NewEagerLexer(wordLexer)),
 		prompt.WithLexer(prompt.NewEagerLexer(charLexer)), // the last one overrides the other
+		prompt.WithLexer(prompt.NewEagerLexer(wordLexer)),
 	)
 
 	p.Run()
@@ -32,7 +32,7 @@ func charLexer(line string) []prompt.Token {
 			color = prompt.White
 		}
 		lastByteIndex := strings.ByteNumber(i + utf8.RuneLen(value) - 1)
-		element := prompt.NewSimpleToken(color, lastByteIndex)
+		element := prompt.NewSimpleToken(color, strings.ByteNumber(i), lastByteIndex)
 
 		elements = append(elements, element)
 	}
@@ -48,36 +48,47 @@ func wordLexer(line string) []prompt.Token {
 
 	var elements []prompt.Token
 	var currentByte strings.ByteNumber
+	var firstByte strings.ByteNumber
+	var firstCharSeen bool
 	var wordIndex int
 	var lastChar rune
 
 	var color prompt.Color
 	for i, char := range line {
 		currentByte = strings.ByteNumber(i)
+		lastChar = char
 		if unicode.IsSpace(char) {
+			if !firstCharSeen {
+				continue
+			}
 			if wordIndex%2 == 0 {
 				color = prompt.Green
 			} else {
 				color = prompt.White
 			}
 
-			element := prompt.NewSimpleToken(color, currentByte)
+			element := prompt.NewSimpleToken(color, firstByte, currentByte-1)
 			elements = append(elements, element)
 			wordIndex++
+			firstCharSeen = false
 			continue
 		}
-		lastChar = char
+		if !firstCharSeen {
+			firstByte = strings.ByteNumber(i)
+			firstCharSeen = true
+		}
 	}
 	if !unicode.IsSpace(lastChar) {
 		if wordIndex%2 == 0 {
 			color = prompt.Green
 		} else {
 			color = prompt.White
 		}
-		element := prompt.NewSimpleToken(color, currentByte)
+		element := prompt.NewSimpleToken(color, firstByte, currentByte+strings.ByteNumber(utf8.RuneLen(lastChar))-1)
 		elements = append(elements, element)
 	}
 
+	prompt.Log("tokens: %#v", elements)
 	return elements
 }
 

diff --git a/lexer.go b/lexer.go
@@ -15,21 +15,24 @@ type Lexer interface {
 
 // Token is a single unit of text returned by a Lexer.
 type Token interface {
-	Color() Color                       // Color of the token
-	LastByteIndex() istrings.ByteNumber // Index of the last byte of this token
+	Color() Color                        // Color of the token
+	FirstByteIndex() istrings.ByteNumber // Index of the last byte of this token
+	LastByteIndex() istrings.ByteNumber  // Index of the last byte of this token
 }
 
 // SimpleToken as the default implementation of Token.
 type SimpleToken struct {
-	color         Color
-	lastByteIndex istrings.ByteNumber
+	color          Color
+	lastByteIndex  istrings.ByteNumber
+	firstByteIndex istrings.ByteNumber
 }
 
 // Create a new SimpleToken.
-func NewSimpleToken(color Color, index istrings.ByteNumber) *SimpleToken {
+func NewSimpleToken(color Color, firstIndex, lastIndex istrings.ByteNumber) *SimpleToken {
 	return &SimpleToken{
-		color:         color,
-		lastByteIndex: index,
+		color:          color,
+		firstByteIndex: firstIndex,
+		lastByteIndex:  lastIndex,
 	}
 }
 
@@ -38,11 +41,16 @@ func (t *SimpleToken) Color() Color {
 	return t.color
 }
 
-// Retrieve the text that this token represents.
+// The index of the last byte of the lexeme.
 func (t *SimpleToken) LastByteIndex() istrings.ByteNumber {
 	return t.lastByteIndex
 }
 
+// The index of the first byte of the lexeme.
+func (t *SimpleToken) FirstByteIndex() istrings.ByteNumber {
+	return t.firstByteIndex
+}
+
 // LexerFunc is a function implementing
 // a simple lexer that receives a string
 // and returns a complete slice of Tokens.

diff --git a/lexer_test.go b/lexer_test.go
@@ -70,7 +70,7 @@ func TestEagerLexerNext(t *testing.T) {
 func charLex(s string) []Token {
 	var result []Token
 	for i := range s {
-		result = append(result, NewSimpleToken(0, istrings.ByteNumber(i)))
+		result = append(result, NewSimpleToken(0, istrings.ByteNumber(i), istrings.ByteNumber(i)))
 	}
 
 	return result
@@ -86,18 +86,18 @@ func TestEagerLexerInit(t *testing.T) {
 			lexer: &EagerLexer{
 				lexFunc: charLex,
 				tokens: []Token{
-					&SimpleToken{lastByteIndex: 2},
-					&SimpleToken{lastByteIndex: 10},
+					&SimpleToken{firstByteIndex: 2, lastByteIndex: 2},
+					&SimpleToken{firstByteIndex: 10, lastByteIndex: 10},
 				},
 				currentIndex: 11,
 			},
 			input: "foo",
 			want: &EagerLexer{
 				lexFunc: charLex,
 				tokens: []Token{
-					&SimpleToken{lastByteIndex: 0},
-					&SimpleToken{lastByteIndex: 1},
-					&SimpleToken{lastByteIndex: 2},
+					&SimpleToken{firstByteIndex: 0, lastByteIndex: 0},
+					&SimpleToken{firstByteIndex: 1, lastByteIndex: 1},
+					&SimpleToken{firstByteIndex: 2, lastByteIndex: 2},
 				},
 				currentIndex: 0,
 			},

diff --git a/renderer.go b/renderer.go
@@ -363,49 +363,79 @@ func (r *Renderer) lex(lexer Lexer, input string, startLine int) {
 tokenLoop:
 	for {
 		token, ok := lexer.Next()
-		if !ok {
+		var currentFirstByteIndex istrings.ByteNumber
+		var currentLastByteIndex istrings.ByteNumber
+		var tokenColor Color
+		var noToken bool
+		if ok {
+			currentFirstByteIndex = token.FirstByteIndex()
+			currentLastByteIndex = token.LastByteIndex()
+			tokenColor = token.Color()
+		} else if previousByteIndex == istrings.Len(input)-1 {
 			break tokenLoop
+		} else {
+			currentFirstByteIndex = istrings.Len(input)
+			tokenColor = DefaultColor
+			noToken = true
 		}
 
-		currentByteIndex := token.LastByteIndex()
-		text := input[previousByteIndex+1 : currentByteIndex+1]
-		previousByteIndex = currentByteIndex
+		color := DefaultColor
+		text := input[previousByteIndex+1 : currentFirstByteIndex]
+		previousByteIndex = currentLastByteIndex
 		lineBuffer = lineBuffer[:0]
-
-	charLoop:
-		for _, char := range text {
-			if lineCharIndex >= col || char == '\n' {
-				lineNumber++
-				lineCharIndex = 0
-				if lineNumber-1 < startLine {
+		interToken := true
+
+	repeatLoop:
+		for {
+
+		charLoop:
+			for _, char := range text {
+				if lineCharIndex >= col || char == '\n' {
+					lineNumber++
+					lineCharIndex = 0
+					if lineNumber-1 < startLine {
+						continue charLoop
+					}
+					if lineNumber >= endLine {
+						break tokenLoop
+					}
+					lineBuffer = append(lineBuffer, '\n')
+					r.writeColor(lineBuffer, color)
+					r.renderPrefix(multilinePrefix)
+					lineBuffer = lineBuffer[:0]
+					if char != '\n' {
+						size := utf8.EncodeRune(runeBuffer, char)
+						lineBuffer = append(lineBuffer, runeBuffer[:size]...)
+						lineCharIndex += istrings.GetRuneWidth(char)
+					}
 					continue charLoop
 				}
-				if lineNumber >= endLine {
-					break tokenLoop
-				}
-				lineBuffer = append(lineBuffer, '\n')
-				r.writeColor(lineBuffer, token.Color())
-				r.renderPrefix(multilinePrefix)
-				lineBuffer = lineBuffer[:0]
-				if char != '\n' {
-					size := utf8.EncodeRune(runeBuffer, char)
-					lineBuffer = append(lineBuffer, runeBuffer[:size]...)
-					lineCharIndex += istrings.GetRuneWidth(char)
+
+				lineCharIndex += istrings.GetRuneWidth(char)
+				if lineNumber < startLine {
+					continue charLoop
 				}
-				continue charLoop
+				size := utf8.EncodeRune(runeBuffer, char)
+				lineBuffer = append(lineBuffer, runeBuffer[:size]...)
+			}
+			if len(lineBuffer) > 0 {
+				r.writeColor(lineBuffer, color)
 			}
 
-			lineCharIndex += istrings.GetRuneWidth(char)
-			if lineNumber < startLine {
-				continue charLoop
+			if !interToken {
+				break repeatLoop
 			}
-			size := utf8.EncodeRune(runeBuffer, char)
-			lineBuffer = append(lineBuffer, runeBuffer[:size]...)
-		}
-		if len(lineBuffer) > 0 {
-			r.writeColor(lineBuffer, token.Color())
+
+			if noToken {
+				break tokenLoop
+			}
+			color = tokenColor
+			text = input[currentFirstByteIndex : currentLastByteIndex+1]
+			lineBuffer = lineBuffer[:0]
+			interToken = false
 		}
 	}
+
 }
 
 // BreakLine to break line.