From c04b73f014da41f2abfc0fdd6a746b3ede699280 Mon Sep 17 00:00:00 2001
From: Chen Asraf <chenasrafil@gmail.com>
Date: Tue, 16 Aug 2022 01:01:57 +0300
Subject: [PATCH] feat(parser): initial poc done

---
 .../{tokenizer_test.ts => lexer_test.ts}      |  46 ++++----
 __tests__/parser_test.ts                      |   4 +-
 src/{tokenizer.ts => lexer.ts}                |  85 ++++++++++----
 src/parser.ts                                 | 109 +++++++++++-------
 4 files changed, 154 insertions(+), 90 deletions(-)
 rename __tests__/{tokenizer_test.ts => lexer_test.ts} (84%)
 rename src/{tokenizer.ts => lexer.ts} (74%)

diff --git a/__tests__/tokenizer_test.ts b/__tests__/lexer_test.ts
similarity index 84%
rename from __tests__/tokenizer_test.ts
rename to __tests__/lexer_test.ts
index 9fc883b..2e7b07d 100644
--- a/__tests__/tokenizer_test.ts
+++ b/__tests__/lexer_test.ts
@@ -1,11 +1,11 @@
 import { StringReader } from '../src/reader'
-import { LexerToken, Lexer } from '../src/tokenizer'
+import { LexerToken, Lexer } from '../src/lexer'
 
 describe('Phrase', () => {
   test('Should tokenize single', () => {
     const reader = new StringReader('"phrase"')
-    const tokenizer = new Lexer(reader)
-    const tokens = tokenizer.parse()
+    const lexer = new Lexer(reader)
+    const tokens = lexer.parse()
 
     expect(tokens[0].token).toBe(LexerToken.quote)
     expect(tokens[0].value).toBe('"')
@@ -19,8 +19,8 @@ describe('Phrase', () => {
 
   test('Should tokenize multi', () => {
     const reader = new StringReader('"one two three 123 !@#"')
-    const tokenizer = new Lexer(reader)
-    const tokens = tokenizer.parse()
+    const lexer = new Lexer(reader)
+    const tokens = lexer.parse()
 
     expect(tokens[0].token).toBe(LexerToken.quote)
     expect(tokens[0].value).toBe('"')
@@ -36,8 +36,8 @@ describe('Phrase', () => {
 describe('Word', () => {
   test('Should tokenize single', () => {
     const reader = new StringReader('word')
-    const tokenizer = new Lexer(reader)
-    const tokens = tokenizer.parse()
+    const lexer = new Lexer(reader)
+    const tokens = lexer.parse()
 
     expect(tokens[0].token).toBe(LexerToken.word)
     expect(tokens[0].value).toBe('word')
@@ -45,8 +45,8 @@ describe('Word', () => {
 
   test('Should tokenize multi', () => {
     const reader = new StringReader('one two three 123')
-    const tokenizer = new Lexer(reader)
-    const tokens = tokenizer.parse()
+    const lexer = new Lexer(reader)
+    const tokens = lexer.parse()
 
     expect(tokens[0].token).toBe(LexerToken.word)
     expect(tokens[0].value).toBe('one')
@@ -74,8 +74,8 @@ describe('Word', () => {
 describe('Groups', () => {
   test('Should tokenize single-word group', () => {
     const reader = new StringReader('(word)')
-    const tokenizer = new Lexer(reader)
-    const tokens = tokenizer.parse()
+    const lexer = new Lexer(reader)
+    const tokens = lexer.parse()
 
     expect(tokens[0].token).toBe(LexerToken.group)
     expect(tokens[0].value).toBe('(')
@@ -91,8 +91,8 @@ describe('Groups', () => {
 describe('Logical operator OR', () => {
   test('should parse OR separator', () => {
     const reader = new StringReader('word OR word')
-    const tokenizer = new Lexer(reader)
-    const tokens = tokenizer.parse()
+    const lexer = new Lexer(reader)
+    const tokens = lexer.parse()
 
     expect(tokens[0].token).toBe(LexerToken.word)
     expect(tokens[0].value).toBe('word')
@@ -112,8 +112,8 @@ describe('Logical operator OR', () => {
 
   test('should not parse OR separator mid-word', () => {
     const reader = new StringReader('wordORword')
-    const tokenizer = new Lexer(reader)
-    const tokens = tokenizer.parse()
+    const lexer = new Lexer(reader)
+    const tokens = lexer.parse()
 
     expect(tokens[0].token).toBe(LexerToken.word)
     expect(tokens[0].value).toBe('wordORword')
@@ -121,8 +121,8 @@ describe('Logical operator OR', () => {
 
   test('should parse | separator', () => {
     const reader = new StringReader('word | word')
-    const tokenizer = new Lexer(reader)
-    const tokens = tokenizer.parse()
+    const lexer = new Lexer(reader)
+    const tokens = lexer.parse()
 
     expect(tokens[0].token).toBe(LexerToken.word)
     expect(tokens[0].value).toBe('word')
@@ -144,8 +144,8 @@ describe('Logical operator OR', () => {
 describe('Logical operator AND', () => {
   test('should parse AND separator', () => {
     const reader = new StringReader('word AND word')
-    const tokenizer = new Lexer(reader)
-    const tokens = tokenizer.parse()
+    const lexer = new Lexer(reader)
+    const tokens = lexer.parse()
 
     expect(tokens[0].token).toBe(LexerToken.word)
     expect(tokens[0].value).toBe('word')
@@ -165,8 +165,8 @@ describe('Logical operator AND', () => {
 
   test('should not parse AND separator mid-word', () => {
     const reader = new StringReader('wordANDword')
-    const tokenizer = new Lexer(reader)
-    const tokens = tokenizer.parse()
+    const lexer = new Lexer(reader)
+    const tokens = lexer.parse()
 
     expect(tokens[0].token).toBe(LexerToken.word)
     expect(tokens[0].value).toBe('wordANDword')
@@ -174,8 +174,8 @@ describe('Logical operator AND', () => {
 
   test('should parse & separator', () => {
     const reader = new StringReader('word & word')
-    const tokenizer = new Lexer(reader)
-    const tokens = tokenizer.parse()
+    const lexer = new Lexer(reader)
+    const tokens = lexer.parse()
 
     expect(tokens[0].token).toBe(LexerToken.word)
     expect(tokens[0].value).toBe('word')
diff --git a/__tests__/parser_test.ts b/__tests__/parser_test.ts
index 85b9bf1..940a2b6 100644
--- a/__tests__/parser_test.ts
+++ b/__tests__/parser_test.ts
@@ -1,6 +1,6 @@
 import { Operator, Parser, Word } from '../src/parser'
 import { StringReader } from '../src/reader'
-import { Lexer } from '../src/tokenizer'
+import { Lexer } from '../src/lexer'
 
 test('should parse single word', () => {
   const reader = new StringReader('word')
@@ -19,5 +19,5 @@ test('should parse OR operator', () => {
   const wordToken = tokens[0] as Operator
   expect(wordToken.type).toBe('operator')
   expect(wordToken.left.value).toBe('word')
-  expect(wordToken.right.value).toBe('"phrase"')
+  expect(wordToken.right.value).toBe('phrase')
 })
diff --git a/src/tokenizer.ts b/src/lexer.ts
similarity index 74%
rename from src/tokenizer.ts
rename to src/lexer.ts
index a6fd178..a3171c2 100644
--- a/src/tokenizer.ts
+++ b/src/lexer.ts
@@ -1,6 +1,6 @@
 import { InputReader } from './reader'
 
-export enum TokenizerState {
+export enum lexerState {
   default,
   inPhrase,
 }
@@ -19,8 +19,8 @@ export interface LexerTokenValue {
 }
 
 export abstract class ILexer {
-  public abstract peek(): LexerTokenValue
-  public abstract consume(): LexerTokenValue
+  public abstract peek(amount?: number): LexerTokenValue | null
+  public abstract consume(amount?: number): LexerTokenValue | null
   public abstract isEOF(): boolean
   public abstract parse(): LexerTokenValue[]
   public abstract index: number
@@ -29,44 +29,79 @@ export abstract class ILexer {
 
 export class Lexer implements ILexer {
   reader: InputReader<string>
-  state: TokenizerState = TokenizerState.default
+  state: lexerState = lexerState.default
   quoteTerminator: string | null = null
   index: number = 0
   peekIndex: number = 0
   afterWhitespace: boolean = false
+  cache: LexerTokenValue[] = []
 
   constructor(reader: InputReader<string>) {
     this.reader = reader
   }
 
-  // TODO implement peek by (n)?
-  public peek(): LexerTokenValue {
-    // save state before peeking
-    const beforePeekState = this.state
-    const beforePeekIndex = this.reader.index
-    const beforePeekWhiteSpace = this.afterWhitespace
+  public peek(amount = 0): LexerTokenValue | null {
+    const cacheIndex = this.index + amount
+    if (this.isEOF()) {
+      return null
+    }
 
-    const value = this.readNextToken()
+    if (this.cache[cacheIndex]) {
+      return this.cache[cacheIndex]
+    }
+
+    // save state before peeking
+    // const beforePeekState = this.state
+    // const beforePeekIndex = this.reader.index
+    // const beforePeekWhiteSpace = this.afterWhitespace
+
+    this.fillCache(cacheIndex)
+    const token = this.cache[cacheIndex]
 
     // restore state after peeking
-    this.state = beforePeekState
-    this.reader.setIndex(beforePeekIndex - 1)
-    this.afterWhitespace = beforePeekWhiteSpace
+    // this.state = beforePeekState
+    // this.reader.setIndex(beforePeekIndex)
+    // this.afterWhitespace = beforePeekWhiteSpace
 
-    return value
+    return token
   }
 
-  // TODO implement consume by (n)?
-  public consume(): LexerTokenValue {
-    const token = this.readNextToken()
-    this.index++
+  public consume(amount = 0): LexerTokenValue | null {
+    const cacheIndex = this.index + amount
+    this.index = cacheIndex + 1
+
+    if (this.cache[cacheIndex]) {
+      return this.cache[cacheIndex]
+    }
+    if (this.isEOF()) {
+      return null
+    }
+
+    this.fillCache(cacheIndex)
+    const token = this.cache[cacheIndex]
     return token
   }
 
+  private fillCache(n: number) {
+    const { index } = this
+    for (let i = 0; i <= n; i++) {
+      this.index = i
+      if (this.isEOF()) {
+        return
+      }
+      if (this.cache[i]) {
+        continue
+      }
+      const value = this.readNextToken()
+      this.cache[i] = value!
+    }
+    this.index = index
+  }
+
   public parse(): LexerTokenValue[] {
     const tokens: LexerTokenValue[] = []
     while (!this.isEOF()) {
-      tokens.push(this.consume())
+      tokens.push(this.consume()!)
     }
     return tokens
   }
@@ -79,10 +114,10 @@ export class Lexer implements ILexer {
     return this.reader.isEOF()
   }
 
-  private readNextToken(): LexerTokenValue {
+  private readNextToken(): LexerTokenValue | null {
     const nextChar = this.reader.peek()
     switch (this.state) {
-      case TokenizerState.default:
+      case lexerState.default:
         // whitespace
         if (this.isWhitespace(nextChar)) {
           this.afterWhitespace = true
@@ -94,7 +129,7 @@ export class Lexer implements ILexer {
 
         // quote
         if (`"'`.includes(nextChar)) {
-          this.state = TokenizerState.inPhrase
+          this.state = lexerState.inPhrase
           this.quoteTerminator = nextChar
           return this.consumeQuote()
         }
@@ -131,12 +166,12 @@ export class Lexer implements ILexer {
 
         // other, consume normally
         return this.consumeWord()
-      case TokenizerState.inPhrase:
+      case lexerState.inPhrase:
         this.afterWhitespace = false
 
         // in phrase mode, consume until quote terminator
         if (nextChar === this.quoteTerminator) {
-          this.state = TokenizerState.default
+          this.state = lexerState.default
           return this.consumeQuote()
         }
 
diff --git a/src/parser.ts b/src/parser.ts
index 4c5e4e9..e610065 100644
--- a/src/parser.ts
+++ b/src/parser.ts
@@ -1,5 +1,5 @@
 import { InputReader } from './reader'
-import { ILexer, LexerToken, LexerTokenValue } from './tokenizer'
+import { ILexer, LexerToken, LexerTokenValue } from './lexer'
 
 export interface ParserTokenValue {
   type: 'word' | 'operator' | 'phrase' | 'group'
@@ -38,8 +38,8 @@ export abstract class IParser {
     this.lexer = lexer
   }
 
-  public abstract peek(): ParserToken | null
-  public abstract consume(): ParserToken | null
+  public abstract peek(amount?: number): ParserToken | null
+  public abstract consume(amount?: number): ParserToken | null
   public abstract parse(): ParserToken[]
   public abstract isEOF(): boolean
 }
@@ -51,47 +51,60 @@ export enum ParserState {
 export class Parser extends IParser {
   index = 0
   state = ParserState.default
-  stack: ParserToken[] = []
+  cache: ParserToken[] = []
 
   constructor(lexer: ILexer) {
     super(lexer)
     this.state = ParserState.default
   }
 
-  public peek(): ParserToken | null {
+  public peek(amount = 0): ParserToken | null {
+    const cacheIndex = this.index + amount
     if (this.isEOF()) {
       return null
     }
-    if (this.index < this.stack.length) {
-      return this.stack[this.index]
+    if (cacheIndex < this.cache.length) {
+      return this.cache[cacheIndex]
     }
-
-    const beforePeekIndex = this.lexer.index
-    const value = this.readNextToken()
-    if (value) {
-      this.stack.push(value)
-    }
-    this.lexer.setIndex(beforePeekIndex)
-    return value
+    // const beforePeekIndex = this.lexer.index
+    this.fillCache(cacheIndex)
+    const token = this.cache[cacheIndex]
+    // this.lexer.setIndex(beforePeekIndex)
+    return token
   }
 
-  public consume(): ParserToken | null {
+  public consume(amount = 0): ParserToken | null {
+    const cacheIndex = this.index + amount
+    this.index = cacheIndex + 1
+
+    if (this.cache[cacheIndex]) {
+      return this.cache[cacheIndex]
+    }
     if (this.isEOF()) {
       return null
     }
-    if (this.index < this.stack.length) {
-      this.index++
-      return this.stack[this.index]
-    }
 
-    const token = this.readNextToken()
-    this.index++
-    if (token) {
-      this.stack.push(token)
-    }
+    this.fillCache(cacheIndex)
+    const token = this.cache[cacheIndex]
     return token
   }
 
+  private fillCache(n: number) {
+    const { index } = this
+    for (let i = 0; i <= n; i++) {
+      this.index = i
+      if (this.isEOF()) {
+        return
+      }
+      if (this.cache[i]) {
+        continue
+      }
+      const value = this.readNextToken()
+      this.cache[i] = value!
+    }
+    this.index = index
+  }
+
   public parse(): ParserToken[] {
     const tokens: ParserToken[] = []
     while (!this.isEOF()) {
@@ -109,26 +122,31 @@ export class Parser extends IParser {
   }
 
   private readNextToken(): ParserToken | null {
-    const token = this.lexer.consume()
-    let nextToken = this.lexer.peek()
-    // TODO reset lexer index?
-    while (nextToken?.token === 'whitespace') {
-      this.lexer.consume()
-      nextToken = this.lexer.peek()
-    }
+    let token = this.lexer.peek()
+    let nextToken = this.lexer.peek(1)
+
     switch (this.state) {
       case ParserState.default:
-        if (nextToken.token === 'group') {
+        if (token?.token === 'whitespace') {
           this.index++
+          this.lexer.consume()
           return this.readNextToken()
         }
-        switch (token.token) {
+        while (nextToken && nextToken.token === 'whitespace') {
+          nextToken = this.lexer.peek(1)
+          this.lexer.consume()
+        }
+        if (nextToken?.token === 'group' || nextToken?.token === 'operator') {
+          this.index++
+          return this.consumeOperator(token!, nextToken)
+        }
+        switch (token?.token) {
           case LexerToken.word:
-            return { type: 'word', value: token.value }
+            return { type: 'word', value: this.lexer.consume()!.value }
           case LexerToken.quote:
-            return { type: 'phrase', value: token.value, quote: token.value as '"' }
+            return this.consumePhrase(token)
           case LexerToken.operator:
-            return this.consumeOperator(token)
+            return this.consumeOperator(token, nextToken!)
           default:
             return null
         }
@@ -137,9 +155,20 @@ export class Parser extends IParser {
     }
   }
 
-  private consumeOperator(token: LexerTokenValue): ParserToken | null {
-    const left = this.stack[this.stack.length - 1]
+  private consumePhrase(token: LexerTokenValue): ParserToken | null {
+    this.lexer.consume()
+    const quoteContent = this.lexer.consume()!
+    this.lexer.consume()
+    return { type: 'phrase', value: quoteContent.value, quote: token.value as '"' }
+  }
+
+  private consumeOperator(left: LexerTokenValue, opToken: LexerTokenValue): ParserToken | null {
+    // const left = this.cache[this.cache.length - 1]
+    this.index++
+    this.lexer.consume()
     const right = this.readNextToken()
-    return { type: 'operator', value: token.value, left, right }
+    this.lexer.consume()
+    // const right = this.readNextToken()
+    return { type: 'operator', value: opToken.value, left, right }
   }
 }