feat(parser): initial poc done

This commit is contained in:
Chen Asraf
2022-08-16 01:01:57 +03:00
parent 48832e23ea
commit c04b73f014
4 changed files with 154 additions and 90 deletions

View File

@@ -1,11 +1,11 @@
import { StringReader } from '../src/reader' import { StringReader } from '../src/reader'
import { LexerToken, Lexer } from '../src/tokenizer' import { LexerToken, Lexer } from '../src/lexer'
describe('Phrase', () => { describe('Phrase', () => {
test('Should tokenize single', () => { test('Should tokenize single', () => {
const reader = new StringReader('"phrase"') const reader = new StringReader('"phrase"')
const tokenizer = new Lexer(reader) const lexer = new Lexer(reader)
const tokens = tokenizer.parse() const tokens = lexer.parse()
expect(tokens[0].token).toBe(LexerToken.quote) expect(tokens[0].token).toBe(LexerToken.quote)
expect(tokens[0].value).toBe('"') expect(tokens[0].value).toBe('"')
@@ -19,8 +19,8 @@ describe('Phrase', () => {
test('Should tokenize multi', () => { test('Should tokenize multi', () => {
const reader = new StringReader('"one two three 123 !@#"') const reader = new StringReader('"one two three 123 !@#"')
const tokenizer = new Lexer(reader) const lexer = new Lexer(reader)
const tokens = tokenizer.parse() const tokens = lexer.parse()
expect(tokens[0].token).toBe(LexerToken.quote) expect(tokens[0].token).toBe(LexerToken.quote)
expect(tokens[0].value).toBe('"') expect(tokens[0].value).toBe('"')
@@ -36,8 +36,8 @@ describe('Phrase', () => {
describe('Word', () => { describe('Word', () => {
test('Should tokenize single', () => { test('Should tokenize single', () => {
const reader = new StringReader('word') const reader = new StringReader('word')
const tokenizer = new Lexer(reader) const lexer = new Lexer(reader)
const tokens = tokenizer.parse() const tokens = lexer.parse()
expect(tokens[0].token).toBe(LexerToken.word) expect(tokens[0].token).toBe(LexerToken.word)
expect(tokens[0].value).toBe('word') expect(tokens[0].value).toBe('word')
@@ -45,8 +45,8 @@ describe('Word', () => {
test('Should tokenize multi', () => { test('Should tokenize multi', () => {
const reader = new StringReader('one two three 123') const reader = new StringReader('one two three 123')
const tokenizer = new Lexer(reader) const lexer = new Lexer(reader)
const tokens = tokenizer.parse() const tokens = lexer.parse()
expect(tokens[0].token).toBe(LexerToken.word) expect(tokens[0].token).toBe(LexerToken.word)
expect(tokens[0].value).toBe('one') expect(tokens[0].value).toBe('one')
@@ -74,8 +74,8 @@ describe('Word', () => {
describe('Groups', () => { describe('Groups', () => {
test('Should tokenize single-word group', () => { test('Should tokenize single-word group', () => {
const reader = new StringReader('(word)') const reader = new StringReader('(word)')
const tokenizer = new Lexer(reader) const lexer = new Lexer(reader)
const tokens = tokenizer.parse() const tokens = lexer.parse()
expect(tokens[0].token).toBe(LexerToken.group) expect(tokens[0].token).toBe(LexerToken.group)
expect(tokens[0].value).toBe('(') expect(tokens[0].value).toBe('(')
@@ -91,8 +91,8 @@ describe('Groups', () => {
describe('Logical operator OR', () => { describe('Logical operator OR', () => {
test('should parse OR separator', () => { test('should parse OR separator', () => {
const reader = new StringReader('word OR word') const reader = new StringReader('word OR word')
const tokenizer = new Lexer(reader) const lexer = new Lexer(reader)
const tokens = tokenizer.parse() const tokens = lexer.parse()
expect(tokens[0].token).toBe(LexerToken.word) expect(tokens[0].token).toBe(LexerToken.word)
expect(tokens[0].value).toBe('word') expect(tokens[0].value).toBe('word')
@@ -112,8 +112,8 @@ describe('Logical operator OR', () => {
test('should not parse OR separator mid-word', () => { test('should not parse OR separator mid-word', () => {
const reader = new StringReader('wordORword') const reader = new StringReader('wordORword')
const tokenizer = new Lexer(reader) const lexer = new Lexer(reader)
const tokens = tokenizer.parse() const tokens = lexer.parse()
expect(tokens[0].token).toBe(LexerToken.word) expect(tokens[0].token).toBe(LexerToken.word)
expect(tokens[0].value).toBe('wordORword') expect(tokens[0].value).toBe('wordORword')
@@ -121,8 +121,8 @@ describe('Logical operator OR', () => {
test('should parse | separator', () => { test('should parse | separator', () => {
const reader = new StringReader('word | word') const reader = new StringReader('word | word')
const tokenizer = new Lexer(reader) const lexer = new Lexer(reader)
const tokens = tokenizer.parse() const tokens = lexer.parse()
expect(tokens[0].token).toBe(LexerToken.word) expect(tokens[0].token).toBe(LexerToken.word)
expect(tokens[0].value).toBe('word') expect(tokens[0].value).toBe('word')
@@ -144,8 +144,8 @@ describe('Logical operator OR', () => {
describe('Logical operator AND', () => { describe('Logical operator AND', () => {
test('should parse AND separator', () => { test('should parse AND separator', () => {
const reader = new StringReader('word AND word') const reader = new StringReader('word AND word')
const tokenizer = new Lexer(reader) const lexer = new Lexer(reader)
const tokens = tokenizer.parse() const tokens = lexer.parse()
expect(tokens[0].token).toBe(LexerToken.word) expect(tokens[0].token).toBe(LexerToken.word)
expect(tokens[0].value).toBe('word') expect(tokens[0].value).toBe('word')
@@ -165,8 +165,8 @@ describe('Logical operator AND', () => {
test('should not parse AND separator mid-word', () => { test('should not parse AND separator mid-word', () => {
const reader = new StringReader('wordANDword') const reader = new StringReader('wordANDword')
const tokenizer = new Lexer(reader) const lexer = new Lexer(reader)
const tokens = tokenizer.parse() const tokens = lexer.parse()
expect(tokens[0].token).toBe(LexerToken.word) expect(tokens[0].token).toBe(LexerToken.word)
expect(tokens[0].value).toBe('wordANDword') expect(tokens[0].value).toBe('wordANDword')
@@ -174,8 +174,8 @@ describe('Logical operator AND', () => {
test('should parse & separator', () => { test('should parse & separator', () => {
const reader = new StringReader('word & word') const reader = new StringReader('word & word')
const tokenizer = new Lexer(reader) const lexer = new Lexer(reader)
const tokens = tokenizer.parse() const tokens = lexer.parse()
expect(tokens[0].token).toBe(LexerToken.word) expect(tokens[0].token).toBe(LexerToken.word)
expect(tokens[0].value).toBe('word') expect(tokens[0].value).toBe('word')

View File

@@ -1,6 +1,6 @@
import { Operator, Parser, Word } from '../src/parser' import { Operator, Parser, Word } from '../src/parser'
import { StringReader } from '../src/reader' import { StringReader } from '../src/reader'
import { Lexer } from '../src/tokenizer' import { Lexer } from '../src/lexer'
test('should parse single word', () => { test('should parse single word', () => {
const reader = new StringReader('word') const reader = new StringReader('word')
@@ -19,5 +19,5 @@ test('should parse OR operator', () => {
const wordToken = tokens[0] as Operator const wordToken = tokens[0] as Operator
expect(wordToken.type).toBe('operator') expect(wordToken.type).toBe('operator')
expect(wordToken.left.value).toBe('word') expect(wordToken.left.value).toBe('word')
expect(wordToken.right.value).toBe('"phrase"') expect(wordToken.right.value).toBe('phrase')
}) })

View File

@@ -1,6 +1,6 @@
import { InputReader } from './reader' import { InputReader } from './reader'
export enum TokenizerState { export enum lexerState {
default, default,
inPhrase, inPhrase,
} }
@@ -19,8 +19,8 @@ export interface LexerTokenValue {
} }
export abstract class ILexer { export abstract class ILexer {
public abstract peek(): LexerTokenValue public abstract peek(amount?: number): LexerTokenValue | null
public abstract consume(): LexerTokenValue public abstract consume(amount?: number): LexerTokenValue | null
public abstract isEOF(): boolean public abstract isEOF(): boolean
public abstract parse(): LexerTokenValue[] public abstract parse(): LexerTokenValue[]
public abstract index: number public abstract index: number
@@ -29,44 +29,79 @@ export abstract class ILexer {
export class Lexer implements ILexer { export class Lexer implements ILexer {
reader: InputReader<string> reader: InputReader<string>
state: TokenizerState = TokenizerState.default state: lexerState = lexerState.default
quoteTerminator: string | null = null quoteTerminator: string | null = null
index: number = 0 index: number = 0
peekIndex: number = 0 peekIndex: number = 0
afterWhitespace: boolean = false afterWhitespace: boolean = false
cache: LexerTokenValue[] = []
constructor(reader: InputReader<string>) { constructor(reader: InputReader<string>) {
this.reader = reader this.reader = reader
} }
// TODO implement peek by (n)? public peek(amount = 0): LexerTokenValue | null {
public peek(): LexerTokenValue { const cacheIndex = this.index + amount
// save state before peeking if (this.isEOF()) {
const beforePeekState = this.state return null
const beforePeekIndex = this.reader.index }
const beforePeekWhiteSpace = this.afterWhitespace
const value = this.readNextToken() if (this.cache[cacheIndex]) {
return this.cache[cacheIndex]
}
// save state before peeking
// const beforePeekState = this.state
// const beforePeekIndex = this.reader.index
// const beforePeekWhiteSpace = this.afterWhitespace
this.fillCache(cacheIndex)
const token = this.cache[cacheIndex]
// restore state after peeking // restore state after peeking
this.state = beforePeekState // this.state = beforePeekState
this.reader.setIndex(beforePeekIndex - 1) // this.reader.setIndex(beforePeekIndex)
this.afterWhitespace = beforePeekWhiteSpace // this.afterWhitespace = beforePeekWhiteSpace
return value return token
} }
// TODO implement consume by (n)? public consume(amount = 0): LexerTokenValue | null {
public consume(): LexerTokenValue { const cacheIndex = this.index + amount
const token = this.readNextToken() this.index = cacheIndex + 1
this.index++
if (this.cache[cacheIndex]) {
return this.cache[cacheIndex]
}
if (this.isEOF()) {
return null
}
this.fillCache(cacheIndex)
const token = this.cache[cacheIndex]
return token return token
} }
private fillCache(n: number) {
const { index } = this
for (let i = 0; i <= n; i++) {
this.index = i
if (this.isEOF()) {
return
}
if (this.cache[i]) {
continue
}
const value = this.readNextToken()
this.cache[i] = value!
}
this.index = index
}
public parse(): LexerTokenValue[] { public parse(): LexerTokenValue[] {
const tokens: LexerTokenValue[] = [] const tokens: LexerTokenValue[] = []
while (!this.isEOF()) { while (!this.isEOF()) {
tokens.push(this.consume()) tokens.push(this.consume()!)
} }
return tokens return tokens
} }
@@ -79,10 +114,10 @@ export class Lexer implements ILexer {
return this.reader.isEOF() return this.reader.isEOF()
} }
private readNextToken(): LexerTokenValue { private readNextToken(): LexerTokenValue | null {
const nextChar = this.reader.peek() const nextChar = this.reader.peek()
switch (this.state) { switch (this.state) {
case TokenizerState.default: case lexerState.default:
// whitespace // whitespace
if (this.isWhitespace(nextChar)) { if (this.isWhitespace(nextChar)) {
this.afterWhitespace = true this.afterWhitespace = true
@@ -94,7 +129,7 @@ export class Lexer implements ILexer {
// quote // quote
if (`"'`.includes(nextChar)) { if (`"'`.includes(nextChar)) {
this.state = TokenizerState.inPhrase this.state = lexerState.inPhrase
this.quoteTerminator = nextChar this.quoteTerminator = nextChar
return this.consumeQuote() return this.consumeQuote()
} }
@@ -131,12 +166,12 @@ export class Lexer implements ILexer {
// other, consume normally // other, consume normally
return this.consumeWord() return this.consumeWord()
case TokenizerState.inPhrase: case lexerState.inPhrase:
this.afterWhitespace = false this.afterWhitespace = false
// in phrase mode, consume until quote terminator // in phrase mode, consume until quote terminator
if (nextChar === this.quoteTerminator) { if (nextChar === this.quoteTerminator) {
this.state = TokenizerState.default this.state = lexerState.default
return this.consumeQuote() return this.consumeQuote()
} }

View File

@@ -1,5 +1,5 @@
import { InputReader } from './reader' import { InputReader } from './reader'
import { ILexer, LexerToken, LexerTokenValue } from './tokenizer' import { ILexer, LexerToken, LexerTokenValue } from './lexer'
export interface ParserTokenValue { export interface ParserTokenValue {
type: 'word' | 'operator' | 'phrase' | 'group' type: 'word' | 'operator' | 'phrase' | 'group'
@@ -38,8 +38,8 @@ export abstract class IParser {
this.lexer = lexer this.lexer = lexer
} }
public abstract peek(): ParserToken | null public abstract peek(amount?: number): ParserToken | null
public abstract consume(): ParserToken | null public abstract consume(amount?: number): ParserToken | null
public abstract parse(): ParserToken[] public abstract parse(): ParserToken[]
public abstract isEOF(): boolean public abstract isEOF(): boolean
} }
@@ -51,47 +51,60 @@ export enum ParserState {
export class Parser extends IParser { export class Parser extends IParser {
index = 0 index = 0
state = ParserState.default state = ParserState.default
stack: ParserToken[] = [] cache: ParserToken[] = []
constructor(lexer: ILexer) { constructor(lexer: ILexer) {
super(lexer) super(lexer)
this.state = ParserState.default this.state = ParserState.default
} }
public peek(): ParserToken | null { public peek(amount = 0): ParserToken | null {
const cacheIndex = this.index + amount
if (this.isEOF()) { if (this.isEOF()) {
return null return null
} }
if (this.index < this.stack.length) { if (cacheIndex < this.cache.length) {
return this.stack[this.index] return this.cache[cacheIndex]
} }
// const beforePeekIndex = this.lexer.index
const beforePeekIndex = this.lexer.index this.fillCache(cacheIndex)
const value = this.readNextToken() const token = this.cache[cacheIndex]
if (value) { // this.lexer.setIndex(beforePeekIndex)
this.stack.push(value) return token
}
this.lexer.setIndex(beforePeekIndex)
return value
} }
public consume(): ParserToken | null { public consume(amount = 0): ParserToken | null {
const cacheIndex = this.index + amount
this.index = cacheIndex + 1
if (this.cache[cacheIndex]) {
return this.cache[cacheIndex]
}
if (this.isEOF()) { if (this.isEOF()) {
return null return null
} }
if (this.index < this.stack.length) {
this.index++
return this.stack[this.index]
}
const token = this.readNextToken() this.fillCache(cacheIndex)
this.index++ const token = this.cache[cacheIndex]
if (token) {
this.stack.push(token)
}
return token return token
} }
private fillCache(n: number) {
const { index } = this
for (let i = 0; i <= n; i++) {
this.index = i
if (this.isEOF()) {
return
}
if (this.cache[i]) {
continue
}
const value = this.readNextToken()
this.cache[i] = value!
}
this.index = index
}
public parse(): ParserToken[] { public parse(): ParserToken[] {
const tokens: ParserToken[] = [] const tokens: ParserToken[] = []
while (!this.isEOF()) { while (!this.isEOF()) {
@@ -109,26 +122,31 @@ export class Parser extends IParser {
} }
private readNextToken(): ParserToken | null { private readNextToken(): ParserToken | null {
const token = this.lexer.consume() let token = this.lexer.peek()
let nextToken = this.lexer.peek() let nextToken = this.lexer.peek(1)
// TODO reset lexer index?
while (nextToken?.token === 'whitespace') {
this.lexer.consume()
nextToken = this.lexer.peek()
}
switch (this.state) { switch (this.state) {
case ParserState.default: case ParserState.default:
if (nextToken.token === 'group') { if (token?.token === 'whitespace') {
this.index++ this.index++
this.lexer.consume()
return this.readNextToken() return this.readNextToken()
} }
switch (token.token) { while (nextToken && nextToken.token === 'whitespace') {
nextToken = this.lexer.peek(1)
this.lexer.consume()
}
if (nextToken?.token === 'group' || nextToken?.token === 'operator') {
this.index++
return this.consumeOperator(token!, nextToken)
}
switch (token?.token) {
case LexerToken.word: case LexerToken.word:
return { type: 'word', value: token.value } return { type: 'word', value: this.lexer.consume()!.value }
case LexerToken.quote: case LexerToken.quote:
return { type: 'phrase', value: token.value, quote: token.value as '"' } return this.consumePhrase(token)
case LexerToken.operator: case LexerToken.operator:
return this.consumeOperator(token) return this.consumeOperator(token, nextToken!)
default: default:
return null return null
} }
@@ -137,9 +155,20 @@ export class Parser extends IParser {
} }
} }
private consumeOperator(token: LexerTokenValue): ParserToken | null { private consumePhrase(token: LexerTokenValue): ParserToken | null {
const left = this.stack[this.stack.length - 1] this.lexer.consume()
const quoteContent = this.lexer.consume()!
this.lexer.consume()
return { type: 'phrase', value: quoteContent.value, quote: token.value as '"' }
}
private consumeOperator(left: LexerTokenValue, opToken: LexerTokenValue): ParserToken | null {
// const left = this.cache[this.cache.length - 1]
this.index++
this.lexer.consume()
const right = this.readNextToken() const right = this.readNextToken()
return { type: 'operator', value: token.value, left, right } this.lexer.consume()
// const right = this.readNextToken()
return { type: 'operator', value: opToken.value, left, right }
} }
} }