From c04b73f014da41f2abfc0fdd6a746b3ede699280 Mon Sep 17 00:00:00 2001 From: Chen Asraf Date: Tue, 16 Aug 2022 01:01:57 +0300 Subject: [PATCH] feat(parser): initial poc done --- .../{tokenizer_test.ts => lexer_test.ts} | 46 ++++---- __tests__/parser_test.ts | 4 +- src/{tokenizer.ts => lexer.ts} | 85 ++++++++++---- src/parser.ts | 109 +++++++++++------- 4 files changed, 154 insertions(+), 90 deletions(-) rename __tests__/{tokenizer_test.ts => lexer_test.ts} (84%) rename src/{tokenizer.ts => lexer.ts} (74%) diff --git a/__tests__/tokenizer_test.ts b/__tests__/lexer_test.ts similarity index 84% rename from __tests__/tokenizer_test.ts rename to __tests__/lexer_test.ts index 9fc883b..2e7b07d 100644 --- a/__tests__/tokenizer_test.ts +++ b/__tests__/lexer_test.ts @@ -1,11 +1,11 @@ import { StringReader } from '../src/reader' -import { LexerToken, Lexer } from '../src/tokenizer' +import { LexerToken, Lexer } from '../src/lexer' describe('Phrase', () => { test('Should tokenize single', () => { const reader = new StringReader('"phrase"') - const tokenizer = new Lexer(reader) - const tokens = tokenizer.parse() + const lexer = new Lexer(reader) + const tokens = lexer.parse() expect(tokens[0].token).toBe(LexerToken.quote) expect(tokens[0].value).toBe('"') @@ -19,8 +19,8 @@ describe('Phrase', () => { test('Should tokenize multi', () => { const reader = new StringReader('"one two three 123 !@#"') - const tokenizer = new Lexer(reader) - const tokens = tokenizer.parse() + const lexer = new Lexer(reader) + const tokens = lexer.parse() expect(tokens[0].token).toBe(LexerToken.quote) expect(tokens[0].value).toBe('"') @@ -36,8 +36,8 @@ describe('Phrase', () => { describe('Word', () => { test('Should tokenize single', () => { const reader = new StringReader('word') - const tokenizer = new Lexer(reader) - const tokens = tokenizer.parse() + const lexer = new Lexer(reader) + const tokens = lexer.parse() expect(tokens[0].token).toBe(LexerToken.word) expect(tokens[0].value).toBe('word') @@ -45,8 +45,8 @@ describe('Word', () => { test('Should tokenize multi', () => { const reader = new StringReader('one two three 123') - const tokenizer = new Lexer(reader) - const tokens = tokenizer.parse() + const lexer = new Lexer(reader) + const tokens = lexer.parse() expect(tokens[0].token).toBe(LexerToken.word) expect(tokens[0].value).toBe('one') @@ -74,8 +74,8 @@ describe('Word', () => { describe('Groups', () => { test('Should tokenize single-word group', () => { const reader = new StringReader('(word)') - const tokenizer = new Lexer(reader) - const tokens = tokenizer.parse() + const lexer = new Lexer(reader) + const tokens = lexer.parse() expect(tokens[0].token).toBe(LexerToken.group) expect(tokens[0].value).toBe('(') @@ -91,8 +91,8 @@ describe('Groups', () => { describe('Logical operator OR', () => { test('should parse OR separator', () => { const reader = new StringReader('word OR word') - const tokenizer = new Lexer(reader) - const tokens = tokenizer.parse() + const lexer = new Lexer(reader) + const tokens = lexer.parse() expect(tokens[0].token).toBe(LexerToken.word) expect(tokens[0].value).toBe('word') @@ -112,8 +112,8 @@ describe('Logical operator OR', () => { test('should not parse OR separator mid-word', () => { const reader = new StringReader('wordORword') - const tokenizer = new Lexer(reader) - const tokens = tokenizer.parse() + const lexer = new Lexer(reader) + const tokens = lexer.parse() expect(tokens[0].token).toBe(LexerToken.word) expect(tokens[0].value).toBe('wordORword') @@ -121,8 +121,8 @@ describe('Logical operator OR', () => { test('should parse | separator', () => { const reader = new StringReader('word | word') - const tokenizer = new Lexer(reader) - const tokens = tokenizer.parse() + const lexer = new Lexer(reader) + const tokens = lexer.parse() expect(tokens[0].token).toBe(LexerToken.word) expect(tokens[0].value).toBe('word') @@ -144,8 +144,8 @@ describe('Logical operator OR', () => { describe('Logical operator AND', () => { test('should parse AND separator', () => { const reader = new StringReader('word AND word') - const tokenizer = new Lexer(reader) - const tokens = tokenizer.parse() + const lexer = new Lexer(reader) + const tokens = lexer.parse() expect(tokens[0].token).toBe(LexerToken.word) expect(tokens[0].value).toBe('word') @@ -165,8 +165,8 @@ describe('Logical operator AND', () => { test('should not parse AND separator mid-word', () => { const reader = new StringReader('wordANDword') - const tokenizer = new Lexer(reader) - const tokens = tokenizer.parse() + const lexer = new Lexer(reader) + const tokens = lexer.parse() expect(tokens[0].token).toBe(LexerToken.word) expect(tokens[0].value).toBe('wordANDword') @@ -174,8 +174,8 @@ describe('Logical operator AND', () => { test('should parse & separator', () => { const reader = new StringReader('word & word') - const tokenizer = new Lexer(reader) - const tokens = tokenizer.parse() + const lexer = new Lexer(reader) + const tokens = lexer.parse() expect(tokens[0].token).toBe(LexerToken.word) expect(tokens[0].value).toBe('word') diff --git a/__tests__/parser_test.ts b/__tests__/parser_test.ts index 85b9bf1..940a2b6 100644 --- a/__tests__/parser_test.ts +++ b/__tests__/parser_test.ts @@ -1,6 +1,6 @@ import { Operator, Parser, Word } from '../src/parser' import { StringReader } from '../src/reader' -import { Lexer } from '../src/tokenizer' +import { Lexer } from '../src/lexer' test('should parse single word', () => { const reader = new StringReader('word') @@ -19,5 +19,5 @@ test('should parse OR operator', () => { const wordToken = tokens[0] as Operator expect(wordToken.type).toBe('operator') expect(wordToken.left.value).toBe('word') - expect(wordToken.right.value).toBe('"phrase"') + expect(wordToken.right.value).toBe('phrase') }) diff --git a/src/tokenizer.ts b/src/lexer.ts similarity index 74% rename from src/tokenizer.ts rename to src/lexer.ts index a6fd178..a3171c2 100644 --- a/src/tokenizer.ts +++ b/src/lexer.ts @@ -1,6 +1,6 @@ import { InputReader } from './reader' -export enum TokenizerState { +export enum lexerState { default, inPhrase, } @@ -19,8 +19,8 @@ export interface LexerTokenValue { } export abstract class ILexer { - public abstract peek(): LexerTokenValue - public abstract consume(): LexerTokenValue + public abstract peek(amount?: number): LexerTokenValue | null + public abstract consume(amount?: number): LexerTokenValue | null public abstract isEOF(): boolean public abstract parse(): LexerTokenValue[] public abstract index: number @@ -29,44 +29,79 @@ export abstract class ILexer { export class Lexer implements ILexer { reader: InputReader - state: TokenizerState = TokenizerState.default + state: lexerState = lexerState.default quoteTerminator: string | null = null index: number = 0 peekIndex: number = 0 afterWhitespace: boolean = false + cache: LexerTokenValue[] = [] constructor(reader: InputReader) { this.reader = reader } - // TODO implement peek by (n)? - public peek(): LexerTokenValue { - // save state before peeking - const beforePeekState = this.state - const beforePeekIndex = this.reader.index - const beforePeekWhiteSpace = this.afterWhitespace + public peek(amount = 0): LexerTokenValue | null { + const cacheIndex = this.index + amount + if (this.isEOF()) { + return null + } - const value = this.readNextToken() + if (this.cache[cacheIndex]) { + return this.cache[cacheIndex] + } + + // save state before peeking + // const beforePeekState = this.state + // const beforePeekIndex = this.reader.index + // const beforePeekWhiteSpace = this.afterWhitespace + + this.fillCache(cacheIndex) + const token = this.cache[cacheIndex] // restore state after peeking - this.state = beforePeekState - this.reader.setIndex(beforePeekIndex - 1) - this.afterWhitespace = beforePeekWhiteSpace + // this.state = beforePeekState + // this.reader.setIndex(beforePeekIndex) + // this.afterWhitespace = beforePeekWhiteSpace - return value + return token } - // TODO implement consume by (n)? - public consume(): LexerTokenValue { - const token = this.readNextToken() - this.index++ + public consume(amount = 0): LexerTokenValue | null { + const cacheIndex = this.index + amount + this.index = cacheIndex + 1 + + if (this.cache[cacheIndex]) { + return this.cache[cacheIndex] + } + if (this.isEOF()) { + return null + } + + this.fillCache(cacheIndex) + const token = this.cache[cacheIndex] return token } + private fillCache(n: number) { + const { index } = this + for (let i = 0; i <= n; i++) { + this.index = i + if (this.isEOF()) { + return + } + if (this.cache[i]) { + continue + } + const value = this.readNextToken() + this.cache[i] = value! + } + this.index = index + } + public parse(): LexerTokenValue[] { const tokens: LexerTokenValue[] = [] while (!this.isEOF()) { - tokens.push(this.consume()) + tokens.push(this.consume()!) } return tokens } @@ -79,10 +114,10 @@ export class Lexer implements ILexer { return this.reader.isEOF() } - private readNextToken(): LexerTokenValue { + private readNextToken(): LexerTokenValue | null { const nextChar = this.reader.peek() switch (this.state) { - case TokenizerState.default: + case lexerState.default: // whitespace if (this.isWhitespace(nextChar)) { this.afterWhitespace = true @@ -94,7 +129,7 @@ export class Lexer implements ILexer { // quote if (`"'`.includes(nextChar)) { - this.state = TokenizerState.inPhrase + this.state = lexerState.inPhrase this.quoteTerminator = nextChar return this.consumeQuote() } @@ -131,12 +166,12 @@ export class Lexer implements ILexer { // other, consume normally return this.consumeWord() - case TokenizerState.inPhrase: + case lexerState.inPhrase: this.afterWhitespace = false // in phrase mode, consume until quote terminator if (nextChar === this.quoteTerminator) { - this.state = TokenizerState.default + this.state = lexerState.default return this.consumeQuote() } diff --git a/src/parser.ts b/src/parser.ts index 4c5e4e9..e610065 100644 --- a/src/parser.ts +++ b/src/parser.ts @@ -1,5 +1,5 @@ import { InputReader } from './reader' -import { ILexer, LexerToken, LexerTokenValue } from './tokenizer' +import { ILexer, LexerToken, LexerTokenValue } from './lexer' export interface ParserTokenValue { type: 'word' | 'operator' | 'phrase' | 'group' @@ -38,8 +38,8 @@ export abstract class IParser { this.lexer = lexer } - public abstract peek(): ParserToken | null - public abstract consume(): ParserToken | null + public abstract peek(amount?: number): ParserToken | null + public abstract consume(amount?: number): ParserToken | null public abstract parse(): ParserToken[] public abstract isEOF(): boolean } @@ -51,47 +51,60 @@ export enum ParserState { export class Parser extends IParser { index = 0 state = ParserState.default - stack: ParserToken[] = [] + cache: ParserToken[] = [] constructor(lexer: ILexer) { super(lexer) this.state = ParserState.default } - public peek(): ParserToken | null { + public peek(amount = 0): ParserToken | null { + const cacheIndex = this.index + amount if (this.isEOF()) { return null } - if (this.index < this.stack.length) { - return this.stack[this.index] + if (cacheIndex < this.cache.length) { + return this.cache[cacheIndex] } - - const beforePeekIndex = this.lexer.index - const value = this.readNextToken() - if (value) { - this.stack.push(value) - } - this.lexer.setIndex(beforePeekIndex) - return value + // const beforePeekIndex = this.lexer.index + this.fillCache(cacheIndex) + const token = this.cache[cacheIndex] + // this.lexer.setIndex(beforePeekIndex) + return token } - public consume(): ParserToken | null { + public consume(amount = 0): ParserToken | null { + const cacheIndex = this.index + amount + this.index = cacheIndex + 1 + + if (this.cache[cacheIndex]) { + return this.cache[cacheIndex] + } if (this.isEOF()) { return null } - if (this.index < this.stack.length) { - this.index++ - return this.stack[this.index] - } - const token = this.readNextToken() - this.index++ - if (token) { - this.stack.push(token) - } + this.fillCache(cacheIndex) + const token = this.cache[cacheIndex] return token } + private fillCache(n: number) { + const { index } = this + for (let i = 0; i <= n; i++) { + this.index = i + if (this.isEOF()) { + return + } + if (this.cache[i]) { + continue + } + const value = this.readNextToken() + this.cache[i] = value! + } + this.index = index + } + public parse(): ParserToken[] { const tokens: ParserToken[] = [] while (!this.isEOF()) { @@ -109,26 +122,31 @@ export class Parser extends IParser { } private readNextToken(): ParserToken | null { - const token = this.lexer.consume() - let nextToken = this.lexer.peek() - // TODO reset lexer index? - while (nextToken?.token === 'whitespace') { - this.lexer.consume() - nextToken = this.lexer.peek() - } + let token = this.lexer.peek() + let nextToken = this.lexer.peek(1) + switch (this.state) { case ParserState.default: - if (nextToken.token === 'group') { + if (token?.token === 'whitespace') { this.index++ + this.lexer.consume() return this.readNextToken() } - switch (token.token) { + while (nextToken && nextToken.token === 'whitespace') { + nextToken = this.lexer.peek(1) + this.lexer.consume() + } + if (nextToken?.token === 'group' || nextToken?.token === 'operator') { + this.index++ + return this.consumeOperator(token!, nextToken) + } + switch (token?.token) { case LexerToken.word: - return { type: 'word', value: token.value } + return { type: 'word', value: this.lexer.consume()!.value } case LexerToken.quote: - return { type: 'phrase', value: token.value, quote: token.value as '"' } + return this.consumePhrase(token) case LexerToken.operator: - return this.consumeOperator(token) + return this.consumeOperator(token, nextToken!) default: return null } @@ -137,9 +155,20 @@ export class Parser extends IParser { } } - private consumeOperator(token: LexerTokenValue): ParserToken | null { - const left = this.stack[this.stack.length - 1] + private consumePhrase(token: LexerTokenValue): ParserToken | null { + this.lexer.consume() + const quoteContent = this.lexer.consume()! + this.lexer.consume() + return { type: 'phrase', value: quoteContent.value, quote: token.value as '"' } + } + + private consumeOperator(left: LexerTokenValue, opToken: LexerTokenValue): ParserToken | null { + // const left = this.cache[this.cache.length - 1] + this.index++ + this.lexer.consume() const right = this.readNextToken() - return { type: 'operator', value: token.value, left, right } + this.lexer.consume() + // const right = this.readNextToken() + return { type: 'operator', value: opToken.value, left, right } } }