From 48832e23eafbfe63ecfc7a6071d01e762584dc0d Mon Sep 17 00:00:00 2001 From: Chen Asraf Date: Sun, 14 Aug 2022 20:26:47 +0300 Subject: [PATCH] feat(parser): wip --- __tests__/parser_test.ts | 23 +++ __tests__/tokenizer_test.ts | 373 ++++++++++++++++++------------------ src/parser.ts | 145 ++++++++++++++ src/tokenizer.ts | 170 +++++++++------- 4 files changed, 453 insertions(+), 258 deletions(-) create mode 100644 __tests__/parser_test.ts diff --git a/__tests__/parser_test.ts b/__tests__/parser_test.ts new file mode 100644 index 0000000..85b9bf1 --- /dev/null +++ b/__tests__/parser_test.ts @@ -0,0 +1,23 @@ +import { Operator, Parser, Word } from '../src/parser' +import { StringReader } from '../src/reader' +import { Lexer } from '../src/tokenizer' + +test('should parse single word', () => { + const reader = new StringReader('word') + const lexer = new Lexer(reader) + const parser = new Parser(lexer) + const tokens = parser.parse() + const wordToken = tokens[0] as Word + expect(wordToken.type).toBe('word') + expect(wordToken.value).toBe('word') +}) +test('should parse OR operator', () => { + const reader = new StringReader('word OR "phrase"') + const lexer = new Lexer(reader) + const parser = new Parser(lexer) + const tokens = parser.parse() + const wordToken = tokens[0] as Operator + expect(wordToken.type).toBe('operator') + expect(wordToken.left.value).toBe('word') + expect(wordToken.right.value).toBe('"phrase"') +}) diff --git a/__tests__/tokenizer_test.ts b/__tests__/tokenizer_test.ts index e6a2503..9fc883b 100644 --- a/__tests__/tokenizer_test.ts +++ b/__tests__/tokenizer_test.ts @@ -1,202 +1,195 @@ import { StringReader } from '../src/reader' -import { Token, Tokenizer } from '../src/tokenizer' +import { LexerToken, Lexer } from '../src/tokenizer' -describe('Tokenizer', () => { - describe('Phrase', () => { - test('Should tokenize single', () => { - const reader = new StringReader('"phrase"') - const tokenizer = new Tokenizer(reader) - const tokens = tokenizer.read() +describe('Phrase', () => { + test('Should tokenize single', () => { + const reader = new StringReader('"phrase"') + const tokenizer = new Lexer(reader) + const tokens = tokenizer.parse() - expect(tokens[0].token).toBe(Token.quote) - expect(tokens[0].value).toBe('"') + expect(tokens[0].token).toBe(LexerToken.quote) + expect(tokens[0].value).toBe('"') - expect(tokens[1].token).toBe(Token.word) - expect(tokens[1].value).toBe('phrase') + expect(tokens[1].token).toBe(LexerToken.word) + expect(tokens[1].value).toBe('phrase') - expect(tokens[2].token).toBe(Token.quote) - expect(tokens[2].value).toBe('"') - }) - - test('Should tokenize multi', () => { - const reader = new StringReader('"one two three 123 !@#"') - const tokenizer = new Tokenizer(reader) - const tokens = tokenizer.read() - - expect(tokens[0].token).toBe(Token.quote) - expect(tokens[0].value).toBe('"') - - expect(tokens[1].token).toBe(Token.word) - expect(tokens[1].value).toBe('one two three 123 !@#') - - expect(tokens[2].token).toBe(Token.quote) - expect(tokens[2].value).toBe('"') - }) + expect(tokens[2].token).toBe(LexerToken.quote) + expect(tokens[2].value).toBe('"') }) - describe('Word', () => { - test('Should tokenize single', () => { - const reader = new StringReader('word') - const tokenizer = new Tokenizer(reader) - const tokens = tokenizer.read() + test('Should tokenize multi', () => { + const reader = new StringReader('"one two three 123 !@#"') + const tokenizer = new Lexer(reader) + const tokens = tokenizer.parse() - expect(tokens[0].token).toBe(Token.word) - expect(tokens[0].value).toBe('word') - }) + expect(tokens[0].token).toBe(LexerToken.quote) + expect(tokens[0].value).toBe('"') - test('Should tokenize multi', () => { - const reader = new StringReader('one two three 123') - const tokenizer = new Tokenizer(reader) - const tokens = tokenizer.read() + expect(tokens[1].token).toBe(LexerToken.word) + expect(tokens[1].value).toBe('one two three 123 !@#') - expect(tokens[0].token).toBe(Token.word) - expect(tokens[0].value).toBe('one') - - expect(tokens[1].token).toBe(Token.whitespace) - expect(tokens[1].value).toBe(' ') - - expect(tokens[2].token).toBe(Token.word) - expect(tokens[2].value).toBe('two') - - expect(tokens[3].token).toBe(Token.whitespace) - expect(tokens[3].value).toBe(' ') - - expect(tokens[4].token).toBe(Token.word) - expect(tokens[4].value).toBe('three') - - expect(tokens[5].token).toBe(Token.whitespace) - expect(tokens[5].value).toBe(' ') - - expect(tokens[6].token).toBe(Token.word) - expect(tokens[6].value).toBe('123') - }) - }) - - describe('Groups', () => { - test('Should tokenize single-word group', () => { - const reader = new StringReader('(word)') - const tokenizer = new Tokenizer(reader) - const tokens = tokenizer.read() - - expect(tokens[0].token).toBe(Token.group) - expect(tokens[0].value).toBe('(') - - expect(tokens[1].token).toBe(Token.word) - expect(tokens[1].value).toBe('word') - - expect(tokens[2].token).toBe(Token.group) - expect(tokens[2].value).toBe(')') - }) - - describe('logical operator OR group', () => { - test('should parse OR separator', () => { - const reader = new StringReader('(word OR word)') - const tokenizer = new Tokenizer(reader) - const tokens = tokenizer.read() - - expect(tokens[0].token).toBe(Token.group) - expect(tokens[0].value).toBe('(') - - expect(tokens[1].token).toBe(Token.word) - expect(tokens[1].value).toBe('word') - - expect(tokens[2].token).toBe(Token.whitespace) - expect(tokens[2].value).toBe(' ') - - expect(tokens[3].token).toBe(Token.operator) - expect(tokens[3].value).toBe('or') - - expect(tokens[4].token).toBe(Token.whitespace) - expect(tokens[4].value).toBe(' ') - - expect(tokens[5].token).toBe(Token.word) - expect(tokens[5].value).toBe('word') - - expect(tokens[6].token).toBe(Token.group) - expect(tokens[6].value).toBe(')') - }) - - test('should parse | separator', () => { - const reader = new StringReader('(word | word)') - const tokenizer = new Tokenizer(reader) - const tokens = tokenizer.read() - - expect(tokens[0].token).toBe(Token.group) - expect(tokens[0].value).toBe('(') - - expect(tokens[1].token).toBe(Token.word) - expect(tokens[1].value).toBe('word') - - expect(tokens[2].token).toBe(Token.whitespace) - expect(tokens[2].value).toBe(' ') - - expect(tokens[3].token).toBe(Token.operator) - expect(tokens[3].value).toBe('|') - - expect(tokens[4].token).toBe(Token.whitespace) - expect(tokens[4].value).toBe(' ') - - expect(tokens[5].token).toBe(Token.word) - expect(tokens[5].value).toBe('word') - - expect(tokens[6].token).toBe(Token.group) - expect(tokens[6].value).toBe(')') - }) - }) - - describe('logical operator AND group', () => { - test('should parse AND separator', () => { - const reader = new StringReader('(word AND word)') - const tokenizer = new Tokenizer(reader) - const tokens = tokenizer.read() - - expect(tokens[0].token).toBe(Token.group) - expect(tokens[0].value).toBe('(') - - expect(tokens[1].token).toBe(Token.word) - expect(tokens[1].value).toBe('word') - - expect(tokens[2].token).toBe(Token.whitespace) - expect(tokens[2].value).toBe(' ') - - expect(tokens[3].token).toBe(Token.operator) - expect(tokens[3].value).toBe('and') - - expect(tokens[4].token).toBe(Token.whitespace) - expect(tokens[4].value).toBe(' ') - - expect(tokens[5].token).toBe(Token.word) - expect(tokens[5].value).toBe('word') - - expect(tokens[6].token).toBe(Token.group) - expect(tokens[6].value).toBe(')') - }) - test('should parse & separator', () => { - const reader = new StringReader('(word & word)') - const tokenizer = new Tokenizer(reader) - const tokens = tokenizer.read() - - expect(tokens[0].token).toBe(Token.group) - expect(tokens[0].value).toBe('(') - - expect(tokens[1].token).toBe(Token.word) - expect(tokens[1].value).toBe('word') - - expect(tokens[2].token).toBe(Token.whitespace) - expect(tokens[2].value).toBe(' ') - - expect(tokens[3].token).toBe(Token.operator) - expect(tokens[3].value).toBe('&') - - expect(tokens[4].token).toBe(Token.whitespace) - expect(tokens[4].value).toBe(' ') - - expect(tokens[5].token).toBe(Token.word) - expect(tokens[5].value).toBe('word') - - expect(tokens[6].token).toBe(Token.group) - expect(tokens[6].value).toBe(')') - }) - }) + expect(tokens[2].token).toBe(LexerToken.quote) + expect(tokens[2].value).toBe('"') + }) +}) + +describe('Word', () => { + test('Should tokenize single', () => { + const reader = new StringReader('word') + const tokenizer = new Lexer(reader) + const tokens = tokenizer.parse() + + expect(tokens[0].token).toBe(LexerToken.word) + expect(tokens[0].value).toBe('word') + }) + + test('Should tokenize multi', () => { + const reader = new StringReader('one two three 123') + const tokenizer = new Lexer(reader) + const tokens = tokenizer.parse() + + expect(tokens[0].token).toBe(LexerToken.word) + expect(tokens[0].value).toBe('one') + + expect(tokens[1].token).toBe(LexerToken.whitespace) + expect(tokens[1].value).toBe(' ') + + expect(tokens[2].token).toBe(LexerToken.word) + expect(tokens[2].value).toBe('two') + + expect(tokens[3].token).toBe(LexerToken.whitespace) + expect(tokens[3].value).toBe(' ') + + expect(tokens[4].token).toBe(LexerToken.word) + expect(tokens[4].value).toBe('three') + + expect(tokens[5].token).toBe(LexerToken.whitespace) + expect(tokens[5].value).toBe(' ') + + expect(tokens[6].token).toBe(LexerToken.word) + expect(tokens[6].value).toBe('123') + }) +}) + +describe('Groups', () => { + test('Should tokenize single-word group', () => { + const reader = new StringReader('(word)') + const tokenizer = new Lexer(reader) + const tokens = tokenizer.parse() + + expect(tokens[0].token).toBe(LexerToken.group) + expect(tokens[0].value).toBe('(') + + expect(tokens[1].token).toBe(LexerToken.word) + expect(tokens[1].value).toBe('word') + + expect(tokens[2].token).toBe(LexerToken.group) + expect(tokens[2].value).toBe(')') + }) +}) + +describe('Logical operator OR', () => { + test('should parse OR separator', () => { + const reader = new StringReader('word OR word') + const tokenizer = new Lexer(reader) + const tokens = tokenizer.parse() + + expect(tokens[0].token).toBe(LexerToken.word) + expect(tokens[0].value).toBe('word') + + expect(tokens[1].token).toBe(LexerToken.whitespace) + expect(tokens[1].value).toBe(' ') + + expect(tokens[2].token).toBe(LexerToken.operator) + expect(tokens[2].value).toBe('or') + + expect(tokens[3].token).toBe(LexerToken.whitespace) + expect(tokens[3].value).toBe(' ') + + expect(tokens[4].token).toBe(LexerToken.word) + expect(tokens[4].value).toBe('word') + }) + + test('should not parse OR separator mid-word', () => { + const reader = new StringReader('wordORword') + const tokenizer = new Lexer(reader) + const tokens = tokenizer.parse() + + expect(tokens[0].token).toBe(LexerToken.word) + expect(tokens[0].value).toBe('wordORword') + }) + + test('should parse | separator', () => { + const reader = new StringReader('word | word') + const tokenizer = new Lexer(reader) + const tokens = tokenizer.parse() + + expect(tokens[0].token).toBe(LexerToken.word) + expect(tokens[0].value).toBe('word') + + expect(tokens[1].token).toBe(LexerToken.whitespace) + expect(tokens[1].value).toBe(' ') + + expect(tokens[2].token).toBe(LexerToken.operator) + expect(tokens[2].value).toBe('|') + + expect(tokens[3].token).toBe(LexerToken.whitespace) + expect(tokens[3].value).toBe(' ') + + expect(tokens[4].token).toBe(LexerToken.word) + expect(tokens[4].value).toBe('word') + }) +}) + +describe('Logical operator AND', () => { + test('should parse AND separator', () => { + const reader = new StringReader('word AND word') + const tokenizer = new Lexer(reader) + const tokens = tokenizer.parse() + + expect(tokens[0].token).toBe(LexerToken.word) + expect(tokens[0].value).toBe('word') + + expect(tokens[1].token).toBe(LexerToken.whitespace) + expect(tokens[1].value).toBe(' ') + + expect(tokens[2].token).toBe(LexerToken.operator) + expect(tokens[2].value).toBe('and') + + expect(tokens[3].token).toBe(LexerToken.whitespace) + expect(tokens[3].value).toBe(' ') + + expect(tokens[4].token).toBe(LexerToken.word) + expect(tokens[4].value).toBe('word') + }) + + test('should not parse AND separator mid-word', () => { + const reader = new StringReader('wordANDword') + const tokenizer = new Lexer(reader) + const tokens = tokenizer.parse() + + expect(tokens[0].token).toBe(LexerToken.word) + expect(tokens[0].value).toBe('wordANDword') + }) + + test('should parse & separator', () => { + const reader = new StringReader('word & word') + const tokenizer = new Lexer(reader) + const tokens = tokenizer.parse() + + expect(tokens[0].token).toBe(LexerToken.word) + expect(tokens[0].value).toBe('word') + + expect(tokens[1].token).toBe(LexerToken.whitespace) + expect(tokens[1].value).toBe(' ') + + expect(tokens[2].token).toBe(LexerToken.operator) + expect(tokens[2].value).toBe('&') + + expect(tokens[3].token).toBe(LexerToken.whitespace) + expect(tokens[3].value).toBe(' ') + + expect(tokens[4].token).toBe(LexerToken.word) + expect(tokens[4].value).toBe('word') }) }) diff --git a/src/parser.ts b/src/parser.ts index e69de29..4c5e4e9 100644 --- a/src/parser.ts +++ b/src/parser.ts @@ -0,0 +1,145 @@ +import { InputReader } from './reader' +import { ILexer, LexerToken, LexerTokenValue } from './tokenizer' + +export interface ParserTokenValue { + type: 'word' | 'operator' | 'phrase' | 'group' +} + +export interface Phrase extends ParserTokenValue { + type: 'phrase' + value: string + quote: "'" | '"' +} + +export interface Word extends ParserTokenValue { + type: 'word' + value: string +} + +export interface Operator extends ParserTokenValue { + type: 'operator' + value: string + left: any + right: any +} + +export interface Group extends ParserTokenValue { + type: 'group' + children: any[] +} + +export type ParserToken = Phrase | Word | Operator | Group + +export abstract class IParser { + public lexer: ILexer + public abstract index: number + + constructor(lexer: ILexer) { + this.lexer = lexer + } + + public abstract peek(): ParserToken | null + public abstract consume(): ParserToken | null + public abstract parse(): ParserToken[] + public abstract isEOF(): boolean +} + +export enum ParserState { + default, +} + +export class Parser extends IParser { + index = 0 + state = ParserState.default + stack: ParserToken[] = [] + + constructor(lexer: ILexer) { + super(lexer) + this.state = ParserState.default + } + + public peek(): ParserToken | null { + if (this.isEOF()) { + return null + } + if (this.index < this.stack.length) { + return this.stack[this.index] + } + + const beforePeekIndex = this.lexer.index + const value = this.readNextToken() + if (value) { + this.stack.push(value) + } + this.lexer.setIndex(beforePeekIndex) + return value + } + + public consume(): ParserToken | null { + if (this.isEOF()) { + return null + } + if (this.index < this.stack.length) { + this.index++ + return this.stack[this.index] + } + + const token = this.readNextToken() + this.index++ + if (token) { + this.stack.push(token) + } + return token + } + + public parse(): ParserToken[] { + const tokens: ParserToken[] = [] + while (!this.isEOF()) { + const token = this.consume() + if (!token) { + return tokens + } + tokens.push(token) + } + return tokens + } + + public isEOF(): boolean { + return this.lexer.isEOF() + } + + private readNextToken(): ParserToken | null { + const token = this.lexer.consume() + let nextToken = this.lexer.peek() + // TODO reset lexer index? + while (nextToken?.token === 'whitespace') { + this.lexer.consume() + nextToken = this.lexer.peek() + } + switch (this.state) { + case ParserState.default: + if (nextToken.token === 'group') { + this.index++ + return this.readNextToken() + } + switch (token.token) { + case LexerToken.word: + return { type: 'word', value: token.value } + case LexerToken.quote: + return { type: 'phrase', value: token.value, quote: token.value as '"' } + case LexerToken.operator: + return this.consumeOperator(token) + default: + return null + } + default: + throw new Error('Bad state') + } + } + + private consumeOperator(token: LexerTokenValue): ParserToken | null { + const left = this.stack[this.stack.length - 1] + const right = this.readNextToken() + return { type: 'operator', value: token.value, left, right } + } +} diff --git a/src/tokenizer.ts b/src/tokenizer.ts index c68ce2d..a6fd178 100644 --- a/src/tokenizer.ts +++ b/src/tokenizer.ts @@ -5,127 +5,186 @@ export enum TokenizerState { inPhrase, } -export enum Token { - // phrase = 'phrase', +export enum LexerToken { group = 'group', operator = 'operator', word = 'word', quote = 'quote', whitespace = 'whitespace', - // eof = 'eof', } -export interface TokenValue { +export interface LexerTokenValue { value: string - token: Token + token: LexerToken } -export class Tokenizer implements InputReader { +export abstract class ILexer { + public abstract peek(): LexerTokenValue + public abstract consume(): LexerTokenValue + public abstract isEOF(): boolean + public abstract parse(): LexerTokenValue[] + public abstract index: number + public abstract setIndex(n: number): void +} + +export class Lexer implements ILexer { reader: InputReader state: TokenizerState = TokenizerState.default quoteTerminator: string | null = null index: number = 0 peekIndex: number = 0 + afterWhitespace: boolean = false constructor(reader: InputReader) { this.reader = reader } - public isEOF(): boolean { - return this.reader.isEOF() + // TODO implement peek by (n)? + public peek(): LexerTokenValue { + // save state before peeking + const beforePeekState = this.state + const beforePeekIndex = this.reader.index + const beforePeekWhiteSpace = this.afterWhitespace + + const value = this.readNextToken() + + // restore state after peeking + this.state = beforePeekState + this.reader.setIndex(beforePeekIndex - 1) + this.afterWhitespace = beforePeekWhiteSpace + + return value + } + + // TODO implement consume by (n)? + public consume(): LexerTokenValue { + const token = this.readNextToken() + this.index++ + return token + } + + public parse(): LexerTokenValue[] { + const tokens: LexerTokenValue[] = [] + while (!this.isEOF()) { + tokens.push(this.consume()) + } + return tokens } public setIndex(n: number): void { this.index = n } - private readNextToken(): TokenValue { + public isEOF(): boolean { + return this.reader.isEOF() + } + + private readNextToken(): LexerTokenValue { const nextChar = this.reader.peek() switch (this.state) { case TokenizerState.default: + // whitespace if (this.isWhitespace(nextChar)) { + this.afterWhitespace = true return { value: this.reader.consume(), - token: Token.whitespace, + token: LexerToken.whitespace, } } + + // quote if (`"'`.includes(nextChar)) { this.state = TokenizerState.inPhrase this.quoteTerminator = nextChar return this.consumeQuote() } - if (this.isAlphanumeric(nextChar.charCodeAt(0))) { - if (this.confirmExactWord('OR')) { + // other words + if (this.isAlphanumeric(nextChar)) { + // guard OR + if (this.afterWhitespace && this.peekExact('OR')) { return this.consumeOr() } - if (this.confirmExactWord('AND')) { + // guard AND + if (this.afterWhitespace && this.peekExact('AND')) { return this.consumeAnd() } + + // neither, consume normally return this.consumeWord() } + // or operator if (nextChar === '|') { return this.consumeOr() } + // and operator if (nextChar === '&') { return this.consumeAnd() } + // group if (nextChar === '(' || nextChar === ')') { return this.consumeGroup() } + + // other, consume normally return this.consumeWord() case TokenizerState.inPhrase: + this.afterWhitespace = false + + // in phrase mode, consume until quote terminator if (nextChar === this.quoteTerminator) { this.state = TokenizerState.default return this.consumeQuote() } + + // otherwise consume any character return this.consumePhrase() default: throw new Error('bad state') } } - consumeQuote(): TokenValue { + private consumeQuote(): LexerTokenValue { return { value: this.reader.consume(), - token: Token.quote, + token: LexerToken.quote, } } - consumeAnd(): TokenValue { + private consumeAnd(): LexerTokenValue { let value = '' - if (this.confirmExactWord('AND')) { - this.consumeExactWord('AND') + if (this.peekExact('AND')) { + this.consumeExact('AND') value = 'and' - } else if (this.confirmExactWord('&')) { - this.consumeExactWord('&') + } else if (this.peekExact('&')) { + this.consumeExact('&') value = '&' } return { value, - token: Token.operator, + token: LexerToken.operator, } } - consumeOr(): TokenValue { + private consumeOr(): LexerTokenValue { let value = '' - if (this.confirmExactWord('OR')) { - this.consumeExactWord('OR') + if (this.peekExact('OR')) { + this.consumeExact('OR') value = 'or' - } else if (this.confirmExactWord('|')) { - this.consumeExactWord('|') + } else if (this.peekExact('|')) { + this.consumeExact('|') value = '|' } return { value, - token: Token.operator, + token: LexerToken.operator, } } - confirmExactWord(word: string) { + private peekExact(word: string) { let nextChar = this.reader.peek() for (let i = 0; i < word.length; i++) { if (nextChar !== word[i]) { @@ -136,28 +195,28 @@ export class Tokenizer implements InputReader { return true } - consumeExactWord(word: string) { - if (this.confirmExactWord(word)) { - this.consumeReader(word.length) + private consumeExact(word: string) { + if (this.peekExact(word)) { + this.consumeLength(word.length) } else { throw new Error("Can't find exact word: " + word) } } - consumeReader(times = 1) { + private consumeLength(times = 1) { for (let i = 0; i < times; i++) { this.reader.consume() } } - private consumeGroup(): TokenValue { + private consumeGroup(): LexerTokenValue { return { value: this.reader.consume(), - token: Token.group, + token: LexerToken.group, } } - private consumePhrase(): TokenValue { + private consumePhrase(): LexerTokenValue { let nextChar = this.reader.consume() let value = nextChar while ((nextChar = this.reader.peek()) && nextChar !== this.quoteTerminator) { @@ -165,58 +224,33 @@ export class Tokenizer implements InputReader { } return { value, - token: Token.word, + token: LexerToken.word, } } - private consumeWord(): TokenValue { + private consumeWord(): LexerTokenValue { let value = this.consumeWholeWord() return { value, - token: Token.word, + token: LexerToken.word, } } private consumeWholeWord() { // let nextChar = this.reader.peek() let value = '' - while (this.isAlphanumeric(this.reader.peek().charCodeAt(0))) { + while (this.isAlphanumeric(this.reader.peek())) { value += this.reader.consume() } return value } - public peek(): TokenValue { - const beforePeekState = this.state - const beforePeekIndex = this.reader.index - // this.peekIndex = this.currentIndex + n - const value = this.readNextToken() - this.state = beforePeekState - this.reader.setIndex(beforePeekIndex) - return value - // return this.readNextToken() + private isWhitespace(char: string) { + return ' \t\n\r'.includes(char) } - public consume(): TokenValue { - const token = this.readNextToken() - // this.reader.consume() - this.index++ - return token - } - - public read(): TokenValue[] { - const tokens: TokenValue[] = [] - while (!this.isEOF()) { - tokens.push(this.consume()) - } - return tokens - } - - private isWhitespace(nextChar: string) { - return ' \t\n\r'.includes(nextChar) - } - - private isAlphanumeric(charCode: number): boolean { + private isAlphanumeric(char: string): boolean { + const charCode = char.charCodeAt(0) return ( (charCode >= 48 && charCode <= 57) || (charCode >= 65 && charCode <= 90) ||