feat(parser): wip

This commit is contained in:
Chen Asraf
2022-08-14 20:26:47 +03:00
parent 4ffd52714d
commit 48832e23ea
4 changed files with 453 additions and 258 deletions

23
__tests__/parser_test.ts Normal file
View File

@@ -0,0 +1,23 @@
import { Operator, Parser, Word } from '../src/parser'
import { StringReader } from '../src/reader'
import { Lexer } from '../src/tokenizer'
test('should parse single word', () => {
const reader = new StringReader('word')
const lexer = new Lexer(reader)
const parser = new Parser(lexer)
const tokens = parser.parse()
const wordToken = tokens[0] as Word
expect(wordToken.type).toBe('word')
expect(wordToken.value).toBe('word')
})
test('should parse OR operator', () => {
const reader = new StringReader('word OR "phrase"')
const lexer = new Lexer(reader)
const parser = new Parser(lexer)
const tokens = parser.parse()
const wordToken = tokens[0] as Operator
expect(wordToken.type).toBe('operator')
expect(wordToken.left.value).toBe('word')
expect(wordToken.right.value).toBe('"phrase"')
})

View File

@@ -1,202 +1,195 @@
import { StringReader } from '../src/reader'
import { Token, Tokenizer } from '../src/tokenizer'
import { LexerToken, Lexer } from '../src/tokenizer'
describe('Tokenizer', () => {
describe('Phrase', () => {
test('Should tokenize single', () => {
const reader = new StringReader('"phrase"')
const tokenizer = new Tokenizer(reader)
const tokens = tokenizer.read()
describe('Phrase', () => {
test('Should tokenize single', () => {
const reader = new StringReader('"phrase"')
const tokenizer = new Lexer(reader)
const tokens = tokenizer.parse()
expect(tokens[0].token).toBe(Token.quote)
expect(tokens[0].value).toBe('"')
expect(tokens[0].token).toBe(LexerToken.quote)
expect(tokens[0].value).toBe('"')
expect(tokens[1].token).toBe(Token.word)
expect(tokens[1].value).toBe('phrase')
expect(tokens[1].token).toBe(LexerToken.word)
expect(tokens[1].value).toBe('phrase')
expect(tokens[2].token).toBe(Token.quote)
expect(tokens[2].value).toBe('"')
})
test('Should tokenize multi', () => {
const reader = new StringReader('"one two three 123 !@#"')
const tokenizer = new Tokenizer(reader)
const tokens = tokenizer.read()
expect(tokens[0].token).toBe(Token.quote)
expect(tokens[0].value).toBe('"')
expect(tokens[1].token).toBe(Token.word)
expect(tokens[1].value).toBe('one two three 123 !@#')
expect(tokens[2].token).toBe(Token.quote)
expect(tokens[2].value).toBe('"')
})
expect(tokens[2].token).toBe(LexerToken.quote)
expect(tokens[2].value).toBe('"')
})
describe('Word', () => {
test('Should tokenize single', () => {
const reader = new StringReader('word')
const tokenizer = new Tokenizer(reader)
const tokens = tokenizer.read()
test('Should tokenize multi', () => {
const reader = new StringReader('"one two three 123 !@#"')
const tokenizer = new Lexer(reader)
const tokens = tokenizer.parse()
expect(tokens[0].token).toBe(Token.word)
expect(tokens[0].value).toBe('word')
})
expect(tokens[0].token).toBe(LexerToken.quote)
expect(tokens[0].value).toBe('"')
test('Should tokenize multi', () => {
const reader = new StringReader('one two three 123')
const tokenizer = new Tokenizer(reader)
const tokens = tokenizer.read()
expect(tokens[1].token).toBe(LexerToken.word)
expect(tokens[1].value).toBe('one two three 123 !@#')
expect(tokens[0].token).toBe(Token.word)
expect(tokens[0].value).toBe('one')
expect(tokens[1].token).toBe(Token.whitespace)
expect(tokens[1].value).toBe(' ')
expect(tokens[2].token).toBe(Token.word)
expect(tokens[2].value).toBe('two')
expect(tokens[3].token).toBe(Token.whitespace)
expect(tokens[3].value).toBe(' ')
expect(tokens[4].token).toBe(Token.word)
expect(tokens[4].value).toBe('three')
expect(tokens[5].token).toBe(Token.whitespace)
expect(tokens[5].value).toBe(' ')
expect(tokens[6].token).toBe(Token.word)
expect(tokens[6].value).toBe('123')
})
})
describe('Groups', () => {
test('Should tokenize single-word group', () => {
const reader = new StringReader('(word)')
const tokenizer = new Tokenizer(reader)
const tokens = tokenizer.read()
expect(tokens[0].token).toBe(Token.group)
expect(tokens[0].value).toBe('(')
expect(tokens[1].token).toBe(Token.word)
expect(tokens[1].value).toBe('word')
expect(tokens[2].token).toBe(Token.group)
expect(tokens[2].value).toBe(')')
})
describe('logical operator OR group', () => {
test('should parse OR separator', () => {
const reader = new StringReader('(word OR word)')
const tokenizer = new Tokenizer(reader)
const tokens = tokenizer.read()
expect(tokens[0].token).toBe(Token.group)
expect(tokens[0].value).toBe('(')
expect(tokens[1].token).toBe(Token.word)
expect(tokens[1].value).toBe('word')
expect(tokens[2].token).toBe(Token.whitespace)
expect(tokens[2].value).toBe(' ')
expect(tokens[3].token).toBe(Token.operator)
expect(tokens[3].value).toBe('or')
expect(tokens[4].token).toBe(Token.whitespace)
expect(tokens[4].value).toBe(' ')
expect(tokens[5].token).toBe(Token.word)
expect(tokens[5].value).toBe('word')
expect(tokens[6].token).toBe(Token.group)
expect(tokens[6].value).toBe(')')
})
test('should parse | separator', () => {
const reader = new StringReader('(word | word)')
const tokenizer = new Tokenizer(reader)
const tokens = tokenizer.read()
expect(tokens[0].token).toBe(Token.group)
expect(tokens[0].value).toBe('(')
expect(tokens[1].token).toBe(Token.word)
expect(tokens[1].value).toBe('word')
expect(tokens[2].token).toBe(Token.whitespace)
expect(tokens[2].value).toBe(' ')
expect(tokens[3].token).toBe(Token.operator)
expect(tokens[3].value).toBe('|')
expect(tokens[4].token).toBe(Token.whitespace)
expect(tokens[4].value).toBe(' ')
expect(tokens[5].token).toBe(Token.word)
expect(tokens[5].value).toBe('word')
expect(tokens[6].token).toBe(Token.group)
expect(tokens[6].value).toBe(')')
})
})
describe('logical operator AND group', () => {
test('should parse AND separator', () => {
const reader = new StringReader('(word AND word)')
const tokenizer = new Tokenizer(reader)
const tokens = tokenizer.read()
expect(tokens[0].token).toBe(Token.group)
expect(tokens[0].value).toBe('(')
expect(tokens[1].token).toBe(Token.word)
expect(tokens[1].value).toBe('word')
expect(tokens[2].token).toBe(Token.whitespace)
expect(tokens[2].value).toBe(' ')
expect(tokens[3].token).toBe(Token.operator)
expect(tokens[3].value).toBe('and')
expect(tokens[4].token).toBe(Token.whitespace)
expect(tokens[4].value).toBe(' ')
expect(tokens[5].token).toBe(Token.word)
expect(tokens[5].value).toBe('word')
expect(tokens[6].token).toBe(Token.group)
expect(tokens[6].value).toBe(')')
})
test('should parse & separator', () => {
const reader = new StringReader('(word & word)')
const tokenizer = new Tokenizer(reader)
const tokens = tokenizer.read()
expect(tokens[0].token).toBe(Token.group)
expect(tokens[0].value).toBe('(')
expect(tokens[1].token).toBe(Token.word)
expect(tokens[1].value).toBe('word')
expect(tokens[2].token).toBe(Token.whitespace)
expect(tokens[2].value).toBe(' ')
expect(tokens[3].token).toBe(Token.operator)
expect(tokens[3].value).toBe('&')
expect(tokens[4].token).toBe(Token.whitespace)
expect(tokens[4].value).toBe(' ')
expect(tokens[5].token).toBe(Token.word)
expect(tokens[5].value).toBe('word')
expect(tokens[6].token).toBe(Token.group)
expect(tokens[6].value).toBe(')')
})
})
expect(tokens[2].token).toBe(LexerToken.quote)
expect(tokens[2].value).toBe('"')
})
})
describe('Word', () => {
test('Should tokenize single', () => {
const reader = new StringReader('word')
const tokenizer = new Lexer(reader)
const tokens = tokenizer.parse()
expect(tokens[0].token).toBe(LexerToken.word)
expect(tokens[0].value).toBe('word')
})
test('Should tokenize multi', () => {
const reader = new StringReader('one two three 123')
const tokenizer = new Lexer(reader)
const tokens = tokenizer.parse()
expect(tokens[0].token).toBe(LexerToken.word)
expect(tokens[0].value).toBe('one')
expect(tokens[1].token).toBe(LexerToken.whitespace)
expect(tokens[1].value).toBe(' ')
expect(tokens[2].token).toBe(LexerToken.word)
expect(tokens[2].value).toBe('two')
expect(tokens[3].token).toBe(LexerToken.whitespace)
expect(tokens[3].value).toBe(' ')
expect(tokens[4].token).toBe(LexerToken.word)
expect(tokens[4].value).toBe('three')
expect(tokens[5].token).toBe(LexerToken.whitespace)
expect(tokens[5].value).toBe(' ')
expect(tokens[6].token).toBe(LexerToken.word)
expect(tokens[6].value).toBe('123')
})
})
describe('Groups', () => {
test('Should tokenize single-word group', () => {
const reader = new StringReader('(word)')
const tokenizer = new Lexer(reader)
const tokens = tokenizer.parse()
expect(tokens[0].token).toBe(LexerToken.group)
expect(tokens[0].value).toBe('(')
expect(tokens[1].token).toBe(LexerToken.word)
expect(tokens[1].value).toBe('word')
expect(tokens[2].token).toBe(LexerToken.group)
expect(tokens[2].value).toBe(')')
})
})
describe('Logical operator OR', () => {
test('should parse OR separator', () => {
const reader = new StringReader('word OR word')
const tokenizer = new Lexer(reader)
const tokens = tokenizer.parse()
expect(tokens[0].token).toBe(LexerToken.word)
expect(tokens[0].value).toBe('word')
expect(tokens[1].token).toBe(LexerToken.whitespace)
expect(tokens[1].value).toBe(' ')
expect(tokens[2].token).toBe(LexerToken.operator)
expect(tokens[2].value).toBe('or')
expect(tokens[3].token).toBe(LexerToken.whitespace)
expect(tokens[3].value).toBe(' ')
expect(tokens[4].token).toBe(LexerToken.word)
expect(tokens[4].value).toBe('word')
})
test('should not parse OR separator mid-word', () => {
const reader = new StringReader('wordORword')
const tokenizer = new Lexer(reader)
const tokens = tokenizer.parse()
expect(tokens[0].token).toBe(LexerToken.word)
expect(tokens[0].value).toBe('wordORword')
})
test('should parse | separator', () => {
const reader = new StringReader('word | word')
const tokenizer = new Lexer(reader)
const tokens = tokenizer.parse()
expect(tokens[0].token).toBe(LexerToken.word)
expect(tokens[0].value).toBe('word')
expect(tokens[1].token).toBe(LexerToken.whitespace)
expect(tokens[1].value).toBe(' ')
expect(tokens[2].token).toBe(LexerToken.operator)
expect(tokens[2].value).toBe('|')
expect(tokens[3].token).toBe(LexerToken.whitespace)
expect(tokens[3].value).toBe(' ')
expect(tokens[4].token).toBe(LexerToken.word)
expect(tokens[4].value).toBe('word')
})
})
describe('Logical operator AND', () => {
test('should parse AND separator', () => {
const reader = new StringReader('word AND word')
const tokenizer = new Lexer(reader)
const tokens = tokenizer.parse()
expect(tokens[0].token).toBe(LexerToken.word)
expect(tokens[0].value).toBe('word')
expect(tokens[1].token).toBe(LexerToken.whitespace)
expect(tokens[1].value).toBe(' ')
expect(tokens[2].token).toBe(LexerToken.operator)
expect(tokens[2].value).toBe('and')
expect(tokens[3].token).toBe(LexerToken.whitespace)
expect(tokens[3].value).toBe(' ')
expect(tokens[4].token).toBe(LexerToken.word)
expect(tokens[4].value).toBe('word')
})
test('should not parse AND separator mid-word', () => {
const reader = new StringReader('wordANDword')
const tokenizer = new Lexer(reader)
const tokens = tokenizer.parse()
expect(tokens[0].token).toBe(LexerToken.word)
expect(tokens[0].value).toBe('wordANDword')
})
test('should parse & separator', () => {
const reader = new StringReader('word & word')
const tokenizer = new Lexer(reader)
const tokens = tokenizer.parse()
expect(tokens[0].token).toBe(LexerToken.word)
expect(tokens[0].value).toBe('word')
expect(tokens[1].token).toBe(LexerToken.whitespace)
expect(tokens[1].value).toBe(' ')
expect(tokens[2].token).toBe(LexerToken.operator)
expect(tokens[2].value).toBe('&')
expect(tokens[3].token).toBe(LexerToken.whitespace)
expect(tokens[3].value).toBe(' ')
expect(tokens[4].token).toBe(LexerToken.word)
expect(tokens[4].value).toBe('word')
})
})

View File

@@ -0,0 +1,145 @@
import { InputReader } from './reader'
import { ILexer, LexerToken, LexerTokenValue } from './tokenizer'
export interface ParserTokenValue {
type: 'word' | 'operator' | 'phrase' | 'group'
}
export interface Phrase extends ParserTokenValue {
type: 'phrase'
value: string
quote: "'" | '"'
}
export interface Word extends ParserTokenValue {
type: 'word'
value: string
}
export interface Operator extends ParserTokenValue {
type: 'operator'
value: string
left: any
right: any
}
export interface Group extends ParserTokenValue {
type: 'group'
children: any[]
}
export type ParserToken = Phrase | Word | Operator | Group
export abstract class IParser {
public lexer: ILexer
public abstract index: number
constructor(lexer: ILexer) {
this.lexer = lexer
}
public abstract peek(): ParserToken | null
public abstract consume(): ParserToken | null
public abstract parse(): ParserToken[]
public abstract isEOF(): boolean
}
export enum ParserState {
default,
}
export class Parser extends IParser {
index = 0
state = ParserState.default
stack: ParserToken[] = []
constructor(lexer: ILexer) {
super(lexer)
this.state = ParserState.default
}
public peek(): ParserToken | null {
if (this.isEOF()) {
return null
}
if (this.index < this.stack.length) {
return this.stack[this.index]
}
const beforePeekIndex = this.lexer.index
const value = this.readNextToken()
if (value) {
this.stack.push(value)
}
this.lexer.setIndex(beforePeekIndex)
return value
}
public consume(): ParserToken | null {
if (this.isEOF()) {
return null
}
if (this.index < this.stack.length) {
this.index++
return this.stack[this.index]
}
const token = this.readNextToken()
this.index++
if (token) {
this.stack.push(token)
}
return token
}
public parse(): ParserToken[] {
const tokens: ParserToken[] = []
while (!this.isEOF()) {
const token = this.consume()
if (!token) {
return tokens
}
tokens.push(token)
}
return tokens
}
public isEOF(): boolean {
return this.lexer.isEOF()
}
private readNextToken(): ParserToken | null {
const token = this.lexer.consume()
let nextToken = this.lexer.peek()
// TODO reset lexer index?
while (nextToken?.token === 'whitespace') {
this.lexer.consume()
nextToken = this.lexer.peek()
}
switch (this.state) {
case ParserState.default:
if (nextToken.token === 'group') {
this.index++
return this.readNextToken()
}
switch (token.token) {
case LexerToken.word:
return { type: 'word', value: token.value }
case LexerToken.quote:
return { type: 'phrase', value: token.value, quote: token.value as '"' }
case LexerToken.operator:
return this.consumeOperator(token)
default:
return null
}
default:
throw new Error('Bad state')
}
}
private consumeOperator(token: LexerTokenValue): ParserToken | null {
const left = this.stack[this.stack.length - 1]
const right = this.readNextToken()
return { type: 'operator', value: token.value, left, right }
}
}

View File

@@ -5,127 +5,186 @@ export enum TokenizerState {
inPhrase,
}
export enum Token {
// phrase = 'phrase',
export enum LexerToken {
group = 'group',
operator = 'operator',
word = 'word',
quote = 'quote',
whitespace = 'whitespace',
// eof = 'eof',
}
export interface TokenValue {
export interface LexerTokenValue {
value: string
token: Token
token: LexerToken
}
export class Tokenizer implements InputReader<TokenValue> {
export abstract class ILexer {
public abstract peek(): LexerTokenValue
public abstract consume(): LexerTokenValue
public abstract isEOF(): boolean
public abstract parse(): LexerTokenValue[]
public abstract index: number
public abstract setIndex(n: number): void
}
export class Lexer implements ILexer {
reader: InputReader<string>
state: TokenizerState = TokenizerState.default
quoteTerminator: string | null = null
index: number = 0
peekIndex: number = 0
afterWhitespace: boolean = false
constructor(reader: InputReader<string>) {
this.reader = reader
}
public isEOF(): boolean {
return this.reader.isEOF()
// TODO implement peek by (n)?
public peek(): LexerTokenValue {
// save state before peeking
const beforePeekState = this.state
const beforePeekIndex = this.reader.index
const beforePeekWhiteSpace = this.afterWhitespace
const value = this.readNextToken()
// restore state after peeking
this.state = beforePeekState
this.reader.setIndex(beforePeekIndex - 1)
this.afterWhitespace = beforePeekWhiteSpace
return value
}
// TODO implement consume by (n)?
public consume(): LexerTokenValue {
const token = this.readNextToken()
this.index++
return token
}
public parse(): LexerTokenValue[] {
const tokens: LexerTokenValue[] = []
while (!this.isEOF()) {
tokens.push(this.consume())
}
return tokens
}
public setIndex(n: number): void {
this.index = n
}
private readNextToken(): TokenValue {
public isEOF(): boolean {
return this.reader.isEOF()
}
private readNextToken(): LexerTokenValue {
const nextChar = this.reader.peek()
switch (this.state) {
case TokenizerState.default:
// whitespace
if (this.isWhitespace(nextChar)) {
this.afterWhitespace = true
return {
value: this.reader.consume(),
token: Token.whitespace,
token: LexerToken.whitespace,
}
}
// quote
if (`"'`.includes(nextChar)) {
this.state = TokenizerState.inPhrase
this.quoteTerminator = nextChar
return this.consumeQuote()
}
if (this.isAlphanumeric(nextChar.charCodeAt(0))) {
if (this.confirmExactWord('OR')) {
// other words
if (this.isAlphanumeric(nextChar)) {
// guard OR
if (this.afterWhitespace && this.peekExact('OR')) {
return this.consumeOr()
}
if (this.confirmExactWord('AND')) {
// guard AND
if (this.afterWhitespace && this.peekExact('AND')) {
return this.consumeAnd()
}
// neither, consume normally
return this.consumeWord()
}
// or operator
if (nextChar === '|') {
return this.consumeOr()
}
// and operator
if (nextChar === '&') {
return this.consumeAnd()
}
// group
if (nextChar === '(' || nextChar === ')') {
return this.consumeGroup()
}
// other, consume normally
return this.consumeWord()
case TokenizerState.inPhrase:
this.afterWhitespace = false
// in phrase mode, consume until quote terminator
if (nextChar === this.quoteTerminator) {
this.state = TokenizerState.default
return this.consumeQuote()
}
// otherwise consume any character
return this.consumePhrase()
default:
throw new Error('bad state')
}
}
consumeQuote(): TokenValue {
private consumeQuote(): LexerTokenValue {
return {
value: this.reader.consume(),
token: Token.quote,
token: LexerToken.quote,
}
}
consumeAnd(): TokenValue {
private consumeAnd(): LexerTokenValue {
let value = ''
if (this.confirmExactWord('AND')) {
this.consumeExactWord('AND')
if (this.peekExact('AND')) {
this.consumeExact('AND')
value = 'and'
} else if (this.confirmExactWord('&')) {
this.consumeExactWord('&')
} else if (this.peekExact('&')) {
this.consumeExact('&')
value = '&'
}
return {
value,
token: Token.operator,
token: LexerToken.operator,
}
}
consumeOr(): TokenValue {
private consumeOr(): LexerTokenValue {
let value = ''
if (this.confirmExactWord('OR')) {
this.consumeExactWord('OR')
if (this.peekExact('OR')) {
this.consumeExact('OR')
value = 'or'
} else if (this.confirmExactWord('|')) {
this.consumeExactWord('|')
} else if (this.peekExact('|')) {
this.consumeExact('|')
value = '|'
}
return {
value,
token: Token.operator,
token: LexerToken.operator,
}
}
confirmExactWord(word: string) {
private peekExact(word: string) {
let nextChar = this.reader.peek()
for (let i = 0; i < word.length; i++) {
if (nextChar !== word[i]) {
@@ -136,28 +195,28 @@ export class Tokenizer implements InputReader<TokenValue> {
return true
}
consumeExactWord(word: string) {
if (this.confirmExactWord(word)) {
this.consumeReader(word.length)
private consumeExact(word: string) {
if (this.peekExact(word)) {
this.consumeLength(word.length)
} else {
throw new Error("Can't find exact word: " + word)
}
}
consumeReader(times = 1) {
private consumeLength(times = 1) {
for (let i = 0; i < times; i++) {
this.reader.consume()
}
}
private consumeGroup(): TokenValue {
private consumeGroup(): LexerTokenValue {
return {
value: this.reader.consume(),
token: Token.group,
token: LexerToken.group,
}
}
private consumePhrase(): TokenValue {
private consumePhrase(): LexerTokenValue {
let nextChar = this.reader.consume()
let value = nextChar
while ((nextChar = this.reader.peek()) && nextChar !== this.quoteTerminator) {
@@ -165,58 +224,33 @@ export class Tokenizer implements InputReader<TokenValue> {
}
return {
value,
token: Token.word,
token: LexerToken.word,
}
}
private consumeWord(): TokenValue {
private consumeWord(): LexerTokenValue {
let value = this.consumeWholeWord()
return {
value,
token: Token.word,
token: LexerToken.word,
}
}
private consumeWholeWord() {
// let nextChar = this.reader.peek()
let value = ''
while (this.isAlphanumeric(this.reader.peek().charCodeAt(0))) {
while (this.isAlphanumeric(this.reader.peek())) {
value += this.reader.consume()
}
return value
}
public peek(): TokenValue {
const beforePeekState = this.state
const beforePeekIndex = this.reader.index
// this.peekIndex = this.currentIndex + n
const value = this.readNextToken()
this.state = beforePeekState
this.reader.setIndex(beforePeekIndex)
return value
// return this.readNextToken()
private isWhitespace(char: string) {
return ' \t\n\r'.includes(char)
}
public consume(): TokenValue {
const token = this.readNextToken()
// this.reader.consume()
this.index++
return token
}
public read(): TokenValue[] {
const tokens: TokenValue[] = []
while (!this.isEOF()) {
tokens.push(this.consume())
}
return tokens
}
private isWhitespace(nextChar: string) {
return ' \t\n\r'.includes(nextChar)
}
private isAlphanumeric(charCode: number): boolean {
private isAlphanumeric(char: string): boolean {
const charCode = char.charCodeAt(0)
return (
(charCode >= 48 && charCode <= 57) ||
(charCode >= 65 && charCode <= 90) ||