mirror of
https://github.com/chenasraf/search-ast-parser-js.git
synced 2026-05-17 17:48:09 +00:00
feat(parser): wip
This commit is contained in:
23
__tests__/parser_test.ts
Normal file
23
__tests__/parser_test.ts
Normal file
@@ -0,0 +1,23 @@
|
||||
import { Operator, Parser, Word } from '../src/parser'
|
||||
import { StringReader } from '../src/reader'
|
||||
import { Lexer } from '../src/tokenizer'
|
||||
|
||||
test('should parse single word', () => {
|
||||
const reader = new StringReader('word')
|
||||
const lexer = new Lexer(reader)
|
||||
const parser = new Parser(lexer)
|
||||
const tokens = parser.parse()
|
||||
const wordToken = tokens[0] as Word
|
||||
expect(wordToken.type).toBe('word')
|
||||
expect(wordToken.value).toBe('word')
|
||||
})
|
||||
test('should parse OR operator', () => {
|
||||
const reader = new StringReader('word OR "phrase"')
|
||||
const lexer = new Lexer(reader)
|
||||
const parser = new Parser(lexer)
|
||||
const tokens = parser.parse()
|
||||
const wordToken = tokens[0] as Operator
|
||||
expect(wordToken.type).toBe('operator')
|
||||
expect(wordToken.left.value).toBe('word')
|
||||
expect(wordToken.right.value).toBe('"phrase"')
|
||||
})
|
||||
@@ -1,202 +1,195 @@
|
||||
import { StringReader } from '../src/reader'
|
||||
import { Token, Tokenizer } from '../src/tokenizer'
|
||||
import { LexerToken, Lexer } from '../src/tokenizer'
|
||||
|
||||
describe('Tokenizer', () => {
|
||||
describe('Phrase', () => {
|
||||
test('Should tokenize single', () => {
|
||||
const reader = new StringReader('"phrase"')
|
||||
const tokenizer = new Tokenizer(reader)
|
||||
const tokens = tokenizer.read()
|
||||
describe('Phrase', () => {
|
||||
test('Should tokenize single', () => {
|
||||
const reader = new StringReader('"phrase"')
|
||||
const tokenizer = new Lexer(reader)
|
||||
const tokens = tokenizer.parse()
|
||||
|
||||
expect(tokens[0].token).toBe(Token.quote)
|
||||
expect(tokens[0].value).toBe('"')
|
||||
expect(tokens[0].token).toBe(LexerToken.quote)
|
||||
expect(tokens[0].value).toBe('"')
|
||||
|
||||
expect(tokens[1].token).toBe(Token.word)
|
||||
expect(tokens[1].value).toBe('phrase')
|
||||
expect(tokens[1].token).toBe(LexerToken.word)
|
||||
expect(tokens[1].value).toBe('phrase')
|
||||
|
||||
expect(tokens[2].token).toBe(Token.quote)
|
||||
expect(tokens[2].value).toBe('"')
|
||||
})
|
||||
|
||||
test('Should tokenize multi', () => {
|
||||
const reader = new StringReader('"one two three 123 !@#"')
|
||||
const tokenizer = new Tokenizer(reader)
|
||||
const tokens = tokenizer.read()
|
||||
|
||||
expect(tokens[0].token).toBe(Token.quote)
|
||||
expect(tokens[0].value).toBe('"')
|
||||
|
||||
expect(tokens[1].token).toBe(Token.word)
|
||||
expect(tokens[1].value).toBe('one two three 123 !@#')
|
||||
|
||||
expect(tokens[2].token).toBe(Token.quote)
|
||||
expect(tokens[2].value).toBe('"')
|
||||
})
|
||||
expect(tokens[2].token).toBe(LexerToken.quote)
|
||||
expect(tokens[2].value).toBe('"')
|
||||
})
|
||||
|
||||
describe('Word', () => {
|
||||
test('Should tokenize single', () => {
|
||||
const reader = new StringReader('word')
|
||||
const tokenizer = new Tokenizer(reader)
|
||||
const tokens = tokenizer.read()
|
||||
test('Should tokenize multi', () => {
|
||||
const reader = new StringReader('"one two three 123 !@#"')
|
||||
const tokenizer = new Lexer(reader)
|
||||
const tokens = tokenizer.parse()
|
||||
|
||||
expect(tokens[0].token).toBe(Token.word)
|
||||
expect(tokens[0].value).toBe('word')
|
||||
})
|
||||
expect(tokens[0].token).toBe(LexerToken.quote)
|
||||
expect(tokens[0].value).toBe('"')
|
||||
|
||||
test('Should tokenize multi', () => {
|
||||
const reader = new StringReader('one two three 123')
|
||||
const tokenizer = new Tokenizer(reader)
|
||||
const tokens = tokenizer.read()
|
||||
expect(tokens[1].token).toBe(LexerToken.word)
|
||||
expect(tokens[1].value).toBe('one two three 123 !@#')
|
||||
|
||||
expect(tokens[0].token).toBe(Token.word)
|
||||
expect(tokens[0].value).toBe('one')
|
||||
|
||||
expect(tokens[1].token).toBe(Token.whitespace)
|
||||
expect(tokens[1].value).toBe(' ')
|
||||
|
||||
expect(tokens[2].token).toBe(Token.word)
|
||||
expect(tokens[2].value).toBe('two')
|
||||
|
||||
expect(tokens[3].token).toBe(Token.whitespace)
|
||||
expect(tokens[3].value).toBe(' ')
|
||||
|
||||
expect(tokens[4].token).toBe(Token.word)
|
||||
expect(tokens[4].value).toBe('three')
|
||||
|
||||
expect(tokens[5].token).toBe(Token.whitespace)
|
||||
expect(tokens[5].value).toBe(' ')
|
||||
|
||||
expect(tokens[6].token).toBe(Token.word)
|
||||
expect(tokens[6].value).toBe('123')
|
||||
})
|
||||
})
|
||||
|
||||
describe('Groups', () => {
|
||||
test('Should tokenize single-word group', () => {
|
||||
const reader = new StringReader('(word)')
|
||||
const tokenizer = new Tokenizer(reader)
|
||||
const tokens = tokenizer.read()
|
||||
|
||||
expect(tokens[0].token).toBe(Token.group)
|
||||
expect(tokens[0].value).toBe('(')
|
||||
|
||||
expect(tokens[1].token).toBe(Token.word)
|
||||
expect(tokens[1].value).toBe('word')
|
||||
|
||||
expect(tokens[2].token).toBe(Token.group)
|
||||
expect(tokens[2].value).toBe(')')
|
||||
})
|
||||
|
||||
describe('logical operator OR group', () => {
|
||||
test('should parse OR separator', () => {
|
||||
const reader = new StringReader('(word OR word)')
|
||||
const tokenizer = new Tokenizer(reader)
|
||||
const tokens = tokenizer.read()
|
||||
|
||||
expect(tokens[0].token).toBe(Token.group)
|
||||
expect(tokens[0].value).toBe('(')
|
||||
|
||||
expect(tokens[1].token).toBe(Token.word)
|
||||
expect(tokens[1].value).toBe('word')
|
||||
|
||||
expect(tokens[2].token).toBe(Token.whitespace)
|
||||
expect(tokens[2].value).toBe(' ')
|
||||
|
||||
expect(tokens[3].token).toBe(Token.operator)
|
||||
expect(tokens[3].value).toBe('or')
|
||||
|
||||
expect(tokens[4].token).toBe(Token.whitespace)
|
||||
expect(tokens[4].value).toBe(' ')
|
||||
|
||||
expect(tokens[5].token).toBe(Token.word)
|
||||
expect(tokens[5].value).toBe('word')
|
||||
|
||||
expect(tokens[6].token).toBe(Token.group)
|
||||
expect(tokens[6].value).toBe(')')
|
||||
})
|
||||
|
||||
test('should parse | separator', () => {
|
||||
const reader = new StringReader('(word | word)')
|
||||
const tokenizer = new Tokenizer(reader)
|
||||
const tokens = tokenizer.read()
|
||||
|
||||
expect(tokens[0].token).toBe(Token.group)
|
||||
expect(tokens[0].value).toBe('(')
|
||||
|
||||
expect(tokens[1].token).toBe(Token.word)
|
||||
expect(tokens[1].value).toBe('word')
|
||||
|
||||
expect(tokens[2].token).toBe(Token.whitespace)
|
||||
expect(tokens[2].value).toBe(' ')
|
||||
|
||||
expect(tokens[3].token).toBe(Token.operator)
|
||||
expect(tokens[3].value).toBe('|')
|
||||
|
||||
expect(tokens[4].token).toBe(Token.whitespace)
|
||||
expect(tokens[4].value).toBe(' ')
|
||||
|
||||
expect(tokens[5].token).toBe(Token.word)
|
||||
expect(tokens[5].value).toBe('word')
|
||||
|
||||
expect(tokens[6].token).toBe(Token.group)
|
||||
expect(tokens[6].value).toBe(')')
|
||||
})
|
||||
})
|
||||
|
||||
describe('logical operator AND group', () => {
|
||||
test('should parse AND separator', () => {
|
||||
const reader = new StringReader('(word AND word)')
|
||||
const tokenizer = new Tokenizer(reader)
|
||||
const tokens = tokenizer.read()
|
||||
|
||||
expect(tokens[0].token).toBe(Token.group)
|
||||
expect(tokens[0].value).toBe('(')
|
||||
|
||||
expect(tokens[1].token).toBe(Token.word)
|
||||
expect(tokens[1].value).toBe('word')
|
||||
|
||||
expect(tokens[2].token).toBe(Token.whitespace)
|
||||
expect(tokens[2].value).toBe(' ')
|
||||
|
||||
expect(tokens[3].token).toBe(Token.operator)
|
||||
expect(tokens[3].value).toBe('and')
|
||||
|
||||
expect(tokens[4].token).toBe(Token.whitespace)
|
||||
expect(tokens[4].value).toBe(' ')
|
||||
|
||||
expect(tokens[5].token).toBe(Token.word)
|
||||
expect(tokens[5].value).toBe('word')
|
||||
|
||||
expect(tokens[6].token).toBe(Token.group)
|
||||
expect(tokens[6].value).toBe(')')
|
||||
})
|
||||
test('should parse & separator', () => {
|
||||
const reader = new StringReader('(word & word)')
|
||||
const tokenizer = new Tokenizer(reader)
|
||||
const tokens = tokenizer.read()
|
||||
|
||||
expect(tokens[0].token).toBe(Token.group)
|
||||
expect(tokens[0].value).toBe('(')
|
||||
|
||||
expect(tokens[1].token).toBe(Token.word)
|
||||
expect(tokens[1].value).toBe('word')
|
||||
|
||||
expect(tokens[2].token).toBe(Token.whitespace)
|
||||
expect(tokens[2].value).toBe(' ')
|
||||
|
||||
expect(tokens[3].token).toBe(Token.operator)
|
||||
expect(tokens[3].value).toBe('&')
|
||||
|
||||
expect(tokens[4].token).toBe(Token.whitespace)
|
||||
expect(tokens[4].value).toBe(' ')
|
||||
|
||||
expect(tokens[5].token).toBe(Token.word)
|
||||
expect(tokens[5].value).toBe('word')
|
||||
|
||||
expect(tokens[6].token).toBe(Token.group)
|
||||
expect(tokens[6].value).toBe(')')
|
||||
})
|
||||
})
|
||||
expect(tokens[2].token).toBe(LexerToken.quote)
|
||||
expect(tokens[2].value).toBe('"')
|
||||
})
|
||||
})
|
||||
|
||||
describe('Word', () => {
|
||||
test('Should tokenize single', () => {
|
||||
const reader = new StringReader('word')
|
||||
const tokenizer = new Lexer(reader)
|
||||
const tokens = tokenizer.parse()
|
||||
|
||||
expect(tokens[0].token).toBe(LexerToken.word)
|
||||
expect(tokens[0].value).toBe('word')
|
||||
})
|
||||
|
||||
test('Should tokenize multi', () => {
|
||||
const reader = new StringReader('one two three 123')
|
||||
const tokenizer = new Lexer(reader)
|
||||
const tokens = tokenizer.parse()
|
||||
|
||||
expect(tokens[0].token).toBe(LexerToken.word)
|
||||
expect(tokens[0].value).toBe('one')
|
||||
|
||||
expect(tokens[1].token).toBe(LexerToken.whitespace)
|
||||
expect(tokens[1].value).toBe(' ')
|
||||
|
||||
expect(tokens[2].token).toBe(LexerToken.word)
|
||||
expect(tokens[2].value).toBe('two')
|
||||
|
||||
expect(tokens[3].token).toBe(LexerToken.whitespace)
|
||||
expect(tokens[3].value).toBe(' ')
|
||||
|
||||
expect(tokens[4].token).toBe(LexerToken.word)
|
||||
expect(tokens[4].value).toBe('three')
|
||||
|
||||
expect(tokens[5].token).toBe(LexerToken.whitespace)
|
||||
expect(tokens[5].value).toBe(' ')
|
||||
|
||||
expect(tokens[6].token).toBe(LexerToken.word)
|
||||
expect(tokens[6].value).toBe('123')
|
||||
})
|
||||
})
|
||||
|
||||
describe('Groups', () => {
|
||||
test('Should tokenize single-word group', () => {
|
||||
const reader = new StringReader('(word)')
|
||||
const tokenizer = new Lexer(reader)
|
||||
const tokens = tokenizer.parse()
|
||||
|
||||
expect(tokens[0].token).toBe(LexerToken.group)
|
||||
expect(tokens[0].value).toBe('(')
|
||||
|
||||
expect(tokens[1].token).toBe(LexerToken.word)
|
||||
expect(tokens[1].value).toBe('word')
|
||||
|
||||
expect(tokens[2].token).toBe(LexerToken.group)
|
||||
expect(tokens[2].value).toBe(')')
|
||||
})
|
||||
})
|
||||
|
||||
describe('Logical operator OR', () => {
|
||||
test('should parse OR separator', () => {
|
||||
const reader = new StringReader('word OR word')
|
||||
const tokenizer = new Lexer(reader)
|
||||
const tokens = tokenizer.parse()
|
||||
|
||||
expect(tokens[0].token).toBe(LexerToken.word)
|
||||
expect(tokens[0].value).toBe('word')
|
||||
|
||||
expect(tokens[1].token).toBe(LexerToken.whitespace)
|
||||
expect(tokens[1].value).toBe(' ')
|
||||
|
||||
expect(tokens[2].token).toBe(LexerToken.operator)
|
||||
expect(tokens[2].value).toBe('or')
|
||||
|
||||
expect(tokens[3].token).toBe(LexerToken.whitespace)
|
||||
expect(tokens[3].value).toBe(' ')
|
||||
|
||||
expect(tokens[4].token).toBe(LexerToken.word)
|
||||
expect(tokens[4].value).toBe('word')
|
||||
})
|
||||
|
||||
test('should not parse OR separator mid-word', () => {
|
||||
const reader = new StringReader('wordORword')
|
||||
const tokenizer = new Lexer(reader)
|
||||
const tokens = tokenizer.parse()
|
||||
|
||||
expect(tokens[0].token).toBe(LexerToken.word)
|
||||
expect(tokens[0].value).toBe('wordORword')
|
||||
})
|
||||
|
||||
test('should parse | separator', () => {
|
||||
const reader = new StringReader('word | word')
|
||||
const tokenizer = new Lexer(reader)
|
||||
const tokens = tokenizer.parse()
|
||||
|
||||
expect(tokens[0].token).toBe(LexerToken.word)
|
||||
expect(tokens[0].value).toBe('word')
|
||||
|
||||
expect(tokens[1].token).toBe(LexerToken.whitespace)
|
||||
expect(tokens[1].value).toBe(' ')
|
||||
|
||||
expect(tokens[2].token).toBe(LexerToken.operator)
|
||||
expect(tokens[2].value).toBe('|')
|
||||
|
||||
expect(tokens[3].token).toBe(LexerToken.whitespace)
|
||||
expect(tokens[3].value).toBe(' ')
|
||||
|
||||
expect(tokens[4].token).toBe(LexerToken.word)
|
||||
expect(tokens[4].value).toBe('word')
|
||||
})
|
||||
})
|
||||
|
||||
describe('Logical operator AND', () => {
|
||||
test('should parse AND separator', () => {
|
||||
const reader = new StringReader('word AND word')
|
||||
const tokenizer = new Lexer(reader)
|
||||
const tokens = tokenizer.parse()
|
||||
|
||||
expect(tokens[0].token).toBe(LexerToken.word)
|
||||
expect(tokens[0].value).toBe('word')
|
||||
|
||||
expect(tokens[1].token).toBe(LexerToken.whitespace)
|
||||
expect(tokens[1].value).toBe(' ')
|
||||
|
||||
expect(tokens[2].token).toBe(LexerToken.operator)
|
||||
expect(tokens[2].value).toBe('and')
|
||||
|
||||
expect(tokens[3].token).toBe(LexerToken.whitespace)
|
||||
expect(tokens[3].value).toBe(' ')
|
||||
|
||||
expect(tokens[4].token).toBe(LexerToken.word)
|
||||
expect(tokens[4].value).toBe('word')
|
||||
})
|
||||
|
||||
test('should not parse AND separator mid-word', () => {
|
||||
const reader = new StringReader('wordANDword')
|
||||
const tokenizer = new Lexer(reader)
|
||||
const tokens = tokenizer.parse()
|
||||
|
||||
expect(tokens[0].token).toBe(LexerToken.word)
|
||||
expect(tokens[0].value).toBe('wordANDword')
|
||||
})
|
||||
|
||||
test('should parse & separator', () => {
|
||||
const reader = new StringReader('word & word')
|
||||
const tokenizer = new Lexer(reader)
|
||||
const tokens = tokenizer.parse()
|
||||
|
||||
expect(tokens[0].token).toBe(LexerToken.word)
|
||||
expect(tokens[0].value).toBe('word')
|
||||
|
||||
expect(tokens[1].token).toBe(LexerToken.whitespace)
|
||||
expect(tokens[1].value).toBe(' ')
|
||||
|
||||
expect(tokens[2].token).toBe(LexerToken.operator)
|
||||
expect(tokens[2].value).toBe('&')
|
||||
|
||||
expect(tokens[3].token).toBe(LexerToken.whitespace)
|
||||
expect(tokens[3].value).toBe(' ')
|
||||
|
||||
expect(tokens[4].token).toBe(LexerToken.word)
|
||||
expect(tokens[4].value).toBe('word')
|
||||
})
|
||||
})
|
||||
|
||||
145
src/parser.ts
145
src/parser.ts
@@ -0,0 +1,145 @@
|
||||
import { InputReader } from './reader'
|
||||
import { ILexer, LexerToken, LexerTokenValue } from './tokenizer'
|
||||
|
||||
export interface ParserTokenValue {
|
||||
type: 'word' | 'operator' | 'phrase' | 'group'
|
||||
}
|
||||
|
||||
export interface Phrase extends ParserTokenValue {
|
||||
type: 'phrase'
|
||||
value: string
|
||||
quote: "'" | '"'
|
||||
}
|
||||
|
||||
export interface Word extends ParserTokenValue {
|
||||
type: 'word'
|
||||
value: string
|
||||
}
|
||||
|
||||
export interface Operator extends ParserTokenValue {
|
||||
type: 'operator'
|
||||
value: string
|
||||
left: any
|
||||
right: any
|
||||
}
|
||||
|
||||
export interface Group extends ParserTokenValue {
|
||||
type: 'group'
|
||||
children: any[]
|
||||
}
|
||||
|
||||
export type ParserToken = Phrase | Word | Operator | Group
|
||||
|
||||
export abstract class IParser {
|
||||
public lexer: ILexer
|
||||
public abstract index: number
|
||||
|
||||
constructor(lexer: ILexer) {
|
||||
this.lexer = lexer
|
||||
}
|
||||
|
||||
public abstract peek(): ParserToken | null
|
||||
public abstract consume(): ParserToken | null
|
||||
public abstract parse(): ParserToken[]
|
||||
public abstract isEOF(): boolean
|
||||
}
|
||||
|
||||
export enum ParserState {
|
||||
default,
|
||||
}
|
||||
|
||||
export class Parser extends IParser {
|
||||
index = 0
|
||||
state = ParserState.default
|
||||
stack: ParserToken[] = []
|
||||
|
||||
constructor(lexer: ILexer) {
|
||||
super(lexer)
|
||||
this.state = ParserState.default
|
||||
}
|
||||
|
||||
public peek(): ParserToken | null {
|
||||
if (this.isEOF()) {
|
||||
return null
|
||||
}
|
||||
if (this.index < this.stack.length) {
|
||||
return this.stack[this.index]
|
||||
}
|
||||
|
||||
const beforePeekIndex = this.lexer.index
|
||||
const value = this.readNextToken()
|
||||
if (value) {
|
||||
this.stack.push(value)
|
||||
}
|
||||
this.lexer.setIndex(beforePeekIndex)
|
||||
return value
|
||||
}
|
||||
|
||||
public consume(): ParserToken | null {
|
||||
if (this.isEOF()) {
|
||||
return null
|
||||
}
|
||||
if (this.index < this.stack.length) {
|
||||
this.index++
|
||||
return this.stack[this.index]
|
||||
}
|
||||
|
||||
const token = this.readNextToken()
|
||||
this.index++
|
||||
if (token) {
|
||||
this.stack.push(token)
|
||||
}
|
||||
return token
|
||||
}
|
||||
|
||||
public parse(): ParserToken[] {
|
||||
const tokens: ParserToken[] = []
|
||||
while (!this.isEOF()) {
|
||||
const token = this.consume()
|
||||
if (!token) {
|
||||
return tokens
|
||||
}
|
||||
tokens.push(token)
|
||||
}
|
||||
return tokens
|
||||
}
|
||||
|
||||
public isEOF(): boolean {
|
||||
return this.lexer.isEOF()
|
||||
}
|
||||
|
||||
private readNextToken(): ParserToken | null {
|
||||
const token = this.lexer.consume()
|
||||
let nextToken = this.lexer.peek()
|
||||
// TODO reset lexer index?
|
||||
while (nextToken?.token === 'whitespace') {
|
||||
this.lexer.consume()
|
||||
nextToken = this.lexer.peek()
|
||||
}
|
||||
switch (this.state) {
|
||||
case ParserState.default:
|
||||
if (nextToken.token === 'group') {
|
||||
this.index++
|
||||
return this.readNextToken()
|
||||
}
|
||||
switch (token.token) {
|
||||
case LexerToken.word:
|
||||
return { type: 'word', value: token.value }
|
||||
case LexerToken.quote:
|
||||
return { type: 'phrase', value: token.value, quote: token.value as '"' }
|
||||
case LexerToken.operator:
|
||||
return this.consumeOperator(token)
|
||||
default:
|
||||
return null
|
||||
}
|
||||
default:
|
||||
throw new Error('Bad state')
|
||||
}
|
||||
}
|
||||
|
||||
private consumeOperator(token: LexerTokenValue): ParserToken | null {
|
||||
const left = this.stack[this.stack.length - 1]
|
||||
const right = this.readNextToken()
|
||||
return { type: 'operator', value: token.value, left, right }
|
||||
}
|
||||
}
|
||||
|
||||
170
src/tokenizer.ts
170
src/tokenizer.ts
@@ -5,127 +5,186 @@ export enum TokenizerState {
|
||||
inPhrase,
|
||||
}
|
||||
|
||||
export enum Token {
|
||||
// phrase = 'phrase',
|
||||
export enum LexerToken {
|
||||
group = 'group',
|
||||
operator = 'operator',
|
||||
word = 'word',
|
||||
quote = 'quote',
|
||||
whitespace = 'whitespace',
|
||||
// eof = 'eof',
|
||||
}
|
||||
|
||||
export interface TokenValue {
|
||||
export interface LexerTokenValue {
|
||||
value: string
|
||||
token: Token
|
||||
token: LexerToken
|
||||
}
|
||||
|
||||
export class Tokenizer implements InputReader<TokenValue> {
|
||||
export abstract class ILexer {
|
||||
public abstract peek(): LexerTokenValue
|
||||
public abstract consume(): LexerTokenValue
|
||||
public abstract isEOF(): boolean
|
||||
public abstract parse(): LexerTokenValue[]
|
||||
public abstract index: number
|
||||
public abstract setIndex(n: number): void
|
||||
}
|
||||
|
||||
export class Lexer implements ILexer {
|
||||
reader: InputReader<string>
|
||||
state: TokenizerState = TokenizerState.default
|
||||
quoteTerminator: string | null = null
|
||||
index: number = 0
|
||||
peekIndex: number = 0
|
||||
afterWhitespace: boolean = false
|
||||
|
||||
constructor(reader: InputReader<string>) {
|
||||
this.reader = reader
|
||||
}
|
||||
|
||||
public isEOF(): boolean {
|
||||
return this.reader.isEOF()
|
||||
// TODO implement peek by (n)?
|
||||
public peek(): LexerTokenValue {
|
||||
// save state before peeking
|
||||
const beforePeekState = this.state
|
||||
const beforePeekIndex = this.reader.index
|
||||
const beforePeekWhiteSpace = this.afterWhitespace
|
||||
|
||||
const value = this.readNextToken()
|
||||
|
||||
// restore state after peeking
|
||||
this.state = beforePeekState
|
||||
this.reader.setIndex(beforePeekIndex - 1)
|
||||
this.afterWhitespace = beforePeekWhiteSpace
|
||||
|
||||
return value
|
||||
}
|
||||
|
||||
// TODO implement consume by (n)?
|
||||
public consume(): LexerTokenValue {
|
||||
const token = this.readNextToken()
|
||||
this.index++
|
||||
return token
|
||||
}
|
||||
|
||||
public parse(): LexerTokenValue[] {
|
||||
const tokens: LexerTokenValue[] = []
|
||||
while (!this.isEOF()) {
|
||||
tokens.push(this.consume())
|
||||
}
|
||||
return tokens
|
||||
}
|
||||
|
||||
public setIndex(n: number): void {
|
||||
this.index = n
|
||||
}
|
||||
|
||||
private readNextToken(): TokenValue {
|
||||
public isEOF(): boolean {
|
||||
return this.reader.isEOF()
|
||||
}
|
||||
|
||||
private readNextToken(): LexerTokenValue {
|
||||
const nextChar = this.reader.peek()
|
||||
switch (this.state) {
|
||||
case TokenizerState.default:
|
||||
// whitespace
|
||||
if (this.isWhitespace(nextChar)) {
|
||||
this.afterWhitespace = true
|
||||
return {
|
||||
value: this.reader.consume(),
|
||||
token: Token.whitespace,
|
||||
token: LexerToken.whitespace,
|
||||
}
|
||||
}
|
||||
|
||||
// quote
|
||||
if (`"'`.includes(nextChar)) {
|
||||
this.state = TokenizerState.inPhrase
|
||||
this.quoteTerminator = nextChar
|
||||
return this.consumeQuote()
|
||||
}
|
||||
|
||||
if (this.isAlphanumeric(nextChar.charCodeAt(0))) {
|
||||
if (this.confirmExactWord('OR')) {
|
||||
// other words
|
||||
if (this.isAlphanumeric(nextChar)) {
|
||||
// guard OR
|
||||
if (this.afterWhitespace && this.peekExact('OR')) {
|
||||
return this.consumeOr()
|
||||
}
|
||||
if (this.confirmExactWord('AND')) {
|
||||
// guard AND
|
||||
if (this.afterWhitespace && this.peekExact('AND')) {
|
||||
return this.consumeAnd()
|
||||
}
|
||||
|
||||
// neither, consume normally
|
||||
return this.consumeWord()
|
||||
}
|
||||
|
||||
// or operator
|
||||
if (nextChar === '|') {
|
||||
return this.consumeOr()
|
||||
}
|
||||
|
||||
// and operator
|
||||
if (nextChar === '&') {
|
||||
return this.consumeAnd()
|
||||
}
|
||||
|
||||
// group
|
||||
if (nextChar === '(' || nextChar === ')') {
|
||||
return this.consumeGroup()
|
||||
}
|
||||
|
||||
// other, consume normally
|
||||
return this.consumeWord()
|
||||
case TokenizerState.inPhrase:
|
||||
this.afterWhitespace = false
|
||||
|
||||
// in phrase mode, consume until quote terminator
|
||||
if (nextChar === this.quoteTerminator) {
|
||||
this.state = TokenizerState.default
|
||||
return this.consumeQuote()
|
||||
}
|
||||
|
||||
// otherwise consume any character
|
||||
return this.consumePhrase()
|
||||
default:
|
||||
throw new Error('bad state')
|
||||
}
|
||||
}
|
||||
|
||||
consumeQuote(): TokenValue {
|
||||
private consumeQuote(): LexerTokenValue {
|
||||
return {
|
||||
value: this.reader.consume(),
|
||||
token: Token.quote,
|
||||
token: LexerToken.quote,
|
||||
}
|
||||
}
|
||||
|
||||
consumeAnd(): TokenValue {
|
||||
private consumeAnd(): LexerTokenValue {
|
||||
let value = ''
|
||||
if (this.confirmExactWord('AND')) {
|
||||
this.consumeExactWord('AND')
|
||||
if (this.peekExact('AND')) {
|
||||
this.consumeExact('AND')
|
||||
value = 'and'
|
||||
} else if (this.confirmExactWord('&')) {
|
||||
this.consumeExactWord('&')
|
||||
} else if (this.peekExact('&')) {
|
||||
this.consumeExact('&')
|
||||
value = '&'
|
||||
}
|
||||
return {
|
||||
value,
|
||||
token: Token.operator,
|
||||
token: LexerToken.operator,
|
||||
}
|
||||
}
|
||||
|
||||
consumeOr(): TokenValue {
|
||||
private consumeOr(): LexerTokenValue {
|
||||
let value = ''
|
||||
if (this.confirmExactWord('OR')) {
|
||||
this.consumeExactWord('OR')
|
||||
if (this.peekExact('OR')) {
|
||||
this.consumeExact('OR')
|
||||
value = 'or'
|
||||
} else if (this.confirmExactWord('|')) {
|
||||
this.consumeExactWord('|')
|
||||
} else if (this.peekExact('|')) {
|
||||
this.consumeExact('|')
|
||||
value = '|'
|
||||
}
|
||||
return {
|
||||
value,
|
||||
token: Token.operator,
|
||||
token: LexerToken.operator,
|
||||
}
|
||||
}
|
||||
|
||||
confirmExactWord(word: string) {
|
||||
private peekExact(word: string) {
|
||||
let nextChar = this.reader.peek()
|
||||
for (let i = 0; i < word.length; i++) {
|
||||
if (nextChar !== word[i]) {
|
||||
@@ -136,28 +195,28 @@ export class Tokenizer implements InputReader<TokenValue> {
|
||||
return true
|
||||
}
|
||||
|
||||
consumeExactWord(word: string) {
|
||||
if (this.confirmExactWord(word)) {
|
||||
this.consumeReader(word.length)
|
||||
private consumeExact(word: string) {
|
||||
if (this.peekExact(word)) {
|
||||
this.consumeLength(word.length)
|
||||
} else {
|
||||
throw new Error("Can't find exact word: " + word)
|
||||
}
|
||||
}
|
||||
|
||||
consumeReader(times = 1) {
|
||||
private consumeLength(times = 1) {
|
||||
for (let i = 0; i < times; i++) {
|
||||
this.reader.consume()
|
||||
}
|
||||
}
|
||||
|
||||
private consumeGroup(): TokenValue {
|
||||
private consumeGroup(): LexerTokenValue {
|
||||
return {
|
||||
value: this.reader.consume(),
|
||||
token: Token.group,
|
||||
token: LexerToken.group,
|
||||
}
|
||||
}
|
||||
|
||||
private consumePhrase(): TokenValue {
|
||||
private consumePhrase(): LexerTokenValue {
|
||||
let nextChar = this.reader.consume()
|
||||
let value = nextChar
|
||||
while ((nextChar = this.reader.peek()) && nextChar !== this.quoteTerminator) {
|
||||
@@ -165,58 +224,33 @@ export class Tokenizer implements InputReader<TokenValue> {
|
||||
}
|
||||
return {
|
||||
value,
|
||||
token: Token.word,
|
||||
token: LexerToken.word,
|
||||
}
|
||||
}
|
||||
|
||||
private consumeWord(): TokenValue {
|
||||
private consumeWord(): LexerTokenValue {
|
||||
let value = this.consumeWholeWord()
|
||||
return {
|
||||
value,
|
||||
token: Token.word,
|
||||
token: LexerToken.word,
|
||||
}
|
||||
}
|
||||
|
||||
private consumeWholeWord() {
|
||||
// let nextChar = this.reader.peek()
|
||||
let value = ''
|
||||
while (this.isAlphanumeric(this.reader.peek().charCodeAt(0))) {
|
||||
while (this.isAlphanumeric(this.reader.peek())) {
|
||||
value += this.reader.consume()
|
||||
}
|
||||
return value
|
||||
}
|
||||
|
||||
public peek(): TokenValue {
|
||||
const beforePeekState = this.state
|
||||
const beforePeekIndex = this.reader.index
|
||||
// this.peekIndex = this.currentIndex + n
|
||||
const value = this.readNextToken()
|
||||
this.state = beforePeekState
|
||||
this.reader.setIndex(beforePeekIndex)
|
||||
return value
|
||||
// return this.readNextToken()
|
||||
private isWhitespace(char: string) {
|
||||
return ' \t\n\r'.includes(char)
|
||||
}
|
||||
|
||||
public consume(): TokenValue {
|
||||
const token = this.readNextToken()
|
||||
// this.reader.consume()
|
||||
this.index++
|
||||
return token
|
||||
}
|
||||
|
||||
public read(): TokenValue[] {
|
||||
const tokens: TokenValue[] = []
|
||||
while (!this.isEOF()) {
|
||||
tokens.push(this.consume())
|
||||
}
|
||||
return tokens
|
||||
}
|
||||
|
||||
private isWhitespace(nextChar: string) {
|
||||
return ' \t\n\r'.includes(nextChar)
|
||||
}
|
||||
|
||||
private isAlphanumeric(charCode: number): boolean {
|
||||
private isAlphanumeric(char: string): boolean {
|
||||
const charCode = char.charCodeAt(0)
|
||||
return (
|
||||
(charCode >= 48 && charCode <= 57) ||
|
||||
(charCode >= 65 && charCode <= 90) ||
|
||||
|
||||
Reference in New Issue
Block a user