mirror of
https://github.com/chenasraf/search-ast-parser-js.git
synced 2026-05-17 17:48:09 +00:00
feat(parser): initial poc done
This commit is contained in:
@@ -1,11 +1,11 @@
|
||||
import { StringReader } from '../src/reader'
|
||||
import { LexerToken, Lexer } from '../src/tokenizer'
|
||||
import { LexerToken, Lexer } from '../src/lexer'
|
||||
|
||||
describe('Phrase', () => {
|
||||
test('Should tokenize single', () => {
|
||||
const reader = new StringReader('"phrase"')
|
||||
const tokenizer = new Lexer(reader)
|
||||
const tokens = tokenizer.parse()
|
||||
const lexer = new Lexer(reader)
|
||||
const tokens = lexer.parse()
|
||||
|
||||
expect(tokens[0].token).toBe(LexerToken.quote)
|
||||
expect(tokens[0].value).toBe('"')
|
||||
@@ -19,8 +19,8 @@ describe('Phrase', () => {
|
||||
|
||||
test('Should tokenize multi', () => {
|
||||
const reader = new StringReader('"one two three 123 !@#"')
|
||||
const tokenizer = new Lexer(reader)
|
||||
const tokens = tokenizer.parse()
|
||||
const lexer = new Lexer(reader)
|
||||
const tokens = lexer.parse()
|
||||
|
||||
expect(tokens[0].token).toBe(LexerToken.quote)
|
||||
expect(tokens[0].value).toBe('"')
|
||||
@@ -36,8 +36,8 @@ describe('Phrase', () => {
|
||||
describe('Word', () => {
|
||||
test('Should tokenize single', () => {
|
||||
const reader = new StringReader('word')
|
||||
const tokenizer = new Lexer(reader)
|
||||
const tokens = tokenizer.parse()
|
||||
const lexer = new Lexer(reader)
|
||||
const tokens = lexer.parse()
|
||||
|
||||
expect(tokens[0].token).toBe(LexerToken.word)
|
||||
expect(tokens[0].value).toBe('word')
|
||||
@@ -45,8 +45,8 @@ describe('Word', () => {
|
||||
|
||||
test('Should tokenize multi', () => {
|
||||
const reader = new StringReader('one two three 123')
|
||||
const tokenizer = new Lexer(reader)
|
||||
const tokens = tokenizer.parse()
|
||||
const lexer = new Lexer(reader)
|
||||
const tokens = lexer.parse()
|
||||
|
||||
expect(tokens[0].token).toBe(LexerToken.word)
|
||||
expect(tokens[0].value).toBe('one')
|
||||
@@ -74,8 +74,8 @@ describe('Word', () => {
|
||||
describe('Groups', () => {
|
||||
test('Should tokenize single-word group', () => {
|
||||
const reader = new StringReader('(word)')
|
||||
const tokenizer = new Lexer(reader)
|
||||
const tokens = tokenizer.parse()
|
||||
const lexer = new Lexer(reader)
|
||||
const tokens = lexer.parse()
|
||||
|
||||
expect(tokens[0].token).toBe(LexerToken.group)
|
||||
expect(tokens[0].value).toBe('(')
|
||||
@@ -91,8 +91,8 @@ describe('Groups', () => {
|
||||
describe('Logical operator OR', () => {
|
||||
test('should parse OR separator', () => {
|
||||
const reader = new StringReader('word OR word')
|
||||
const tokenizer = new Lexer(reader)
|
||||
const tokens = tokenizer.parse()
|
||||
const lexer = new Lexer(reader)
|
||||
const tokens = lexer.parse()
|
||||
|
||||
expect(tokens[0].token).toBe(LexerToken.word)
|
||||
expect(tokens[0].value).toBe('word')
|
||||
@@ -112,8 +112,8 @@ describe('Logical operator OR', () => {
|
||||
|
||||
test('should not parse OR separator mid-word', () => {
|
||||
const reader = new StringReader('wordORword')
|
||||
const tokenizer = new Lexer(reader)
|
||||
const tokens = tokenizer.parse()
|
||||
const lexer = new Lexer(reader)
|
||||
const tokens = lexer.parse()
|
||||
|
||||
expect(tokens[0].token).toBe(LexerToken.word)
|
||||
expect(tokens[0].value).toBe('wordORword')
|
||||
@@ -121,8 +121,8 @@ describe('Logical operator OR', () => {
|
||||
|
||||
test('should parse | separator', () => {
|
||||
const reader = new StringReader('word | word')
|
||||
const tokenizer = new Lexer(reader)
|
||||
const tokens = tokenizer.parse()
|
||||
const lexer = new Lexer(reader)
|
||||
const tokens = lexer.parse()
|
||||
|
||||
expect(tokens[0].token).toBe(LexerToken.word)
|
||||
expect(tokens[0].value).toBe('word')
|
||||
@@ -144,8 +144,8 @@ describe('Logical operator OR', () => {
|
||||
describe('Logical operator AND', () => {
|
||||
test('should parse AND separator', () => {
|
||||
const reader = new StringReader('word AND word')
|
||||
const tokenizer = new Lexer(reader)
|
||||
const tokens = tokenizer.parse()
|
||||
const lexer = new Lexer(reader)
|
||||
const tokens = lexer.parse()
|
||||
|
||||
expect(tokens[0].token).toBe(LexerToken.word)
|
||||
expect(tokens[0].value).toBe('word')
|
||||
@@ -165,8 +165,8 @@ describe('Logical operator AND', () => {
|
||||
|
||||
test('should not parse AND separator mid-word', () => {
|
||||
const reader = new StringReader('wordANDword')
|
||||
const tokenizer = new Lexer(reader)
|
||||
const tokens = tokenizer.parse()
|
||||
const lexer = new Lexer(reader)
|
||||
const tokens = lexer.parse()
|
||||
|
||||
expect(tokens[0].token).toBe(LexerToken.word)
|
||||
expect(tokens[0].value).toBe('wordANDword')
|
||||
@@ -174,8 +174,8 @@ describe('Logical operator AND', () => {
|
||||
|
||||
test('should parse & separator', () => {
|
||||
const reader = new StringReader('word & word')
|
||||
const tokenizer = new Lexer(reader)
|
||||
const tokens = tokenizer.parse()
|
||||
const lexer = new Lexer(reader)
|
||||
const tokens = lexer.parse()
|
||||
|
||||
expect(tokens[0].token).toBe(LexerToken.word)
|
||||
expect(tokens[0].value).toBe('word')
|
||||
@@ -1,6 +1,6 @@
|
||||
import { Operator, Parser, Word } from '../src/parser'
|
||||
import { StringReader } from '../src/reader'
|
||||
import { Lexer } from '../src/tokenizer'
|
||||
import { Lexer } from '../src/lexer'
|
||||
|
||||
test('should parse single word', () => {
|
||||
const reader = new StringReader('word')
|
||||
@@ -19,5 +19,5 @@ test('should parse OR operator', () => {
|
||||
const wordToken = tokens[0] as Operator
|
||||
expect(wordToken.type).toBe('operator')
|
||||
expect(wordToken.left.value).toBe('word')
|
||||
expect(wordToken.right.value).toBe('"phrase"')
|
||||
expect(wordToken.right.value).toBe('phrase')
|
||||
})
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import { InputReader } from './reader'
|
||||
|
||||
export enum TokenizerState {
|
||||
export enum lexerState {
|
||||
default,
|
||||
inPhrase,
|
||||
}
|
||||
@@ -19,8 +19,8 @@ export interface LexerTokenValue {
|
||||
}
|
||||
|
||||
export abstract class ILexer {
|
||||
public abstract peek(): LexerTokenValue
|
||||
public abstract consume(): LexerTokenValue
|
||||
public abstract peek(amount?: number): LexerTokenValue | null
|
||||
public abstract consume(amount?: number): LexerTokenValue | null
|
||||
public abstract isEOF(): boolean
|
||||
public abstract parse(): LexerTokenValue[]
|
||||
public abstract index: number
|
||||
@@ -29,44 +29,79 @@ export abstract class ILexer {
|
||||
|
||||
export class Lexer implements ILexer {
|
||||
reader: InputReader<string>
|
||||
state: TokenizerState = TokenizerState.default
|
||||
state: lexerState = lexerState.default
|
||||
quoteTerminator: string | null = null
|
||||
index: number = 0
|
||||
peekIndex: number = 0
|
||||
afterWhitespace: boolean = false
|
||||
cache: LexerTokenValue[] = []
|
||||
|
||||
constructor(reader: InputReader<string>) {
|
||||
this.reader = reader
|
||||
}
|
||||
|
||||
// TODO implement peek by (n)?
|
||||
public peek(): LexerTokenValue {
|
||||
// save state before peeking
|
||||
const beforePeekState = this.state
|
||||
const beforePeekIndex = this.reader.index
|
||||
const beforePeekWhiteSpace = this.afterWhitespace
|
||||
public peek(amount = 0): LexerTokenValue | null {
|
||||
const cacheIndex = this.index + amount
|
||||
if (this.isEOF()) {
|
||||
return null
|
||||
}
|
||||
|
||||
const value = this.readNextToken()
|
||||
if (this.cache[cacheIndex]) {
|
||||
return this.cache[cacheIndex]
|
||||
}
|
||||
|
||||
// save state before peeking
|
||||
// const beforePeekState = this.state
|
||||
// const beforePeekIndex = this.reader.index
|
||||
// const beforePeekWhiteSpace = this.afterWhitespace
|
||||
|
||||
this.fillCache(cacheIndex)
|
||||
const token = this.cache[cacheIndex]
|
||||
|
||||
// restore state after peeking
|
||||
this.state = beforePeekState
|
||||
this.reader.setIndex(beforePeekIndex - 1)
|
||||
this.afterWhitespace = beforePeekWhiteSpace
|
||||
// this.state = beforePeekState
|
||||
// this.reader.setIndex(beforePeekIndex)
|
||||
// this.afterWhitespace = beforePeekWhiteSpace
|
||||
|
||||
return value
|
||||
return token
|
||||
}
|
||||
|
||||
// TODO implement consume by (n)?
|
||||
public consume(): LexerTokenValue {
|
||||
const token = this.readNextToken()
|
||||
this.index++
|
||||
public consume(amount = 0): LexerTokenValue | null {
|
||||
const cacheIndex = this.index + amount
|
||||
this.index = cacheIndex + 1
|
||||
|
||||
if (this.cache[cacheIndex]) {
|
||||
return this.cache[cacheIndex]
|
||||
}
|
||||
if (this.isEOF()) {
|
||||
return null
|
||||
}
|
||||
|
||||
this.fillCache(cacheIndex)
|
||||
const token = this.cache[cacheIndex]
|
||||
return token
|
||||
}
|
||||
|
||||
private fillCache(n: number) {
|
||||
const { index } = this
|
||||
for (let i = 0; i <= n; i++) {
|
||||
this.index = i
|
||||
if (this.isEOF()) {
|
||||
return
|
||||
}
|
||||
if (this.cache[i]) {
|
||||
continue
|
||||
}
|
||||
const value = this.readNextToken()
|
||||
this.cache[i] = value!
|
||||
}
|
||||
this.index = index
|
||||
}
|
||||
|
||||
public parse(): LexerTokenValue[] {
|
||||
const tokens: LexerTokenValue[] = []
|
||||
while (!this.isEOF()) {
|
||||
tokens.push(this.consume())
|
||||
tokens.push(this.consume()!)
|
||||
}
|
||||
return tokens
|
||||
}
|
||||
@@ -79,10 +114,10 @@ export class Lexer implements ILexer {
|
||||
return this.reader.isEOF()
|
||||
}
|
||||
|
||||
private readNextToken(): LexerTokenValue {
|
||||
private readNextToken(): LexerTokenValue | null {
|
||||
const nextChar = this.reader.peek()
|
||||
switch (this.state) {
|
||||
case TokenizerState.default:
|
||||
case lexerState.default:
|
||||
// whitespace
|
||||
if (this.isWhitespace(nextChar)) {
|
||||
this.afterWhitespace = true
|
||||
@@ -94,7 +129,7 @@ export class Lexer implements ILexer {
|
||||
|
||||
// quote
|
||||
if (`"'`.includes(nextChar)) {
|
||||
this.state = TokenizerState.inPhrase
|
||||
this.state = lexerState.inPhrase
|
||||
this.quoteTerminator = nextChar
|
||||
return this.consumeQuote()
|
||||
}
|
||||
@@ -131,12 +166,12 @@ export class Lexer implements ILexer {
|
||||
|
||||
// other, consume normally
|
||||
return this.consumeWord()
|
||||
case TokenizerState.inPhrase:
|
||||
case lexerState.inPhrase:
|
||||
this.afterWhitespace = false
|
||||
|
||||
// in phrase mode, consume until quote terminator
|
||||
if (nextChar === this.quoteTerminator) {
|
||||
this.state = TokenizerState.default
|
||||
this.state = lexerState.default
|
||||
return this.consumeQuote()
|
||||
}
|
||||
|
||||
109
src/parser.ts
109
src/parser.ts
@@ -1,5 +1,5 @@
|
||||
import { InputReader } from './reader'
|
||||
import { ILexer, LexerToken, LexerTokenValue } from './tokenizer'
|
||||
import { ILexer, LexerToken, LexerTokenValue } from './lexer'
|
||||
|
||||
export interface ParserTokenValue {
|
||||
type: 'word' | 'operator' | 'phrase' | 'group'
|
||||
@@ -38,8 +38,8 @@ export abstract class IParser {
|
||||
this.lexer = lexer
|
||||
}
|
||||
|
||||
public abstract peek(): ParserToken | null
|
||||
public abstract consume(): ParserToken | null
|
||||
public abstract peek(amount?: number): ParserToken | null
|
||||
public abstract consume(amount?: number): ParserToken | null
|
||||
public abstract parse(): ParserToken[]
|
||||
public abstract isEOF(): boolean
|
||||
}
|
||||
@@ -51,47 +51,60 @@ export enum ParserState {
|
||||
export class Parser extends IParser {
|
||||
index = 0
|
||||
state = ParserState.default
|
||||
stack: ParserToken[] = []
|
||||
cache: ParserToken[] = []
|
||||
|
||||
constructor(lexer: ILexer) {
|
||||
super(lexer)
|
||||
this.state = ParserState.default
|
||||
}
|
||||
|
||||
public peek(): ParserToken | null {
|
||||
public peek(amount = 0): ParserToken | null {
|
||||
const cacheIndex = this.index + amount
|
||||
if (this.isEOF()) {
|
||||
return null
|
||||
}
|
||||
if (this.index < this.stack.length) {
|
||||
return this.stack[this.index]
|
||||
if (cacheIndex < this.cache.length) {
|
||||
return this.cache[cacheIndex]
|
||||
}
|
||||
|
||||
const beforePeekIndex = this.lexer.index
|
||||
const value = this.readNextToken()
|
||||
if (value) {
|
||||
this.stack.push(value)
|
||||
}
|
||||
this.lexer.setIndex(beforePeekIndex)
|
||||
return value
|
||||
// const beforePeekIndex = this.lexer.index
|
||||
this.fillCache(cacheIndex)
|
||||
const token = this.cache[cacheIndex]
|
||||
// this.lexer.setIndex(beforePeekIndex)
|
||||
return token
|
||||
}
|
||||
|
||||
public consume(): ParserToken | null {
|
||||
public consume(amount = 0): ParserToken | null {
|
||||
const cacheIndex = this.index + amount
|
||||
this.index = cacheIndex + 1
|
||||
|
||||
if (this.cache[cacheIndex]) {
|
||||
return this.cache[cacheIndex]
|
||||
}
|
||||
if (this.isEOF()) {
|
||||
return null
|
||||
}
|
||||
if (this.index < this.stack.length) {
|
||||
this.index++
|
||||
return this.stack[this.index]
|
||||
}
|
||||
|
||||
const token = this.readNextToken()
|
||||
this.index++
|
||||
if (token) {
|
||||
this.stack.push(token)
|
||||
}
|
||||
this.fillCache(cacheIndex)
|
||||
const token = this.cache[cacheIndex]
|
||||
return token
|
||||
}
|
||||
|
||||
private fillCache(n: number) {
|
||||
const { index } = this
|
||||
for (let i = 0; i <= n; i++) {
|
||||
this.index = i
|
||||
if (this.isEOF()) {
|
||||
return
|
||||
}
|
||||
if (this.cache[i]) {
|
||||
continue
|
||||
}
|
||||
const value = this.readNextToken()
|
||||
this.cache[i] = value!
|
||||
}
|
||||
this.index = index
|
||||
}
|
||||
|
||||
public parse(): ParserToken[] {
|
||||
const tokens: ParserToken[] = []
|
||||
while (!this.isEOF()) {
|
||||
@@ -109,26 +122,31 @@ export class Parser extends IParser {
|
||||
}
|
||||
|
||||
private readNextToken(): ParserToken | null {
|
||||
const token = this.lexer.consume()
|
||||
let nextToken = this.lexer.peek()
|
||||
// TODO reset lexer index?
|
||||
while (nextToken?.token === 'whitespace') {
|
||||
this.lexer.consume()
|
||||
nextToken = this.lexer.peek()
|
||||
}
|
||||
let token = this.lexer.peek()
|
||||
let nextToken = this.lexer.peek(1)
|
||||
|
||||
switch (this.state) {
|
||||
case ParserState.default:
|
||||
if (nextToken.token === 'group') {
|
||||
if (token?.token === 'whitespace') {
|
||||
this.index++
|
||||
this.lexer.consume()
|
||||
return this.readNextToken()
|
||||
}
|
||||
switch (token.token) {
|
||||
while (nextToken && nextToken.token === 'whitespace') {
|
||||
nextToken = this.lexer.peek(1)
|
||||
this.lexer.consume()
|
||||
}
|
||||
if (nextToken?.token === 'group' || nextToken?.token === 'operator') {
|
||||
this.index++
|
||||
return this.consumeOperator(token!, nextToken)
|
||||
}
|
||||
switch (token?.token) {
|
||||
case LexerToken.word:
|
||||
return { type: 'word', value: token.value }
|
||||
return { type: 'word', value: this.lexer.consume()!.value }
|
||||
case LexerToken.quote:
|
||||
return { type: 'phrase', value: token.value, quote: token.value as '"' }
|
||||
return this.consumePhrase(token)
|
||||
case LexerToken.operator:
|
||||
return this.consumeOperator(token)
|
||||
return this.consumeOperator(token, nextToken!)
|
||||
default:
|
||||
return null
|
||||
}
|
||||
@@ -137,9 +155,20 @@ export class Parser extends IParser {
|
||||
}
|
||||
}
|
||||
|
||||
private consumeOperator(token: LexerTokenValue): ParserToken | null {
|
||||
const left = this.stack[this.stack.length - 1]
|
||||
private consumePhrase(token: LexerTokenValue): ParserToken | null {
|
||||
this.lexer.consume()
|
||||
const quoteContent = this.lexer.consume()!
|
||||
this.lexer.consume()
|
||||
return { type: 'phrase', value: quoteContent.value, quote: token.value as '"' }
|
||||
}
|
||||
|
||||
private consumeOperator(left: LexerTokenValue, opToken: LexerTokenValue): ParserToken | null {
|
||||
// const left = this.cache[this.cache.length - 1]
|
||||
this.index++
|
||||
this.lexer.consume()
|
||||
const right = this.readNextToken()
|
||||
return { type: 'operator', value: token.value, left, right }
|
||||
this.lexer.consume()
|
||||
// const right = this.readNextToken()
|
||||
return { type: 'operator', value: opToken.value, left, right }
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user