diff --git a/__tests__/tokenizer_test.ts b/__tests__/tokenizer_test.ts index 125a7c7..04c9bfd 100644 --- a/__tests__/tokenizer_test.ts +++ b/__tests__/tokenizer_test.ts @@ -19,87 +19,130 @@ describe('Tokenizer', () => { }) describe('Groups', () => { - it('Should tokenize single-word group', () => { + test('Should tokenize single-word group', () => { const reader = new StringReader('(word)') const tokenizer = new Tokenizer(reader) + const tokens = tokenizer.read() - let token = tokenizer.consume() - expect(token.token).toBe(Token.group) - expect(token.value).toBe('(') + expect(tokens[0].token).toBe(Token.group) + expect(tokens[0].value).toBe('(') - token = tokenizer.consume() - expect(token.token).toBe(Token.word) - expect(token.value).toBe('word') + expect(tokens[1].token).toBe(Token.word) + expect(tokens[1].value).toBe('word') - token = tokenizer.consume() - expect(token.token).toBe(Token.group) - expect(token.value).toBe(')') + expect(tokens[2].token).toBe(Token.group) + expect(tokens[2].value).toBe(')') }) - it('Should tokenize logical operator OR group', () => { - const reader = new StringReader('(word OR word)') - const tokenizer = new Tokenizer(reader) + describe('logical operator OR group', () => { + test('should parse OR separator', () => { + const reader = new StringReader('(word OR word)') + const tokenizer = new Tokenizer(reader) + const tokens = tokenizer.read() - let token = tokenizer.consume() - expect(token.token).toBe(Token.group) - expect(token.value).toBe('(') + expect(tokens[0].token).toBe(Token.group) + expect(tokens[0].value).toBe('(') - token = tokenizer.consume() - expect(token.token).toBe(Token.word) - expect(token.value).toBe('word') + expect(tokens[1].token).toBe(Token.word) + expect(tokens[1].value).toBe('word') - token = tokenizer.consume() - expect(token.token).toBe(Token.whitespace) - expect(token.value).toBe(' ') + expect(tokens[2].token).toBe(Token.whitespace) + expect(tokens[2].value).toBe(' ') - token = tokenizer.consume() - expect(token.token).toBe(Token.operator) - expect(token.value).toBe('or') + expect(tokens[3].token).toBe(Token.operator) + expect(tokens[3].value).toBe('or') - token = tokenizer.consume() - expect(token.token).toBe(Token.whitespace) - expect(token.value).toBe(' ') + expect(tokens[4].token).toBe(Token.whitespace) + expect(tokens[4].value).toBe(' ') - token = tokenizer.consume() - expect(token.token).toBe(Token.word) - expect(token.value).toBe('word') + expect(tokens[5].token).toBe(Token.word) + expect(tokens[5].value).toBe('word') - token = tokenizer.consume() - expect(token.token).toBe(Token.group) - expect(token.value).toBe(')') + expect(tokens[6].token).toBe(Token.group) + expect(tokens[6].value).toBe(')') + }) + + test('should parse | separator', () => { + const reader = new StringReader('(word | word)') + const tokenizer = new Tokenizer(reader) + const tokens = tokenizer.read() + + expect(tokens[0].token).toBe(Token.group) + expect(tokens[0].value).toBe('(') + + expect(tokens[1].token).toBe(Token.word) + expect(tokens[1].value).toBe('word') + + expect(tokens[2].token).toBe(Token.whitespace) + expect(tokens[2].value).toBe(' ') + + expect(tokens[3].token).toBe(Token.operator) + expect(tokens[3].value).toBe('|') + + expect(tokens[4].token).toBe(Token.whitespace) + expect(tokens[4].value).toBe(' ') + + expect(tokens[5].token).toBe(Token.word) + expect(tokens[5].value).toBe('word') + + expect(tokens[6].token).toBe(Token.group) + expect(tokens[6].value).toBe(')') + }) }) - it('Should tokenize logical operator AND group', () => { - const reader = new StringReader('(word AND word)') - const tokenizer = new Tokenizer(reader) + describe('logical operator AND group', () => { + test('should parse AND separator', () => { + const reader = new StringReader('(word AND word)') + const tokenizer = new Tokenizer(reader) + const tokens = tokenizer.read() - let token = tokenizer.consume() - expect(token.token).toBe(Token.group) - expect(token.value).toBe('(') + expect(tokens[0].token).toBe(Token.group) + expect(tokens[0].value).toBe('(') - token = tokenizer.consume() - expect(token.token).toBe(Token.word) - expect(token.value).toBe('word') + expect(tokens[1].token).toBe(Token.word) + expect(tokens[1].value).toBe('word') - token = tokenizer.consume() - expect(token.token).toBe(Token.whitespace) - expect(token.value).toBe(' ') + expect(tokens[2].token).toBe(Token.whitespace) + expect(tokens[2].value).toBe(' ') - token = tokenizer.consume() - expect(token.token).toBe(Token.operator) - expect(token.value).toBe('and') + expect(tokens[3].token).toBe(Token.operator) + expect(tokens[3].value).toBe('and') - token = tokenizer.consume() - expect(token.token).toBe(Token.whitespace) - expect(token.value).toBe(' ') + expect(tokens[4].token).toBe(Token.whitespace) + expect(tokens[4].value).toBe(' ') - token = tokenizer.consume() - expect(token.token).toBe(Token.word) - expect(token.value).toBe('word') + expect(tokens[5].token).toBe(Token.word) + expect(tokens[5].value).toBe('word') - token = tokenizer.consume() - expect(token.token).toBe(Token.group) - expect(token.value).toBe(')') + expect(tokens[6].token).toBe(Token.group) + expect(tokens[6].value).toBe(')') + }) + test('should parse & separator', () => { + const reader = new StringReader('(word & word)') + const tokenizer = new Tokenizer(reader) + const tokens = tokenizer.read() + + expect(tokens[0].token).toBe(Token.group) + expect(tokens[0].value).toBe('(') + + expect(tokens[1].token).toBe(Token.word) + expect(tokens[1].value).toBe('word') + + expect(tokens[2].token).toBe(Token.whitespace) + expect(tokens[2].value).toBe(' ') + + expect(tokens[3].token).toBe(Token.operator) + expect(tokens[3].value).toBe('&') + + expect(tokens[4].token).toBe(Token.whitespace) + expect(tokens[4].value).toBe(' ') + + expect(tokens[5].token).toBe(Token.word) + expect(tokens[5].value).toBe('word') + + expect(tokens[6].token).toBe(Token.group) + expect(tokens[6].value).toBe(')') + }) }) }) }) diff --git a/src/tokenizer.ts b/src/tokenizer.ts index 042517b..e85f3ec 100644 --- a/src/tokenizer.ts +++ b/src/tokenizer.ts @@ -56,19 +56,23 @@ export class Tokenizer implements InputReader { } if (this.isAlphanumeric(nextChar.charCodeAt(0))) { - if (nextChar.toUpperCase() === 'O' && this.reader.peek(1) === 'R') { + if (this.confirmExactWord('OR')) { return this.consumeOr() } - if ( - nextChar.toUpperCase() === 'A' && - this.reader.peek(1) === 'N' && - this.reader.peek(2) === 'D' - ) { + if (this.confirmExactWord('AND')) { return this.consumeAnd() } return this.consumeWord() } + if (nextChar === '|') { + return this.consumeOr() + } + + if (nextChar === '&') { + return this.consumeAnd() + } + if (nextChar === '(' || nextChar === ')') { // this.state = TokenizerState.inGroup return this.consumeGroup() @@ -84,24 +88,62 @@ export class Tokenizer implements InputReader { throw new Error('bad state') } } + consumeAnd(): TokenValue { - this.reader.consume() - this.reader.consume() - this.reader.consume() + let value = '' + if (this.confirmExactWord('AND')) { + this.consumeExactWord('AND') + value = 'and' + } else if (this.confirmExactWord('&')) { + this.consumeExactWord('&') + value = '&' + } return { - value: 'and', + value, token: Token.operator, } } + consumeOr(): TokenValue { - this.reader.consume() - this.reader.consume() + let value = '' + if (this.confirmExactWord('OR')) { + this.consumeExactWord('OR') + value = 'or' + } else if (this.confirmExactWord('|')) { + this.consumeExactWord('|') + value = '|' + } return { - value: 'or', + value, token: Token.operator, } } + confirmExactWord(word: string) { + let nextChar = this.reader.peek() + for (let i = 0; i < word.length; i++) { + if (nextChar !== word[i]) { + return false + } + nextChar = this.reader.peek(i + 1) + } + return true + } + + consumeExactWord(word: string) { + if (this.confirmExactWord(word)) { + this.consumeReader(word.length) + } else { + throw new Error("Can't find exact word: " + word) + } + } + + consumeReader(times = 1) { + for (let i = 0; i < times; i++) { + this.reader.consume() + } + } + private consumeGroup(): TokenValue { return { value: this.reader.consume(), @@ -166,10 +208,6 @@ export class Tokenizer implements InputReader { return tokens } - // public read() { - // throw new Error('Method not implemented.') - // } - private isWhitespace(nextChar: string) { return ' \t\n\r'.includes(nextChar) }