diff --git a/__tests__/tokenizer_test.ts b/__tests__/tokenizer_test.ts index 04c9bfd..e6a2503 100644 --- a/__tests__/tokenizer_test.ts +++ b/__tests__/tokenizer_test.ts @@ -2,20 +2,74 @@ import { StringReader } from '../src/reader' import { Token, Tokenizer } from '../src/tokenizer' describe('Tokenizer', () => { - test('Should tokenize phrase', () => { - const reader = new StringReader('"phrase"') - const tokenizer = new Tokenizer(reader) - const token = tokenizer.peek() - expect(token.token).toBe(Token.phrase) - expect(token.value).toBe('"phrase"') + describe('Phrase', () => { + test('Should tokenize single', () => { + const reader = new StringReader('"phrase"') + const tokenizer = new Tokenizer(reader) + const tokens = tokenizer.read() + + expect(tokens[0].token).toBe(Token.quote) + expect(tokens[0].value).toBe('"') + + expect(tokens[1].token).toBe(Token.word) + expect(tokens[1].value).toBe('phrase') + + expect(tokens[2].token).toBe(Token.quote) + expect(tokens[2].value).toBe('"') + }) + + test('Should tokenize multi', () => { + const reader = new StringReader('"one two three 123 !@#"') + const tokenizer = new Tokenizer(reader) + const tokens = tokenizer.read() + + expect(tokens[0].token).toBe(Token.quote) + expect(tokens[0].value).toBe('"') + + expect(tokens[1].token).toBe(Token.word) + expect(tokens[1].value).toBe('one two three 123 !@#') + + expect(tokens[2].token).toBe(Token.quote) + expect(tokens[2].value).toBe('"') + }) }) - test('Should tokenize word', () => { - const reader = new StringReader('word') - const tokenizer = new Tokenizer(reader) - const token = tokenizer.peek() - expect(token.token).toBe(Token.word) - expect(token.value).toBe('word') + describe('Word', () => { + test('Should tokenize single', () => { + const reader = new StringReader('word') + const tokenizer = new Tokenizer(reader) + const tokens = tokenizer.read() + + expect(tokens[0].token).toBe(Token.word) + expect(tokens[0].value).toBe('word') + }) + + test('Should tokenize multi', () => { + const reader = new StringReader('one two three 123') + const tokenizer = new Tokenizer(reader) + const tokens = tokenizer.read() + + expect(tokens[0].token).toBe(Token.word) + expect(tokens[0].value).toBe('one') + + expect(tokens[1].token).toBe(Token.whitespace) + expect(tokens[1].value).toBe(' ') + + expect(tokens[2].token).toBe(Token.word) + expect(tokens[2].value).toBe('two') + + expect(tokens[3].token).toBe(Token.whitespace) + expect(tokens[3].value).toBe(' ') + + expect(tokens[4].token).toBe(Token.word) + expect(tokens[4].value).toBe('three') + + expect(tokens[5].token).toBe(Token.whitespace) + expect(tokens[5].value).toBe(' ') + + expect(tokens[6].token).toBe(Token.word) + expect(tokens[6].value).toBe('123') + }) }) describe('Groups', () => { diff --git a/src/tokenizer.ts b/src/tokenizer.ts index e85f3ec..c68ce2d 100644 --- a/src/tokenizer.ts +++ b/src/tokenizer.ts @@ -3,16 +3,16 @@ import { InputReader } from './reader' export enum TokenizerState { default, inPhrase, - // inGroup, } export enum Token { - phrase = 'phrase', + // phrase = 'phrase', group = 'group', operator = 'operator', word = 'word', + quote = 'quote', whitespace = 'whitespace', - eof = 'eof', + // eof = 'eof', } export interface TokenValue { @@ -52,7 +52,7 @@ export class Tokenizer implements InputReader { if (`"'`.includes(nextChar)) { this.state = TokenizerState.inPhrase this.quoteTerminator = nextChar - return this.consumePhrase() + return this.consumeQuote() } if (this.isAlphanumeric(nextChar.charCodeAt(0))) { @@ -74,21 +74,27 @@ export class Tokenizer implements InputReader { } if (nextChar === '(' || nextChar === ')') { - // this.state = TokenizerState.inGroup return this.consumeGroup() } return this.consumeWord() case TokenizerState.inPhrase: if (nextChar === this.quoteTerminator) { this.state = TokenizerState.default - return this.consumePhrase() + return this.consumeQuote() } - return this.consumeWord() + return this.consumePhrase() default: throw new Error('bad state') } } + consumeQuote(): TokenValue { + return { + value: this.reader.consume(), + token: Token.quote, + } + } + consumeAnd(): TokenValue { let value = '' if (this.confirmExactWord('AND')) { @@ -157,11 +163,9 @@ export class Tokenizer implements InputReader { while ((nextChar = this.reader.peek()) && nextChar !== this.quoteTerminator) { value += this.reader.consume() } - value += nextChar - this.reader.consume() return { value, - token: Token.phrase, + token: Token.word, } }