diff --git a/src/evented-tokenizer.ts b/src/evented-tokenizer.ts index 476a831..d387df7 100644 --- a/src/evented-tokenizer.ts +++ b/src/evented-tokenizer.ts @@ -185,6 +185,194 @@ export default class EventedTokenizer { this.consume(); this.transitionTo(TokenizerState.commentStart); this.delegate.beginComment(); + } else { + let maybeDoctype = char.toUpperCase() + this.input.substring(this.index, this.index + 6).toUpperCase(); + + if (maybeDoctype === 'DOCTYPE') { + this.consume(); + this.consume(); + this.consume(); + this.consume(); + this.consume(); + this.consume(); + this.transitionTo(TokenizerState.doctype); + if (this.delegate.beginDoctype) this.delegate.beginDoctype(); + } + } + }, + + doctype() { + let char = this.consume(); + + if (isSpace(char)) { + this.transitionTo(TokenizerState.beforeDoctypeName); + } + }, + + beforeDoctypeName() { + let char = this.consume(); + + if (isSpace(char)) { + return; + } else { + this.transitionTo(TokenizerState.doctypeName); + if (this.delegate.appendToDoctypeName) this.delegate.appendToDoctypeName(char.toLowerCase()); + } + }, + + doctypeName() { + let char = this.consume(); + + if (isSpace(char)) { + this.transitionTo(TokenizerState.afterDoctypeName); + } else if (char === '>') { + if (this.delegate.endDoctype) this.delegate.endDoctype(); + this.transitionTo(TokenizerState.beforeData); + } else { + if (this.delegate.appendToDoctypeName) this.delegate.appendToDoctypeName(char.toLowerCase()); + } + }, + + afterDoctypeName() { + let char = this.consume(); + + if (isSpace(char)) { + return; + } else if (char === '>') { + if (this.delegate.endDoctype) this.delegate.endDoctype(); + this.transitionTo(TokenizerState.beforeData); + } else { + let nextSixChars = char.toUpperCase() + this.input.substring(this.index, this.index + 5).toUpperCase(); + + let isPublic = nextSixChars.toUpperCase() === 'PUBLIC'; + let isSystem = nextSixChars.toUpperCase() === 'SYSTEM'; + + if (isPublic || isSystem) { + this.consume(); + this.consume(); + this.consume(); + this.consume(); + this.consume(); + this.consume(); + } + + if (isPublic) { + this.transitionTo(TokenizerState.afterDoctypePublicKeyword); + } else if (isSystem) { + this.transitionTo(TokenizerState.afterDoctypeSystemKeyword); + } + } + }, + + afterDoctypePublicKeyword() { + let char = this.peek(); + + if (isSpace(char)) { + this.transitionTo(TokenizerState.beforeDoctypePublicIdentifier); + this.consume(); + } else if (char === '"') { + this.transitionTo(TokenizerState.doctypePublicIdentifierDoubleQuoted); + this.consume(); + } else if (char === "'") { + this.transitionTo(TokenizerState.doctypePublicIdentifierSingleQuoted); + this.consume(); + } else if (char === '>') { + this.consume(); + if (this.delegate.endDoctype) this.delegate.endDoctype(); + this.transitionTo(TokenizerState.beforeData); + } + }, + + doctypePublicIdentifierDoubleQuoted() { + let char = this.consume(); + + if (char === '"') { + this.transitionTo(TokenizerState.afterDoctypePublicIdentifier); + } else if (char === '>') { + if (this.delegate.endDoctype) this.delegate.endDoctype(); + this.transitionTo(TokenizerState.beforeData); + } else { + if (this.delegate.appendToDoctypePublicIdentifier) this.delegate.appendToDoctypePublicIdentifier(char); + } + }, + + doctypePublicIdentifierSingleQuoted() { + let char = this.consume(); + + if (char === "'") { + this.transitionTo(TokenizerState.afterDoctypePublicIdentifier); + } else if (char === '>') { + if (this.delegate.endDoctype) this.delegate.endDoctype(); + this.transitionTo(TokenizerState.beforeData); + } else { + if (this.delegate.appendToDoctypePublicIdentifier) this.delegate.appendToDoctypePublicIdentifier(char); + } + }, + + afterDoctypePublicIdentifier() { + let char = this.consume(); + + if (isSpace(char)) { + this.transitionTo(TokenizerState.betweenDoctypePublicAndSystemIdentifiers); + } else if (char === '>') { + if (this.delegate.endDoctype) this.delegate.endDoctype(); + this.transitionTo(TokenizerState.beforeData); + } else if (char === '"') { + this.transitionTo(TokenizerState.doctypeSystemIdentifierDoubleQuoted); + } else if (char === "'") { + this.transitionTo(TokenizerState.doctypeSystemIdentifierSingleQuoted); + } + }, + + betweenDoctypePublicAndSystemIdentifiers() { + let char = this.consume(); + + if (isSpace(char)) { + return; + } else if (char === '>') { + if (this.delegate.endDoctype) this.delegate.endDoctype(); + this.transitionTo(TokenizerState.beforeData); + } else if (char === '"') { + this.transitionTo(TokenizerState.doctypeSystemIdentifierDoubleQuoted); + } else if (char === "'") { + this.transitionTo(TokenizerState.doctypeSystemIdentifierSingleQuoted); + } + }, + + doctypeSystemIdentifierDoubleQuoted() { + let char = this.consume(); + + if (char === '"') { + this.transitionTo(TokenizerState.afterDoctypeSystemIdentifier); + } else if (char === '>') { + if (this.delegate.endDoctype) this.delegate.endDoctype(); + this.transitionTo(TokenizerState.beforeData); + } else { + if (this.delegate.appendToDoctypeSystemIdentifier) this.delegate.appendToDoctypeSystemIdentifier(char); + } + }, + + doctypeSystemIdentifierSingleQuoted() { + let char = this.consume(); + + if (char === "'") { + this.transitionTo(TokenizerState.afterDoctypeSystemIdentifier); + } else if (char === '>') { + if (this.delegate.endDoctype) this.delegate.endDoctype(); + this.transitionTo(TokenizerState.beforeData); + } else { + if (this.delegate.appendToDoctypeSystemIdentifier) this.delegate.appendToDoctypeSystemIdentifier(char); + } + }, + + afterDoctypeSystemIdentifier() { + let char = this.consume(); + + if (isSpace(char)) { + return; + } else if (char === '>') { + if (this.delegate.endDoctype) this.delegate.endDoctype(); + this.transitionTo(TokenizerState.beforeData); } }, diff --git a/src/tokenizer.ts b/src/tokenizer.ts index b7b9c2d..46f124d 100644 --- a/src/tokenizer.ts +++ b/src/tokenizer.ts @@ -99,6 +99,41 @@ export default class Tokenizer implements TokenizerDelegate { // Data + beginDoctype() { + this.push({ + type: TokenType.Doctype, + name: '', + }); + } + + appendToDoctypeName(char: string) { + this.current(TokenType.Doctype).name += char; + } + + appendToDoctypePublicIdentifier(char: string) { + let doctype = this.current(TokenType.Doctype); + + if (doctype.publicIdentifier === undefined) { + doctype.publicIdentifier = char; + } else { + doctype.publicIdentifier += char; + } + } + + appendToDoctypeSystemIdentifier(char: string) { + let doctype = this.current(TokenType.Doctype); + + if (doctype.systemIdentifier === undefined) { + doctype.systemIdentifier = char; + } else { + doctype.systemIdentifier += char; + } + } + + endDoctype() { + this.addLocInfo(); + } + beginData() { this.push({ type: TokenType.Chars, diff --git a/src/types.ts b/src/types.ts index c20ba74..13708a7 100644 --- a/src/types.ts +++ b/src/types.ts @@ -30,6 +30,12 @@ export interface TokenBase { loc?: Location; } +export interface Doctype extends TokenBase { + name: string; + publicIdentifier?: string; + systemIdentifier?: string; +} + export interface StartTag extends TokenBase { tagName: string; attributes: Attribute[]; @@ -48,9 +54,10 @@ export interface Comment extends TokenBase { chars: string; } -export type Token = StartTag | EndTag | Chars | Comment; +export type Token = StartTag | EndTag | Chars | Comment | Doctype; export const enum TokenType { + Doctype = 'Doctype', StartTag = 'StartTag', EndTag = 'EndTag', Chars = 'Chars', @@ -62,6 +69,7 @@ export interface TokenMap { EndTag: EndTag; Chars: Chars; Comment: Comment; + Doctype: Doctype; } export interface TokenizerDelegate { @@ -69,6 +77,13 @@ export interface TokenizerDelegate { finishData(): void; tagOpen(): void; + // TODO: make these non-optional in preparation for the next major version release + beginDoctype?(): void; + appendToDoctypeName?(char: string): void; + appendToDoctypePublicIdentifier?(char: string): void; + appendToDoctypeSystemIdentifier?(char: string): void; + endDoctype?(): void; + beginData(): void; appendToData(char: string): void; diff --git a/tests/tokenizer-tests.ts b/tests/tokenizer-tests.ts index cc6c5c8..58fe989 100644 --- a/tests/tokenizer-tests.ts +++ b/tests/tokenizer-tests.ts @@ -1,5 +1,9 @@ import { tokenize, + EventedTokenizer, + TokenizerDelegate, + EntityParser, + Doctype, StartTag, EndTag, Comment, @@ -11,6 +15,150 @@ import { QUnit.module('simple-html-tokenizer - tokenizer'); +QUnit.test('does not fail if delegate does not include doctype methods', function(assert) { + let steps: Array = []; + + class MissingDoctypeTokenizerDelegate implements TokenizerDelegate { + reset() { + steps.push(['reset']); + } + finishData() { + steps.push(['finishData']); + } + tagOpen() { + steps.push(['tagOpen']); + } + + beginData() { + steps.push(['beginData']); + } + + appendToData(char: string) { + steps.push(['appendToData', char]); + } + + beginStartTag() { + steps.push(['beginStartTag']); + } + appendToTagName(char: string) { + steps.push(['appendToTagName', char]); + } + + beginAttribute() { + steps.push(['beginAttribute']); + } + appendToAttributeName(char: string) { + steps.push(['appendToAttributeName', char]); + } + beginAttributeValue(quoted: boolean) { + steps.push(['beginAttributeValue', `${quoted}`]); + } + + appendToAttributeValue(char: string) { + steps.push(['appendToAttributeValue', char]); + } + finishAttributeValue() { + steps.push(['finishAttributeValue']); + } + + markTagAsSelfClosing() { + steps.push(['markTagAsSelfClosing']); + } + + beginEndTag() { + steps.push(['beginEndTag']); + } + finishTag() { + steps.push(['finishTag']); + } + + beginComment() { + steps.push(['beginComment']); + } + appendToCommentData(char: string) { + steps.push(['appendToCommentData', char]); + } + finishComment() { + steps.push(['finishComment']); + } + + reportSyntaxError(error: string) { + steps.push(['reportSyntaxError', error]); + } + } + + let delegate = new MissingDoctypeTokenizerDelegate(); + let tokenizer = new EventedTokenizer(delegate, new EntityParser({})); + + tokenizer.tokenize('\n\n'); + + assert.deepEqual(steps, [ + [ "reset" ], + [ "reset" ], + [ "beginData" ], + [ "appendToData", "\n" ], + [ "finishData" ], + [ "tagOpen" ], + [ "beginComment" ], + [ "appendToCommentData", " " ], + [ "appendToCommentData", "c" ], + [ "appendToCommentData", "o" ], + [ "appendToCommentData", "m" ], + [ "appendToCommentData", "m" ], + [ "appendToCommentData", "e" ], + [ "appendToCommentData", "n" ], + [ "appendToCommentData", "t" ], + [ "appendToCommentData", " " ], + [ "appendToCommentData", "h" ], + [ "appendToCommentData", "e" ], + [ "appendToCommentData", "r" ], + [ "appendToCommentData", "e" ], + [ "appendToCommentData", " " ], + [ "finishComment" ], + [ "tagOpen" ], + [ "beginData" ], + [ "appendToData", "\n" ], + [ "finishData" ], + [ "tagOpen" ], + [ "beginComment" ], + [ "appendToCommentData", " " ], + [ "appendToCommentData", "c" ], + [ "appendToCommentData", "o" ], + [ "appendToCommentData", "m" ], + [ "appendToCommentData", "m" ], + [ "appendToCommentData", "e" ], + [ "appendToCommentData", "n" ], + [ "appendToCommentData", "t" ], + [ "appendToCommentData", " " ], + [ "appendToCommentData", "h" ], + [ "appendToCommentData", "e" ], + [ "appendToCommentData", "r" ], + [ "appendToCommentData", "e" ], + [ "appendToCommentData", " " ], + [ "finishComment" ] + ]); +}); + +QUnit.test('Doctype', function(assert) { + let tokens = tokenize(''); + assert.deepEqual(tokens, [ doctype('-//W3C//DTD HTML 4.01//EN', 'http://www.w3.org/TR/html4/strict.dtd') ], 'Standard HTML 4.01 Strict doctype'); + + tokens = tokenize(''); + assert.deepEqual(tokens, [ + doctype(), + startTag('html'), + startTag('body'), + endTag('body'), + endTag('html'), + ], 'DOCTYPE is included in tokens'); + + tokens = tokenize(''); + assert.deepEqual(tokens, [comment(' comment '), doctype()], 'DOCTYPE after comments is valid'); + + tokens = tokenize(''); + assert.deepEqual(tokens, [comment(' comment '), doctype()], 'DOCTYPE after comments is valid'); +}); + QUnit.test('Simple content', function(assert) { let tokens = tokenize('hello'); assert.deepEqual(tokens, [chars('hello')]); @@ -289,6 +437,25 @@ QUnit.test('An Emberish named arg invocation', function(assert) { assert.deepEqual(tokens, [startTag('@foo'), endTag('@foo')]); }); +QUnit.test('Parsing `; + + let tokens = tokenize(input); + assert.deepEqual(tokens, [ + doctype(), + startTag('html'), + startTag('head'), + startTag('script', [['src','/foo.js', true]]), + endTag('script'), + startTag('script', [['src','/bar.js', true]]), + endTag('script'), + startTag('script', [['src','/baz.js', true]]), + endTag('script'), + endTag('head'), + endTag('html'), + ]); +}); + QUnit.module('simple-html-tokenizer - preprocessing'); QUnit.test('Carriage returns are replaced with line feeds', function(assert) { @@ -392,6 +559,23 @@ function endTag(tagName: string): EndTag { }; } +function doctype(publicIdentifier?: string, systemIdentifier?: string): Doctype { + let doctype: Doctype = { + type: TokenType.Doctype, + name: 'html', + }; + + if (publicIdentifier) { + doctype.publicIdentifier = publicIdentifier; + } + + if (systemIdentifier) { + doctype.systemIdentifier = systemIdentifier; + } + + return doctype; +} + function locInfo( token: Token, startLine: number,