Skip to content

Commit

Permalink
feat: introduce GrammarState (#712)
Browse files Browse the repository at this point in the history
  • Loading branch information
antfu committed Jun 28, 2024
1 parent 603713d commit 8a8faf9
Show file tree
Hide file tree
Showing 9 changed files with 418 additions and 16 deletions.
81 changes: 75 additions & 6 deletions packages/core/src/code-to-tokens-base.ts
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
/* ---------------------------------------------------------
* Copyright (C) Microsoft Corporation. All rights reserved.
*-------------------------------------------------------- */
import type { IGrammar, IRawThemeSetting } from './textmate'
import type { IGrammar, IRawThemeSetting, StateStack } from './textmate'
import { INITIAL } from './textmate'
import type { CodeToTokensBaseOptions, FontStyle, ShikiInternal, ThemeRegistrationResolved, ThemedToken, ThemedTokenScopeExplanation, TokenizeWithThemeOptions } from './types'
import { StackElementMetadata } from './stack-element-metadata'
import { applyColorReplacements, isNoneTheme, isPlainLang, resolveColorReplacements, splitLines } from './utils'
import { tokenizeAnsiWithTheme } from './code-to-tokens-ansi'
import { ShikiError } from './error'
import { GrammarState, getGrammarStack } from './grammar-state'

/**
* Code to tokens, with a simple theme.
Expand All @@ -30,9 +32,45 @@ export function codeToTokensBase(
return tokenizeAnsiWithTheme(theme, code, options)

const _grammar = internal.getLanguage(lang)

if (options.grammarState) {
if (options.grammarState.lang !== _grammar.name) {
throw new ShikiError(`Grammar state language "${options.grammarState.lang}" does not match highlight language "${_grammar.name}"`)
}
if (options.grammarState.theme !== themeName) {
throw new ShikiError(`Grammar state theme "${options.grammarState.theme}" does not match highlight theme "${themeName}"`)
}
}

return tokenizeWithTheme(code, _grammar, theme, colorMap, options)
}

export function getLastGrammarState(
internal: ShikiInternal,
code: string,
options: CodeToTokensBaseOptions = {},
): GrammarState {
const {
lang = 'text',
theme: themeName = internal.getLoadedThemes()[0],
} = options

if (isPlainLang(lang) || isNoneTheme(themeName))
throw new ShikiError('Plain language does not have grammar state')
if (lang === 'ansi')
throw new ShikiError('ANSI language does not have grammar state')

const { theme, colorMap } = internal.setTheme(themeName)

const _grammar = internal.getLanguage(lang)

return new GrammarState(
_tokenizeWithTheme(code, _grammar, theme, colorMap, options).stateStack,
_grammar.name,
theme.name,
)
}

/** for explanations */
interface ThemeSettingsSelectors {
settings: IRawThemeSetting
Expand All @@ -46,6 +84,19 @@ export function tokenizeWithTheme(
colorMap: string[],
options: TokenizeWithThemeOptions,
): ThemedToken[][] {
return _tokenizeWithTheme(code, grammar, theme, colorMap, options).tokens
}

function _tokenizeWithTheme(
code: string,
grammar: IGrammar,
theme: ThemeRegistrationResolved,
colorMap: string[],
options: TokenizeWithThemeOptions,
): {
tokens: ThemedToken[][]
stateStack: StateStack
} {
const colorReplacements = resolveColorReplacements(theme, options)

const {
Expand All @@ -55,7 +106,22 @@ export function tokenizeWithTheme(

const lines = splitLines(code)

let ruleStack = INITIAL
let stateStack = options.grammarState
? getGrammarStack(options.grammarState)
: options.grammarContextCode != null
? _tokenizeWithTheme(
options.grammarContextCode,
grammar,
theme,
colorMap,
{
...options,
grammarState: undefined,
grammarContextCode: undefined,
},
).stateStack
: INITIAL

let actual: ThemedToken[] = []
const final: ThemedToken[][] = []

Expand Down Expand Up @@ -106,12 +172,12 @@ export function tokenizeWithTheme(
let tokensWithScopesIndex

if (options.includeExplanation) {
resultWithScopes = grammar.tokenizeLine(line, ruleStack)
resultWithScopes = grammar.tokenizeLine(line, stateStack)
tokensWithScopes = resultWithScopes.tokens
tokensWithScopesIndex = 0
}

const result = grammar.tokenizeLine2(line, ruleStack, tokenizeTimeLimit)
const result = grammar.tokenizeLine2(line, stateStack, tokenizeTimeLimit)

const tokensLength = result.tokens.length / 2
for (let j = 0; j < tokensLength; j++) {
Expand Down Expand Up @@ -158,10 +224,13 @@ export function tokenizeWithTheme(
}
final.push(actual)
actual = []
ruleStack = result.ruleStack
stateStack = result.ruleStack
}

return final
return {
tokens: final,
stateStack,
}
}

function explainThemeScopes(
Expand Down
54 changes: 54 additions & 0 deletions packages/core/src/grammar-state.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import type { StateStackImpl } from '../vendor/vscode-textmate/src/grammar'
import { ShikiError } from './error'
import type { StateStack } from './textmate'

/**
* GrammarState is a special reference object that holds the state of a grammar.
*
* It's used to highlight code snippets that are part of the target language.
*/
export class GrammarState {
constructor(
private _stack: StateStack,
public lang: string,
public theme: string,
) {}

get scopes() {
return getScopes(this._stack as StateStackImpl)
}

toJSON() {
return {
lang: this.lang,
theme: this.theme,
scopes: this.scopes,
}
}
}

function getScopes(stack: StateStackImpl) {
const scopes: string[] = []
const visited = new Set<StateStackImpl>()

function pushScope(stack: StateStackImpl) {
if (visited.has(stack))
return
visited.add(stack)
const name = stack?.nameScopesList?.scopeName
if (name)
scopes.push(name)
if (stack.parent)
pushScope(stack.parent)
}

pushScope(stack)
return scopes
}

export function getGrammarStack(state: GrammarState) {
if (!(state instanceof GrammarState))
throw new ShikiError('Invalid grammar state')
// @ts-expect-error _stack is private
return state._stack
}
3 changes: 2 additions & 1 deletion packages/core/src/highlighter.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import { codeToHast } from './code-to-hast'
import { codeToHtml } from './code-to-html'
import { codeToTokens } from './code-to-tokens'
import { codeToTokensBase } from './code-to-tokens-base'
import { codeToTokensBase, getLastGrammarState } from './code-to-tokens-base'
import { codeToTokensWithThemes } from './code-to-tokens-themes'
import { createShikiInternal } from './internal'
import type { HighlighterCore, HighlighterCoreOptions } from './types'
Expand All @@ -16,6 +16,7 @@ export async function createHighlighterCore(options: HighlighterCoreOptions = {}
const internal = await createShikiInternal(options)

return {
getLastGrammarState: (code, options) => getLastGrammarState(internal, code, options),
codeToTokensBase: (code, options) => codeToTokensBase(internal, code, options),
codeToTokensWithThemes: (code, options) => codeToTokensWithThemes(internal, code, options),
codeToTokens: (code, options) => codeToTokens(internal, code, options),
Expand Down
11 changes: 6 additions & 5 deletions packages/core/src/registry.ts
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import type { IGrammar, IGrammarConfiguration, IRawTheme } from './textmate'
import type { IGrammarConfiguration, IRawTheme } from './textmate'
import { Registry as TextMateRegistry, Theme as TextMateTheme } from './textmate'
import type { LanguageRegistration, ThemeRegistrationAny, ThemeRegistrationResolved } from './types'
import type { Grammar, LanguageRegistration, ThemeRegistrationAny, ThemeRegistrationResolved } from './types'
import type { Resolver } from './resolver'
import { normalizeTheme } from './normalize'
import { ShikiError } from './error'

export class Registry extends TextMateRegistry {
private _resolvedThemes: Map<string, ThemeRegistrationResolved> = new Map()
private _resolvedGrammars: Map<string, IGrammar> = new Map()
private _resolvedGrammars: Map<string, Grammar> = new Map()
private _langMap: Map<string, LanguageRegistration> = new Map()
private _langGraph: Map<string, LanguageRegistration> = new Map()

Expand Down Expand Up @@ -97,8 +97,9 @@ export class Registry extends TextMateRegistry {

// @ts-expect-error Private members, set this to override the previous grammar (that can be a stub)
this._syncRegistry._rawGrammars.set(lang.scopeName, lang)
const g = await this.loadGrammarWithConfiguration(lang.scopeName, 1, grammarConfig)
this._resolvedGrammars.set(lang.name, g!)
const g = await this.loadGrammarWithConfiguration(lang.scopeName, 1, grammarConfig) as Grammar
g.name = lang.name
this._resolvedGrammars.set(lang.name, g)
if (lang.aliases) {
lang.aliases.forEach((alias) => {
this._alias[alias] = lang.name
Expand Down
10 changes: 9 additions & 1 deletion packages/core/src/types/highlighter.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import type { Root } from 'hast'
import type { Grammar } from './textmate'
import type { LanguageInput, LanguageRegistration, ResolveBundleKey, SpecialLanguage } from './langs'
import type { SpecialTheme, ThemeInput, ThemeRegistrationAny, ThemeRegistrationResolved } from './themes'
import type { CodeToTokensBaseOptions, CodeToTokensOptions, CodeToTokensWithThemesOptions, ThemedToken, ThemedTokenWithVariants, TokensResult } from './tokens'
import type { CodeToTokensBaseOptions, CodeToTokensOptions, CodeToTokensWithThemesOptions, GrammarState, ThemedToken, ThemedTokenWithVariants, TokensResult } from './tokens'
import type { CodeToHastOptions } from './options'

/**
Expand Down Expand Up @@ -104,6 +104,14 @@ export interface HighlighterGeneric<BundledLangKeys extends string, BundledTheme
code: string,
options: CodeToTokensWithThemesOptions<ResolveBundleKey<BundledLangKeys>, ResolveBundleKey<BundledThemeKeys>>
) => ThemedTokenWithVariants[][]
/**
* Get the last grammar state of a code snippet.
* You can pass the grammar state to `codeToTokens` as `grammarState` to continue tokenizing from an intermediate state.
*/
getLastGrammarState: (
code: string,
options: CodeToTokensBaseOptions<ResolveBundleKey<BundledLangKeys>, ResolveBundleKey<BundledThemeKeys>>
) => GrammarState

/**
* Get internal context object
Expand Down
2 changes: 1 addition & 1 deletion packages/core/src/types/options.ts
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ export interface CodeToHastOptionsCommon<Languages extends string = string>
extends
TransformerOptions,
DecorationOptions,
Pick<TokenizeWithThemeOptions, 'colorReplacements' | 'tokenizeMaxLineLength' | 'tokenizeTimeLimit'> {
Pick<TokenizeWithThemeOptions, 'colorReplacements' | 'tokenizeMaxLineLength' | 'tokenizeTimeLimit' | 'grammarState' | 'grammarContextCode'> {

lang: StringLiteralUnion<Languages | SpecialLanguage>

Expand Down
7 changes: 5 additions & 2 deletions packages/core/src/types/textmate.ts
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
import type {
IGrammar as Grammar,
IGrammar,
IRawGrammar as RawGrammar,
IRawTheme as RawTheme,
IRawThemeSetting as RawThemeSetting,
} from '../textmate'

export type {
Grammar,
RawGrammar,
RawTheme,
RawThemeSetting,
}

export interface Grammar extends IGrammar {
name: string
}
18 changes: 18 additions & 0 deletions packages/core/src/types/tokens.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import type { GrammarState } from '../grammar-state'
import type { SpecialLanguage } from './langs'
import type { SpecialTheme, ThemeRegistrationAny } from './themes'
import type { CodeOptionsThemes } from './options'

export type { GrammarState }

export interface CodeToTokensBaseOptions<Languages extends string = string, Themes extends string = string> extends TokenizeWithThemeOptions {
lang?: Languages | SpecialLanguage
theme?: Themes | ThemeRegistrationAny | SpecialTheme
Expand Down Expand Up @@ -172,6 +175,21 @@ export interface TokenizeWithThemeOptions {
* @default 500 (0.5s)
*/
tokenizeTimeLimit?: number

/**
* Represent the state of the grammar, allowing to continue tokenizing from a intermediate grammar state.
*
* You can get the grammar state from `getLastGrammarState`.
*/
grammarState?: GrammarState

/**
* The code context of the grammar.
* Consider it a prepended code to the input code, that only participate the grammar inference but not presented in the final output.
*
* This will be ignored if `grammarState` is provided.
*/
grammarContextCode?: string
}

/**
Expand Down
Loading

0 comments on commit 8a8faf9

Please sign in to comment.