feat: 添加字库到映射表生成器

pengzhanbo · Mar 23, 2024 · f99a03b · f99a03b
1 parent 105c29d
commit f99a03b
Showing 1 changed file with 182 additions and 0 deletions.
diff --git a/scripts/generate.ts b/scripts/generate.ts
@@ -0,0 +1,182 @@
+/**
+ * 将原始数据转换为 `data` 目录下的文件
+ */
+
+import path from 'node:path'
+import fs from 'node:fs/promises'
+
+interface TransformData {
+  [key: string]: string[]
+}
+
+const from = path.resolve('data')
+const output = path.resolve('src/data')
+
+const files = {
+  /**
+   * 17k+ 的 繁 <-> 简 映射
+   * https://github.com/fighting41love/funNLP
+   */
+  full: 'full_t2s.txt',
+
+  /**
+   * 常用字 的 繁 <-> 简 映射
+   * 存在两份表是为了互为对照
+   */
+  s2t_c: 's2t_characters.txt',
+  t2s_c: 't2s_characters.txt',
+
+  /**
+   * 特殊短语 的 繁 <-> 简 映射
+   */
+  s2t_p: 's2t_phrases.txt',
+  t2s_p: 't2s_phrases.txt',
+
+  /**
+   * 1.x 版本中使用的映射表
+   */
+  old_words: 's2t-1.x.txt',
+
+  /**
+   * 最终产出文件
+   */
+  // 字符映射表
+  full_words: 'words.ts',
+  // 繁 -> 简 特殊短语 映射表
+  t_phrases: 'traditional-phrases.ts',
+  // 简 -> 繁 特殊短语 映射表
+  s_phrases: 'simplified-phrases.ts',
+}
+
+async function read(filename: string) {
+  const file = path.resolve(from, filename)
+  const content = await fs.readFile(file, 'utf-8')
+  return content
+}
+
+async function write(filename: string, content: any, after = '') {
+  const file = path.resolve(output, filename)
+  await fs.mkdir(path.dirname(file), { recursive: true })
+  await fs.writeFile(
+    file,
+    `export default ${JSON.stringify(content)}${after}`,
+    'utf8',
+  )
+}
+
+function transform(content: string, reverse = false, same = false): TransformData {
+  const lines = content.replace(/[\s\n]+$/, '').split(/\n+/)
+  const data: TransformData = {}
+  for (const line of lines) {
+    if (!line)
+      continue
+
+    const [key, ...values] = line.trim().split(/\s+/)
+    if (!key || key === '□')
+      continue
+
+    if (reverse) {
+      for (const value of values) {
+        if ((key === value && !same) || value === '□')
+          continue
+
+        data[value] ??= []
+        data[value].push(key)
+      }
+    }
+    else {
+      const res = values.filter(v => same ? true : v !== key)
+      if (res.length) {
+        data[key] ??= []
+        data[key].push(...res)
+      }
+    }
+  }
+  return data
+}
+
+function transformOld(oldWords: string): TransformData {
+  const data: TransformData = {}
+  const words = oldWords.replace(/[\s\n]+$/g, '').split(' ')
+  for (const word of words) {
+    const [key, ...values] = word.split('')
+    data[key] = values
+  }
+
+  return data
+}
+
+function merge(from: TransformData, to: TransformData) {
+  for (const [key, value] of Object.entries(from)) {
+    if (key in to)
+      to[key] = uniq([...to[key], ...value])
+    else
+      to[key] = value
+  }
+}
+
+function uniq(arr: string[]) {
+  return Array.from(new Set(arr))
+}
+
+function resolvePhrases(data: TransformData): [string, string] {
+  let phrases: [string, string][] = []
+
+  for (const [key, values] of Object.entries(data))
+    phrases.push([key, values[0]])
+
+  phrases = phrases.sort((a, b) => {
+    if (a[0].length === b[0].length)
+      return 0
+
+    return a[0].length < b[0].length ? 1 : -1
+  })
+
+  const sources: string[] = []
+  const targets: string[] = []
+
+  for (const [source, target] of phrases) {
+    sources.push(source)
+    targets.push(source === target ? '_' : target)
+  }
+
+  return [sources.join(' '), targets.join(' ')]
+}
+
+async function generate() {
+  const s2t = transform(await read(files.s2t_c))
+  const t2s = transform(await read(files.t2s_c), true)
+  const full = transform(await read(files.full), true)
+  const sp = transform(await read(files.s2t_p), false, true)
+  const tp = transform(await read(files.t2s_p), false, true)
+  const oldData = transformOld(await read(files.old_words))
+
+  const full_content: string[] = ['‘『', '’』', '“「', '”」', '″〞', '〓═']
+
+  // 对照 繁 -> 简 表，合并到 简 -> 繁 表
+  merge(t2s, s2t)
+
+  // 对照 full 表，合并到 简 -> 繁 表
+  for (const [key, values] of Object.entries(full)) {
+    if (!(key in s2t))
+      s2t[key] = values
+  }
+
+  // 合并旧数据
+  merge(oldData, s2t)
+
+  // 序列化内容
+  for (const [key, values] of Object.entries(s2t))
+    full_content.push(`${key}${values.join('')}`)
+
+  await write(files.full_words, `${full_content.join(' ')}`)
+  await write(files.t_phrases, resolvePhrases(tp), ' as readonly [string, string]')
+  await write(files.s_phrases, resolvePhrases(sp), ' as readonly [string, string]')
+}
+
+try {
+  await generate()
+}
+catch (e) {
+  console.error(e)
+}