feat: initial commit

2026-01-01 03:40:41 +08:00
commit 631f8ed771
98 changed files with 14776 additions and 0 deletions
@@ -0,0 +1 @@
+../../LICENSE
@@ -0,0 +1,72 @@
+# `@maigolabs/needle`
+
+Fuzzy search engine for small text pieces, with Chinese/Japanese pronunciation support.
+
+See also [in-browser demo](https://needle.maigo.dev).
+
+## Install
+
+Dictionaries are installed as dependencies of the package, but if you don't use the indexer, they could be tree-shaken when bundling.
+
+```bash
+pnpm install @maigolabs/needle
+```
+
+## Usage
+
+### Indexing
+
+NeedLe uses Kuromoji for Japanese tokenization, which loads dictionaries dynamically. You need to create a Kuromoji `TokenizerBuilder` first:
+
+```ts
+// In Node.js you can just load the dictionary from the file system.
+
+import { TokenizerBuilder } from '@patdx/kuromoji';
+import NodeDictionaryLoader from '@patdx/kuromoji/node';
+
+const kuromojiDictPath = path.resolve(url.fileURLToPath(import.meta.resolve('@patdx/kuromoji')), '..', '..', 'dict');
+const kuromoji = await new TokenizerBuilder({ loader: new NodeDictionaryLoader({ dic_path: kuromojiDictPath }) }).build();
+
+// In browser you need to provide a custom loader to load the dictionary files with fetch().
+
+import { TokenizerBuilder } from '@patdx/kuromoji';
+
+// You can load dict files from CDN (See also the README of https://github.com/patdx/kuromoji.js)
+const kuromoji = await new TokenizerBuilder({
+  loader: {
+    loadArrayBuffer: async (url: string) => {
+      url = `https://cdn.jsdelivr.net/npm/@aiktb/kuromoji@1.0.2/dict/${url.replace('.gz', '')}`;
+      const res = await fetch(url);
+      if (!res.ok) throw new Error(`Failed to fetch ${url}`);
+      return await res.arrayBuffer();
+    },
+  },
+}).build();
+```
+
+After creating the Kuromoji instance, you can build the inverted index:
+
+```ts
+import { buildInvertedIndex } from '@maigolabs/needle/indexer';
+
+const documents = ['你好世界', 'こんにちは'];
+const compressedIndex = buildInvertedIndex(documents, { kuromoji });
+
+// The built index could be stored for later use.
+const json = JSON.stringify(compressedIndex);
+```
+
+### Searching
+
+If you only import the searcher in your frontend code, indexer and dictionary-related dependencies will be tree-shaken.
+
+```ts
+import { loadInvertedIndex, searchInvertedIndex } from '@maigolabs/needle/searcher';
+
+const loadedIndex = loadInvertedIndex(compressedIndex);
+const results = searchInvertedIndex(loadedIndex, 'sekai');
+for (const result of results) console.log(`${result.documentText} (${(result.matchRatio * 100).toFixed(0)}%)`);
+// → 你好世界 (50%)
+```
+
+To highlight the search result, see also `highlightSearchResult`.
@@ -0,0 +1,18 @@
+import type { Config } from 'jest';
+
+const config: Config = {
+  preset: 'ts-jest/presets/default-esm',
+  testEnvironment: 'node',
+  extensionsToTreatAsEsm: ['.ts'],
+  moduleNameMapper: {
+    '^(\\.{1,2}/.*)\\.js$': '$1',
+  },
+  transform: {
+    '^.+\\.tsx?$': ['ts-jest', { useESM: true }],
+  },
+  testMatch: ['**/*.test.ts'],
+  testTimeout: 30000,
+};
+
+export default config;
+
@@ -0,0 +1,84 @@
+{
+  "name": "@maigolabs/needle",
+  "version": "1.0.1",
+  "description": "Fuzzy search engine for small text pieces, with Chinese/Japanese pronunciation support.",
+  "type": "module",
+  "main": "./src/index.ts",
+  "scripts": {
+    "build": "tsdown",
+    "typecheck": "tsc",
+    "test": "cross-env NODE_OPTIONS=--experimental-vm-modules jest",
+    "prepare": "pnpm run build"
+  },
+  "license": "AGPL-3.0",
+  "homepage": "https://needle.maigo.dev",
+  "repository": {
+    "type": "git",
+    "url": "git+https://github.com/MaigoLabs/needLe.git",
+    "directory": "packages/needle"
+  },
+  "bugs": "https://github.com/MaigoLabs/needLe/issues",
+  "keywords": [
+    "needle",
+    "search",
+    "fuzzy",
+    "cjk",
+    "chinese",
+    "japanese",
+    "pinyin",
+    "romaji"
+  ],
+  "author": "Menci <mencici@msn.com>",
+  "sideEffects": false,
+  "exports": {
+    ".": "./src/index.ts",
+    "./common": "./src/common/index.ts",
+    "./indexer": "./src/indexer/index.ts",
+    "./searcher": "./src/searcher/index.ts",
+    "./package.json": "./package.json"
+  },
+  "packageManager": "pnpm@10.20.0",
+  "dependencies": {
+    "@patdx/kuromoji": "^1.0.4",
+    "hepburn": "^1.2.2",
+    "opencc-js": "^1.0.5",
+    "pinyin-pro": "^3.27.0"
+  },
+  "devDependencies": {
+    "@types/hepburn": "^1.2.2",
+    "@types/jest": "^30.0.0",
+    "@types/opencc-js": "^1.0.3",
+    "jest": "^30.2.0",
+    "ts-jest": "^29.4.6"
+  },
+  "files": [
+    "README.md",
+    "dist",
+    "package.json"
+  ],
+  "publishConfig": {
+    "access": "public",
+    "main": "./dist/index.mjs",
+    "module": "./dist/index.mjs",
+    "types": "./dist/index.d.mts",
+    "exports": {
+      ".": {
+       "types": "./dist/index.d.mts",
+       "default": "./dist/index.mjs"
+      },
+      "./common": {
+       "types": "./dist/common/index.d.mts",
+       "default": "./dist/common/index.mjs"
+      },
+      "./indexer": {
+       "types": "./dist/indexer/index.d.mts",
+       "default": "./dist/indexer/index.mjs"
+      },
+      "./searcher": {
+       "types": "./dist/searcher/index.d.mts",
+       "default": "./dist/searcher/index.mjs"
+      },
+      "./package.json": "./package.json"
+    }
+  }
+}
@@ -0,0 +1,4 @@
+export * from './types';
+export * from './utils';
+export * from './normalize';
+export * from './trie';
@@ -0,0 +1,60 @@
+import { normalizeByCodePoint, toKatakana } from './normalize';
+
+describe('toKatakana', () => {
+  it('should convert hiragana to katakana', () => {
+    expect(toKatakana('あいうえお')).toBe('アイウエオ');
+    expect(toKatakana('かきくけこ')).toBe('カキクケコ');
+    expect(toKatakana('さしすせそ')).toBe('サシスセソ');
+  });
+
+  it('should keep katakana unchanged', () => {
+    expect(toKatakana('アイウエオ')).toBe('アイウエオ');
+  });
+
+  it('should keep non-kana characters unchanged', () => {
+    expect(toKatakana('abc123')).toBe('abc123');
+    expect(toKatakana('漢字')).toBe('漢字');
+  });
+
+  it('should handle mixed input', () => {
+    expect(toKatakana('あアa漢')).toBe('アアa漢');
+  });
+});
+
+describe('normalizeByCodePoint', () => {
+  it('should convert fullwidth ASCII to halfwidth lowercase', () => {
+    expect(normalizeByCodePoint('ＡＢＣ')).toBe('abc');
+    expect(normalizeByCodePoint('１２３')).toBe('123');
+    expect(normalizeByCodePoint('！＠＃')).toBe('!@#');
+  });
+
+  it('should convert fullwidth space to halfwidth space', () => {
+    expect(normalizeByCodePoint('　')).toBe(' ');
+  });
+
+  it('should convert halfwidth kana to fullwidth kana', () => {
+    expect(normalizeByCodePoint('ｱｲｳｴｵ')).toBe('アイウエオ');
+    expect(normalizeByCodePoint('ｶｷｸｹｺ')).toBe('カキクケコ');
+  });
+
+  it('should normalize voiced/semi-voiced sound marks', () => {
+    expect(normalizeByCodePoint('ﾞ')).toBe('\u3099'); // halfwidth voiced -> combining
+    expect(normalizeByCodePoint('ﾟ')).toBe('\u309A'); // halfwidth semi-voiced -> combining
+    expect(normalizeByCodePoint('゛')).toBe('\u3099'); // fullwidth voiced -> combining
+    expect(normalizeByCodePoint('゜')).toBe('\u309A'); // fullwidth semi-voiced -> combining
+  });
+
+  it('should convert halfwidth punctuation to fullwidth', () => {
+    expect(normalizeByCodePoint('｡')).toBe('。');
+    expect(normalizeByCodePoint('｢')).toBe('「');
+    expect(normalizeByCodePoint('｣')).toBe('」');
+    expect(normalizeByCodePoint('､')).toBe('、');
+    expect(normalizeByCodePoint('･')).toBe('・');
+  });
+
+  it('should lowercase regular ASCII', () => {
+    expect(normalizeByCodePoint('ABC')).toBe('abc');
+  });
+
+  // Should keep hiragana unchanged
+});
@@ -0,0 +1,42 @@
+export const normalizeByCodePoint = (string: string) => [...string].map(normalizeCodePoint).join('');
+
+export const normalizeCodePoint = (char: string) => {
+  const codePoint = char.codePointAt(0)!;
+  // Fullwidth ASCII -> Halfwidth ASCII
+  if (codePoint >= 0xFF01 && codePoint <= 0xFF5E) return String.fromCodePoint(codePoint - 0xFEE0).toLowerCase();
+  // Fullwidth space -> Halfwidth space
+  else if (codePoint === /* '　' */ 0x3000) return ' ';
+  // Halfwidth kana (U+FF66 - U+FF9D) -> Fullwidth kana
+  else if (codePoint >= 0xFF66 && codePoint <= 0xFF9D) return HALF_TO_FULL_KANA[char] ?? char;
+  else if (codePoint === /* '｡' */ 0xFF61) return '。';
+  else if (codePoint === /* '｢' */ 0xFF62) return '「';
+  else if (codePoint === /* '｣' */ 0xFF63) return '」';
+  else if (codePoint === /* '､' */ 0xFF64) return '、';
+  else if (codePoint === /* '･' */ 0xFF65) return '・';
+  else if (codePoint === /* 'ﾞ' */ 0xFF9E || codePoint === /* '゛' */ 0x309B) return '\u3099'; // -> COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK
+  else if (codePoint === /* 'ﾟ' */ 0xFF9F || codePoint === /* '゜' */ 0x309C) return '\u309A'; // -> COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
+  else return char.toLowerCase();
+};
+
+const HALF_TO_FULL_KANA: Record<string, string> = {
+  'ｦ': 'ヲ', 'ｧ': 'ァ', 'ｨ': 'ィ', 'ｩ': 'ゥ', 'ｪ': 'ェ', 'ｫ': 'ォ',
+  'ｬ': 'ャ', 'ｭ': 'ュ', 'ｮ': 'ョ', 'ｯ': 'ッ',
+  'ｰ': 'ー',
+  'ｱ': 'ア', 'ｲ': 'イ', 'ｳ': 'ウ', 'ｴ': 'エ', 'ｵ': 'オ',
+  'ｶ': 'カ', 'ｷ': 'キ', 'ｸ': 'ク', 'ｹ': 'ケ', 'ｺ': 'コ',
+  'ｻ': 'サ', 'ｼ': 'シ', 'ｽ': 'ス', 'ｾ': 'セ', 'ｿ': 'ソ',
+  'ﾀ': 'タ', 'ﾁ': 'チ', 'ﾂ': 'ツ', 'ﾃ': 'テ', 'ﾄ': 'ト',
+  'ﾅ': 'ナ', 'ﾆ': 'ニ', 'ﾇ': 'ヌ', 'ﾈ': 'ネ', 'ﾉ': 'ノ',
+  'ﾊ': 'ハ', 'ﾋ': 'ヒ', 'ﾌ': 'フ', 'ﾍ': 'ヘ', 'ﾎ': 'ホ',
+  'ﾏ': 'マ', 'ﾐ': 'ミ', 'ﾑ': 'ム', 'ﾒ': 'メ', 'ﾓ': 'モ',
+  'ﾔ': 'ヤ', 'ﾕ': 'ユ', 'ﾖ': 'ヨ',
+  'ﾗ': 'ラ', 'ﾘ': 'リ', 'ﾙ': 'ル', 'ﾚ': 'レ', 'ﾛ': 'ロ',
+  'ﾜ': 'ワ', 'ﾝ': 'ン',
+};
+
+const isHiraganaRange = (charCode: number) => (charCode >= 0x3041 && charCode <= 0x3096) || (charCode >= 0x309D && charCode <= 0x309E);
+export const toKatakanaSingle = (char: string) => {
+  const code = char.charCodeAt(0);
+  return isHiraganaRange(code) ? String.fromCharCode(code + 0x60) : char;
+};
+export const toKatakana = (string: string) => [...string].map(toKatakanaSingle).join('');
@@ -0,0 +1,17 @@
+export interface TrieNode {
+  parent: TrieNode | undefined;
+  children: Map<number, TrieNode>; // Unicode code point -> child node
+  tokenIds: number[];
+  subTreeTokenIds: number[]; // Empty on root. Will Uint16Array be faster?
+}
+
+export const traverseTrieStep = (node: TrieNode | undefined, codePoint: string, ignorableCodePoints?: RegExp) =>
+  node?.children.get(codePoint.codePointAt(0)!) ?? (ignorableCodePoints?.test(codePoint) ? node : undefined);
+export const traverseTrie = (node: TrieNode | undefined, text: string, ignorableCodePoints?: RegExp) => {
+  if (!node) return;
+  for (const codePoint of text) {
+    node = traverseTrieStep(node, codePoint, ignorableCodePoints);
+    if (!node) return;
+  }
+  return node;
+};
@@ -0,0 +1,31 @@
+export enum TokenType {
+  Raw,
+  Kana,
+  Romaji,
+  Han,
+  Pinyin,
+}
+
+export interface TokenDefinition {
+  id: number;
+  type: TokenType;
+  text: string;
+  codePointLength: number;
+}
+
+// [start, end)
+export interface OffsetSpan {
+  start: number;
+  end: number;
+}
+
+export type CompressedInvertedIndex = {
+  documents: string[];
+  tokenTypes: TokenType[];
+  tokenReferences: number[][][]; // tokenId -> [documentId, start1, end1, start2, end2, ...][]
+  tries: {
+    romaji: number[];
+    kana: number[];
+    other: number[];
+  };
+};
@@ -0,0 +1,3 @@
+import type { OffsetSpan } from './types';
+
+export const getSpanLength = (offset: OffsetSpan) => offset.end - offset.start;
@@ -0,0 +1,73 @@
+import path from 'node:path';
+import url from 'node:url';
+
+import { TokenizerBuilder } from '@patdx/kuromoji';
+import NodeDictionaryLoader from '@patdx/kuromoji/node';
+
+import { buildInvertedIndex, type KuromojiTokenizer } from '../indexer';
+import { highlightSearchResult, loadInvertedIndex, searchInvertedIndex } from '../searcher';
+
+let kuromoji: KuromojiTokenizer;
+
+beforeAll(async () => {
+  const kuromojiDictPath = path.resolve(url.fileURLToPath(import.meta.resolve('@patdx/kuromoji')), '..', '..', 'dict');
+  kuromoji = await new TokenizerBuilder({ loader: new NodeDictionaryLoader({ dic_path: kuromojiDictPath }) }).build();
+});
+
+describe('search', () => {
+  const testDocuments = [
+    'ミーティア',
+    'エンドマークに希望と涙を添えて',
+    '宵の鳥',
+    '僕の和風本当上手',
+  ];
+
+  it('should match with mixed search query', () => {
+    const compressed = buildInvertedIndex(testDocuments, { kuromoji });
+    const invertedIndex = loadInvertedIndex(compressed);
+
+    const results = searchInvertedIndex(invertedIndex, 'bokunoh风じょう');
+
+    // Should have at least one result
+    expect(results.length).toBeGreaterThan(0);
+
+    // The first result should be "僕の和風本当上手"
+    expect(results[0]!.documentText).toBe('僕の和風本当上手');
+  });
+
+  it('should highlight search result correctly', () => {
+    const compressed = buildInvertedIndex(testDocuments, { kuromoji });
+    const invertedIndex = loadInvertedIndex(compressed);
+
+    const results = searchInvertedIndex(invertedIndex, 'bokunoh风じょう');
+    expect(results.length).toBeGreaterThan(0);
+
+    const highlighted = highlightSearchResult(results[0]!);
+
+    // Should be an array of parts
+    expect(Array.isArray(highlighted)).toBe(true);
+    expect(highlighted.length).toBeGreaterThan(0);
+
+    // Collect highlighted text
+    const highlightedTexts = highlighted
+      .filter((part): part is { highlight: string } => typeof part !== 'string')
+      .map(part => part.highlight);
+
+    expect(highlightedTexts.some(text => text.includes('僕'))).toBe(true);
+    expect(highlightedTexts.some(text => text.includes('の'))).toBe(true);
+    expect(highlightedTexts.some(text => text.includes('和'))).toBe(true);
+    expect(highlightedTexts.some(text => text.includes('風'))).toBe(true);
+    expect(highlightedTexts.some(text => text.includes('上'))).toBe(true);
+  });
+
+  it('should match romaji input to kana documents', () => {
+    const compressed = buildInvertedIndex(testDocuments, { kuromoji });
+    const invertedIndex = loadInvertedIndex(compressed);
+
+    // Search for "yoi" should match "宵の鳥"
+    const results = searchInvertedIndex(invertedIndex, 'yoi');
+    const matchedTexts = results.map(r => r.documentText);
+
+    expect(matchedTexts).toContain('宵の鳥');
+  });
+});
@@ -0,0 +1,111 @@
+import { traverseTrie } from '../common';
+import { buildTrie, serializeTrie } from '../indexer/trie';
+import { deserializeTrie } from '../searcher/trie';
+
+describe('Trie building', () => {
+  it('should build a Trie with multiple different tokens', () => {
+    const trie = buildTrie([
+      [0, 'hello'],
+      [1, 'help'],
+      [2, 'world'],
+      [3, 'word'],
+    ]);
+
+    // Traverse to verify structure
+    const helloNode = traverseTrie(trie, 'hello');
+    const helpNode = traverseTrie(trie, 'help');
+    const worldNode = traverseTrie(trie, 'world');
+    const wordNode = traverseTrie(trie, 'word');
+
+    expect(helloNode).toBeDefined();
+    expect(helpNode).toBeDefined();
+    expect(worldNode).toBeDefined();
+    expect(wordNode).toBeDefined();
+
+    // Check token IDs
+    expect(helloNode!.tokenIds).toContain(0);
+    expect(helpNode!.tokenIds).toContain(1);
+    expect(worldNode!.tokenIds).toContain(2);
+    expect(wordNode!.tokenIds).toContain(3);
+
+    // Check that 'hel' prefix node has both tokens in subTree
+    const helNode = traverseTrie(trie, 'hel');
+    expect(helNode).toBeDefined();
+    expect(helNode!.subTreeTokenIds).toContain(0);
+    expect(helNode!.subTreeTokenIds).toContain(1);
+  });
+
+  it('should handle Japanese text tokens', () => {
+    const trie = buildTrie([
+      [0, 'さくら'],
+      [1, 'サクラ'],
+      [2, '桜'],
+    ]);
+
+    expect(traverseTrie(trie, 'さくら')?.tokenIds).toContain(0);
+    expect(traverseTrie(trie, 'サクラ')?.tokenIds).toContain(1);
+    expect(traverseTrie(trie, '桜')?.tokenIds).toContain(2);
+  });
+});
+
+describe('Trie serialization and deserialization', () => {
+  it('should serialize and deserialize a Trie correctly', () => {
+    const originalTrie = buildTrie([
+      [0, 'apple'],
+      [1, 'app'],
+      [2, 'banana'],
+    ]);
+
+    // Serialize
+    const serialized = serializeTrie(originalTrie);
+    expect(Array.isArray(serialized)).toBe(true);
+    expect(serialized.length).toBeGreaterThan(0);
+
+    // Deserialize
+    const { root: deserializedTrie, tokenCodePoints } = deserializeTrie(serialized);
+
+    // Verify structure is preserved
+    const appleNode = traverseTrie(deserializedTrie, 'apple');
+    const appNode = traverseTrie(deserializedTrie, 'app');
+    const bananaNode = traverseTrie(deserializedTrie, 'banana');
+
+    expect(appleNode).toBeDefined();
+    expect(appNode).toBeDefined();
+    expect(bananaNode).toBeDefined();
+
+    expect(appleNode!.tokenIds).toContain(0);
+    expect(appNode!.tokenIds).toContain(1);
+    expect(bananaNode!.tokenIds).toContain(2);
+
+    // Verify tokenCodePoints map
+    expect(tokenCodePoints.get(0)?.join('')).toBe('apple');
+    expect(tokenCodePoints.get(1)?.join('')).toBe('app');
+    expect(tokenCodePoints.get(2)?.join('')).toBe('banana');
+
+    // Verify subTreeTokenIds are reconstructed
+    expect(appNode!.subTreeTokenIds).toContain(0);
+    expect(appNode!.subTreeTokenIds).toContain(1);
+  });
+
+  it('should preserve parent references after deserialization', () => {
+    const originalTrie = buildTrie([
+      [0, 'test'],
+    ]);
+
+    const serialized = serializeTrie(originalTrie);
+    const { root } = deserializeTrie(serialized);
+
+    const testNode = traverseTrie(root, 'test');
+    expect(testNode).toBeDefined();
+
+    // Walk back to root via parent references
+    let node = testNode;
+    let depth = 0;
+    while (node?.parent) {
+      node = node.parent;
+      depth++;
+    }
+    expect(depth).toBe(4); // 't' -> 'e' -> 's' -> 't' -> root
+    expect(node).toBe(root);
+  });
+});
@@ -0,0 +1,3 @@
+export * from './common';
+export * from './indexer';
+export * from './searcher';
@@ -0,0 +1,103 @@
+import { getHanVariants, getPinyinCandidates, isHanCharacter, unionFindSet } from './han';
+
+describe('unionFindSet', () => {
+  it('should find self as root initially', () => {
+    const ufs = unionFindSet<number>();
+    expect(ufs.find(1)).toBe(1);
+    expect(ufs.find(2)).toBe(2);
+  });
+
+  it('should union two elements', () => {
+    const ufs = unionFindSet<number>();
+    ufs.union(1, 2);
+    expect(ufs.find(1)).toBe(ufs.find(2));
+  });
+
+  it('should union multiple elements transitively', () => {
+    const ufs = unionFindSet<number>();
+    ufs.union(1, 2);
+    ufs.union(2, 3);
+    ufs.union(4, 5);
+    expect(ufs.find(1)).toBe(ufs.find(3));
+    expect(ufs.find(1)).not.toBe(ufs.find(4));
+    ufs.union(3, 4);
+    expect(ufs.find(1)).toBe(ufs.find(5));
+  });
+
+  it('should iterate all keys', () => {
+    const ufs = unionFindSet<string>();
+    ufs.union('a', 'b');
+    ufs.union('c', 'd');
+    const keys = [...ufs.keys()];
+    expect(keys).toContain('a');
+    expect(keys).toContain('b');
+    expect(keys).toContain('c');
+    expect(keys).toContain('d');
+  });
+});
+
+describe('isHanCharacter', () => {
+  it('should return true for CJK characters', () => {
+    expect(isHanCharacter('中')).toBe(true);
+    expect(isHanCharacter('国')).toBe(true);
+    expect(isHanCharacter('日')).toBe(true);
+    expect(isHanCharacter('本')).toBe(true);
+  });
+
+  it('should return false for non-CJK characters', () => {
+    expect(isHanCharacter('a')).toBe(false);
+    expect(isHanCharacter('あ')).toBe(false);
+    expect(isHanCharacter('ア')).toBe(false);
+    expect(isHanCharacter('1')).toBe(false);
+  });
+});
+
+describe('getHanVariants', () => {
+  it('should return variants for simplified/traditional characters', () => {
+    // 国 (simplified) and 國 (traditional) should be variants of each other
+    const variants1 = getHanVariants('国');
+    const variants2 = getHanVariants('國');
+    expect(variants1).toContain('国');
+    expect(variants1).toContain('國');
+    expect(variants2).toContain('国');
+    expect(variants2).toContain('國');
+  });
+
+  it('should return the character itself for characters without variants', () => {
+    const variants = getHanVariants('一');
+    expect(variants).toContain('一');
+  });
+
+  it('should return empty array for non-Han characters', () => {
+    expect(getHanVariants('a')).toEqual([]);
+    expect(getHanVariants('あ')).toEqual([]);
+  });
+});
+
+describe('getPinyinCandidates', () => {
+  it('should return pinyin for a Han character', () => {
+    const candidates = getPinyinCandidates('中');
+    expect(candidates).toContain('zhong');
+    expect(candidates).toContain('zh'); // initial
+    expect(candidates).toContain('z'); // first letter
+  });
+
+  it('should return multiple pinyin for polyphonic characters', () => {
+    // 行 can be "xing" or "hang"
+    const candidates = getPinyinCandidates('行');
+    expect(candidates).toContain('xing');
+    expect(candidates).toContain('hang');
+  });
+
+  it('should include fuzzy pinyin variants', () => {
+    // 风 is "feng", should also have fuzzy variant "fen"
+    const candidates = getPinyinCandidates('风');
+    expect(candidates).toContain('feng');
+    expect(candidates).toContain('fen'); // fuzzy: eng -> en
+  });
+
+  it('should return empty array for non-Han characters', () => {
+    expect(getPinyinCandidates('a')).toEqual([]);
+    expect(getPinyinCandidates('あ')).toEqual([]);
+  });
+});
@@ -0,0 +1,85 @@
+// @ts-expect-error No declaration file
+import hkVariants from 'opencc-js/dict/HKVariants';
+// @ts-expect-error No declaration file
+import hkVariantsRev from 'opencc-js/dict/HKVariantsRev';
+// @ts-expect-error No declaration file
+import jpVariants from 'opencc-js/dict/JPVariants';
+// @ts-expect-error No declaration file
+import jpVariantsRev from 'opencc-js/dict/JPVariantsRev';
+// @ts-expect-error No declaration file
+import stCharacters from 'opencc-js/dict/STCharacters';
+// @ts-expect-error No declaration file
+import tsCharacters from 'opencc-js/dict/TSCharacters';
+// @ts-expect-error No declaration file
+import twVariants from 'opencc-js/dict/TWVariants';
+// @ts-expect-error No declaration file
+import twVariantsRev from 'opencc-js/dict/TWVariantsRev';
+import { polyphonic } from 'pinyin-pro';
+
+export const unionFindSet = <T>() => {
+  const parent = new Map<T, T>();
+  const rank = new Map<T, number>();
+  const find = (x: T): T => {
+    const p = parent.get(x);
+    if (p == null) {
+      parent.set(x, x);
+      return x;
+    } else if (p === x) return x;
+    else {
+      const root = find(p);
+      parent.set(x, root);
+      return root;
+    }
+  };
+  const union = (x: T, y: T) => {
+    x = find(x);
+    y = find(y);
+    if (x === y) return;
+    const rankX = rank.get(x) ?? 0, rankY = rank.get(y) ?? 0;
+    if (rankX < rankY) parent.set(x, y);
+    else if (rankX > rankY) parent.set(y, x);
+    else {
+      parent.set(y, x);
+      rank.set(x, rankX + 1);
+    }
+  };
+  const keys = () => parent.keys();
+  return { find, union, keys };
+};
+
+const exchangeMap = (() => {
+  const ufs = unionFindSet<string>();
+  for (const dict of [hkVariants, hkVariantsRev, jpVariants, jpVariantsRev, stCharacters, tsCharacters, twVariants, twVariantsRev] as string[]) {
+    for (const [from, to] of dict.split('|').map(pair => pair.split(' '))) {
+      if (!from || !to || [...from].length !== 1 || [...to].length !== 1) continue;
+      ufs.union(from, to);
+    }
+  }
+  const map = new Map<string, string[]>();
+  for (const key of ufs.keys()) {
+    const root = ufs.find(key);
+    let list = map.get(root);
+    if (!list) map.set(root, list = []);
+    if (key !== root) map.set(key, list);
+    list.push(key);
+  }
+  for (const list of map.values()) list.sort();
+  return map;
+})();
+
+export const isHanCharacter = (phrase: string) => /^[\p{Script=Han}]+$/u.test(phrase);
+
+export const getHanVariants = (character: string) => exchangeMap.get(character) ?? (isHanCharacter(character) ? [character] : []);
+
+const PINYIN_INITIALS: string[] = ['b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 'j', 'q', 'x', 'zh', 'ch', 'sh', 'r', 'z', 'c', 's', 'y', 'w'];
+const PINYIN_FINALS_FUZZY_MAP: Record<string, string> = { 'ang': 'an', 'eng': 'en', 'ing': 'in' };
+export const getPinyinCandidates = (character: string) => {
+  const pinyins = polyphonic(character, { type: 'array', toneType: 'none', removeNonZh: true })[0] ?? [];
+  return Array.from(new Set(pinyins.filter(fullPinyin => fullPinyin).flatMap(fullPinyin => {
+    const initial = PINYIN_INITIALS.find(initial => fullPinyin.startsWith(initial));
+    const initialAlphabet = initial?.[0] ?? fullPinyin[0]!;
+    const fuzzySuffix = fullPinyin.slice(-3);
+    const fuzzyPinyin = fuzzySuffix in PINYIN_FINALS_FUZZY_MAP ? fullPinyin.slice(0, -3) + PINYIN_FINALS_FUZZY_MAP[fuzzySuffix] : undefined;
+    return [fullPinyin, initial, initialAlphabet, fuzzyPinyin].filter((s): s is string => !!s);
+  })));
+};
@@ -0,0 +1,5 @@
+export * from './han';
+export * from './japanese';
+export * from './tokenizer';
+export * from './trie';
+export * from './inverted-index';
@@ -0,0 +1,46 @@
+import { NORMALIZE_RULES_KANA_DAKUTEN, NORMALIZE_RULES_ROMAJI } from './japanese';
+import { createTokenizer, type TokenizerOptions } from './tokenizer';
+import { buildTrie, graftTriePaths, serializeTrie } from './trie';
+import type { CompressedInvertedIndex, TokenDefinition } from '../common/types';
+import { TokenType } from '../common/types';
+
+const buildTypedTrie = (tokens: TokenDefinition[], typePredicate: (tokenType: TokenType) => boolean) =>
+  buildTrie(tokens.filter(token => typePredicate(token.type)).map(token => [token.id, token.text]));
+
+export const buildInvertedIndex = (documents: string[], tokenizerOptions: TokenizerOptions) => {
+  const tokenizer = createTokenizer(tokenizerOptions);
+  const documentTokens = documents.map(document => tokenizer.tokenize(document));
+
+  const tokenDefinitions = [...tokenizer.tokens.values()];
+  const romajiRoot = buildTypedTrie(tokenDefinitions, type => type === TokenType.Romaji);
+  const kanaRoot = buildTypedTrie(tokenDefinitions, type => type === TokenType.Kana);
+  const otherRoot = buildTypedTrie(tokenDefinitions, type => type !== TokenType.Romaji && type !== TokenType.Kana);
+  graftTriePaths(romajiRoot, NORMALIZE_RULES_ROMAJI);
+  graftTriePaths(kanaRoot, NORMALIZE_RULES_KANA_DAKUTEN);
+
+  const invertedIndex: CompressedInvertedIndex = {
+    documents,
+    tokenTypes: tokenDefinitions.map(token => token.type),
+    tokenReferences: Array.from({ length: tokenDefinitions.length }, () => []),
+    tries: {
+      romaji: serializeTrie(romajiRoot),
+      kana: serializeTrie(kanaRoot),
+      other: serializeTrie(otherRoot),
+    },
+  };
+  for (const [documentId, tokens] of documentTokens.entries()) {
+    const tokenOccurrences = new Map<number, number[]>();
+    for (const token of tokens) {
+      let occurrences = tokenOccurrences.get(token.id);
+      if (!occurrences) {
+        occurrences = [];
+        tokenOccurrences.set(token.id, occurrences);
+      }
+      occurrences.push(token.start, token.end);
+    }
+    for (const [tokenId, occurrences] of tokenOccurrences) {
+      invertedIndex.tokenReferences[tokenId]!.push([documentId, ...occurrences]);
+    }
+  }
+  return invertedIndex;
+};
@@ -0,0 +1,66 @@
+import path from 'node:path';
+import url from 'node:url';
+
+import { TokenizerBuilder } from '@patdx/kuromoji';
+import NodeDictionaryLoader from '@patdx/kuromoji/node';
+
+import { getAllKanaReadings, toRomajiStrictly } from './japanese';
+import type { KuromojiTokenizer } from './tokenizer';
+
+let kuromoji: KuromojiTokenizer;
+
+beforeAll(async () => {
+  const kuromojiDictPath = path.resolve(url.fileURLToPath(import.meta.resolve('@patdx/kuromoji')), '..', '..', 'dict');
+  kuromoji = await new TokenizerBuilder({ loader: new NodeDictionaryLoader({ dic_path: kuromojiDictPath }) }).build();
+});
+
+describe('toRomajiStrictly', () => {
+  it('should convert basic kana to romaji', () => {
+    expect(toRomajiStrictly('あ')).toBe('a');
+    expect(toRomajiStrictly('か')).toBe('ka');
+    expect(toRomajiStrictly('さくら')).toBe('sakura');
+  });
+
+  it('should convert katakana to romaji', () => {
+    expect(toRomajiStrictly('ア')).toBe('a');
+    expect(toRomajiStrictly('カ')).toBe('ka');
+    expect(toRomajiStrictly('サクラ')).toBe('sakura');
+  });
+
+  it('should handle long vowels', () => {
+    expect(toRomajiStrictly('おう')).toBe('ou');
+    expect(toRomajiStrictly('おお')).toBe('oo');
+  });
+
+  it('should return empty string for invalid first character', () => {
+    expect(toRomajiStrictly('ー')).toBe(''); // prolonged sound mark cannot be first
+    expect(toRomajiStrictly('ゃ')).toBe(''); // small ya cannot be first
+  });
+
+  it('should return empty string for invalid last character', () => {
+    expect(toRomajiStrictly('っ')).toBe(''); // small tsu cannot be last
+  });
+
+  it('should handle gemination (small tsu)', () => {
+    expect(toRomajiStrictly('かった')).toBe('katta');
+  });
+});
+
+describe('getAllKanaReadings', () => {
+  it('should return katakana reading for pure kana input', () => {
+    const readings = getAllKanaReadings(kuromoji, 'あ');
+    expect(readings).toContain('ア');
+  });
+
+  it('should return readings for kanji', () => {
+    const readings = getAllKanaReadings(kuromoji, '僕');
+    expect(readings.length).toBeGreaterThan(0);
+    // 僕 should have reading ボク
+    expect(readings).toContain('ボク');
+  });
+
+  it('should return readings for compound words', () => {
+    const readings = getAllKanaReadings(kuromoji, '和風');
+    expect(readings.length).toBeGreaterThan(0);
+  });
+});
@@ -0,0 +1,158 @@
+import { fromKana } from 'hepburn';
+
+import type { KuromojiTokenizer } from './tokenizer';
+import { toKatakana } from '../common';
+
+// We have normalized all other sound marks to \u3099 and \u309A (combining kata-hiragana voiced/semi-voiced sound marks)
+export const isMaybeJapanese = (phrase: string) => /^[\p{Script=Han}\u3041-\u309F\u30A0-\u30FF\u3005\u3006\u30FC\u3099\u309A]+$/u.test(phrase);
+
+// See also normalize.ts
+export const isJapaneseSoundMark = (phrase: string) => /^[\u3099\u309A]+$/.test(phrase);
+export const stripJapaneseSoundMarks = (phrase: string) => phrase.replaceAll('\u3099', '').replaceAll('\u309A', '');
+
+export const isKanaSingle = (char: string) => {
+  const code = char.charCodeAt(0);
+  return (code >= 0x3041 && code <= 0x309F) || (code >= 0x30A0 && code <= 0x30FF);
+};
+export const isKana = (phrase: string) => [...phrase].every(isKanaSingle);
+
+const KANAS_CANNOT_BE_FIRST = [
+  'ァ', 'ィ', 'ゥ', 'ェ', 'ォ',
+  'ぁ', 'ぃ', 'ぅ', 'ぇ', 'ぉ',
+  'ャ', 'ュ', 'ョ',
+  'ゃ', 'ゅ', 'ょ',
+  'ヮ', 'ゎ',
+  'ㇰ', 'ㇱ', 'ㇲ', 'ㇳ', 'ㇴ', 'ㇵ', 'ㇶ', 'ㇷ', 'ㇸ', 'ㇹ', 'ㇺ', 'ㇻ', 'ㇼ', 'ㇽ', 'ㇾ', 'ㇿ',
+  'ー',
+];
+const KANAS_CANNOT_BE_LAST = [
+  'ッ', 'っ',
+];
+export const toRomajiStrictly = (kana: string) => {
+  if (KANAS_CANNOT_BE_FIRST.includes(kana[0]!)) return '';
+  if (KANAS_CANNOT_BE_LAST.includes(kana[kana.length - 1]!)) return '';
+  const romaji = fromKana(kana).toLowerCase()
+    .replaceAll('ā', 'aa')
+    .replaceAll('ī', 'ii')
+    .replaceAll('ū', 'uu')
+    .replaceAll('ē', 'ee')
+    .replaceAll('ō', 'ou');
+  if (!romaji.match(/^[a-z]+$/)) return '';
+  return romaji;
+};
+
+export const createTranscriptionEnumerator = (
+  isValidPhrase: (codePoints: string[], start: number, length: number) => boolean,
+  getAllTranscriptions: (phrase: string) => string[],
+) => (codePoints: string[]) => {
+  const toKey = (start: number, length: number) => `${start}:${length}`;
+  const resultMap = new Map<string, { start: number; length: number; transcriptions: string[] }>();
+  for (let phraseLength = 1; phraseLength <= codePoints.length; phraseLength++) for (let start = 0; start + phraseLength <= codePoints.length; start++) {
+    if (!isValidPhrase(codePoints, start, phraseLength)) continue;
+    const phrase = codePoints.slice(start, start + phraseLength).join('');
+    const atomicTranscriptions = [...new Set(getAllTranscriptions(phrase))].filter(candidateTranscription => {
+      if (!candidateTranscription) return false;
+      // Ensure the transcription is atomic (not a combination of multiple shorter transcriptions, separated by any midpoints)
+      type State = { phrasePosition: number; transcriptionPosition: number };
+      const toStateKey = (state: State) => `${state.phrasePosition}:${state.transcriptionPosition}`;
+      const visitedStates = new Set<string>();
+      const queue: State[] = [{ phrasePosition: 0, transcriptionPosition: 0 }];
+      while (queue.length > 0) {
+        const { phrasePosition, transcriptionPosition } = queue.shift()!;
+        for (let prefixLength = 1; prefixLength <= phraseLength - phrasePosition; prefixLength++) {
+          const prefixResult = resultMap.get(toKey(start + phrasePosition, prefixLength));
+          if (!prefixResult) continue;
+          for (const transcription of prefixResult.transcriptions) {
+            if (candidateTranscription.slice(transcriptionPosition, transcriptionPosition + transcription.length) === transcription) {
+              const nextState: State = { phrasePosition: phrasePosition + prefixLength, transcriptionPosition: transcriptionPosition + transcription.length };
+              if (nextState.phrasePosition === phraseLength && nextState.transcriptionPosition === candidateTranscription.length) return false; // Found a valid combination
+              if (visitedStates.has(toStateKey(nextState))) continue;
+              visitedStates.add(toStateKey(nextState));
+              queue.push(nextState);
+            }
+          }
+        }
+      }
+      return true;
+    });
+    if (atomicTranscriptions.length > 0) resultMap.set(toKey(start, phraseLength), { start, length: phraseLength, transcriptions: atomicTranscriptions });
+  }
+  return [...resultMap.values()];
+};
+
+export const getAllKanaReadings = (kuromoji: KuromojiTokenizer, phrase: string) => Array.from(new Set(
+  [
+    ...isKana(phrase) ? [toKatakana(phrase)] : [],
+    ...isKana(phrase) && [...phrase].length === 1 ? [] : ((kuromoji.token_info_dictionary.target_map[kuromoji.viterbi_builder.trie.lookup(phrase)] ?? [])
+      .map(id => kuromoji.formatter.formatEntry(
+        id, 0, 'KNOWN',
+        kuromoji.token_info_dictionary.getFeatures(id as unknown as string)?.split(',') ?? [],
+      ).reading)
+      .filter((reading): reading is string => !!reading))
+      .map(toKatakana),
+  ],
+));
+
+const createNormalizer = (rules: Record<string, string>) => (text: string) => {
+  while (true) {
+    const beforeCurrentIteration = text;
+    for (const [from, to] of Object.entries(rules)) text = text.replaceAll(from, to);
+    if (text === beforeCurrentIteration) break;
+  }
+  return text;
+};
+
+export const NORMALIZE_RULES_ROMAJI: Record<string, string> = {
+  // Remove all long vowels (sa-ba- -> saba)
+  '-': '',
+  // Collapse consecutive vowels
+  'aa': 'a',
+  'ii': 'i',
+  'uu': 'u',
+  'ee': 'e',
+  'oo': 'o',
+  'ou': 'o',
+  // mb/mp/mm -> nb/np/nm (shimbun -> shinbun)
+  'mb': 'nb',
+  'mp': 'np',
+  'mm': 'nm',
+  // Others
+  'sha': 'sya',
+  'tsu': 'tu',
+  'chi': 'ti',
+  'shi': 'si',
+  'ji': 'zi',
+};
+export const normalizeRomaji = createNormalizer(NORMALIZE_RULES_ROMAJI);
+
+export const NORMALIZE_RULES_KANA_DAKUTEN: Record<string, string> = {
+  'う\u3099': 'ゔ',
+  'か\u3099': 'が', 'き\u3099': 'ぎ', 'く\u3099': 'ぐ', 'け\u3099': 'げ', 'こ\u3099': 'ご',
+  'さ\u3099': 'ざ', 'し\u3099': 'じ', 'す\u3099': 'ず', 'せ\u3099': 'ぜ', 'そ\u3099': 'ぞ',
+  'た\u3099': 'だ', 'ち\u3099': 'ぢ', 'つ\u3099': 'づ', 'て\u3099': 'で', 'と\u3099': 'ど',
+  'は\u3099': 'ば', 'ひ\u3099': 'び', 'ふ\u3099': 'ぶ', 'へ\u3099': 'べ', 'ほ\u3099': 'ぼ',
+  'は\u309A': 'ぱ', 'ひ\u309A': 'ぴ', 'ふ\u309A': 'ぷ', 'へ\u309A': 'ぺ', 'ほ\u309A': 'ぽ',
+  'ゝ\u3099': 'ゞ',
+
+  'ウ\u3099': 'ヴ',
+  'カ\u3099': 'ガ', 'キ\u3099': 'ギ', 'ク\u3099': 'グ', 'ケ\u3099': 'ゲ', 'コ\u3099': 'ゴ',
+  'サ\u3099': 'ザ', 'シ\u3099': 'ジ', 'ス\u3099': 'ズ', 'セ\u3099': 'ゼ', 'ソ\u3099': 'ゾ',
+  'タ\u3099': 'ダ', 'チ\u3099': 'ヂ', 'ツ\u3099': 'ヅ', 'テ\u3099': 'デ', 'ト\u3099': 'ド',
+  'ハ\u3099': 'バ', 'ヒ\u3099': 'ビ', 'フ\u3099': 'ブ', 'ヘ\u3099': 'ベ', 'ホ\u3099': 'ボ',
+  'ハ\u309A': 'パ', 'ヒ\u309A': 'ピ', 'フ\u309A': 'プ', 'ヘ\u309A': 'ペ', 'ホ\u309A': 'ポ',
+  'ワ\u3099': 'ヷ', 'ヰ\u3099': 'ヸ', 'ヱ\u3099': 'ヹ', 'ヲ\u3099': 'ヺ',
+  'ヽ\u3099': 'ヾ',
+};
+export const normalizeKanaDakuten = createNormalizer(NORMALIZE_RULES_KANA_DAKUTEN);
+
+const isValidJapanesePhrase = (codePoints: string[], start: number, length: number) =>
+  // Skip splittings that cause sound marks to occur in the first position of a phrase
+  !isJapaneseSoundMark(codePoints[start]!) && (start + length === codePoints.length || !isJapaneseSoundMark(codePoints[start + length]!));
+export const createKanaTranscriptionEnumerator = (kuromoji: KuromojiTokenizer) => createTranscriptionEnumerator(
+  isValidJapanesePhrase,
+  phrase => getAllKanaReadings(kuromoji, stripJapaneseSoundMarks(normalizeKanaDakuten(phrase))),
+);
+export const createRomajiTranscriptionEnumerator = (kuromoji: KuromojiTokenizer) => createTranscriptionEnumerator(
+  isValidJapanesePhrase,
+  phrase => getAllKanaReadings(kuromoji, stripJapaneseSoundMarks(normalizeKanaDakuten(phrase))).map(kana => normalizeRomaji(toRomajiStrictly(kana))),
+);
@@ -0,0 +1,166 @@
+import path from 'node:path';
+import url from 'node:url';
+
+import { TokenizerBuilder } from '@patdx/kuromoji';
+import NodeDictionaryLoader from '@patdx/kuromoji/node';
+
+import { createTokenizer, type KuromojiTokenizer } from './tokenizer';
+import { TokenType } from '../common/types';
+
+let kuromoji: KuromojiTokenizer;
+
+beforeAll(async () => {
+  const kuromojiDictPath = path.resolve(url.fileURLToPath(import.meta.resolve('@patdx/kuromoji')), '..', '..', 'dict');
+  kuromoji = await new TokenizerBuilder({ loader: new NodeDictionaryLoader({ dic_path: kuromojiDictPath }) }).build();
+});
+
+describe('tokenizer', () => {
+  it('should tokenize mixed Japanese text', () => {
+    const tokenizer = createTokenizer({ kuromoji });
+    const tokens = tokenizer.tokenize('僕の和風本当上手');
+
+    // Get all token definitions
+    const tokenDefs = [...tokenizer.tokens.values()];
+
+    // Should have tokens of various types
+    const types = new Set(tokenDefs.map(t => t.type));
+    expect(types.has(TokenType.Han)).toBe(true);
+    expect(types.has(TokenType.Pinyin)).toBe(true);
+    expect(types.has(TokenType.Kana)).toBe(true);
+    expect(types.has(TokenType.Romaji)).toBe(true);
+
+    const getTokenTextsAt = (pos: number, type: TokenType) => tokens
+      .filter(t => t.start <= pos && t.end > pos && tokenDefs.find(d => d.id === t.id)?.type === type)
+      .map(t => tokenDefs.find(d => d.id === t.id)!.text);
+
+    // Position 0: 僕
+    expect(getTokenTextsAt(0, TokenType.Han)).toContain('僕');
+    expect(getTokenTextsAt(0, TokenType.Pinyin)).toContain('pu');
+    expect(getTokenTextsAt(0, TokenType.Kana)).toContain('ボク');
+    expect(getTokenTextsAt(0, TokenType.Romaji)).toContain('boku');
+
+    // Position 1: の (hiragana, no Han/Pinyin)
+    expect(getTokenTextsAt(1, TokenType.Han)).toEqual([]);
+    expect(getTokenTextsAt(1, TokenType.Pinyin)).toEqual([]);
+    expect(getTokenTextsAt(1, TokenType.Kana)).toContain('ノ');
+    expect(getTokenTextsAt(1, TokenType.Romaji)).toContain('no');
+
+    // Position 2: 和
+    expect(getTokenTextsAt(2, TokenType.Han)).toContain('和');
+    expect(getTokenTextsAt(2, TokenType.Pinyin)).toContain('he');
+    expect(getTokenTextsAt(2, TokenType.Kana)).toContain('ワ');
+    expect(getTokenTextsAt(2, TokenType.Romaji)).toContain('wa');
+
+    // Position 3: 風
+    expect(getTokenTextsAt(3, TokenType.Han)).toContain('風');
+    expect(getTokenTextsAt(3, TokenType.Han)).toContain('风'); // simplified variant
+    expect(getTokenTextsAt(3, TokenType.Pinyin)).toContain('feng');
+    expect(getTokenTextsAt(3, TokenType.Kana)).toContain('フウ');
+    expect(getTokenTextsAt(3, TokenType.Romaji)).toContain('fu');
+
+    // Position 4: 本
+    expect(getTokenTextsAt(4, TokenType.Han)).toContain('本');
+    expect(getTokenTextsAt(4, TokenType.Pinyin)).toContain('ben');
+    expect(getTokenTextsAt(4, TokenType.Kana)).toContain('ホン');
+    expect(getTokenTextsAt(4, TokenType.Romaji)).toContain('hon');
+
+    // Position 5: 当
+    expect(getTokenTextsAt(5, TokenType.Han)).toContain('当');
+    expect(getTokenTextsAt(5, TokenType.Han)).toContain('當'); // traditional variant
+    expect(getTokenTextsAt(5, TokenType.Pinyin)).toContain('dang');
+    expect(getTokenTextsAt(5, TokenType.Kana)).toContain('トウ');
+    expect(getTokenTextsAt(5, TokenType.Romaji)).toContain('to'); // normalized: tou -> to
+
+    // Position 6: 上
+    expect(getTokenTextsAt(6, TokenType.Han)).toContain('上');
+    expect(getTokenTextsAt(6, TokenType.Pinyin)).toContain('shang');
+    expect(getTokenTextsAt(6, TokenType.Kana)).toContain('ジョウ');
+    expect(getTokenTextsAt(6, TokenType.Romaji)).toContain('jo'); // normalized: jou -> jo
+
+    // Position 7: 手
+    expect(getTokenTextsAt(7, TokenType.Han)).toContain('手');
+    expect(getTokenTextsAt(7, TokenType.Pinyin)).toContain('shou');
+    expect(getTokenTextsAt(7, TokenType.Kana)).toContain('シュ');
+    expect(getTokenTextsAt(7, TokenType.Romaji)).toContain('shu');
+
+    // Check that tokens cover the entire input
+    expect(tokens.length).toBeGreaterThan(0);
+
+    // Check some specific token definitions exist
+    const hanTokenTexts = tokenDefs.filter(t => t.type === TokenType.Han).map(t => t.text);
+    expect(hanTokenTexts).toContain('僕');
+    expect(hanTokenTexts).toContain('和');
+    expect(hanTokenTexts).toContain('風');
+
+    // Check kana readings exist for kanji
+    const kanaTokenTexts = tokenDefs.filter(t => t.type === TokenType.Kana).map(t => t.text);
+    expect(kanaTokenTexts).toContain('ボク'); // 僕 -> ボク
+
+    // Check romaji readings exist
+    const romajiTokenTexts = tokenDefs.filter(t => t.type === TokenType.Romaji).map(t => t.text);
+    expect(romajiTokenTexts).toContain('boku'); // 僕 -> boku
+  });
+
+  it('should not create duplicate tokens when tokenizing multiple documents', () => {
+    const tokenizer = createTokenizer({ kuromoji });
+
+    // Tokenize multiple music names that share some characters
+    tokenizer.tokenize('僕の和風本当上手');
+    tokenizer.tokenize('僕');
+    tokenizer.tokenize('和風');
+
+    // Check that there are no duplicate tokens
+    const tokenDefs = [...tokenizer.tokens.values()];
+    const tokenKeys = tokenDefs.map(t => `${t.type}:${t.text}`);
+    const uniqueKeys = new Set(tokenKeys);
+
+    expect(tokenKeys.length).toBe(uniqueKeys.size);
+
+    // Also check that IDs are unique
+    const ids = tokenDefs.map(t => t.id);
+    const uniqueIds = new Set(ids);
+    expect(ids.length).toBe(uniqueIds.size);
+  });
+
+  it('should handle Raw tokens for non-CJK characters', () => {
+    const tokenizer = createTokenizer({ kuromoji });
+    tokenizer.tokenize('a-b');
+
+    const tokenDefs = [...tokenizer.tokens.values()];
+    const rawTokenTexts = tokenDefs.filter(t => t.type === TokenType.Raw).map(t => t.text);
+
+    expect(rawTokenTexts).toContain('a'); // normalized to lowercase
+    expect(rawTokenTexts).toContain('-');
+    expect(rawTokenTexts).toContain('b');
+  });
+
+  it('should tokenize compound word "今日" with both individual and combined readings', () => {
+    const tokenizer = createTokenizer({ kuromoji });
+    const tokens = tokenizer.tokenize('今日');
+    const tokenDefs = [...tokenizer.tokens.values()];
+
+    const getTokensWithSpan = (type: TokenType, start: number, end: number) => tokens
+      .filter(t => t.start === start && t.end === end && tokenDefs.find(d => d.id === t.id)?.type === type)
+      .map(t => tokenDefs.find(d => d.id === t.id)!.text);
+
+    // Individual character readings at position 0: 今
+    expect(getTokensWithSpan(TokenType.Han, 0, 1)).toContain('今');
+    expect(getTokensWithSpan(TokenType.Pinyin, 0, 1)).toContain('jin');
+    expect(getTokensWithSpan(TokenType.Kana, 0, 1)).toContain('コン');
+    expect(getTokensWithSpan(TokenType.Kana, 0, 1)).toContain('イマ');
+    expect(getTokensWithSpan(TokenType.Romaji, 0, 1)).toContain('kon');
+    expect(getTokensWithSpan(TokenType.Romaji, 0, 1)).toContain('ima');
+
+    // Individual character readings at position 1: 日
+    expect(getTokensWithSpan(TokenType.Han, 1, 2)).toContain('日');
+    expect(getTokensWithSpan(TokenType.Pinyin, 1, 2)).toContain('ri');
+    expect(getTokensWithSpan(TokenType.Kana, 1, 2)).toContain('ニチ');
+    expect(getTokensWithSpan(TokenType.Kana, 1, 2)).toContain('ヒ');
+    expect(getTokensWithSpan(TokenType.Romaji, 1, 2)).toContain('niti');
+    expect(getTokensWithSpan(TokenType.Romaji, 1, 2)).toContain('hi');
+
+    // Combined reading for "今日" [0, 2] - this is an indivisible compound word
+    expect(getTokensWithSpan(TokenType.Kana, 0, 2)).toContain('キョウ');
+    expect(getTokensWithSpan(TokenType.Romaji, 0, 2)).toContain('kyo'); // normalized: kyou -> kyo
+  });
+});
@@ -0,0 +1,93 @@
+import type { TokenizerBuilder } from '@patdx/kuromoji';
+
+import { getHanVariants, getPinyinCandidates } from './han';
+import { createKanaTranscriptionEnumerator, createRomajiTranscriptionEnumerator, isMaybeJapanese } from './japanese';
+import { normalizeByCodePoint } from '../common/normalize';
+import { TokenType, type TokenDefinition } from '../common/types';
+
+export interface Token {
+  id: number;
+  start: number;
+  end: number;
+}
+
+export type KuromojiTokenizer = Awaited<ReturnType<TokenizerBuilder['build']>>;
+export interface TokenizerOptions {
+  kuromoji: KuromojiTokenizer;
+}
+export const createTokenizer = (options: TokenizerOptions) => {
+  const tokens = new Map<string, TokenDefinition>();
+  let nextId = 0;
+  const ensureToken = (type: TokenType, text: string) => {
+    const key = `${type}:${text}`;
+    let tokenDefinition = tokens.get(key);
+    if (tokenDefinition) return tokenDefinition;
+    tokenDefinition = { id: nextId++, type, text, codePointLength: [...text].length };
+    tokens.set(key, tokenDefinition);
+    return tokenDefinition;
+  };
+
+  const enumerateAllKanaCombinations = createKanaTranscriptionEnumerator(options.kuromoji);
+  const enumerateAllRomajiCombinations = createRomajiTranscriptionEnumerator(options.kuromoji);
+  const tokenize = (text: string) => {
+    const results: Token[] = [];
+    const emitter = (start: number, end: number) => (type: TokenType, text: string) => results.push({ id: ensureToken(type, text).id, start, end });
+
+    const emitMaybeJapanese = (codePoints: string[], offset: number) => {
+      for (const { start, length, transcriptions } of enumerateAllKanaCombinations(codePoints)) {
+        const emit = emitter(offset + start, offset + start + length);
+        for (const transcription of transcriptions) emit(TokenType.Kana, transcription);
+      }
+      for (const { start, length, transcriptions } of enumerateAllRomajiCombinations(codePoints)) {
+        const emit = emitter(offset + start, offset + start + length);
+        for (const transcription of transcriptions) emit(TokenType.Romaji, transcription);
+      }
+      for (let i = 0; i < codePoints.length; i++) {
+        // Single character may have not only kana readings, but also Chinese pronunciations or Simplified/Traditional/Japanese variants.
+        const character = codePoints[i]!;
+        const hanAlternates = getHanVariants(character); // All possible variant characters (Simplified/Traditional/Japanese)
+        const pinyinAlternates = Array.from(new Set(hanAlternates.flatMap(han => getPinyinCandidates(han)))); // All possible pinyin candidates
+        const emit = emitter(offset + i, offset + i + 1);
+        for (const han of hanAlternates) emit(TokenType.Han, han);
+        for (const pinyin of pinyinAlternates) emit(TokenType.Pinyin, pinyin);
+      }
+    };
+    const emitRaw = (codePoint: string, offset: number) => emitter(offset, offset + 1)(TokenType.Raw, codePoint);
+
+    const codePoints = [...normalizeByCodePoint(text)];
+    for (let start = 0; start < codePoints.length;) {
+      const codePoint = codePoints[start]!;
+
+      const consequentCharsets = [
+        { is: isMaybeJapanese, emit: emitMaybeJapanese },
+      ];
+      let emitted = false;
+      for (const { is, emit } of consequentCharsets) {
+        let length = 0;
+        while (start + length < codePoints.length && is(codePoints[start + length]!)) length++;
+        if (length > 0) {
+          emit(codePoints.slice(start, start + length), start);
+          start += length;
+          emitted = true;
+          break;
+        }
+      }
+      if (emitted) continue;
+
+      // Skip whitespaces
+      if (/\s/.test(codePoint)) {
+        start++;
+        continue;
+      }
+
+      emitRaw(codePoint, start);
+      start++;
+    }
+    return results;
+  };
+
+  return {
+    tokens,
+    tokenize,
+  };
+};
@@ -0,0 +1,51 @@
+import { traverseTrie } from '../common';
+import { buildTrie, graftTriePaths } from './trie';
+
+describe('graftTriePaths', () => {
+  it('should graft paths according to normalization rules', () => {
+    // Build a trie with tokens containing normalized forms
+    const trie = buildTrie([
+      [0, 'sya'], // normalized form of "sha"
+      [1, 'tu'],  // normalized form of "tsu"
+    ]);
+
+    // Graft paths so that "sha" -> "sya" and "tsu" -> "tu"
+    graftTriePaths(trie, {
+      sha: 'sya',
+      tsu: 'tu',
+    });
+
+    // Now we should be able to traverse using both the original and grafted paths
+    const syaNode = traverseTrie(trie, 'sya');
+    const shaNode = traverseTrie(trie, 'sha');
+    expect(syaNode).toBeDefined();
+    expect(shaNode).toBeDefined();
+    expect(syaNode).toBe(shaNode); // Both paths should lead to the same node
+
+    const tuNode = traverseTrie(trie, 'tu');
+    const tsuNode = traverseTrie(trie, 'tsu');
+    expect(tuNode).toBeDefined();
+    expect(tsuNode).toBeDefined();
+    expect(tuNode).toBe(tsuNode);
+  });
+
+  it('should handle chained graft rules', () => {
+    const trie = buildTrie([
+      [0, 'o'], // normalized vowel
+    ]);
+
+    // Chain: "ou" -> "o", "oo" -> "o"
+    graftTriePaths(trie, {
+      ou: 'o',
+      oo: 'o',
+    });
+
+    const oNode = traverseTrie(trie, 'o');
+    const ouNode = traverseTrie(trie, 'ou');
+    const ooNode = traverseTrie(trie, 'oo');
+
+    expect(oNode).toBeDefined();
+    expect(ouNode).toBe(oNode);
+    expect(ooNode).toBe(oNode);
+  });
+});
@@ -0,0 +1,115 @@
+import { traverseTrie, type TrieNode } from '../common';
+
+const newNode = (parent?: TrieNode): TrieNode => ({ parent, children: new Map(), tokenIds: [], subTreeTokenIds: [] });
+
+// Assume tokens are unique.
+export const buildTrie = (tokens: [id: number, text: string][]) => {
+  const root = newNode(undefined);
+  for (const [id, text] of tokens) {
+    let node = root;
+    for (const char of text) {
+      const codePoint = char.codePointAt(0)!;
+      let childNode = node.children.get(codePoint);
+      if (!childNode) {
+        childNode = newNode(node);
+        node.children.set(codePoint, childNode);
+      }
+      node = childNode;
+      node.subTreeTokenIds.push(id);
+    }
+    node.tokenIds.push(id);
+  }
+  return root;
+};
+
+export const graftTriePaths = (root: TrieNode, rules: Record<string, string>) => {
+  for (const [inputPhrase, graftTo] of Object.entries(rules)) if ([...graftTo].length > [...inputPhrase].length) throw new Error(`Graft rule ${inputPhrase} -> ${graftTo} maps to longer string and may cause infinite loop`);
+  const visitedNodes = new Set<TrieNode>();
+  const graftFromNode = (node: TrieNode, recursiveChildren: boolean) => {
+    if (visitedNodes.has(node)) return;
+    visitedNodes.add(node);
+    if (recursiveChildren) for (const [, childNode] of node.children) graftFromNode(childNode, true);
+    while (true) {
+      const nodesWithNewGraftedChildren = new Map<TrieNode, /* depth from initial node */ number>();
+      for (const [inputPhrase, graftTo] of Object.entries(rules)) {
+        const targetNode = traverseTrie(node, graftTo);
+        if (!targetNode) continue;
+        const codePoints = [...inputPhrase];
+        const graftedPath = Array.from<TrieNode>({ length: codePoints.length - 1 });
+        let isGrafted = false;
+        let currentNode = node;
+        for (let i = 0; i < codePoints.length; i++) {
+          const codePoint = codePoints[i]!.codePointAt(0)!;
+          let childNode = currentNode.children.get(codePoint);
+          if (i === codePoints.length - 1) {
+            if (childNode) {
+              if (childNode !== targetNode) throw new Error(`Grafted path ${inputPhrase} conflicts with existing path`);
+              // Already grafted
+            } else {
+              currentNode.children.set(codePoint, childNode = targetNode);
+              isGrafted = true;
+            }
+          } else {
+            if (!childNode) {
+              childNode = newNode(currentNode);
+              childNode.subTreeTokenIds = targetNode.subTreeTokenIds;
+              currentNode.children.set(codePoint, childNode);
+            } else {
+              // Part of another grafted path?
+              childNode.subTreeTokenIds = Array.from(new Set([...childNode.subTreeTokenIds, ...targetNode.subTreeTokenIds]));
+            }
+            graftedPath[i] = currentNode = childNode;
+          }
+        }
+        if (isGrafted) for (const [i, nodeToAdd] of graftedPath.entries()) nodesWithNewGraftedChildren.set(nodeToAdd, i + 1);
+      }
+
+      if (nodesWithNewGraftedChildren.size > 0) {
+        // Re-check graft rules on the newly grafted path
+        // 1. No need to recursive other children (not on this path) since their children are not affected
+        // 2. No need to consider ancestors of this node since they're handled later (we run in DFS order)
+        const sortedNodes = [...nodesWithNewGraftedChildren.entries()].sort((a, b) => b[1] - a[1]);
+        for (const [changedNode] of sortedNodes) graftFromNode(changedNode, false);
+      } else {
+        // No new grafts applied
+        break;
+      }
+    }
+  };
+  graftFromNode(root, true);
+};
+
+export const serializeTrie = (root: TrieNode) => {
+  const nodeEntries = new Map<TrieNode, {
+    id: number;
+    visited: boolean;
+    data?: number[];
+  }>();
+  let currentId = 0;
+  const getNodeEntry = (node: TrieNode) => {
+    let entry = nodeEntries.get(node);
+    if (!entry) {
+      entry = { id: ++currentId, visited: false };
+      nodeEntries.set(node, entry);
+    }
+    return entry;
+  };
+  const serializeNode = (node: TrieNode) => {
+    const entry = getNodeEntry(node);
+    if (entry.visited) return entry.id;
+    entry.visited = true;
+    const children = [...node.children.entries()].map(([codePoint, childNode]) => [codePoint, serializeNode(childNode)] as const);
+    entry.data = [
+      node.parent ? getNodeEntry(node.parent).id : 0,
+      ...children.map(child => child[0]), // code points
+      ...children.map(child => child[1]), // child node ids
+      // End of children list (<= 0 are not valid code points nor node IDs)
+      ...node.tokenIds.length > 0
+        ? node.tokenIds.map(tokenId => -(tokenId + 1)) // Use the negative value of (tokenId + 1)
+        : [0], // End of children list, no token IDs (token IDs are encoded to negative values)
+    ];
+    return entry.id;
+  };
+  serializeNode(root);
+  return [...nodeEntries.values()].sort((a, b) => a.id - b.id).flatMap(node => node.data ?? []);
+};
@@ -0,0 +1,26 @@
+import { getSpanLength, TokenType } from '../common';
+import type { SearchResult } from './search';
+
+export type HighlightedTextPart = /* not highlighted */ string | /* highlighted */ { highlight: string };
+
+export const highlightSearchResult = (resultDocument: SearchResult): HighlightedTextPart[] => {
+  const highlightResult: HighlightedTextPart[] = [];
+  let previousHighlightEnd = 0;
+  for (const token of resultDocument.tokens) {
+    const notHighlightedText = resultDocument.documentCodePoints.slice(previousHighlightEnd, token.documentOffset.start).join('');
+    if (notHighlightedText.length > 0) highlightResult.push(notHighlightedText);
+    const highlightEnd = token.isTokenPrefixMatching && (token.definition.type === TokenType.Kana)
+      ? token.documentOffset.start + Math.max(
+        1,
+        Math.round(
+          getSpanLength(token.documentOffset) *
+          Math.min(1, getSpanLength(token.inputOffset) / token.definition.codePointLength),
+        ),
+      )
+      : token.documentOffset.end;
+    highlightResult.push({ highlight: resultDocument.documentCodePoints.slice(token.documentOffset.start, highlightEnd).join('') });
+    previousHighlightEnd = highlightEnd;
+  }
+  if (previousHighlightEnd < resultDocument.documentCodePoints.length) highlightResult.push(resultDocument.documentCodePoints.slice(previousHighlightEnd).join(''));
+  return highlightResult;
+};
@@ -0,0 +1,4 @@
+export * from './trie';
+export * from './inverted-index';
+export * from './search';
+export * from './highlight';
@@ -0,0 +1,59 @@
+import { deserializeTrie } from './trie';
+import type { TrieNode } from '../common';
+import type { CompressedInvertedIndex, OffsetSpan, TokenDefinition } from '../common/types';
+
+export interface TokenDocumentReference {
+  documentId: number;
+  offsets: OffsetSpan[];
+}
+
+interface TokenDefinitionExtended extends TokenDefinition {
+  references: TokenDocumentReference[];
+};
+
+const mergeMap = <K, V>(...maps: Map<K, V>[]) => {
+  const result = new Map<K, V>();
+  for (const map of maps) for (const [key, value] of map.entries()) result.set(key, value);
+  return result;
+};
+
+export interface LoadedInvertedIndex {
+  documents: string[];
+  documentCodePoints: string[][];
+  tokenDefinitions: TokenDefinitionExtended[];
+  tries: {
+    romaji: TrieNode;
+    kana: TrieNode;
+    other: TrieNode;
+  };
+}
+
+export const loadInvertedIndex = (compressed: CompressedInvertedIndex): LoadedInvertedIndex => {
+  const documents = compressed.documents;
+  const documentCodePoints = documents.map(document => [...document]);
+
+  const romajiTrie = deserializeTrie(compressed.tries.romaji);
+  const kanaTrie = deserializeTrie(compressed.tries.kana);
+  const otherTrie = deserializeTrie(compressed.tries.other);
+
+  const tokenCodePoints = mergeMap(romajiTrie.tokenCodePoints, kanaTrie.tokenCodePoints, otherTrie.tokenCodePoints);
+  const tokenDefinitions = compressed.tokenTypes.map<TokenDefinitionExtended>((type, index) => ({
+    id: index, type, text: tokenCodePoints.get(index)!.join(''),
+    codePointLength: tokenCodePoints.get(index)!.length,
+    references: compressed.tokenReferences[index]!.map<TokenDocumentReference>(([documentId, ...offsets]) => ({
+      documentId: documentId!,
+      offsets: Array.from({ length: offsets.length / 2 }, (_, i) => ({ start: offsets[i * 2]!, end: offsets[i * 2 + 1]! })),
+    })),
+  }));
+
+  return {
+    documents,
+    documentCodePoints,
+    tokenDefinitions,
+    tries: {
+      romaji: romajiTrie.root,
+      kana: kanaTrie.root,
+      other: otherTrie.root,
+    },
+  };
+};
@@ -0,0 +1,258 @@
+import { highlightSearchResult } from './highlight';
+import { getTrieNodeTokenIds } from './trie';
+import type { TrieNode } from '../common';
+import { traverseTrieStep } from '../common';
+import type { LoadedInvertedIndex } from './inverted-index';
+import { normalizeByCodePoint, toKatakana } from '../common/normalize';
+import { type OffsetSpan, type TokenDefinition, TokenType } from '../common/types';
+import { getSpanLength } from '../common/utils';
+
+const IGNORABLE_CODE_POINTS = /[\s\u3099\u309A]/u;
+
+enum TokenTypePrefixMatchingPolicy {
+  AlwaysAllow,
+  NeverAllow,
+  AllowOnlyAtInputEnd,
+}
+const tokenTypePrefixMatchingPolicy: Record<TokenType, TokenTypePrefixMatchingPolicy> = {
+  [TokenType.Romaji]: TokenTypePrefixMatchingPolicy.NeverAllow,
+  [TokenType.Kana]: TokenTypePrefixMatchingPolicy.AlwaysAllow,
+  // These token types are in an "other" Trie
+  [TokenType.Han]: TokenTypePrefixMatchingPolicy.AllowOnlyAtInputEnd, // No effect because always 1 code point
+  [TokenType.Pinyin]: TokenTypePrefixMatchingPolicy.AllowOnlyAtInputEnd,
+  [TokenType.Raw]: TokenTypePrefixMatchingPolicy.AllowOnlyAtInputEnd, // No effect because always 1 code point
+};
+const shouldAllowPrefixMatching = (tokenType: TokenType, isAtInputEnd: boolean) =>
+  tokenTypePrefixMatchingPolicy[tokenType] === TokenTypePrefixMatchingPolicy.AlwaysAllow ||
+  (tokenTypePrefixMatchingPolicy[tokenType] !== TokenTypePrefixMatchingPolicy.NeverAllow && isAtInputEnd);
+
+export interface SearchResultToken {
+  definition: TokenDefinition;
+  documentOffset: OffsetSpan;
+  inputOffset: OffsetSpan;
+  isTokenPrefixMatching: boolean;
+}
+
+interface ComparableStateTraits<T> {
+  getRangeCount: (state: T) => number;
+  getPrefixMatchCount: (state: T) => number;
+  getFirstTokenDocumentOffset: (state: T) => OffsetSpan;
+  getLastTokenDocumentOffset: (state: T) => OffsetSpan;
+  getLastToken?: (state: T) => SearchResultToken; // Not on intermediate results
+  getMatchRatioLevel?: (state: T) => number; // Not on intermediate/candidate results
+  getMatchRatio: (state: T) => number;
+  // Called when all other comparisons are equal
+  nextComparer?: (a: T, b: T) => number; // Not on intermediate/candidate results
+}
+
+const getComparerForTraits = <T>(traits: ComparableStateTraits<T>) => (a: T, b: T) => {
+  // Prefer matches that not relying on end-of-input loose matching (full match over prefix match)
+  if (traits.getLastToken) {
+    const aLastToken = traits.getLastToken(a), bLastToken = traits.getLastToken(b);
+    const aDidPrefixMatchByTokenType = aLastToken.isTokenPrefixMatching && tokenTypePrefixMatchingPolicy[aLastToken.definition.type] === TokenTypePrefixMatchingPolicy.AllowOnlyAtInputEnd;
+    const bDidPrefixMatchByTokenType = bLastToken.isTokenPrefixMatching && tokenTypePrefixMatchingPolicy[bLastToken.definition.type] === TokenTypePrefixMatchingPolicy.AllowOnlyAtInputEnd;
+    if (aDidPrefixMatchByTokenType !== bDidPrefixMatchByTokenType) return aDidPrefixMatchByTokenType ? 1 : -1;
+  }
+
+  // Prefer results that matched fewer discontinuous ranges over more
+  const aRangeCount = traits.getRangeCount(a), bRangeCount = traits.getRangeCount(b);
+  if (aRangeCount !== bRangeCount) return aRangeCount - bRangeCount;
+
+  // Prefer results that matches first token in document earlier over later
+  const aFirstTokenDocumentOffset = traits.getFirstTokenDocumentOffset(a), bFirstTokenDocumentOffset = traits.getFirstTokenDocumentOffset(b);
+  if (aFirstTokenDocumentOffset.start !== bFirstTokenDocumentOffset.start) return aFirstTokenDocumentOffset.start - bFirstTokenDocumentOffset.start;
+
+  // Prefer results that has higher match ratio (but don't distinguish similar ratios, so we introduced `matchRatioLevel`)
+  if (traits.getMatchRatioLevel) {
+    const aMatchRatioLevel = traits.getMatchRatioLevel(a), bMatchRatioLevel = traits.getMatchRatioLevel(b);
+    if (aMatchRatioLevel !== bMatchRatioLevel) return bMatchRatioLevel - aMatchRatioLevel;
+  }
+
+  // Prefer results that last token occurred earlier (if same, ended earlier) in the document over later
+  const aLastTokenDocumentOffset = traits.getLastTokenDocumentOffset(a), bLastTokenDocumentOffset = traits.getLastTokenDocumentOffset(b);
+  if (aLastTokenDocumentOffset.start !== bLastTokenDocumentOffset.start) return aLastTokenDocumentOffset.start - bLastTokenDocumentOffset.start;
+  if (aLastTokenDocumentOffset.end !== bLastTokenDocumentOffset.end) return aLastTokenDocumentOffset.end - bLastTokenDocumentOffset.end;
+
+  // Prefer results that has higher match ratio (precisely)
+  const aMatchRatio = traits.getMatchRatio(a), bMatchRatio = traits.getMatchRatio(b);
+  if (aMatchRatio !== bMatchRatio) return bMatchRatio - aMatchRatio;
+
+  return traits.nextComparer?.(a, b) ?? 0;
+};
+
+interface IntermediateResult {
+  previousState?: IntermediateResult;
+  firstTokenDocumentOffset: OffsetSpan;
+  rangeCount: number;
+  tokenCount: number;
+  prefixMatchCount: number;
+  matchedTokenLength: number;
+  tokenId: number;
+  documentOffset: OffsetSpan;
+  inputOffset: OffsetSpan;
+  isTokenPrefixMatching: boolean;
+}
+const compareIntermediateResult = getComparerForTraits<IntermediateResult>({
+  getRangeCount: state => state.rangeCount,
+  getPrefixMatchCount: state => state.prefixMatchCount,
+  getFirstTokenDocumentOffset: state => state.firstTokenDocumentOffset,
+  getLastTokenDocumentOffset: state => state.documentOffset,
+  getMatchRatio: state => state.matchedTokenLength, // No need to divide document length since intermediate results are for same document
+});
+
+interface CandidateResult {
+  tokens: SearchResultToken[];
+  prefixMatchCount: number;
+  matchedTokenLength: number;
+  rangeCount: number;
+}
+const compareCandidateResult = getComparerForTraits<CandidateResult>({
+  getRangeCount: state => state.rangeCount,
+  getPrefixMatchCount: state => state.prefixMatchCount,
+  getFirstTokenDocumentOffset: state => state.tokens[0]!.documentOffset,
+  getLastTokenDocumentOffset: state => state.tokens[state.tokens.length - 1]!.documentOffset,
+  getLastToken: state => state.tokens[state.tokens.length - 1]!,
+  getMatchRatio: state => state.matchedTokenLength, // No need to divide document length since candidate results are for same document
+});
+
+export interface SearchResult {
+  documentId: number;
+  documentText: string;
+  documentCodePoints: string[];
+  tokens: SearchResultToken[];
+  prefixMatchCount: number;
+  rangeCount: number;
+  matchRatio: number;
+  matchRatioLevel: number;
+}
+const compareFinalResult = getComparerForTraits<SearchResult>({
+  getRangeCount: state => state.rangeCount,
+  getPrefixMatchCount: state => state.prefixMatchCount,
+  getFirstTokenDocumentOffset: state => state.tokens[0]!.documentOffset,
+  getLastTokenDocumentOffset: state => state.tokens[state.tokens.length - 1]!.documentOffset,
+  getLastToken: state => state.tokens[state.tokens.length - 1]!,
+  getMatchRatio: state => state.matchRatio,
+  getMatchRatioLevel: state => Math.round(state.matchRatio * 5),
+  nextComparer: (a, b) => a.documentText === b.documentText ? 0 : a.documentText < b.documentText ? -1 : 1,
+});
+
+const hasNonEmptyCharacters = (documentCodePoints: string[], start: number, end: number) => start !== end && !documentCodePoints.slice(start, end).every(char => /\s/.test(char));
+
+export const searchInvertedIndex = (invertedIndex: LoadedInvertedIndex, text: string): SearchResult[] => {
+  const { documents, documentCodePoints, tokenDefinitions, tries } = invertedIndex;
+
+  const codePoints = [...toKatakana(normalizeByCodePoint(text))];
+  // dp[i] = docId => end => IntermediateResult, starts from dp[-1] (l === 0), ends at dp[N - 1] (r === N - 1)
+  const dp = Array.from({ length: codePoints.length }, () => new Map<number, Record<number, IntermediateResult>>());
+  for (let l = 0; l < codePoints.length; l++) {
+    if (l !== 0 && dp[l - 1]!.size === 0) continue; // No documents match input from beginning to this position
+    let romajiNode: TrieNode | undefined = tries.romaji;
+    let kanaNode: TrieNode | undefined = tries.kana;
+    let otherNode: TrieNode | undefined = tries.other;
+    for (let r = l; r < codePoints.length && (romajiNode || kanaNode || otherNode); r++) { // [l, r]
+      const codePoint = codePoints[r]!;
+      romajiNode = traverseTrieStep(romajiNode, codePoint, IGNORABLE_CODE_POINTS);
+      kanaNode = traverseTrieStep(kanaNode, codePoint, IGNORABLE_CODE_POINTS);
+      otherNode = traverseTrieStep(otherNode, codePoint, IGNORABLE_CODE_POINTS);
+      const reachingInputEnd = r === codePoints.length - 1;
+      const matchingTokenIds = new Set([
+        // Allow suffix matching of romaji/other tokens if we're at the end of the input
+        ...getTrieNodeTokenIds(romajiNode, shouldAllowPrefixMatching(TokenType.Romaji, reachingInputEnd)),
+        ...getTrieNodeTokenIds(kanaNode, shouldAllowPrefixMatching(TokenType.Kana, reachingInputEnd)),
+        ...getTrieNodeTokenIds(otherNode, reachingInputEnd),
+      ]);
+      for (const tokenId of matchingTokenIds) for (const { documentId, offsets } of tokenDefinitions[tokenId]!.references) {
+        const isTokenPrefixMatching = !romajiNode?.tokenIds.includes(tokenId) && !kanaNode?.tokenIds.includes(tokenId) && !otherNode?.tokenIds.includes(tokenId);
+        const previousMatchesOfDocument = dp[l - 1]?.get(documentId);
+        if (l !== 0 && !previousMatchesOfDocument) continue;
+        for (const documentOffset of offsets) {
+          const { start: currentStart, end: currentEnd } = documentOffset;
+          const contributeNextMatchingState = (previousState: IntermediateResult | undefined) => {
+            const nextMatchingMap = dp[r]!;
+            let nextMatchesOfDocument = nextMatchingMap.get(documentId);
+            if (!nextMatchesOfDocument) {
+              nextMatchesOfDocument = Object.create(null) as Record<number, IntermediateResult>;
+              nextMatchingMap.set(documentId, nextMatchesOfDocument);
+            }
+            const oldResult = nextMatchesOfDocument[currentEnd];
+            const inputOffset = { start: l, end: r + 1 };
+            const newResult: IntermediateResult = {
+              previousState,
+              firstTokenDocumentOffset: previousState?.firstTokenDocumentOffset ?? documentOffset,
+              rangeCount: !previousState ? 1
+                : (previousState.rangeCount + (hasNonEmptyCharacters(documentCodePoints[documentId]!, previousState.documentOffset.end, currentStart) ? 1 : 0)),
+              tokenCount: (previousState?.tokenCount ?? 0) + 1,
+              prefixMatchCount: (previousState?.prefixMatchCount ?? 0) + (isTokenPrefixMatching ? 1 : 0),
+              matchedTokenLength: (previousState?.matchedTokenLength ?? 0) + getSpanLength(documentOffset) *
+                Math.min(isTokenPrefixMatching ? getSpanLength(inputOffset) / tokenDefinitions[tokenId]!.codePointLength : Infinity, 1),
+              tokenId,
+              documentOffset,
+              inputOffset,
+              isTokenPrefixMatching,
+            };
+            nextMatchesOfDocument[currentEnd] = !oldResult || compareIntermediateResult(newResult, oldResult) < 0 ? newResult : oldResult;
+          };
+          if (l === 0) contributeNextMatchingState(undefined);
+          else for (const previousEnd in previousMatchesOfDocument) if (currentStart >= Number(previousEnd))
+            contributeNextMatchingState(previousMatchesOfDocument[previousEnd as unknown as number]!);
+          // Don't `break` here because keys of `previousMatchesOfDocument` are not essentially ordered
+        }
+      }
+    }
+  }
+
+  // Build search results and sort documents
+  return [...dp[codePoints.length - 1]!.entries()].map<SearchResult>(([documentId, matches]) => {
+    const sortedMatches = Object.values(matches).map<CandidateResult>(match => {
+      const tokens: SearchResultToken[] = [];
+      // Build token list from backtracking
+      let state: IntermediateResult | undefined = match;
+      while (state) {
+        tokens.unshift({
+          definition: tokenDefinitions[state.tokenId]!,
+          documentOffset: state.documentOffset, inputOffset: state.inputOffset,
+          isTokenPrefixMatching: state.isTokenPrefixMatching,
+        });
+        state = state.previousState;
+      }
+      return { tokens, prefixMatchCount: match.prefixMatchCount, matchedTokenLength: match.matchedTokenLength, rangeCount: match.rangeCount };
+    }).sort(compareCandidateResult);
+    const bestMatchOfDocument = sortedMatches[0]!;
+    const documentText = documents[documentId]!;
+    const matchRatio = bestMatchOfDocument.matchedTokenLength / documentCodePoints[documentId]!.length;
+    const matchRatioLevel = Math.round(matchRatio * 5);
+    return {
+      documentId,
+      documentText,
+      documentCodePoints: documentCodePoints[documentId]!,
+      tokens: bestMatchOfDocument.tokens,
+      prefixMatchCount: bestMatchOfDocument.prefixMatchCount,
+      rangeCount: bestMatchOfDocument.rangeCount,
+      matchRatio,
+      matchRatioLevel,
+    };
+  }).sort(compareFinalResult);
+};
+
+// For debugging
+export const inspectSearchResult = (resultDocument: SearchResult, htmlHighlight: boolean) => {
+  const { documentText, tokens, rangeCount, matchRatio, matchRatioLevel } = resultDocument;
+  const escapeHtml = (s: string) => s.replaceAll('&', '&amp;').replaceAll('<', '&lt;').replaceAll('>', '&gt;');
+  const escapedText = htmlHighlight ? highlightSearchResult(resultDocument).map(part =>
+    typeof part === 'string' ? escapeHtml(part) : `<u><b>${escapeHtml(part.highlight)}</b></u>`).join('') : JSON.stringify(documentText);
+  const description = ` (${rangeCount} ranges, ${Math.round(matchRatio * 10000) / 10000} => L${matchRatioLevel})`;
+  return [
+    escapedText + (htmlHighlight ? `<code>${description}</code>` : description),
+    ...tokens.map(token => {
+      let escapedTokenText = JSON.stringify(token.definition.text);
+      let escapedDocumentText = JSON.stringify([...documentText].slice(token.documentOffset.start, token.documentOffset.end).join(''));
+      if (htmlHighlight) {
+        escapedTokenText = escapeHtml(escapedTokenText);
+        escapedDocumentText = escapeHtml(escapedDocumentText);
+      }
+      const line = `    ${TokenType[token.definition.type]}: ${escapedTokenText} -> ${escapedDocumentText}${token.isTokenPrefixMatching ? ' (prefix match)' : ''}`;
+      return htmlHighlight ? `<code>${line}</code>` : line;
+    }),
+    '',
+  ].join('\n');
+};
@@ -0,0 +1,58 @@
+import type { TrieNode } from '../common';
+
+export const deserializeTrie = (data: number[]) => {
+  const nodes: TrieNode[] = [];
+  const getNode = (id: number) => nodes[id - 1] ??= { parent: undefined, children: new Map(), tokenIds: [], subTreeTokenIds: [] };
+  let currentId = 0;
+  for (let i = 0; i < data.length;) {
+    const node = getNode(++currentId);
+    const parentId = data[i++]!;
+    node.parent = parentId !== 0 ? getNode(parentId) : undefined;
+
+    let endOfChildren = i;
+    while (endOfChildren < data.length && data[endOfChildren]! > 0) endOfChildren++;
+    const numberOfChildren = (endOfChildren - i) / 2;
+    for (let j = i; j < i + numberOfChildren; j++) {
+      const codePoint = data[j]!;
+      const child = getNode(data[j + numberOfChildren]!);
+      node.children.set(codePoint, child);
+    }
+    i = endOfChildren;
+
+    if (data[i] === 0) i++; // No token IDs
+    else while (i < data.length && data[i]! < 0) node.tokenIds.push(-data[i++]! - 1);
+  }
+  const root = nodes[0]!;
+
+  // DFS to construct code point paths for each token
+  const tokenCodePoints = new Map<number, string[]>();
+  const currentCodePoints: string[] = [];
+  const dfsCodePoints = (node: TrieNode) => {
+    for (const tokenId of node.tokenIds) tokenCodePoints.set(tokenId, [...currentCodePoints]);
+    for (const [codePoint, child] of node.children.entries()) {
+      if (child.parent !== node) continue; // Skip grafted paths as these are not the canonical representation of the tokens
+      currentCodePoints.push(String.fromCodePoint(codePoint));
+      dfsCodePoints(child);
+      currentCodePoints.pop();
+    }
+  };
+  dfsCodePoints(root);
+
+  // DFS to construct subTreeTokenIds for each node
+  const visitedNodes = new Set<TrieNode>();
+  const dfsSubTreeTokenIds = (node: TrieNode) => {
+    if (visitedNodes.has(node)) return node.subTreeTokenIds;
+    visitedNodes.add(node);
+    node.subTreeTokenIds = [...node.tokenIds, ...new Set([...node.children.values()].flatMap(child => dfsSubTreeTokenIds(child)))];
+    return node.subTreeTokenIds;
+  };
+  dfsSubTreeTokenIds(root);
+
+  return {
+    root,
+    tokenCodePoints,
+  };
+};
+
+export const getTrieNodeTokenIds = (node: TrieNode | undefined, includeSubTree: boolean) =>
+  (includeSubTree ? node?.subTreeTokenIds : node?.tokenIds) ?? [];
@@ -0,0 +1,23 @@
+{
+  "compilerOptions": {
+    "target": "ESNext",
+    "jsx": "preserve",
+    "lib": ["DOM", "DOM.Iterable", "ESNext", "WebWorker"],
+    "module": "ESNext",
+    "moduleResolution": "Bundler",
+    "noUncheckedIndexedAccess": true,
+    "resolveJsonModule": true,
+    "allowJs": true,
+    "strict": true,
+    "strictNullChecks": true,
+    "noEmit": true,
+    "esModuleInterop": true,
+    "forceConsistentCasingInFileNames": true,
+    "isolatedModules": true,
+    "skipLibCheck": true,
+    "rootDir": ".",
+    "outDir": "dist"
+  },
+  "include": ["src/**/*.ts"],
+  "exclude": ["dist", "node_modules"]
+}
@@ -0,0 +1,15 @@
+import { defineConfig } from 'tsdown';
+
+export default defineConfig({
+  entry: [
+    './src/index.ts',
+    './src/searcher/index.ts',
+    './src/indexer/index.ts',
+    './src/common/index.ts',
+  ],
+  dts: true,
+  unused: true,
+  fixedExtension: true,
+  unbundle: true,
+  sourcemap: true,
+});