feat: initial commit
This commit is contained in:
Symlink
+1
@@ -0,0 +1 @@
|
||||
../../LICENSE
|
||||
@@ -0,0 +1,72 @@
|
||||
# `@maigolabs/needle`
|
||||
|
||||
Fuzzy search engine for small text pieces, with Chinese/Japanese pronunciation support.
|
||||
|
||||
See also [in-browser demo](https://needle.maigo.dev).
|
||||
|
||||
## Install
|
||||
|
||||
Dictionaries are installed as dependencies of the package, but if you don't use the indexer, they could be tree-shaken when bundling.
|
||||
|
||||
```bash
|
||||
pnpm install @maigolabs/needle
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
### Indexing
|
||||
|
||||
NeedLe uses Kuromoji for Japanese tokenization, which loads dictionaries dynamically. You need to create a Kuromoji `TokenizerBuilder` first:
|
||||
|
||||
```ts
|
||||
// In Node.js you can just load the dictionary from the file system.
|
||||
|
||||
import { TokenizerBuilder } from '@patdx/kuromoji';
|
||||
import NodeDictionaryLoader from '@patdx/kuromoji/node';
|
||||
|
||||
const kuromojiDictPath = path.resolve(url.fileURLToPath(import.meta.resolve('@patdx/kuromoji')), '..', '..', 'dict');
|
||||
const kuromoji = await new TokenizerBuilder({ loader: new NodeDictionaryLoader({ dic_path: kuromojiDictPath }) }).build();
|
||||
|
||||
// In browser you need to provide a custom loader to load the dictionary files with fetch().
|
||||
|
||||
import { TokenizerBuilder } from '@patdx/kuromoji';
|
||||
|
||||
// You can load dict files from CDN (See also the README of https://github.com/patdx/kuromoji.js)
|
||||
const kuromoji = await new TokenizerBuilder({
|
||||
loader: {
|
||||
loadArrayBuffer: async (url: string) => {
|
||||
url = `https://cdn.jsdelivr.net/npm/@aiktb/kuromoji@1.0.2/dict/${url.replace('.gz', '')}`;
|
||||
const res = await fetch(url);
|
||||
if (!res.ok) throw new Error(`Failed to fetch ${url}`);
|
||||
return await res.arrayBuffer();
|
||||
},
|
||||
},
|
||||
}).build();
|
||||
```
|
||||
|
||||
After creating the Kuromoji instance, you can build the inverted index:
|
||||
|
||||
```ts
|
||||
import { buildInvertedIndex } from '@maigolabs/needle/indexer';
|
||||
|
||||
const documents = ['你好世界', 'こんにちは'];
|
||||
const compressedIndex = buildInvertedIndex(documents, { kuromoji });
|
||||
|
||||
// The built index could be stored for later use.
|
||||
const json = JSON.stringify(compressedIndex);
|
||||
```
|
||||
|
||||
### Searching
|
||||
|
||||
If you only import the searcher in your frontend code, indexer and dictionary-related dependencies will be tree-shaken.
|
||||
|
||||
```ts
|
||||
import { loadInvertedIndex, searchInvertedIndex } from '@maigolabs/needle/searcher';
|
||||
|
||||
const loadedIndex = loadInvertedIndex(compressedIndex);
|
||||
const results = searchInvertedIndex(loadedIndex, 'sekai');
|
||||
for (const result of results) console.log(`${result.documentText} (${(result.matchRatio * 100).toFixed(0)}%)`);
|
||||
// → 你好世界 (50%)
|
||||
```
|
||||
|
||||
To highlight the search result, see also `highlightSearchResult`.
|
||||
@@ -0,0 +1,18 @@
|
||||
import type { Config } from 'jest';
|
||||
|
||||
const config: Config = {
|
||||
preset: 'ts-jest/presets/default-esm',
|
||||
testEnvironment: 'node',
|
||||
extensionsToTreatAsEsm: ['.ts'],
|
||||
moduleNameMapper: {
|
||||
'^(\\.{1,2}/.*)\\.js$': '$1',
|
||||
},
|
||||
transform: {
|
||||
'^.+\\.tsx?$': ['ts-jest', { useESM: true }],
|
||||
},
|
||||
testMatch: ['**/*.test.ts'],
|
||||
testTimeout: 30000,
|
||||
};
|
||||
|
||||
export default config;
|
||||
|
||||
@@ -0,0 +1,84 @@
|
||||
{
|
||||
"name": "@maigolabs/needle",
|
||||
"version": "1.0.1",
|
||||
"description": "Fuzzy search engine for small text pieces, with Chinese/Japanese pronunciation support.",
|
||||
"type": "module",
|
||||
"main": "./src/index.ts",
|
||||
"scripts": {
|
||||
"build": "tsdown",
|
||||
"typecheck": "tsc",
|
||||
"test": "cross-env NODE_OPTIONS=--experimental-vm-modules jest",
|
||||
"prepare": "pnpm run build"
|
||||
},
|
||||
"license": "AGPL-3.0",
|
||||
"homepage": "https://needle.maigo.dev",
|
||||
"repository": {
|
||||
"type": "git",
|
||||
"url": "git+https://github.com/MaigoLabs/needLe.git",
|
||||
"directory": "packages/needle"
|
||||
},
|
||||
"bugs": "https://github.com/MaigoLabs/needLe/issues",
|
||||
"keywords": [
|
||||
"needle",
|
||||
"search",
|
||||
"fuzzy",
|
||||
"cjk",
|
||||
"chinese",
|
||||
"japanese",
|
||||
"pinyin",
|
||||
"romaji"
|
||||
],
|
||||
"author": "Menci <mencici@msn.com>",
|
||||
"sideEffects": false,
|
||||
"exports": {
|
||||
".": "./src/index.ts",
|
||||
"./common": "./src/common/index.ts",
|
||||
"./indexer": "./src/indexer/index.ts",
|
||||
"./searcher": "./src/searcher/index.ts",
|
||||
"./package.json": "./package.json"
|
||||
},
|
||||
"packageManager": "pnpm@10.20.0",
|
||||
"dependencies": {
|
||||
"@patdx/kuromoji": "^1.0.4",
|
||||
"hepburn": "^1.2.2",
|
||||
"opencc-js": "^1.0.5",
|
||||
"pinyin-pro": "^3.27.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/hepburn": "^1.2.2",
|
||||
"@types/jest": "^30.0.0",
|
||||
"@types/opencc-js": "^1.0.3",
|
||||
"jest": "^30.2.0",
|
||||
"ts-jest": "^29.4.6"
|
||||
},
|
||||
"files": [
|
||||
"README.md",
|
||||
"dist",
|
||||
"package.json"
|
||||
],
|
||||
"publishConfig": {
|
||||
"access": "public",
|
||||
"main": "./dist/index.mjs",
|
||||
"module": "./dist/index.mjs",
|
||||
"types": "./dist/index.d.mts",
|
||||
"exports": {
|
||||
".": {
|
||||
"types": "./dist/index.d.mts",
|
||||
"default": "./dist/index.mjs"
|
||||
},
|
||||
"./common": {
|
||||
"types": "./dist/common/index.d.mts",
|
||||
"default": "./dist/common/index.mjs"
|
||||
},
|
||||
"./indexer": {
|
||||
"types": "./dist/indexer/index.d.mts",
|
||||
"default": "./dist/indexer/index.mjs"
|
||||
},
|
||||
"./searcher": {
|
||||
"types": "./dist/searcher/index.d.mts",
|
||||
"default": "./dist/searcher/index.mjs"
|
||||
},
|
||||
"./package.json": "./package.json"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,4 @@
|
||||
export * from './types';
|
||||
export * from './utils';
|
||||
export * from './normalize';
|
||||
export * from './trie';
|
||||
@@ -0,0 +1,60 @@
|
||||
import { normalizeByCodePoint, toKatakana } from './normalize';
|
||||
|
||||
describe('toKatakana', () => {
|
||||
it('should convert hiragana to katakana', () => {
|
||||
expect(toKatakana('あいうえお')).toBe('アイウエオ');
|
||||
expect(toKatakana('かきくけこ')).toBe('カキクケコ');
|
||||
expect(toKatakana('さしすせそ')).toBe('サシスセソ');
|
||||
});
|
||||
|
||||
it('should keep katakana unchanged', () => {
|
||||
expect(toKatakana('アイウエオ')).toBe('アイウエオ');
|
||||
});
|
||||
|
||||
it('should keep non-kana characters unchanged', () => {
|
||||
expect(toKatakana('abc123')).toBe('abc123');
|
||||
expect(toKatakana('漢字')).toBe('漢字');
|
||||
});
|
||||
|
||||
it('should handle mixed input', () => {
|
||||
expect(toKatakana('あアa漢')).toBe('アアa漢');
|
||||
});
|
||||
});
|
||||
|
||||
describe('normalizeByCodePoint', () => {
|
||||
it('should convert fullwidth ASCII to halfwidth lowercase', () => {
|
||||
expect(normalizeByCodePoint('ABC')).toBe('abc');
|
||||
expect(normalizeByCodePoint('123')).toBe('123');
|
||||
expect(normalizeByCodePoint('!@#')).toBe('!@#');
|
||||
});
|
||||
|
||||
it('should convert fullwidth space to halfwidth space', () => {
|
||||
expect(normalizeByCodePoint(' ')).toBe(' ');
|
||||
});
|
||||
|
||||
it('should convert halfwidth kana to fullwidth kana', () => {
|
||||
expect(normalizeByCodePoint('アイウエオ')).toBe('アイウエオ');
|
||||
expect(normalizeByCodePoint('カキクケコ')).toBe('カキクケコ');
|
||||
});
|
||||
|
||||
it('should normalize voiced/semi-voiced sound marks', () => {
|
||||
expect(normalizeByCodePoint('゙')).toBe('\u3099'); // halfwidth voiced -> combining
|
||||
expect(normalizeByCodePoint('゚')).toBe('\u309A'); // halfwidth semi-voiced -> combining
|
||||
expect(normalizeByCodePoint('゛')).toBe('\u3099'); // fullwidth voiced -> combining
|
||||
expect(normalizeByCodePoint('゜')).toBe('\u309A'); // fullwidth semi-voiced -> combining
|
||||
});
|
||||
|
||||
it('should convert halfwidth punctuation to fullwidth', () => {
|
||||
expect(normalizeByCodePoint('。')).toBe('。');
|
||||
expect(normalizeByCodePoint('「')).toBe('「');
|
||||
expect(normalizeByCodePoint('」')).toBe('」');
|
||||
expect(normalizeByCodePoint('、')).toBe('、');
|
||||
expect(normalizeByCodePoint('・')).toBe('・');
|
||||
});
|
||||
|
||||
it('should lowercase regular ASCII', () => {
|
||||
expect(normalizeByCodePoint('ABC')).toBe('abc');
|
||||
});
|
||||
|
||||
// Should keep hiragana unchanged
|
||||
});
|
||||
@@ -0,0 +1,42 @@
|
||||
export const normalizeByCodePoint = (string: string) => [...string].map(normalizeCodePoint).join('');
|
||||
|
||||
export const normalizeCodePoint = (char: string) => {
|
||||
const codePoint = char.codePointAt(0)!;
|
||||
// Fullwidth ASCII -> Halfwidth ASCII
|
||||
if (codePoint >= 0xFF01 && codePoint <= 0xFF5E) return String.fromCodePoint(codePoint - 0xFEE0).toLowerCase();
|
||||
// Fullwidth space -> Halfwidth space
|
||||
else if (codePoint === /* ' ' */ 0x3000) return ' ';
|
||||
// Halfwidth kana (U+FF66 - U+FF9D) -> Fullwidth kana
|
||||
else if (codePoint >= 0xFF66 && codePoint <= 0xFF9D) return HALF_TO_FULL_KANA[char] ?? char;
|
||||
else if (codePoint === /* '。' */ 0xFF61) return '。';
|
||||
else if (codePoint === /* '「' */ 0xFF62) return '「';
|
||||
else if (codePoint === /* '」' */ 0xFF63) return '」';
|
||||
else if (codePoint === /* '、' */ 0xFF64) return '、';
|
||||
else if (codePoint === /* '・' */ 0xFF65) return '・';
|
||||
else if (codePoint === /* '゙' */ 0xFF9E || codePoint === /* '゛' */ 0x309B) return '\u3099'; // -> COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK
|
||||
else if (codePoint === /* '゚' */ 0xFF9F || codePoint === /* '゜' */ 0x309C) return '\u309A'; // -> COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
|
||||
else return char.toLowerCase();
|
||||
};
|
||||
|
||||
const HALF_TO_FULL_KANA: Record<string, string> = {
|
||||
'ヲ': 'ヲ', 'ァ': 'ァ', 'ィ': 'ィ', 'ゥ': 'ゥ', 'ェ': 'ェ', 'ォ': 'ォ',
|
||||
'ャ': 'ャ', 'ュ': 'ュ', 'ョ': 'ョ', 'ッ': 'ッ',
|
||||
'ー': 'ー',
|
||||
'ア': 'ア', 'イ': 'イ', 'ウ': 'ウ', 'エ': 'エ', 'オ': 'オ',
|
||||
'カ': 'カ', 'キ': 'キ', 'ク': 'ク', 'ケ': 'ケ', 'コ': 'コ',
|
||||
'サ': 'サ', 'シ': 'シ', 'ス': 'ス', 'セ': 'セ', 'ソ': 'ソ',
|
||||
'タ': 'タ', 'チ': 'チ', 'ツ': 'ツ', 'テ': 'テ', 'ト': 'ト',
|
||||
'ナ': 'ナ', 'ニ': 'ニ', 'ヌ': 'ヌ', 'ネ': 'ネ', 'ノ': 'ノ',
|
||||
'ハ': 'ハ', 'ヒ': 'ヒ', 'フ': 'フ', 'ヘ': 'ヘ', 'ホ': 'ホ',
|
||||
'マ': 'マ', 'ミ': 'ミ', 'ム': 'ム', 'メ': 'メ', 'モ': 'モ',
|
||||
'ヤ': 'ヤ', 'ユ': 'ユ', 'ヨ': 'ヨ',
|
||||
'ラ': 'ラ', 'リ': 'リ', 'ル': 'ル', 'レ': 'レ', 'ロ': 'ロ',
|
||||
'ワ': 'ワ', 'ン': 'ン',
|
||||
};
|
||||
|
||||
const isHiraganaRange = (charCode: number) => (charCode >= 0x3041 && charCode <= 0x3096) || (charCode >= 0x309D && charCode <= 0x309E);
|
||||
export const toKatakanaSingle = (char: string) => {
|
||||
const code = char.charCodeAt(0);
|
||||
return isHiraganaRange(code) ? String.fromCharCode(code + 0x60) : char;
|
||||
};
|
||||
export const toKatakana = (string: string) => [...string].map(toKatakanaSingle).join('');
|
||||
@@ -0,0 +1,17 @@
|
||||
export interface TrieNode {
|
||||
parent: TrieNode | undefined;
|
||||
children: Map<number, TrieNode>; // Unicode code point -> child node
|
||||
tokenIds: number[];
|
||||
subTreeTokenIds: number[]; // Empty on root. Will Uint16Array be faster?
|
||||
}
|
||||
|
||||
export const traverseTrieStep = (node: TrieNode | undefined, codePoint: string, ignorableCodePoints?: RegExp) =>
|
||||
node?.children.get(codePoint.codePointAt(0)!) ?? (ignorableCodePoints?.test(codePoint) ? node : undefined);
|
||||
export const traverseTrie = (node: TrieNode | undefined, text: string, ignorableCodePoints?: RegExp) => {
|
||||
if (!node) return;
|
||||
for (const codePoint of text) {
|
||||
node = traverseTrieStep(node, codePoint, ignorableCodePoints);
|
||||
if (!node) return;
|
||||
}
|
||||
return node;
|
||||
};
|
||||
@@ -0,0 +1,31 @@
|
||||
export enum TokenType {
|
||||
Raw,
|
||||
Kana,
|
||||
Romaji,
|
||||
Han,
|
||||
Pinyin,
|
||||
}
|
||||
|
||||
export interface TokenDefinition {
|
||||
id: number;
|
||||
type: TokenType;
|
||||
text: string;
|
||||
codePointLength: number;
|
||||
}
|
||||
|
||||
// [start, end)
|
||||
export interface OffsetSpan {
|
||||
start: number;
|
||||
end: number;
|
||||
}
|
||||
|
||||
export type CompressedInvertedIndex = {
|
||||
documents: string[];
|
||||
tokenTypes: TokenType[];
|
||||
tokenReferences: number[][][]; // tokenId -> [documentId, start1, end1, start2, end2, ...][]
|
||||
tries: {
|
||||
romaji: number[];
|
||||
kana: number[];
|
||||
other: number[];
|
||||
};
|
||||
};
|
||||
@@ -0,0 +1,3 @@
|
||||
import type { OffsetSpan } from './types';
|
||||
|
||||
export const getSpanLength = (offset: OffsetSpan) => offset.end - offset.start;
|
||||
@@ -0,0 +1,73 @@
|
||||
import path from 'node:path';
|
||||
import url from 'node:url';
|
||||
|
||||
import { TokenizerBuilder } from '@patdx/kuromoji';
|
||||
import NodeDictionaryLoader from '@patdx/kuromoji/node';
|
||||
|
||||
import { buildInvertedIndex, type KuromojiTokenizer } from '../indexer';
|
||||
import { highlightSearchResult, loadInvertedIndex, searchInvertedIndex } from '../searcher';
|
||||
|
||||
let kuromoji: KuromojiTokenizer;
|
||||
|
||||
beforeAll(async () => {
|
||||
const kuromojiDictPath = path.resolve(url.fileURLToPath(import.meta.resolve('@patdx/kuromoji')), '..', '..', 'dict');
|
||||
kuromoji = await new TokenizerBuilder({ loader: new NodeDictionaryLoader({ dic_path: kuromojiDictPath }) }).build();
|
||||
});
|
||||
|
||||
describe('search', () => {
|
||||
const testDocuments = [
|
||||
'ミーティア',
|
||||
'エンドマークに希望と涙を添えて',
|
||||
'宵の鳥',
|
||||
'僕の和風本当上手',
|
||||
];
|
||||
|
||||
it('should match with mixed search query', () => {
|
||||
const compressed = buildInvertedIndex(testDocuments, { kuromoji });
|
||||
const invertedIndex = loadInvertedIndex(compressed);
|
||||
|
||||
const results = searchInvertedIndex(invertedIndex, 'bokunoh风じょう');
|
||||
|
||||
// Should have at least one result
|
||||
expect(results.length).toBeGreaterThan(0);
|
||||
|
||||
// The first result should be "僕の和風本当上手"
|
||||
expect(results[0]!.documentText).toBe('僕の和風本当上手');
|
||||
});
|
||||
|
||||
it('should highlight search result correctly', () => {
|
||||
const compressed = buildInvertedIndex(testDocuments, { kuromoji });
|
||||
const invertedIndex = loadInvertedIndex(compressed);
|
||||
|
||||
const results = searchInvertedIndex(invertedIndex, 'bokunoh风じょう');
|
||||
expect(results.length).toBeGreaterThan(0);
|
||||
|
||||
const highlighted = highlightSearchResult(results[0]!);
|
||||
|
||||
// Should be an array of parts
|
||||
expect(Array.isArray(highlighted)).toBe(true);
|
||||
expect(highlighted.length).toBeGreaterThan(0);
|
||||
|
||||
// Collect highlighted text
|
||||
const highlightedTexts = highlighted
|
||||
.filter((part): part is { highlight: string } => typeof part !== 'string')
|
||||
.map(part => part.highlight);
|
||||
|
||||
expect(highlightedTexts.some(text => text.includes('僕'))).toBe(true);
|
||||
expect(highlightedTexts.some(text => text.includes('の'))).toBe(true);
|
||||
expect(highlightedTexts.some(text => text.includes('和'))).toBe(true);
|
||||
expect(highlightedTexts.some(text => text.includes('風'))).toBe(true);
|
||||
expect(highlightedTexts.some(text => text.includes('上'))).toBe(true);
|
||||
});
|
||||
|
||||
it('should match romaji input to kana documents', () => {
|
||||
const compressed = buildInvertedIndex(testDocuments, { kuromoji });
|
||||
const invertedIndex = loadInvertedIndex(compressed);
|
||||
|
||||
// Search for "yoi" should match "宵の鳥"
|
||||
const results = searchInvertedIndex(invertedIndex, 'yoi');
|
||||
const matchedTexts = results.map(r => r.documentText);
|
||||
|
||||
expect(matchedTexts).toContain('宵の鳥');
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,111 @@
|
||||
import { traverseTrie } from '../common';
|
||||
import { buildTrie, serializeTrie } from '../indexer/trie';
|
||||
import { deserializeTrie } from '../searcher/trie';
|
||||
|
||||
describe('Trie building', () => {
|
||||
it('should build a Trie with multiple different tokens', () => {
|
||||
const trie = buildTrie([
|
||||
[0, 'hello'],
|
||||
[1, 'help'],
|
||||
[2, 'world'],
|
||||
[3, 'word'],
|
||||
]);
|
||||
|
||||
// Traverse to verify structure
|
||||
const helloNode = traverseTrie(trie, 'hello');
|
||||
const helpNode = traverseTrie(trie, 'help');
|
||||
const worldNode = traverseTrie(trie, 'world');
|
||||
const wordNode = traverseTrie(trie, 'word');
|
||||
|
||||
expect(helloNode).toBeDefined();
|
||||
expect(helpNode).toBeDefined();
|
||||
expect(worldNode).toBeDefined();
|
||||
expect(wordNode).toBeDefined();
|
||||
|
||||
// Check token IDs
|
||||
expect(helloNode!.tokenIds).toContain(0);
|
||||
expect(helpNode!.tokenIds).toContain(1);
|
||||
expect(worldNode!.tokenIds).toContain(2);
|
||||
expect(wordNode!.tokenIds).toContain(3);
|
||||
|
||||
// Check that 'hel' prefix node has both tokens in subTree
|
||||
const helNode = traverseTrie(trie, 'hel');
|
||||
expect(helNode).toBeDefined();
|
||||
expect(helNode!.subTreeTokenIds).toContain(0);
|
||||
expect(helNode!.subTreeTokenIds).toContain(1);
|
||||
});
|
||||
|
||||
it('should handle Japanese text tokens', () => {
|
||||
const trie = buildTrie([
|
||||
[0, 'さくら'],
|
||||
[1, 'サクラ'],
|
||||
[2, '桜'],
|
||||
]);
|
||||
|
||||
expect(traverseTrie(trie, 'さくら')?.tokenIds).toContain(0);
|
||||
expect(traverseTrie(trie, 'サクラ')?.tokenIds).toContain(1);
|
||||
expect(traverseTrie(trie, '桜')?.tokenIds).toContain(2);
|
||||
});
|
||||
});
|
||||
|
||||
describe('Trie serialization and deserialization', () => {
|
||||
it('should serialize and deserialize a Trie correctly', () => {
|
||||
const originalTrie = buildTrie([
|
||||
[0, 'apple'],
|
||||
[1, 'app'],
|
||||
[2, 'banana'],
|
||||
]);
|
||||
|
||||
// Serialize
|
||||
const serialized = serializeTrie(originalTrie);
|
||||
expect(Array.isArray(serialized)).toBe(true);
|
||||
expect(serialized.length).toBeGreaterThan(0);
|
||||
|
||||
// Deserialize
|
||||
const { root: deserializedTrie, tokenCodePoints } = deserializeTrie(serialized);
|
||||
|
||||
// Verify structure is preserved
|
||||
const appleNode = traverseTrie(deserializedTrie, 'apple');
|
||||
const appNode = traverseTrie(deserializedTrie, 'app');
|
||||
const bananaNode = traverseTrie(deserializedTrie, 'banana');
|
||||
|
||||
expect(appleNode).toBeDefined();
|
||||
expect(appNode).toBeDefined();
|
||||
expect(bananaNode).toBeDefined();
|
||||
|
||||
expect(appleNode!.tokenIds).toContain(0);
|
||||
expect(appNode!.tokenIds).toContain(1);
|
||||
expect(bananaNode!.tokenIds).toContain(2);
|
||||
|
||||
// Verify tokenCodePoints map
|
||||
expect(tokenCodePoints.get(0)?.join('')).toBe('apple');
|
||||
expect(tokenCodePoints.get(1)?.join('')).toBe('app');
|
||||
expect(tokenCodePoints.get(2)?.join('')).toBe('banana');
|
||||
|
||||
// Verify subTreeTokenIds are reconstructed
|
||||
expect(appNode!.subTreeTokenIds).toContain(0);
|
||||
expect(appNode!.subTreeTokenIds).toContain(1);
|
||||
});
|
||||
|
||||
it('should preserve parent references after deserialization', () => {
|
||||
const originalTrie = buildTrie([
|
||||
[0, 'test'],
|
||||
]);
|
||||
|
||||
const serialized = serializeTrie(originalTrie);
|
||||
const { root } = deserializeTrie(serialized);
|
||||
|
||||
const testNode = traverseTrie(root, 'test');
|
||||
expect(testNode).toBeDefined();
|
||||
|
||||
// Walk back to root via parent references
|
||||
let node = testNode;
|
||||
let depth = 0;
|
||||
while (node?.parent) {
|
||||
node = node.parent;
|
||||
depth++;
|
||||
}
|
||||
expect(depth).toBe(4); // 't' -> 'e' -> 's' -> 't' -> root
|
||||
expect(node).toBe(root);
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,3 @@
|
||||
export * from './common';
|
||||
export * from './indexer';
|
||||
export * from './searcher';
|
||||
@@ -0,0 +1,103 @@
|
||||
import { getHanVariants, getPinyinCandidates, isHanCharacter, unionFindSet } from './han';
|
||||
|
||||
describe('unionFindSet', () => {
|
||||
it('should find self as root initially', () => {
|
||||
const ufs = unionFindSet<number>();
|
||||
expect(ufs.find(1)).toBe(1);
|
||||
expect(ufs.find(2)).toBe(2);
|
||||
});
|
||||
|
||||
it('should union two elements', () => {
|
||||
const ufs = unionFindSet<number>();
|
||||
ufs.union(1, 2);
|
||||
expect(ufs.find(1)).toBe(ufs.find(2));
|
||||
});
|
||||
|
||||
it('should union multiple elements transitively', () => {
|
||||
const ufs = unionFindSet<number>();
|
||||
ufs.union(1, 2);
|
||||
ufs.union(2, 3);
|
||||
ufs.union(4, 5);
|
||||
expect(ufs.find(1)).toBe(ufs.find(3));
|
||||
expect(ufs.find(1)).not.toBe(ufs.find(4));
|
||||
ufs.union(3, 4);
|
||||
expect(ufs.find(1)).toBe(ufs.find(5));
|
||||
});
|
||||
|
||||
it('should iterate all keys', () => {
|
||||
const ufs = unionFindSet<string>();
|
||||
ufs.union('a', 'b');
|
||||
ufs.union('c', 'd');
|
||||
const keys = [...ufs.keys()];
|
||||
expect(keys).toContain('a');
|
||||
expect(keys).toContain('b');
|
||||
expect(keys).toContain('c');
|
||||
expect(keys).toContain('d');
|
||||
});
|
||||
});
|
||||
|
||||
describe('isHanCharacter', () => {
|
||||
it('should return true for CJK characters', () => {
|
||||
expect(isHanCharacter('中')).toBe(true);
|
||||
expect(isHanCharacter('国')).toBe(true);
|
||||
expect(isHanCharacter('日')).toBe(true);
|
||||
expect(isHanCharacter('本')).toBe(true);
|
||||
});
|
||||
|
||||
it('should return false for non-CJK characters', () => {
|
||||
expect(isHanCharacter('a')).toBe(false);
|
||||
expect(isHanCharacter('あ')).toBe(false);
|
||||
expect(isHanCharacter('ア')).toBe(false);
|
||||
expect(isHanCharacter('1')).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
describe('getHanVariants', () => {
|
||||
it('should return variants for simplified/traditional characters', () => {
|
||||
// 国 (simplified) and 國 (traditional) should be variants of each other
|
||||
const variants1 = getHanVariants('国');
|
||||
const variants2 = getHanVariants('國');
|
||||
expect(variants1).toContain('国');
|
||||
expect(variants1).toContain('國');
|
||||
expect(variants2).toContain('国');
|
||||
expect(variants2).toContain('國');
|
||||
});
|
||||
|
||||
it('should return the character itself for characters without variants', () => {
|
||||
const variants = getHanVariants('一');
|
||||
expect(variants).toContain('一');
|
||||
});
|
||||
|
||||
it('should return empty array for non-Han characters', () => {
|
||||
expect(getHanVariants('a')).toEqual([]);
|
||||
expect(getHanVariants('あ')).toEqual([]);
|
||||
});
|
||||
});
|
||||
|
||||
describe('getPinyinCandidates', () => {
|
||||
it('should return pinyin for a Han character', () => {
|
||||
const candidates = getPinyinCandidates('中');
|
||||
expect(candidates).toContain('zhong');
|
||||
expect(candidates).toContain('zh'); // initial
|
||||
expect(candidates).toContain('z'); // first letter
|
||||
});
|
||||
|
||||
it('should return multiple pinyin for polyphonic characters', () => {
|
||||
// 行 can be "xing" or "hang"
|
||||
const candidates = getPinyinCandidates('行');
|
||||
expect(candidates).toContain('xing');
|
||||
expect(candidates).toContain('hang');
|
||||
});
|
||||
|
||||
it('should include fuzzy pinyin variants', () => {
|
||||
// 风 is "feng", should also have fuzzy variant "fen"
|
||||
const candidates = getPinyinCandidates('风');
|
||||
expect(candidates).toContain('feng');
|
||||
expect(candidates).toContain('fen'); // fuzzy: eng -> en
|
||||
});
|
||||
|
||||
it('should return empty array for non-Han characters', () => {
|
||||
expect(getPinyinCandidates('a')).toEqual([]);
|
||||
expect(getPinyinCandidates('あ')).toEqual([]);
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,85 @@
|
||||
// @ts-expect-error No declaration file
|
||||
import hkVariants from 'opencc-js/dict/HKVariants';
|
||||
// @ts-expect-error No declaration file
|
||||
import hkVariantsRev from 'opencc-js/dict/HKVariantsRev';
|
||||
// @ts-expect-error No declaration file
|
||||
import jpVariants from 'opencc-js/dict/JPVariants';
|
||||
// @ts-expect-error No declaration file
|
||||
import jpVariantsRev from 'opencc-js/dict/JPVariantsRev';
|
||||
// @ts-expect-error No declaration file
|
||||
import stCharacters from 'opencc-js/dict/STCharacters';
|
||||
// @ts-expect-error No declaration file
|
||||
import tsCharacters from 'opencc-js/dict/TSCharacters';
|
||||
// @ts-expect-error No declaration file
|
||||
import twVariants from 'opencc-js/dict/TWVariants';
|
||||
// @ts-expect-error No declaration file
|
||||
import twVariantsRev from 'opencc-js/dict/TWVariantsRev';
|
||||
import { polyphonic } from 'pinyin-pro';
|
||||
|
||||
export const unionFindSet = <T>() => {
|
||||
const parent = new Map<T, T>();
|
||||
const rank = new Map<T, number>();
|
||||
const find = (x: T): T => {
|
||||
const p = parent.get(x);
|
||||
if (p == null) {
|
||||
parent.set(x, x);
|
||||
return x;
|
||||
} else if (p === x) return x;
|
||||
else {
|
||||
const root = find(p);
|
||||
parent.set(x, root);
|
||||
return root;
|
||||
}
|
||||
};
|
||||
const union = (x: T, y: T) => {
|
||||
x = find(x);
|
||||
y = find(y);
|
||||
if (x === y) return;
|
||||
const rankX = rank.get(x) ?? 0, rankY = rank.get(y) ?? 0;
|
||||
if (rankX < rankY) parent.set(x, y);
|
||||
else if (rankX > rankY) parent.set(y, x);
|
||||
else {
|
||||
parent.set(y, x);
|
||||
rank.set(x, rankX + 1);
|
||||
}
|
||||
};
|
||||
const keys = () => parent.keys();
|
||||
return { find, union, keys };
|
||||
};
|
||||
|
||||
const exchangeMap = (() => {
|
||||
const ufs = unionFindSet<string>();
|
||||
for (const dict of [hkVariants, hkVariantsRev, jpVariants, jpVariantsRev, stCharacters, tsCharacters, twVariants, twVariantsRev] as string[]) {
|
||||
for (const [from, to] of dict.split('|').map(pair => pair.split(' '))) {
|
||||
if (!from || !to || [...from].length !== 1 || [...to].length !== 1) continue;
|
||||
ufs.union(from, to);
|
||||
}
|
||||
}
|
||||
const map = new Map<string, string[]>();
|
||||
for (const key of ufs.keys()) {
|
||||
const root = ufs.find(key);
|
||||
let list = map.get(root);
|
||||
if (!list) map.set(root, list = []);
|
||||
if (key !== root) map.set(key, list);
|
||||
list.push(key);
|
||||
}
|
||||
for (const list of map.values()) list.sort();
|
||||
return map;
|
||||
})();
|
||||
|
||||
export const isHanCharacter = (phrase: string) => /^[\p{Script=Han}]+$/u.test(phrase);
|
||||
|
||||
export const getHanVariants = (character: string) => exchangeMap.get(character) ?? (isHanCharacter(character) ? [character] : []);
|
||||
|
||||
const PINYIN_INITIALS: string[] = ['b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 'j', 'q', 'x', 'zh', 'ch', 'sh', 'r', 'z', 'c', 's', 'y', 'w'];
|
||||
const PINYIN_FINALS_FUZZY_MAP: Record<string, string> = { 'ang': 'an', 'eng': 'en', 'ing': 'in' };
|
||||
export const getPinyinCandidates = (character: string) => {
|
||||
const pinyins = polyphonic(character, { type: 'array', toneType: 'none', removeNonZh: true })[0] ?? [];
|
||||
return Array.from(new Set(pinyins.filter(fullPinyin => fullPinyin).flatMap(fullPinyin => {
|
||||
const initial = PINYIN_INITIALS.find(initial => fullPinyin.startsWith(initial));
|
||||
const initialAlphabet = initial?.[0] ?? fullPinyin[0]!;
|
||||
const fuzzySuffix = fullPinyin.slice(-3);
|
||||
const fuzzyPinyin = fuzzySuffix in PINYIN_FINALS_FUZZY_MAP ? fullPinyin.slice(0, -3) + PINYIN_FINALS_FUZZY_MAP[fuzzySuffix] : undefined;
|
||||
return [fullPinyin, initial, initialAlphabet, fuzzyPinyin].filter((s): s is string => !!s);
|
||||
})));
|
||||
};
|
||||
@@ -0,0 +1,5 @@
|
||||
export * from './han';
|
||||
export * from './japanese';
|
||||
export * from './tokenizer';
|
||||
export * from './trie';
|
||||
export * from './inverted-index';
|
||||
@@ -0,0 +1,46 @@
|
||||
import { NORMALIZE_RULES_KANA_DAKUTEN, NORMALIZE_RULES_ROMAJI } from './japanese';
|
||||
import { createTokenizer, type TokenizerOptions } from './tokenizer';
|
||||
import { buildTrie, graftTriePaths, serializeTrie } from './trie';
|
||||
import type { CompressedInvertedIndex, TokenDefinition } from '../common/types';
|
||||
import { TokenType } from '../common/types';
|
||||
|
||||
const buildTypedTrie = (tokens: TokenDefinition[], typePredicate: (tokenType: TokenType) => boolean) =>
|
||||
buildTrie(tokens.filter(token => typePredicate(token.type)).map(token => [token.id, token.text]));
|
||||
|
||||
export const buildInvertedIndex = (documents: string[], tokenizerOptions: TokenizerOptions) => {
|
||||
const tokenizer = createTokenizer(tokenizerOptions);
|
||||
const documentTokens = documents.map(document => tokenizer.tokenize(document));
|
||||
|
||||
const tokenDefinitions = [...tokenizer.tokens.values()];
|
||||
const romajiRoot = buildTypedTrie(tokenDefinitions, type => type === TokenType.Romaji);
|
||||
const kanaRoot = buildTypedTrie(tokenDefinitions, type => type === TokenType.Kana);
|
||||
const otherRoot = buildTypedTrie(tokenDefinitions, type => type !== TokenType.Romaji && type !== TokenType.Kana);
|
||||
graftTriePaths(romajiRoot, NORMALIZE_RULES_ROMAJI);
|
||||
graftTriePaths(kanaRoot, NORMALIZE_RULES_KANA_DAKUTEN);
|
||||
|
||||
const invertedIndex: CompressedInvertedIndex = {
|
||||
documents,
|
||||
tokenTypes: tokenDefinitions.map(token => token.type),
|
||||
tokenReferences: Array.from({ length: tokenDefinitions.length }, () => []),
|
||||
tries: {
|
||||
romaji: serializeTrie(romajiRoot),
|
||||
kana: serializeTrie(kanaRoot),
|
||||
other: serializeTrie(otherRoot),
|
||||
},
|
||||
};
|
||||
for (const [documentId, tokens] of documentTokens.entries()) {
|
||||
const tokenOccurrences = new Map<number, number[]>();
|
||||
for (const token of tokens) {
|
||||
let occurrences = tokenOccurrences.get(token.id);
|
||||
if (!occurrences) {
|
||||
occurrences = [];
|
||||
tokenOccurrences.set(token.id, occurrences);
|
||||
}
|
||||
occurrences.push(token.start, token.end);
|
||||
}
|
||||
for (const [tokenId, occurrences] of tokenOccurrences) {
|
||||
invertedIndex.tokenReferences[tokenId]!.push([documentId, ...occurrences]);
|
||||
}
|
||||
}
|
||||
return invertedIndex;
|
||||
};
|
||||
@@ -0,0 +1,66 @@
|
||||
import path from 'node:path';
|
||||
import url from 'node:url';
|
||||
|
||||
import { TokenizerBuilder } from '@patdx/kuromoji';
|
||||
import NodeDictionaryLoader from '@patdx/kuromoji/node';
|
||||
|
||||
import { getAllKanaReadings, toRomajiStrictly } from './japanese';
|
||||
import type { KuromojiTokenizer } from './tokenizer';
|
||||
|
||||
let kuromoji: KuromojiTokenizer;
|
||||
|
||||
beforeAll(async () => {
|
||||
const kuromojiDictPath = path.resolve(url.fileURLToPath(import.meta.resolve('@patdx/kuromoji')), '..', '..', 'dict');
|
||||
kuromoji = await new TokenizerBuilder({ loader: new NodeDictionaryLoader({ dic_path: kuromojiDictPath }) }).build();
|
||||
});
|
||||
|
||||
describe('toRomajiStrictly', () => {
|
||||
it('should convert basic kana to romaji', () => {
|
||||
expect(toRomajiStrictly('あ')).toBe('a');
|
||||
expect(toRomajiStrictly('か')).toBe('ka');
|
||||
expect(toRomajiStrictly('さくら')).toBe('sakura');
|
||||
});
|
||||
|
||||
it('should convert katakana to romaji', () => {
|
||||
expect(toRomajiStrictly('ア')).toBe('a');
|
||||
expect(toRomajiStrictly('カ')).toBe('ka');
|
||||
expect(toRomajiStrictly('サクラ')).toBe('sakura');
|
||||
});
|
||||
|
||||
it('should handle long vowels', () => {
|
||||
expect(toRomajiStrictly('おう')).toBe('ou');
|
||||
expect(toRomajiStrictly('おお')).toBe('oo');
|
||||
});
|
||||
|
||||
it('should return empty string for invalid first character', () => {
|
||||
expect(toRomajiStrictly('ー')).toBe(''); // prolonged sound mark cannot be first
|
||||
expect(toRomajiStrictly('ゃ')).toBe(''); // small ya cannot be first
|
||||
});
|
||||
|
||||
it('should return empty string for invalid last character', () => {
|
||||
expect(toRomajiStrictly('っ')).toBe(''); // small tsu cannot be last
|
||||
});
|
||||
|
||||
it('should handle gemination (small tsu)', () => {
|
||||
expect(toRomajiStrictly('かった')).toBe('katta');
|
||||
});
|
||||
});
|
||||
|
||||
describe('getAllKanaReadings', () => {
|
||||
it('should return katakana reading for pure kana input', () => {
|
||||
const readings = getAllKanaReadings(kuromoji, 'あ');
|
||||
expect(readings).toContain('ア');
|
||||
});
|
||||
|
||||
it('should return readings for kanji', () => {
|
||||
const readings = getAllKanaReadings(kuromoji, '僕');
|
||||
expect(readings.length).toBeGreaterThan(0);
|
||||
// 僕 should have reading ボク
|
||||
expect(readings).toContain('ボク');
|
||||
});
|
||||
|
||||
it('should return readings for compound words', () => {
|
||||
const readings = getAllKanaReadings(kuromoji, '和風');
|
||||
expect(readings.length).toBeGreaterThan(0);
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,158 @@
|
||||
import { fromKana } from 'hepburn';
|
||||
|
||||
import type { KuromojiTokenizer } from './tokenizer';
|
||||
import { toKatakana } from '../common';
|
||||
|
||||
// We have normalized all other sound marks to \u3099 and \u309A (combining kata-hiragana voiced/semi-voiced sound marks)
|
||||
export const isMaybeJapanese = (phrase: string) => /^[\p{Script=Han}\u3041-\u309F\u30A0-\u30FF\u3005\u3006\u30FC\u3099\u309A]+$/u.test(phrase);
|
||||
|
||||
// See also normalize.ts
|
||||
export const isJapaneseSoundMark = (phrase: string) => /^[\u3099\u309A]+$/.test(phrase);
|
||||
export const stripJapaneseSoundMarks = (phrase: string) => phrase.replaceAll('\u3099', '').replaceAll('\u309A', '');
|
||||
|
||||
export const isKanaSingle = (char: string) => {
|
||||
const code = char.charCodeAt(0);
|
||||
return (code >= 0x3041 && code <= 0x309F) || (code >= 0x30A0 && code <= 0x30FF);
|
||||
};
|
||||
export const isKana = (phrase: string) => [...phrase].every(isKanaSingle);
|
||||
|
||||
const KANAS_CANNOT_BE_FIRST = [
|
||||
'ァ', 'ィ', 'ゥ', 'ェ', 'ォ',
|
||||
'ぁ', 'ぃ', 'ぅ', 'ぇ', 'ぉ',
|
||||
'ャ', 'ュ', 'ョ',
|
||||
'ゃ', 'ゅ', 'ょ',
|
||||
'ヮ', 'ゎ',
|
||||
'ㇰ', 'ㇱ', 'ㇲ', 'ㇳ', 'ㇴ', 'ㇵ', 'ㇶ', 'ㇷ', 'ㇸ', 'ㇹ', 'ㇺ', 'ㇻ', 'ㇼ', 'ㇽ', 'ㇾ', 'ㇿ',
|
||||
'ー',
|
||||
];
|
||||
const KANAS_CANNOT_BE_LAST = [
|
||||
'ッ', 'っ',
|
||||
];
|
||||
export const toRomajiStrictly = (kana: string) => {
|
||||
if (KANAS_CANNOT_BE_FIRST.includes(kana[0]!)) return '';
|
||||
if (KANAS_CANNOT_BE_LAST.includes(kana[kana.length - 1]!)) return '';
|
||||
const romaji = fromKana(kana).toLowerCase()
|
||||
.replaceAll('ā', 'aa')
|
||||
.replaceAll('ī', 'ii')
|
||||
.replaceAll('ū', 'uu')
|
||||
.replaceAll('ē', 'ee')
|
||||
.replaceAll('ō', 'ou');
|
||||
if (!romaji.match(/^[a-z]+$/)) return '';
|
||||
return romaji;
|
||||
};
|
||||
|
||||
export const createTranscriptionEnumerator = (
|
||||
isValidPhrase: (codePoints: string[], start: number, length: number) => boolean,
|
||||
getAllTranscriptions: (phrase: string) => string[],
|
||||
) => (codePoints: string[]) => {
|
||||
const toKey = (start: number, length: number) => `${start}:${length}`;
|
||||
const resultMap = new Map<string, { start: number; length: number; transcriptions: string[] }>();
|
||||
for (let phraseLength = 1; phraseLength <= codePoints.length; phraseLength++) for (let start = 0; start + phraseLength <= codePoints.length; start++) {
|
||||
if (!isValidPhrase(codePoints, start, phraseLength)) continue;
|
||||
const phrase = codePoints.slice(start, start + phraseLength).join('');
|
||||
const atomicTranscriptions = [...new Set(getAllTranscriptions(phrase))].filter(candidateTranscription => {
|
||||
if (!candidateTranscription) return false;
|
||||
// Ensure the transcription is atomic (not a combination of multiple shorter transcriptions, separated by any midpoints)
|
||||
type State = { phrasePosition: number; transcriptionPosition: number };
|
||||
const toStateKey = (state: State) => `${state.phrasePosition}:${state.transcriptionPosition}`;
|
||||
const visitedStates = new Set<string>();
|
||||
const queue: State[] = [{ phrasePosition: 0, transcriptionPosition: 0 }];
|
||||
while (queue.length > 0) {
|
||||
const { phrasePosition, transcriptionPosition } = queue.shift()!;
|
||||
for (let prefixLength = 1; prefixLength <= phraseLength - phrasePosition; prefixLength++) {
|
||||
const prefixResult = resultMap.get(toKey(start + phrasePosition, prefixLength));
|
||||
if (!prefixResult) continue;
|
||||
for (const transcription of prefixResult.transcriptions) {
|
||||
if (candidateTranscription.slice(transcriptionPosition, transcriptionPosition + transcription.length) === transcription) {
|
||||
const nextState: State = { phrasePosition: phrasePosition + prefixLength, transcriptionPosition: transcriptionPosition + transcription.length };
|
||||
if (nextState.phrasePosition === phraseLength && nextState.transcriptionPosition === candidateTranscription.length) return false; // Found a valid combination
|
||||
if (visitedStates.has(toStateKey(nextState))) continue;
|
||||
visitedStates.add(toStateKey(nextState));
|
||||
queue.push(nextState);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
});
|
||||
if (atomicTranscriptions.length > 0) resultMap.set(toKey(start, phraseLength), { start, length: phraseLength, transcriptions: atomicTranscriptions });
|
||||
}
|
||||
return [...resultMap.values()];
|
||||
};
|
||||
|
||||
export const getAllKanaReadings = (kuromoji: KuromojiTokenizer, phrase: string) => Array.from(new Set(
|
||||
[
|
||||
...isKana(phrase) ? [toKatakana(phrase)] : [],
|
||||
...isKana(phrase) && [...phrase].length === 1 ? [] : ((kuromoji.token_info_dictionary.target_map[kuromoji.viterbi_builder.trie.lookup(phrase)] ?? [])
|
||||
.map(id => kuromoji.formatter.formatEntry(
|
||||
id, 0, 'KNOWN',
|
||||
kuromoji.token_info_dictionary.getFeatures(id as unknown as string)?.split(',') ?? [],
|
||||
).reading)
|
||||
.filter((reading): reading is string => !!reading))
|
||||
.map(toKatakana),
|
||||
],
|
||||
));
|
||||
|
||||
const createNormalizer = (rules: Record<string, string>) => (text: string) => {
|
||||
while (true) {
|
||||
const beforeCurrentIteration = text;
|
||||
for (const [from, to] of Object.entries(rules)) text = text.replaceAll(from, to);
|
||||
if (text === beforeCurrentIteration) break;
|
||||
}
|
||||
return text;
|
||||
};
|
||||
|
||||
export const NORMALIZE_RULES_ROMAJI: Record<string, string> = {
|
||||
// Remove all long vowels (sa-ba- -> saba)
|
||||
'-': '',
|
||||
// Collapse consecutive vowels
|
||||
'aa': 'a',
|
||||
'ii': 'i',
|
||||
'uu': 'u',
|
||||
'ee': 'e',
|
||||
'oo': 'o',
|
||||
'ou': 'o',
|
||||
// mb/mp/mm -> nb/np/nm (shimbun -> shinbun)
|
||||
'mb': 'nb',
|
||||
'mp': 'np',
|
||||
'mm': 'nm',
|
||||
// Others
|
||||
'sha': 'sya',
|
||||
'tsu': 'tu',
|
||||
'chi': 'ti',
|
||||
'shi': 'si',
|
||||
'ji': 'zi',
|
||||
};
|
||||
export const normalizeRomaji = createNormalizer(NORMALIZE_RULES_ROMAJI);
|
||||
|
||||
export const NORMALIZE_RULES_KANA_DAKUTEN: Record<string, string> = {
|
||||
'う\u3099': 'ゔ',
|
||||
'か\u3099': 'が', 'き\u3099': 'ぎ', 'く\u3099': 'ぐ', 'け\u3099': 'げ', 'こ\u3099': 'ご',
|
||||
'さ\u3099': 'ざ', 'し\u3099': 'じ', 'す\u3099': 'ず', 'せ\u3099': 'ぜ', 'そ\u3099': 'ぞ',
|
||||
'た\u3099': 'だ', 'ち\u3099': 'ぢ', 'つ\u3099': 'づ', 'て\u3099': 'で', 'と\u3099': 'ど',
|
||||
'は\u3099': 'ば', 'ひ\u3099': 'び', 'ふ\u3099': 'ぶ', 'へ\u3099': 'べ', 'ほ\u3099': 'ぼ',
|
||||
'は\u309A': 'ぱ', 'ひ\u309A': 'ぴ', 'ふ\u309A': 'ぷ', 'へ\u309A': 'ぺ', 'ほ\u309A': 'ぽ',
|
||||
'ゝ\u3099': 'ゞ',
|
||||
|
||||
'ウ\u3099': 'ヴ',
|
||||
'カ\u3099': 'ガ', 'キ\u3099': 'ギ', 'ク\u3099': 'グ', 'ケ\u3099': 'ゲ', 'コ\u3099': 'ゴ',
|
||||
'サ\u3099': 'ザ', 'シ\u3099': 'ジ', 'ス\u3099': 'ズ', 'セ\u3099': 'ゼ', 'ソ\u3099': 'ゾ',
|
||||
'タ\u3099': 'ダ', 'チ\u3099': 'ヂ', 'ツ\u3099': 'ヅ', 'テ\u3099': 'デ', 'ト\u3099': 'ド',
|
||||
'ハ\u3099': 'バ', 'ヒ\u3099': 'ビ', 'フ\u3099': 'ブ', 'ヘ\u3099': 'ベ', 'ホ\u3099': 'ボ',
|
||||
'ハ\u309A': 'パ', 'ヒ\u309A': 'ピ', 'フ\u309A': 'プ', 'ヘ\u309A': 'ペ', 'ホ\u309A': 'ポ',
|
||||
'ワ\u3099': 'ヷ', 'ヰ\u3099': 'ヸ', 'ヱ\u3099': 'ヹ', 'ヲ\u3099': 'ヺ',
|
||||
'ヽ\u3099': 'ヾ',
|
||||
};
|
||||
export const normalizeKanaDakuten = createNormalizer(NORMALIZE_RULES_KANA_DAKUTEN);
|
||||
|
||||
const isValidJapanesePhrase = (codePoints: string[], start: number, length: number) =>
|
||||
// Skip splittings that cause sound marks to occur in the first position of a phrase
|
||||
!isJapaneseSoundMark(codePoints[start]!) && (start + length === codePoints.length || !isJapaneseSoundMark(codePoints[start + length]!));
|
||||
export const createKanaTranscriptionEnumerator = (kuromoji: KuromojiTokenizer) => createTranscriptionEnumerator(
|
||||
isValidJapanesePhrase,
|
||||
phrase => getAllKanaReadings(kuromoji, stripJapaneseSoundMarks(normalizeKanaDakuten(phrase))),
|
||||
);
|
||||
export const createRomajiTranscriptionEnumerator = (kuromoji: KuromojiTokenizer) => createTranscriptionEnumerator(
|
||||
isValidJapanesePhrase,
|
||||
phrase => getAllKanaReadings(kuromoji, stripJapaneseSoundMarks(normalizeKanaDakuten(phrase))).map(kana => normalizeRomaji(toRomajiStrictly(kana))),
|
||||
);
|
||||
@@ -0,0 +1,166 @@
|
||||
import path from 'node:path';
|
||||
import url from 'node:url';
|
||||
|
||||
import { TokenizerBuilder } from '@patdx/kuromoji';
|
||||
import NodeDictionaryLoader from '@patdx/kuromoji/node';
|
||||
|
||||
import { createTokenizer, type KuromojiTokenizer } from './tokenizer';
|
||||
import { TokenType } from '../common/types';
|
||||
|
||||
let kuromoji: KuromojiTokenizer;
|
||||
|
||||
beforeAll(async () => {
|
||||
const kuromojiDictPath = path.resolve(url.fileURLToPath(import.meta.resolve('@patdx/kuromoji')), '..', '..', 'dict');
|
||||
kuromoji = await new TokenizerBuilder({ loader: new NodeDictionaryLoader({ dic_path: kuromojiDictPath }) }).build();
|
||||
});
|
||||
|
||||
describe('tokenizer', () => {
|
||||
it('should tokenize mixed Japanese text', () => {
|
||||
const tokenizer = createTokenizer({ kuromoji });
|
||||
const tokens = tokenizer.tokenize('僕の和風本当上手');
|
||||
|
||||
// Get all token definitions
|
||||
const tokenDefs = [...tokenizer.tokens.values()];
|
||||
|
||||
// Should have tokens of various types
|
||||
const types = new Set(tokenDefs.map(t => t.type));
|
||||
expect(types.has(TokenType.Han)).toBe(true);
|
||||
expect(types.has(TokenType.Pinyin)).toBe(true);
|
||||
expect(types.has(TokenType.Kana)).toBe(true);
|
||||
expect(types.has(TokenType.Romaji)).toBe(true);
|
||||
|
||||
const getTokenTextsAt = (pos: number, type: TokenType) => tokens
|
||||
.filter(t => t.start <= pos && t.end > pos && tokenDefs.find(d => d.id === t.id)?.type === type)
|
||||
.map(t => tokenDefs.find(d => d.id === t.id)!.text);
|
||||
|
||||
// Position 0: 僕
|
||||
expect(getTokenTextsAt(0, TokenType.Han)).toContain('僕');
|
||||
expect(getTokenTextsAt(0, TokenType.Pinyin)).toContain('pu');
|
||||
expect(getTokenTextsAt(0, TokenType.Kana)).toContain('ボク');
|
||||
expect(getTokenTextsAt(0, TokenType.Romaji)).toContain('boku');
|
||||
|
||||
// Position 1: の (hiragana, no Han/Pinyin)
|
||||
expect(getTokenTextsAt(1, TokenType.Han)).toEqual([]);
|
||||
expect(getTokenTextsAt(1, TokenType.Pinyin)).toEqual([]);
|
||||
expect(getTokenTextsAt(1, TokenType.Kana)).toContain('ノ');
|
||||
expect(getTokenTextsAt(1, TokenType.Romaji)).toContain('no');
|
||||
|
||||
// Position 2: 和
|
||||
expect(getTokenTextsAt(2, TokenType.Han)).toContain('和');
|
||||
expect(getTokenTextsAt(2, TokenType.Pinyin)).toContain('he');
|
||||
expect(getTokenTextsAt(2, TokenType.Kana)).toContain('ワ');
|
||||
expect(getTokenTextsAt(2, TokenType.Romaji)).toContain('wa');
|
||||
|
||||
// Position 3: 風
|
||||
expect(getTokenTextsAt(3, TokenType.Han)).toContain('風');
|
||||
expect(getTokenTextsAt(3, TokenType.Han)).toContain('风'); // simplified variant
|
||||
expect(getTokenTextsAt(3, TokenType.Pinyin)).toContain('feng');
|
||||
expect(getTokenTextsAt(3, TokenType.Kana)).toContain('フウ');
|
||||
expect(getTokenTextsAt(3, TokenType.Romaji)).toContain('fu');
|
||||
|
||||
// Position 4: 本
|
||||
expect(getTokenTextsAt(4, TokenType.Han)).toContain('本');
|
||||
expect(getTokenTextsAt(4, TokenType.Pinyin)).toContain('ben');
|
||||
expect(getTokenTextsAt(4, TokenType.Kana)).toContain('ホン');
|
||||
expect(getTokenTextsAt(4, TokenType.Romaji)).toContain('hon');
|
||||
|
||||
// Position 5: 当
|
||||
expect(getTokenTextsAt(5, TokenType.Han)).toContain('当');
|
||||
expect(getTokenTextsAt(5, TokenType.Han)).toContain('當'); // traditional variant
|
||||
expect(getTokenTextsAt(5, TokenType.Pinyin)).toContain('dang');
|
||||
expect(getTokenTextsAt(5, TokenType.Kana)).toContain('トウ');
|
||||
expect(getTokenTextsAt(5, TokenType.Romaji)).toContain('to'); // normalized: tou -> to
|
||||
|
||||
// Position 6: 上
|
||||
expect(getTokenTextsAt(6, TokenType.Han)).toContain('上');
|
||||
expect(getTokenTextsAt(6, TokenType.Pinyin)).toContain('shang');
|
||||
expect(getTokenTextsAt(6, TokenType.Kana)).toContain('ジョウ');
|
||||
expect(getTokenTextsAt(6, TokenType.Romaji)).toContain('jo'); // normalized: jou -> jo
|
||||
|
||||
// Position 7: 手
|
||||
expect(getTokenTextsAt(7, TokenType.Han)).toContain('手');
|
||||
expect(getTokenTextsAt(7, TokenType.Pinyin)).toContain('shou');
|
||||
expect(getTokenTextsAt(7, TokenType.Kana)).toContain('シュ');
|
||||
expect(getTokenTextsAt(7, TokenType.Romaji)).toContain('shu');
|
||||
|
||||
// Check that tokens cover the entire input
|
||||
expect(tokens.length).toBeGreaterThan(0);
|
||||
|
||||
// Check some specific token definitions exist
|
||||
const hanTokenTexts = tokenDefs.filter(t => t.type === TokenType.Han).map(t => t.text);
|
||||
expect(hanTokenTexts).toContain('僕');
|
||||
expect(hanTokenTexts).toContain('和');
|
||||
expect(hanTokenTexts).toContain('風');
|
||||
|
||||
// Check kana readings exist for kanji
|
||||
const kanaTokenTexts = tokenDefs.filter(t => t.type === TokenType.Kana).map(t => t.text);
|
||||
expect(kanaTokenTexts).toContain('ボク'); // 僕 -> ボク
|
||||
|
||||
// Check romaji readings exist
|
||||
const romajiTokenTexts = tokenDefs.filter(t => t.type === TokenType.Romaji).map(t => t.text);
|
||||
expect(romajiTokenTexts).toContain('boku'); // 僕 -> boku
|
||||
});
|
||||
|
||||
it('should not create duplicate tokens when tokenizing multiple documents', () => {
|
||||
const tokenizer = createTokenizer({ kuromoji });
|
||||
|
||||
// Tokenize multiple music names that share some characters
|
||||
tokenizer.tokenize('僕の和風本当上手');
|
||||
tokenizer.tokenize('僕');
|
||||
tokenizer.tokenize('和風');
|
||||
|
||||
// Check that there are no duplicate tokens
|
||||
const tokenDefs = [...tokenizer.tokens.values()];
|
||||
const tokenKeys = tokenDefs.map(t => `${t.type}:${t.text}`);
|
||||
const uniqueKeys = new Set(tokenKeys);
|
||||
|
||||
expect(tokenKeys.length).toBe(uniqueKeys.size);
|
||||
|
||||
// Also check that IDs are unique
|
||||
const ids = tokenDefs.map(t => t.id);
|
||||
const uniqueIds = new Set(ids);
|
||||
expect(ids.length).toBe(uniqueIds.size);
|
||||
});
|
||||
|
||||
it('should handle Raw tokens for non-CJK characters', () => {
|
||||
const tokenizer = createTokenizer({ kuromoji });
|
||||
tokenizer.tokenize('a-b');
|
||||
|
||||
const tokenDefs = [...tokenizer.tokens.values()];
|
||||
const rawTokenTexts = tokenDefs.filter(t => t.type === TokenType.Raw).map(t => t.text);
|
||||
|
||||
expect(rawTokenTexts).toContain('a'); // normalized to lowercase
|
||||
expect(rawTokenTexts).toContain('-');
|
||||
expect(rawTokenTexts).toContain('b');
|
||||
});
|
||||
|
||||
it('should tokenize compound word "今日" with both individual and combined readings', () => {
|
||||
const tokenizer = createTokenizer({ kuromoji });
|
||||
const tokens = tokenizer.tokenize('今日');
|
||||
const tokenDefs = [...tokenizer.tokens.values()];
|
||||
|
||||
const getTokensWithSpan = (type: TokenType, start: number, end: number) => tokens
|
||||
.filter(t => t.start === start && t.end === end && tokenDefs.find(d => d.id === t.id)?.type === type)
|
||||
.map(t => tokenDefs.find(d => d.id === t.id)!.text);
|
||||
|
||||
// Individual character readings at position 0: 今
|
||||
expect(getTokensWithSpan(TokenType.Han, 0, 1)).toContain('今');
|
||||
expect(getTokensWithSpan(TokenType.Pinyin, 0, 1)).toContain('jin');
|
||||
expect(getTokensWithSpan(TokenType.Kana, 0, 1)).toContain('コン');
|
||||
expect(getTokensWithSpan(TokenType.Kana, 0, 1)).toContain('イマ');
|
||||
expect(getTokensWithSpan(TokenType.Romaji, 0, 1)).toContain('kon');
|
||||
expect(getTokensWithSpan(TokenType.Romaji, 0, 1)).toContain('ima');
|
||||
|
||||
// Individual character readings at position 1: 日
|
||||
expect(getTokensWithSpan(TokenType.Han, 1, 2)).toContain('日');
|
||||
expect(getTokensWithSpan(TokenType.Pinyin, 1, 2)).toContain('ri');
|
||||
expect(getTokensWithSpan(TokenType.Kana, 1, 2)).toContain('ニチ');
|
||||
expect(getTokensWithSpan(TokenType.Kana, 1, 2)).toContain('ヒ');
|
||||
expect(getTokensWithSpan(TokenType.Romaji, 1, 2)).toContain('niti');
|
||||
expect(getTokensWithSpan(TokenType.Romaji, 1, 2)).toContain('hi');
|
||||
|
||||
// Combined reading for "今日" [0, 2] - this is an indivisible compound word
|
||||
expect(getTokensWithSpan(TokenType.Kana, 0, 2)).toContain('キョウ');
|
||||
expect(getTokensWithSpan(TokenType.Romaji, 0, 2)).toContain('kyo'); // normalized: kyou -> kyo
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,93 @@
|
||||
import type { TokenizerBuilder } from '@patdx/kuromoji';
|
||||
|
||||
import { getHanVariants, getPinyinCandidates } from './han';
|
||||
import { createKanaTranscriptionEnumerator, createRomajiTranscriptionEnumerator, isMaybeJapanese } from './japanese';
|
||||
import { normalizeByCodePoint } from '../common/normalize';
|
||||
import { TokenType, type TokenDefinition } from '../common/types';
|
||||
|
||||
export interface Token {
|
||||
id: number;
|
||||
start: number;
|
||||
end: number;
|
||||
}
|
||||
|
||||
export type KuromojiTokenizer = Awaited<ReturnType<TokenizerBuilder['build']>>;
|
||||
export interface TokenizerOptions {
|
||||
kuromoji: KuromojiTokenizer;
|
||||
}
|
||||
export const createTokenizer = (options: TokenizerOptions) => {
|
||||
const tokens = new Map<string, TokenDefinition>();
|
||||
let nextId = 0;
|
||||
const ensureToken = (type: TokenType, text: string) => {
|
||||
const key = `${type}:${text}`;
|
||||
let tokenDefinition = tokens.get(key);
|
||||
if (tokenDefinition) return tokenDefinition;
|
||||
tokenDefinition = { id: nextId++, type, text, codePointLength: [...text].length };
|
||||
tokens.set(key, tokenDefinition);
|
||||
return tokenDefinition;
|
||||
};
|
||||
|
||||
const enumerateAllKanaCombinations = createKanaTranscriptionEnumerator(options.kuromoji);
|
||||
const enumerateAllRomajiCombinations = createRomajiTranscriptionEnumerator(options.kuromoji);
|
||||
const tokenize = (text: string) => {
|
||||
const results: Token[] = [];
|
||||
const emitter = (start: number, end: number) => (type: TokenType, text: string) => results.push({ id: ensureToken(type, text).id, start, end });
|
||||
|
||||
const emitMaybeJapanese = (codePoints: string[], offset: number) => {
|
||||
for (const { start, length, transcriptions } of enumerateAllKanaCombinations(codePoints)) {
|
||||
const emit = emitter(offset + start, offset + start + length);
|
||||
for (const transcription of transcriptions) emit(TokenType.Kana, transcription);
|
||||
}
|
||||
for (const { start, length, transcriptions } of enumerateAllRomajiCombinations(codePoints)) {
|
||||
const emit = emitter(offset + start, offset + start + length);
|
||||
for (const transcription of transcriptions) emit(TokenType.Romaji, transcription);
|
||||
}
|
||||
for (let i = 0; i < codePoints.length; i++) {
|
||||
// Single character may have not only kana readings, but also Chinese pronunciations or Simplified/Traditional/Japanese variants.
|
||||
const character = codePoints[i]!;
|
||||
const hanAlternates = getHanVariants(character); // All possible variant characters (Simplified/Traditional/Japanese)
|
||||
const pinyinAlternates = Array.from(new Set(hanAlternates.flatMap(han => getPinyinCandidates(han)))); // All possible pinyin candidates
|
||||
const emit = emitter(offset + i, offset + i + 1);
|
||||
for (const han of hanAlternates) emit(TokenType.Han, han);
|
||||
for (const pinyin of pinyinAlternates) emit(TokenType.Pinyin, pinyin);
|
||||
}
|
||||
};
|
||||
const emitRaw = (codePoint: string, offset: number) => emitter(offset, offset + 1)(TokenType.Raw, codePoint);
|
||||
|
||||
const codePoints = [...normalizeByCodePoint(text)];
|
||||
for (let start = 0; start < codePoints.length;) {
|
||||
const codePoint = codePoints[start]!;
|
||||
|
||||
const consequentCharsets = [
|
||||
{ is: isMaybeJapanese, emit: emitMaybeJapanese },
|
||||
];
|
||||
let emitted = false;
|
||||
for (const { is, emit } of consequentCharsets) {
|
||||
let length = 0;
|
||||
while (start + length < codePoints.length && is(codePoints[start + length]!)) length++;
|
||||
if (length > 0) {
|
||||
emit(codePoints.slice(start, start + length), start);
|
||||
start += length;
|
||||
emitted = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (emitted) continue;
|
||||
|
||||
// Skip whitespaces
|
||||
if (/\s/.test(codePoint)) {
|
||||
start++;
|
||||
continue;
|
||||
}
|
||||
|
||||
emitRaw(codePoint, start);
|
||||
start++;
|
||||
}
|
||||
return results;
|
||||
};
|
||||
|
||||
return {
|
||||
tokens,
|
||||
tokenize,
|
||||
};
|
||||
};
|
||||
@@ -0,0 +1,51 @@
|
||||
import { traverseTrie } from '../common';
|
||||
import { buildTrie, graftTriePaths } from './trie';
|
||||
|
||||
describe('graftTriePaths', () => {
|
||||
it('should graft paths according to normalization rules', () => {
|
||||
// Build a trie with tokens containing normalized forms
|
||||
const trie = buildTrie([
|
||||
[0, 'sya'], // normalized form of "sha"
|
||||
[1, 'tu'], // normalized form of "tsu"
|
||||
]);
|
||||
|
||||
// Graft paths so that "sha" -> "sya" and "tsu" -> "tu"
|
||||
graftTriePaths(trie, {
|
||||
sha: 'sya',
|
||||
tsu: 'tu',
|
||||
});
|
||||
|
||||
// Now we should be able to traverse using both the original and grafted paths
|
||||
const syaNode = traverseTrie(trie, 'sya');
|
||||
const shaNode = traverseTrie(trie, 'sha');
|
||||
expect(syaNode).toBeDefined();
|
||||
expect(shaNode).toBeDefined();
|
||||
expect(syaNode).toBe(shaNode); // Both paths should lead to the same node
|
||||
|
||||
const tuNode = traverseTrie(trie, 'tu');
|
||||
const tsuNode = traverseTrie(trie, 'tsu');
|
||||
expect(tuNode).toBeDefined();
|
||||
expect(tsuNode).toBeDefined();
|
||||
expect(tuNode).toBe(tsuNode);
|
||||
});
|
||||
|
||||
it('should handle chained graft rules', () => {
|
||||
const trie = buildTrie([
|
||||
[0, 'o'], // normalized vowel
|
||||
]);
|
||||
|
||||
// Chain: "ou" -> "o", "oo" -> "o"
|
||||
graftTriePaths(trie, {
|
||||
ou: 'o',
|
||||
oo: 'o',
|
||||
});
|
||||
|
||||
const oNode = traverseTrie(trie, 'o');
|
||||
const ouNode = traverseTrie(trie, 'ou');
|
||||
const ooNode = traverseTrie(trie, 'oo');
|
||||
|
||||
expect(oNode).toBeDefined();
|
||||
expect(ouNode).toBe(oNode);
|
||||
expect(ooNode).toBe(oNode);
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,115 @@
|
||||
import { traverseTrie, type TrieNode } from '../common';
|
||||
|
||||
const newNode = (parent?: TrieNode): TrieNode => ({ parent, children: new Map(), tokenIds: [], subTreeTokenIds: [] });
|
||||
|
||||
// Assume tokens are unique.
|
||||
export const buildTrie = (tokens: [id: number, text: string][]) => {
|
||||
const root = newNode(undefined);
|
||||
for (const [id, text] of tokens) {
|
||||
let node = root;
|
||||
for (const char of text) {
|
||||
const codePoint = char.codePointAt(0)!;
|
||||
let childNode = node.children.get(codePoint);
|
||||
if (!childNode) {
|
||||
childNode = newNode(node);
|
||||
node.children.set(codePoint, childNode);
|
||||
}
|
||||
node = childNode;
|
||||
node.subTreeTokenIds.push(id);
|
||||
}
|
||||
node.tokenIds.push(id);
|
||||
}
|
||||
return root;
|
||||
};
|
||||
|
||||
export const graftTriePaths = (root: TrieNode, rules: Record<string, string>) => {
|
||||
for (const [inputPhrase, graftTo] of Object.entries(rules)) if ([...graftTo].length > [...inputPhrase].length) throw new Error(`Graft rule ${inputPhrase} -> ${graftTo} maps to longer string and may cause infinite loop`);
|
||||
const visitedNodes = new Set<TrieNode>();
|
||||
const graftFromNode = (node: TrieNode, recursiveChildren: boolean) => {
|
||||
if (visitedNodes.has(node)) return;
|
||||
visitedNodes.add(node);
|
||||
if (recursiveChildren) for (const [, childNode] of node.children) graftFromNode(childNode, true);
|
||||
while (true) {
|
||||
const nodesWithNewGraftedChildren = new Map<TrieNode, /* depth from initial node */ number>();
|
||||
for (const [inputPhrase, graftTo] of Object.entries(rules)) {
|
||||
const targetNode = traverseTrie(node, graftTo);
|
||||
if (!targetNode) continue;
|
||||
const codePoints = [...inputPhrase];
|
||||
const graftedPath = Array.from<TrieNode>({ length: codePoints.length - 1 });
|
||||
let isGrafted = false;
|
||||
let currentNode = node;
|
||||
for (let i = 0; i < codePoints.length; i++) {
|
||||
const codePoint = codePoints[i]!.codePointAt(0)!;
|
||||
let childNode = currentNode.children.get(codePoint);
|
||||
if (i === codePoints.length - 1) {
|
||||
if (childNode) {
|
||||
if (childNode !== targetNode) throw new Error(`Grafted path ${inputPhrase} conflicts with existing path`);
|
||||
// Already grafted
|
||||
} else {
|
||||
currentNode.children.set(codePoint, childNode = targetNode);
|
||||
isGrafted = true;
|
||||
}
|
||||
} else {
|
||||
if (!childNode) {
|
||||
childNode = newNode(currentNode);
|
||||
childNode.subTreeTokenIds = targetNode.subTreeTokenIds;
|
||||
currentNode.children.set(codePoint, childNode);
|
||||
} else {
|
||||
// Part of another grafted path?
|
||||
childNode.subTreeTokenIds = Array.from(new Set([...childNode.subTreeTokenIds, ...targetNode.subTreeTokenIds]));
|
||||
}
|
||||
graftedPath[i] = currentNode = childNode;
|
||||
}
|
||||
}
|
||||
if (isGrafted) for (const [i, nodeToAdd] of graftedPath.entries()) nodesWithNewGraftedChildren.set(nodeToAdd, i + 1);
|
||||
}
|
||||
|
||||
if (nodesWithNewGraftedChildren.size > 0) {
|
||||
// Re-check graft rules on the newly grafted path
|
||||
// 1. No need to recursive other children (not on this path) since their children are not affected
|
||||
// 2. No need to consider ancestors of this node since they're handled later (we run in DFS order)
|
||||
const sortedNodes = [...nodesWithNewGraftedChildren.entries()].sort((a, b) => b[1] - a[1]);
|
||||
for (const [changedNode] of sortedNodes) graftFromNode(changedNode, false);
|
||||
} else {
|
||||
// No new grafts applied
|
||||
break;
|
||||
}
|
||||
}
|
||||
};
|
||||
graftFromNode(root, true);
|
||||
};
|
||||
|
||||
export const serializeTrie = (root: TrieNode) => {
|
||||
const nodeEntries = new Map<TrieNode, {
|
||||
id: number;
|
||||
visited: boolean;
|
||||
data?: number[];
|
||||
}>();
|
||||
let currentId = 0;
|
||||
const getNodeEntry = (node: TrieNode) => {
|
||||
let entry = nodeEntries.get(node);
|
||||
if (!entry) {
|
||||
entry = { id: ++currentId, visited: false };
|
||||
nodeEntries.set(node, entry);
|
||||
}
|
||||
return entry;
|
||||
};
|
||||
const serializeNode = (node: TrieNode) => {
|
||||
const entry = getNodeEntry(node);
|
||||
if (entry.visited) return entry.id;
|
||||
entry.visited = true;
|
||||
const children = [...node.children.entries()].map(([codePoint, childNode]) => [codePoint, serializeNode(childNode)] as const);
|
||||
entry.data = [
|
||||
node.parent ? getNodeEntry(node.parent).id : 0,
|
||||
...children.map(child => child[0]), // code points
|
||||
...children.map(child => child[1]), // child node ids
|
||||
// End of children list (<= 0 are not valid code points nor node IDs)
|
||||
...node.tokenIds.length > 0
|
||||
? node.tokenIds.map(tokenId => -(tokenId + 1)) // Use the negative value of (tokenId + 1)
|
||||
: [0], // End of children list, no token IDs (token IDs are encoded to negative values)
|
||||
];
|
||||
return entry.id;
|
||||
};
|
||||
serializeNode(root);
|
||||
return [...nodeEntries.values()].sort((a, b) => a.id - b.id).flatMap(node => node.data ?? []);
|
||||
};
|
||||
@@ -0,0 +1,26 @@
|
||||
import { getSpanLength, TokenType } from '../common';
|
||||
import type { SearchResult } from './search';
|
||||
|
||||
export type HighlightedTextPart = /* not highlighted */ string | /* highlighted */ { highlight: string };
|
||||
|
||||
export const highlightSearchResult = (resultDocument: SearchResult): HighlightedTextPart[] => {
|
||||
const highlightResult: HighlightedTextPart[] = [];
|
||||
let previousHighlightEnd = 0;
|
||||
for (const token of resultDocument.tokens) {
|
||||
const notHighlightedText = resultDocument.documentCodePoints.slice(previousHighlightEnd, token.documentOffset.start).join('');
|
||||
if (notHighlightedText.length > 0) highlightResult.push(notHighlightedText);
|
||||
const highlightEnd = token.isTokenPrefixMatching && (token.definition.type === TokenType.Kana)
|
||||
? token.documentOffset.start + Math.max(
|
||||
1,
|
||||
Math.round(
|
||||
getSpanLength(token.documentOffset) *
|
||||
Math.min(1, getSpanLength(token.inputOffset) / token.definition.codePointLength),
|
||||
),
|
||||
)
|
||||
: token.documentOffset.end;
|
||||
highlightResult.push({ highlight: resultDocument.documentCodePoints.slice(token.documentOffset.start, highlightEnd).join('') });
|
||||
previousHighlightEnd = highlightEnd;
|
||||
}
|
||||
if (previousHighlightEnd < resultDocument.documentCodePoints.length) highlightResult.push(resultDocument.documentCodePoints.slice(previousHighlightEnd).join(''));
|
||||
return highlightResult;
|
||||
};
|
||||
@@ -0,0 +1,4 @@
|
||||
export * from './trie';
|
||||
export * from './inverted-index';
|
||||
export * from './search';
|
||||
export * from './highlight';
|
||||
@@ -0,0 +1,59 @@
|
||||
import { deserializeTrie } from './trie';
|
||||
import type { TrieNode } from '../common';
|
||||
import type { CompressedInvertedIndex, OffsetSpan, TokenDefinition } from '../common/types';
|
||||
|
||||
export interface TokenDocumentReference {
|
||||
documentId: number;
|
||||
offsets: OffsetSpan[];
|
||||
}
|
||||
|
||||
interface TokenDefinitionExtended extends TokenDefinition {
|
||||
references: TokenDocumentReference[];
|
||||
};
|
||||
|
||||
const mergeMap = <K, V>(...maps: Map<K, V>[]) => {
|
||||
const result = new Map<K, V>();
|
||||
for (const map of maps) for (const [key, value] of map.entries()) result.set(key, value);
|
||||
return result;
|
||||
};
|
||||
|
||||
export interface LoadedInvertedIndex {
|
||||
documents: string[];
|
||||
documentCodePoints: string[][];
|
||||
tokenDefinitions: TokenDefinitionExtended[];
|
||||
tries: {
|
||||
romaji: TrieNode;
|
||||
kana: TrieNode;
|
||||
other: TrieNode;
|
||||
};
|
||||
}
|
||||
|
||||
export const loadInvertedIndex = (compressed: CompressedInvertedIndex): LoadedInvertedIndex => {
|
||||
const documents = compressed.documents;
|
||||
const documentCodePoints = documents.map(document => [...document]);
|
||||
|
||||
const romajiTrie = deserializeTrie(compressed.tries.romaji);
|
||||
const kanaTrie = deserializeTrie(compressed.tries.kana);
|
||||
const otherTrie = deserializeTrie(compressed.tries.other);
|
||||
|
||||
const tokenCodePoints = mergeMap(romajiTrie.tokenCodePoints, kanaTrie.tokenCodePoints, otherTrie.tokenCodePoints);
|
||||
const tokenDefinitions = compressed.tokenTypes.map<TokenDefinitionExtended>((type, index) => ({
|
||||
id: index, type, text: tokenCodePoints.get(index)!.join(''),
|
||||
codePointLength: tokenCodePoints.get(index)!.length,
|
||||
references: compressed.tokenReferences[index]!.map<TokenDocumentReference>(([documentId, ...offsets]) => ({
|
||||
documentId: documentId!,
|
||||
offsets: Array.from({ length: offsets.length / 2 }, (_, i) => ({ start: offsets[i * 2]!, end: offsets[i * 2 + 1]! })),
|
||||
})),
|
||||
}));
|
||||
|
||||
return {
|
||||
documents,
|
||||
documentCodePoints,
|
||||
tokenDefinitions,
|
||||
tries: {
|
||||
romaji: romajiTrie.root,
|
||||
kana: kanaTrie.root,
|
||||
other: otherTrie.root,
|
||||
},
|
||||
};
|
||||
};
|
||||
@@ -0,0 +1,258 @@
|
||||
import { highlightSearchResult } from './highlight';
|
||||
import { getTrieNodeTokenIds } from './trie';
|
||||
import type { TrieNode } from '../common';
|
||||
import { traverseTrieStep } from '../common';
|
||||
import type { LoadedInvertedIndex } from './inverted-index';
|
||||
import { normalizeByCodePoint, toKatakana } from '../common/normalize';
|
||||
import { type OffsetSpan, type TokenDefinition, TokenType } from '../common/types';
|
||||
import { getSpanLength } from '../common/utils';
|
||||
|
||||
const IGNORABLE_CODE_POINTS = /[\s\u3099\u309A]/u;
|
||||
|
||||
enum TokenTypePrefixMatchingPolicy {
|
||||
AlwaysAllow,
|
||||
NeverAllow,
|
||||
AllowOnlyAtInputEnd,
|
||||
}
|
||||
const tokenTypePrefixMatchingPolicy: Record<TokenType, TokenTypePrefixMatchingPolicy> = {
|
||||
[TokenType.Romaji]: TokenTypePrefixMatchingPolicy.NeverAllow,
|
||||
[TokenType.Kana]: TokenTypePrefixMatchingPolicy.AlwaysAllow,
|
||||
// These token types are in an "other" Trie
|
||||
[TokenType.Han]: TokenTypePrefixMatchingPolicy.AllowOnlyAtInputEnd, // No effect because always 1 code point
|
||||
[TokenType.Pinyin]: TokenTypePrefixMatchingPolicy.AllowOnlyAtInputEnd,
|
||||
[TokenType.Raw]: TokenTypePrefixMatchingPolicy.AllowOnlyAtInputEnd, // No effect because always 1 code point
|
||||
};
|
||||
const shouldAllowPrefixMatching = (tokenType: TokenType, isAtInputEnd: boolean) =>
|
||||
tokenTypePrefixMatchingPolicy[tokenType] === TokenTypePrefixMatchingPolicy.AlwaysAllow ||
|
||||
(tokenTypePrefixMatchingPolicy[tokenType] !== TokenTypePrefixMatchingPolicy.NeverAllow && isAtInputEnd);
|
||||
|
||||
export interface SearchResultToken {
|
||||
definition: TokenDefinition;
|
||||
documentOffset: OffsetSpan;
|
||||
inputOffset: OffsetSpan;
|
||||
isTokenPrefixMatching: boolean;
|
||||
}
|
||||
|
||||
interface ComparableStateTraits<T> {
|
||||
getRangeCount: (state: T) => number;
|
||||
getPrefixMatchCount: (state: T) => number;
|
||||
getFirstTokenDocumentOffset: (state: T) => OffsetSpan;
|
||||
getLastTokenDocumentOffset: (state: T) => OffsetSpan;
|
||||
getLastToken?: (state: T) => SearchResultToken; // Not on intermediate results
|
||||
getMatchRatioLevel?: (state: T) => number; // Not on intermediate/candidate results
|
||||
getMatchRatio: (state: T) => number;
|
||||
// Called when all other comparisons are equal
|
||||
nextComparer?: (a: T, b: T) => number; // Not on intermediate/candidate results
|
||||
}
|
||||
|
||||
const getComparerForTraits = <T>(traits: ComparableStateTraits<T>) => (a: T, b: T) => {
|
||||
// Prefer matches that not relying on end-of-input loose matching (full match over prefix match)
|
||||
if (traits.getLastToken) {
|
||||
const aLastToken = traits.getLastToken(a), bLastToken = traits.getLastToken(b);
|
||||
const aDidPrefixMatchByTokenType = aLastToken.isTokenPrefixMatching && tokenTypePrefixMatchingPolicy[aLastToken.definition.type] === TokenTypePrefixMatchingPolicy.AllowOnlyAtInputEnd;
|
||||
const bDidPrefixMatchByTokenType = bLastToken.isTokenPrefixMatching && tokenTypePrefixMatchingPolicy[bLastToken.definition.type] === TokenTypePrefixMatchingPolicy.AllowOnlyAtInputEnd;
|
||||
if (aDidPrefixMatchByTokenType !== bDidPrefixMatchByTokenType) return aDidPrefixMatchByTokenType ? 1 : -1;
|
||||
}
|
||||
|
||||
// Prefer results that matched fewer discontinuous ranges over more
|
||||
const aRangeCount = traits.getRangeCount(a), bRangeCount = traits.getRangeCount(b);
|
||||
if (aRangeCount !== bRangeCount) return aRangeCount - bRangeCount;
|
||||
|
||||
// Prefer results that matches first token in document earlier over later
|
||||
const aFirstTokenDocumentOffset = traits.getFirstTokenDocumentOffset(a), bFirstTokenDocumentOffset = traits.getFirstTokenDocumentOffset(b);
|
||||
if (aFirstTokenDocumentOffset.start !== bFirstTokenDocumentOffset.start) return aFirstTokenDocumentOffset.start - bFirstTokenDocumentOffset.start;
|
||||
|
||||
// Prefer results that has higher match ratio (but don't distinguish similar ratios, so we introduced `matchRatioLevel`)
|
||||
if (traits.getMatchRatioLevel) {
|
||||
const aMatchRatioLevel = traits.getMatchRatioLevel(a), bMatchRatioLevel = traits.getMatchRatioLevel(b);
|
||||
if (aMatchRatioLevel !== bMatchRatioLevel) return bMatchRatioLevel - aMatchRatioLevel;
|
||||
}
|
||||
|
||||
// Prefer results that last token occurred earlier (if same, ended earlier) in the document over later
|
||||
const aLastTokenDocumentOffset = traits.getLastTokenDocumentOffset(a), bLastTokenDocumentOffset = traits.getLastTokenDocumentOffset(b);
|
||||
if (aLastTokenDocumentOffset.start !== bLastTokenDocumentOffset.start) return aLastTokenDocumentOffset.start - bLastTokenDocumentOffset.start;
|
||||
if (aLastTokenDocumentOffset.end !== bLastTokenDocumentOffset.end) return aLastTokenDocumentOffset.end - bLastTokenDocumentOffset.end;
|
||||
|
||||
// Prefer results that has higher match ratio (precisely)
|
||||
const aMatchRatio = traits.getMatchRatio(a), bMatchRatio = traits.getMatchRatio(b);
|
||||
if (aMatchRatio !== bMatchRatio) return bMatchRatio - aMatchRatio;
|
||||
|
||||
return traits.nextComparer?.(a, b) ?? 0;
|
||||
};
|
||||
|
||||
interface IntermediateResult {
|
||||
previousState?: IntermediateResult;
|
||||
firstTokenDocumentOffset: OffsetSpan;
|
||||
rangeCount: number;
|
||||
tokenCount: number;
|
||||
prefixMatchCount: number;
|
||||
matchedTokenLength: number;
|
||||
tokenId: number;
|
||||
documentOffset: OffsetSpan;
|
||||
inputOffset: OffsetSpan;
|
||||
isTokenPrefixMatching: boolean;
|
||||
}
|
||||
const compareIntermediateResult = getComparerForTraits<IntermediateResult>({
|
||||
getRangeCount: state => state.rangeCount,
|
||||
getPrefixMatchCount: state => state.prefixMatchCount,
|
||||
getFirstTokenDocumentOffset: state => state.firstTokenDocumentOffset,
|
||||
getLastTokenDocumentOffset: state => state.documentOffset,
|
||||
getMatchRatio: state => state.matchedTokenLength, // No need to divide document length since intermediate results are for same document
|
||||
});
|
||||
|
||||
interface CandidateResult {
|
||||
tokens: SearchResultToken[];
|
||||
prefixMatchCount: number;
|
||||
matchedTokenLength: number;
|
||||
rangeCount: number;
|
||||
}
|
||||
const compareCandidateResult = getComparerForTraits<CandidateResult>({
|
||||
getRangeCount: state => state.rangeCount,
|
||||
getPrefixMatchCount: state => state.prefixMatchCount,
|
||||
getFirstTokenDocumentOffset: state => state.tokens[0]!.documentOffset,
|
||||
getLastTokenDocumentOffset: state => state.tokens[state.tokens.length - 1]!.documentOffset,
|
||||
getLastToken: state => state.tokens[state.tokens.length - 1]!,
|
||||
getMatchRatio: state => state.matchedTokenLength, // No need to divide document length since candidate results are for same document
|
||||
});
|
||||
|
||||
export interface SearchResult {
|
||||
documentId: number;
|
||||
documentText: string;
|
||||
documentCodePoints: string[];
|
||||
tokens: SearchResultToken[];
|
||||
prefixMatchCount: number;
|
||||
rangeCount: number;
|
||||
matchRatio: number;
|
||||
matchRatioLevel: number;
|
||||
}
|
||||
const compareFinalResult = getComparerForTraits<SearchResult>({
|
||||
getRangeCount: state => state.rangeCount,
|
||||
getPrefixMatchCount: state => state.prefixMatchCount,
|
||||
getFirstTokenDocumentOffset: state => state.tokens[0]!.documentOffset,
|
||||
getLastTokenDocumentOffset: state => state.tokens[state.tokens.length - 1]!.documentOffset,
|
||||
getLastToken: state => state.tokens[state.tokens.length - 1]!,
|
||||
getMatchRatio: state => state.matchRatio,
|
||||
getMatchRatioLevel: state => Math.round(state.matchRatio * 5),
|
||||
nextComparer: (a, b) => a.documentText === b.documentText ? 0 : a.documentText < b.documentText ? -1 : 1,
|
||||
});
|
||||
|
||||
const hasNonEmptyCharacters = (documentCodePoints: string[], start: number, end: number) => start !== end && !documentCodePoints.slice(start, end).every(char => /\s/.test(char));
|
||||
|
||||
export const searchInvertedIndex = (invertedIndex: LoadedInvertedIndex, text: string): SearchResult[] => {
|
||||
const { documents, documentCodePoints, tokenDefinitions, tries } = invertedIndex;
|
||||
|
||||
const codePoints = [...toKatakana(normalizeByCodePoint(text))];
|
||||
// dp[i] = docId => end => IntermediateResult, starts from dp[-1] (l === 0), ends at dp[N - 1] (r === N - 1)
|
||||
const dp = Array.from({ length: codePoints.length }, () => new Map<number, Record<number, IntermediateResult>>());
|
||||
for (let l = 0; l < codePoints.length; l++) {
|
||||
if (l !== 0 && dp[l - 1]!.size === 0) continue; // No documents match input from beginning to this position
|
||||
let romajiNode: TrieNode | undefined = tries.romaji;
|
||||
let kanaNode: TrieNode | undefined = tries.kana;
|
||||
let otherNode: TrieNode | undefined = tries.other;
|
||||
for (let r = l; r < codePoints.length && (romajiNode || kanaNode || otherNode); r++) { // [l, r]
|
||||
const codePoint = codePoints[r]!;
|
||||
romajiNode = traverseTrieStep(romajiNode, codePoint, IGNORABLE_CODE_POINTS);
|
||||
kanaNode = traverseTrieStep(kanaNode, codePoint, IGNORABLE_CODE_POINTS);
|
||||
otherNode = traverseTrieStep(otherNode, codePoint, IGNORABLE_CODE_POINTS);
|
||||
const reachingInputEnd = r === codePoints.length - 1;
|
||||
const matchingTokenIds = new Set([
|
||||
// Allow suffix matching of romaji/other tokens if we're at the end of the input
|
||||
...getTrieNodeTokenIds(romajiNode, shouldAllowPrefixMatching(TokenType.Romaji, reachingInputEnd)),
|
||||
...getTrieNodeTokenIds(kanaNode, shouldAllowPrefixMatching(TokenType.Kana, reachingInputEnd)),
|
||||
...getTrieNodeTokenIds(otherNode, reachingInputEnd),
|
||||
]);
|
||||
for (const tokenId of matchingTokenIds) for (const { documentId, offsets } of tokenDefinitions[tokenId]!.references) {
|
||||
const isTokenPrefixMatching = !romajiNode?.tokenIds.includes(tokenId) && !kanaNode?.tokenIds.includes(tokenId) && !otherNode?.tokenIds.includes(tokenId);
|
||||
const previousMatchesOfDocument = dp[l - 1]?.get(documentId);
|
||||
if (l !== 0 && !previousMatchesOfDocument) continue;
|
||||
for (const documentOffset of offsets) {
|
||||
const { start: currentStart, end: currentEnd } = documentOffset;
|
||||
const contributeNextMatchingState = (previousState: IntermediateResult | undefined) => {
|
||||
const nextMatchingMap = dp[r]!;
|
||||
let nextMatchesOfDocument = nextMatchingMap.get(documentId);
|
||||
if (!nextMatchesOfDocument) {
|
||||
nextMatchesOfDocument = Object.create(null) as Record<number, IntermediateResult>;
|
||||
nextMatchingMap.set(documentId, nextMatchesOfDocument);
|
||||
}
|
||||
const oldResult = nextMatchesOfDocument[currentEnd];
|
||||
const inputOffset = { start: l, end: r + 1 };
|
||||
const newResult: IntermediateResult = {
|
||||
previousState,
|
||||
firstTokenDocumentOffset: previousState?.firstTokenDocumentOffset ?? documentOffset,
|
||||
rangeCount: !previousState ? 1
|
||||
: (previousState.rangeCount + (hasNonEmptyCharacters(documentCodePoints[documentId]!, previousState.documentOffset.end, currentStart) ? 1 : 0)),
|
||||
tokenCount: (previousState?.tokenCount ?? 0) + 1,
|
||||
prefixMatchCount: (previousState?.prefixMatchCount ?? 0) + (isTokenPrefixMatching ? 1 : 0),
|
||||
matchedTokenLength: (previousState?.matchedTokenLength ?? 0) + getSpanLength(documentOffset) *
|
||||
Math.min(isTokenPrefixMatching ? getSpanLength(inputOffset) / tokenDefinitions[tokenId]!.codePointLength : Infinity, 1),
|
||||
tokenId,
|
||||
documentOffset,
|
||||
inputOffset,
|
||||
isTokenPrefixMatching,
|
||||
};
|
||||
nextMatchesOfDocument[currentEnd] = !oldResult || compareIntermediateResult(newResult, oldResult) < 0 ? newResult : oldResult;
|
||||
};
|
||||
if (l === 0) contributeNextMatchingState(undefined);
|
||||
else for (const previousEnd in previousMatchesOfDocument) if (currentStart >= Number(previousEnd))
|
||||
contributeNextMatchingState(previousMatchesOfDocument[previousEnd as unknown as number]!);
|
||||
// Don't `break` here because keys of `previousMatchesOfDocument` are not essentially ordered
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Build search results and sort documents
|
||||
return [...dp[codePoints.length - 1]!.entries()].map<SearchResult>(([documentId, matches]) => {
|
||||
const sortedMatches = Object.values(matches).map<CandidateResult>(match => {
|
||||
const tokens: SearchResultToken[] = [];
|
||||
// Build token list from backtracking
|
||||
let state: IntermediateResult | undefined = match;
|
||||
while (state) {
|
||||
tokens.unshift({
|
||||
definition: tokenDefinitions[state.tokenId]!,
|
||||
documentOffset: state.documentOffset, inputOffset: state.inputOffset,
|
||||
isTokenPrefixMatching: state.isTokenPrefixMatching,
|
||||
});
|
||||
state = state.previousState;
|
||||
}
|
||||
return { tokens, prefixMatchCount: match.prefixMatchCount, matchedTokenLength: match.matchedTokenLength, rangeCount: match.rangeCount };
|
||||
}).sort(compareCandidateResult);
|
||||
const bestMatchOfDocument = sortedMatches[0]!;
|
||||
const documentText = documents[documentId]!;
|
||||
const matchRatio = bestMatchOfDocument.matchedTokenLength / documentCodePoints[documentId]!.length;
|
||||
const matchRatioLevel = Math.round(matchRatio * 5);
|
||||
return {
|
||||
documentId,
|
||||
documentText,
|
||||
documentCodePoints: documentCodePoints[documentId]!,
|
||||
tokens: bestMatchOfDocument.tokens,
|
||||
prefixMatchCount: bestMatchOfDocument.prefixMatchCount,
|
||||
rangeCount: bestMatchOfDocument.rangeCount,
|
||||
matchRatio,
|
||||
matchRatioLevel,
|
||||
};
|
||||
}).sort(compareFinalResult);
|
||||
};
|
||||
|
||||
// For debugging
|
||||
export const inspectSearchResult = (resultDocument: SearchResult, htmlHighlight: boolean) => {
|
||||
const { documentText, tokens, rangeCount, matchRatio, matchRatioLevel } = resultDocument;
|
||||
const escapeHtml = (s: string) => s.replaceAll('&', '&').replaceAll('<', '<').replaceAll('>', '>');
|
||||
const escapedText = htmlHighlight ? highlightSearchResult(resultDocument).map(part =>
|
||||
typeof part === 'string' ? escapeHtml(part) : `<u><b>${escapeHtml(part.highlight)}</b></u>`).join('') : JSON.stringify(documentText);
|
||||
const description = ` (${rangeCount} ranges, ${Math.round(matchRatio * 10000) / 10000} => L${matchRatioLevel})`;
|
||||
return [
|
||||
escapedText + (htmlHighlight ? `<code>${description}</code>` : description),
|
||||
...tokens.map(token => {
|
||||
let escapedTokenText = JSON.stringify(token.definition.text);
|
||||
let escapedDocumentText = JSON.stringify([...documentText].slice(token.documentOffset.start, token.documentOffset.end).join(''));
|
||||
if (htmlHighlight) {
|
||||
escapedTokenText = escapeHtml(escapedTokenText);
|
||||
escapedDocumentText = escapeHtml(escapedDocumentText);
|
||||
}
|
||||
const line = ` ${TokenType[token.definition.type]}: ${escapedTokenText} -> ${escapedDocumentText}${token.isTokenPrefixMatching ? ' (prefix match)' : ''}`;
|
||||
return htmlHighlight ? `<code>${line}</code>` : line;
|
||||
}),
|
||||
'',
|
||||
].join('\n');
|
||||
};
|
||||
@@ -0,0 +1,58 @@
|
||||
import type { TrieNode } from '../common';
|
||||
|
||||
export const deserializeTrie = (data: number[]) => {
|
||||
const nodes: TrieNode[] = [];
|
||||
const getNode = (id: number) => nodes[id - 1] ??= { parent: undefined, children: new Map(), tokenIds: [], subTreeTokenIds: [] };
|
||||
let currentId = 0;
|
||||
for (let i = 0; i < data.length;) {
|
||||
const node = getNode(++currentId);
|
||||
const parentId = data[i++]!;
|
||||
node.parent = parentId !== 0 ? getNode(parentId) : undefined;
|
||||
|
||||
let endOfChildren = i;
|
||||
while (endOfChildren < data.length && data[endOfChildren]! > 0) endOfChildren++;
|
||||
const numberOfChildren = (endOfChildren - i) / 2;
|
||||
for (let j = i; j < i + numberOfChildren; j++) {
|
||||
const codePoint = data[j]!;
|
||||
const child = getNode(data[j + numberOfChildren]!);
|
||||
node.children.set(codePoint, child);
|
||||
}
|
||||
i = endOfChildren;
|
||||
|
||||
if (data[i] === 0) i++; // No token IDs
|
||||
else while (i < data.length && data[i]! < 0) node.tokenIds.push(-data[i++]! - 1);
|
||||
}
|
||||
const root = nodes[0]!;
|
||||
|
||||
// DFS to construct code point paths for each token
|
||||
const tokenCodePoints = new Map<number, string[]>();
|
||||
const currentCodePoints: string[] = [];
|
||||
const dfsCodePoints = (node: TrieNode) => {
|
||||
for (const tokenId of node.tokenIds) tokenCodePoints.set(tokenId, [...currentCodePoints]);
|
||||
for (const [codePoint, child] of node.children.entries()) {
|
||||
if (child.parent !== node) continue; // Skip grafted paths as these are not the canonical representation of the tokens
|
||||
currentCodePoints.push(String.fromCodePoint(codePoint));
|
||||
dfsCodePoints(child);
|
||||
currentCodePoints.pop();
|
||||
}
|
||||
};
|
||||
dfsCodePoints(root);
|
||||
|
||||
// DFS to construct subTreeTokenIds for each node
|
||||
const visitedNodes = new Set<TrieNode>();
|
||||
const dfsSubTreeTokenIds = (node: TrieNode) => {
|
||||
if (visitedNodes.has(node)) return node.subTreeTokenIds;
|
||||
visitedNodes.add(node);
|
||||
node.subTreeTokenIds = [...node.tokenIds, ...new Set([...node.children.values()].flatMap(child => dfsSubTreeTokenIds(child)))];
|
||||
return node.subTreeTokenIds;
|
||||
};
|
||||
dfsSubTreeTokenIds(root);
|
||||
|
||||
return {
|
||||
root,
|
||||
tokenCodePoints,
|
||||
};
|
||||
};
|
||||
|
||||
export const getTrieNodeTokenIds = (node: TrieNode | undefined, includeSubTree: boolean) =>
|
||||
(includeSubTree ? node?.subTreeTokenIds : node?.tokenIds) ?? [];
|
||||
@@ -0,0 +1,23 @@
|
||||
{
|
||||
"compilerOptions": {
|
||||
"target": "ESNext",
|
||||
"jsx": "preserve",
|
||||
"lib": ["DOM", "DOM.Iterable", "ESNext", "WebWorker"],
|
||||
"module": "ESNext",
|
||||
"moduleResolution": "Bundler",
|
||||
"noUncheckedIndexedAccess": true,
|
||||
"resolveJsonModule": true,
|
||||
"allowJs": true,
|
||||
"strict": true,
|
||||
"strictNullChecks": true,
|
||||
"noEmit": true,
|
||||
"esModuleInterop": true,
|
||||
"forceConsistentCasingInFileNames": true,
|
||||
"isolatedModules": true,
|
||||
"skipLibCheck": true,
|
||||
"rootDir": ".",
|
||||
"outDir": "dist"
|
||||
},
|
||||
"include": ["src/**/*.ts"],
|
||||
"exclude": ["dist", "node_modules"]
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
import { defineConfig } from 'tsdown';
|
||||
|
||||
export default defineConfig({
|
||||
entry: [
|
||||
'./src/index.ts',
|
||||
'./src/searcher/index.ts',
|
||||
'./src/indexer/index.ts',
|
||||
'./src/common/index.ts',
|
||||
],
|
||||
dts: true,
|
||||
unused: true,
|
||||
fixedExtension: true,
|
||||
unbundle: true,
|
||||
sourcemap: true,
|
||||
});
|
||||
Reference in New Issue
Block a user