feat: initial commit

This commit is contained in:
Menci
2026-01-01 03:40:41 +08:00
commit 631f8ed771
98 changed files with 14776 additions and 0 deletions
+1
View File
@@ -0,0 +1 @@
../../LICENSE
+72
View File
@@ -0,0 +1,72 @@
# `@maigolabs/needle`
Fuzzy search engine for small text pieces, with Chinese/Japanese pronunciation support.
See also [in-browser demo](https://needle.maigo.dev).
## Install
Dictionaries are installed as dependencies of the package, but if you don't use the indexer, they could be tree-shaken when bundling.
```bash
pnpm install @maigolabs/needle
```
## Usage
### Indexing
NeedLe uses Kuromoji for Japanese tokenization, which loads dictionaries dynamically. You need to create a Kuromoji `TokenizerBuilder` first:
```ts
// In Node.js you can just load the dictionary from the file system.
import { TokenizerBuilder } from '@patdx/kuromoji';
import NodeDictionaryLoader from '@patdx/kuromoji/node';
const kuromojiDictPath = path.resolve(url.fileURLToPath(import.meta.resolve('@patdx/kuromoji')), '..', '..', 'dict');
const kuromoji = await new TokenizerBuilder({ loader: new NodeDictionaryLoader({ dic_path: kuromojiDictPath }) }).build();
// In browser you need to provide a custom loader to load the dictionary files with fetch().
import { TokenizerBuilder } from '@patdx/kuromoji';
// You can load dict files from CDN (See also the README of https://github.com/patdx/kuromoji.js)
const kuromoji = await new TokenizerBuilder({
loader: {
loadArrayBuffer: async (url: string) => {
url = `https://cdn.jsdelivr.net/npm/@aiktb/kuromoji@1.0.2/dict/${url.replace('.gz', '')}`;
const res = await fetch(url);
if (!res.ok) throw new Error(`Failed to fetch ${url}`);
return await res.arrayBuffer();
},
},
}).build();
```
After creating the Kuromoji instance, you can build the inverted index:
```ts
import { buildInvertedIndex } from '@maigolabs/needle/indexer';
const documents = ['你好世界', 'こんにちは'];
const compressedIndex = buildInvertedIndex(documents, { kuromoji });
// The built index could be stored for later use.
const json = JSON.stringify(compressedIndex);
```
### Searching
If you only import the searcher in your frontend code, indexer and dictionary-related dependencies will be tree-shaken.
```ts
import { loadInvertedIndex, searchInvertedIndex } from '@maigolabs/needle/searcher';
const loadedIndex = loadInvertedIndex(compressedIndex);
const results = searchInvertedIndex(loadedIndex, 'sekai');
for (const result of results) console.log(`${result.documentText} (${(result.matchRatio * 100).toFixed(0)}%)`);
// → 你好世界 (50%)
```
To highlight the search result, see also `highlightSearchResult`.
+18
View File
@@ -0,0 +1,18 @@
import type { Config } from 'jest';
const config: Config = {
preset: 'ts-jest/presets/default-esm',
testEnvironment: 'node',
extensionsToTreatAsEsm: ['.ts'],
moduleNameMapper: {
'^(\\.{1,2}/.*)\\.js$': '$1',
},
transform: {
'^.+\\.tsx?$': ['ts-jest', { useESM: true }],
},
testMatch: ['**/*.test.ts'],
testTimeout: 30000,
};
export default config;
+84
View File
@@ -0,0 +1,84 @@
{
"name": "@maigolabs/needle",
"version": "1.0.1",
"description": "Fuzzy search engine for small text pieces, with Chinese/Japanese pronunciation support.",
"type": "module",
"main": "./src/index.ts",
"scripts": {
"build": "tsdown",
"typecheck": "tsc",
"test": "cross-env NODE_OPTIONS=--experimental-vm-modules jest",
"prepare": "pnpm run build"
},
"license": "AGPL-3.0",
"homepage": "https://needle.maigo.dev",
"repository": {
"type": "git",
"url": "git+https://github.com/MaigoLabs/needLe.git",
"directory": "packages/needle"
},
"bugs": "https://github.com/MaigoLabs/needLe/issues",
"keywords": [
"needle",
"search",
"fuzzy",
"cjk",
"chinese",
"japanese",
"pinyin",
"romaji"
],
"author": "Menci <mencici@msn.com>",
"sideEffects": false,
"exports": {
".": "./src/index.ts",
"./common": "./src/common/index.ts",
"./indexer": "./src/indexer/index.ts",
"./searcher": "./src/searcher/index.ts",
"./package.json": "./package.json"
},
"packageManager": "pnpm@10.20.0",
"dependencies": {
"@patdx/kuromoji": "^1.0.4",
"hepburn": "^1.2.2",
"opencc-js": "^1.0.5",
"pinyin-pro": "^3.27.0"
},
"devDependencies": {
"@types/hepburn": "^1.2.2",
"@types/jest": "^30.0.0",
"@types/opencc-js": "^1.0.3",
"jest": "^30.2.0",
"ts-jest": "^29.4.6"
},
"files": [
"README.md",
"dist",
"package.json"
],
"publishConfig": {
"access": "public",
"main": "./dist/index.mjs",
"module": "./dist/index.mjs",
"types": "./dist/index.d.mts",
"exports": {
".": {
"types": "./dist/index.d.mts",
"default": "./dist/index.mjs"
},
"./common": {
"types": "./dist/common/index.d.mts",
"default": "./dist/common/index.mjs"
},
"./indexer": {
"types": "./dist/indexer/index.d.mts",
"default": "./dist/indexer/index.mjs"
},
"./searcher": {
"types": "./dist/searcher/index.d.mts",
"default": "./dist/searcher/index.mjs"
},
"./package.json": "./package.json"
}
}
}
+4
View File
@@ -0,0 +1,4 @@
export * from './types';
export * from './utils';
export * from './normalize';
export * from './trie';
@@ -0,0 +1,60 @@
import { normalizeByCodePoint, toKatakana } from './normalize';
describe('toKatakana', () => {
it('should convert hiragana to katakana', () => {
expect(toKatakana('あいうえお')).toBe('アイウエオ');
expect(toKatakana('かきくけこ')).toBe('カキクケコ');
expect(toKatakana('さしすせそ')).toBe('サシスセソ');
});
it('should keep katakana unchanged', () => {
expect(toKatakana('アイウエオ')).toBe('アイウエオ');
});
it('should keep non-kana characters unchanged', () => {
expect(toKatakana('abc123')).toBe('abc123');
expect(toKatakana('漢字')).toBe('漢字');
});
it('should handle mixed input', () => {
expect(toKatakana('あアa漢')).toBe('アアa漢');
});
});
describe('normalizeByCodePoint', () => {
it('should convert fullwidth ASCII to halfwidth lowercase', () => {
expect(normalizeByCodePoint('ABC')).toBe('abc');
expect(normalizeByCodePoint('123')).toBe('123');
expect(normalizeByCodePoint('!@#')).toBe('!@#');
});
it('should convert fullwidth space to halfwidth space', () => {
expect(normalizeByCodePoint(' ')).toBe(' ');
});
it('should convert halfwidth kana to fullwidth kana', () => {
expect(normalizeByCodePoint('アイウエオ')).toBe('アイウエオ');
expect(normalizeByCodePoint('カキクケコ')).toBe('カキクケコ');
});
it('should normalize voiced/semi-voiced sound marks', () => {
expect(normalizeByCodePoint('゙')).toBe('\u3099'); // halfwidth voiced -> combining
expect(normalizeByCodePoint('゚')).toBe('\u309A'); // halfwidth semi-voiced -> combining
expect(normalizeByCodePoint('゛')).toBe('\u3099'); // fullwidth voiced -> combining
expect(normalizeByCodePoint('゜')).toBe('\u309A'); // fullwidth semi-voiced -> combining
});
it('should convert halfwidth punctuation to fullwidth', () => {
expect(normalizeByCodePoint('。')).toBe('。');
expect(normalizeByCodePoint('「')).toBe('「');
expect(normalizeByCodePoint('」')).toBe('」');
expect(normalizeByCodePoint('、')).toBe('、');
expect(normalizeByCodePoint('・')).toBe('・');
});
it('should lowercase regular ASCII', () => {
expect(normalizeByCodePoint('ABC')).toBe('abc');
});
// Should keep hiragana unchanged
});
+42
View File
@@ -0,0 +1,42 @@
export const normalizeByCodePoint = (string: string) => [...string].map(normalizeCodePoint).join('');
export const normalizeCodePoint = (char: string) => {
const codePoint = char.codePointAt(0)!;
// Fullwidth ASCII -> Halfwidth ASCII
if (codePoint >= 0xFF01 && codePoint <= 0xFF5E) return String.fromCodePoint(codePoint - 0xFEE0).toLowerCase();
// Fullwidth space -> Halfwidth space
else if (codePoint === /* ' ' */ 0x3000) return ' ';
// Halfwidth kana (U+FF66 - U+FF9D) -> Fullwidth kana
else if (codePoint >= 0xFF66 && codePoint <= 0xFF9D) return HALF_TO_FULL_KANA[char] ?? char;
else if (codePoint === /* '。' */ 0xFF61) return '。';
else if (codePoint === /* '「' */ 0xFF62) return '「';
else if (codePoint === /* '」' */ 0xFF63) return '」';
else if (codePoint === /* '、' */ 0xFF64) return '、';
else if (codePoint === /* '・' */ 0xFF65) return '・';
else if (codePoint === /* '゙' */ 0xFF9E || codePoint === /* '゛' */ 0x309B) return '\u3099'; // -> COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK
else if (codePoint === /* '゚' */ 0xFF9F || codePoint === /* '゜' */ 0x309C) return '\u309A'; // -> COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
else return char.toLowerCase();
};
const HALF_TO_FULL_KANA: Record<string, string> = {
'ヲ': 'ヲ', 'ァ': 'ァ', 'ィ': 'ィ', 'ゥ': 'ゥ', 'ェ': 'ェ', 'ォ': 'ォ',
'ャ': 'ャ', 'ュ': 'ュ', 'ョ': 'ョ', 'ッ': 'ッ',
'ー': 'ー',
'ア': 'ア', 'イ': 'イ', 'ウ': 'ウ', 'エ': 'エ', 'オ': 'オ',
'カ': 'カ', 'キ': 'キ', 'ク': 'ク', 'ケ': 'ケ', 'コ': 'コ',
'サ': 'サ', 'シ': 'シ', 'ス': 'ス', 'セ': 'セ', 'ソ': 'ソ',
'タ': 'タ', 'チ': 'チ', 'ツ': 'ツ', 'テ': 'テ', 'ト': 'ト',
'ナ': 'ナ', 'ニ': 'ニ', 'ヌ': 'ヌ', 'ネ': 'ネ', 'ノ': '',
'ハ': 'ハ', 'ヒ': 'ヒ', 'フ': 'フ', 'ヘ': 'ヘ', 'ホ': 'ホ',
'マ': 'マ', 'ミ': 'ミ', 'ム': 'ム', 'メ': 'メ', 'モ': 'モ',
'ヤ': 'ヤ', 'ユ': 'ユ', 'ヨ': 'ヨ',
'ラ': 'ラ', 'リ': 'リ', 'ル': 'ル', 'レ': 'レ', 'ロ': 'ロ',
'ワ': 'ワ', 'ン': 'ン',
};
const isHiraganaRange = (charCode: number) => (charCode >= 0x3041 && charCode <= 0x3096) || (charCode >= 0x309D && charCode <= 0x309E);
export const toKatakanaSingle = (char: string) => {
const code = char.charCodeAt(0);
return isHiraganaRange(code) ? String.fromCharCode(code + 0x60) : char;
};
export const toKatakana = (string: string) => [...string].map(toKatakanaSingle).join('');
+17
View File
@@ -0,0 +1,17 @@
export interface TrieNode {
parent: TrieNode | undefined;
children: Map<number, TrieNode>; // Unicode code point -> child node
tokenIds: number[];
subTreeTokenIds: number[]; // Empty on root. Will Uint16Array be faster?
}
export const traverseTrieStep = (node: TrieNode | undefined, codePoint: string, ignorableCodePoints?: RegExp) =>
node?.children.get(codePoint.codePointAt(0)!) ?? (ignorableCodePoints?.test(codePoint) ? node : undefined);
export const traverseTrie = (node: TrieNode | undefined, text: string, ignorableCodePoints?: RegExp) => {
if (!node) return;
for (const codePoint of text) {
node = traverseTrieStep(node, codePoint, ignorableCodePoints);
if (!node) return;
}
return node;
};
+31
View File
@@ -0,0 +1,31 @@
export enum TokenType {
Raw,
Kana,
Romaji,
Han,
Pinyin,
}
export interface TokenDefinition {
id: number;
type: TokenType;
text: string;
codePointLength: number;
}
// [start, end)
export interface OffsetSpan {
start: number;
end: number;
}
export type CompressedInvertedIndex = {
documents: string[];
tokenTypes: TokenType[];
tokenReferences: number[][][]; // tokenId -> [documentId, start1, end1, start2, end2, ...][]
tries: {
romaji: number[];
kana: number[];
other: number[];
};
};
+3
View File
@@ -0,0 +1,3 @@
import type { OffsetSpan } from './types';
export const getSpanLength = (offset: OffsetSpan) => offset.end - offset.start;
+73
View File
@@ -0,0 +1,73 @@
import path from 'node:path';
import url from 'node:url';
import { TokenizerBuilder } from '@patdx/kuromoji';
import NodeDictionaryLoader from '@patdx/kuromoji/node';
import { buildInvertedIndex, type KuromojiTokenizer } from '../indexer';
import { highlightSearchResult, loadInvertedIndex, searchInvertedIndex } from '../searcher';
let kuromoji: KuromojiTokenizer;
beforeAll(async () => {
const kuromojiDictPath = path.resolve(url.fileURLToPath(import.meta.resolve('@patdx/kuromoji')), '..', '..', 'dict');
kuromoji = await new TokenizerBuilder({ loader: new NodeDictionaryLoader({ dic_path: kuromojiDictPath }) }).build();
});
describe('search', () => {
const testDocuments = [
'ミーティア',
'エンドマークに希望と涙を添えて',
'宵の鳥',
'僕の和風本当上手',
];
it('should match with mixed search query', () => {
const compressed = buildInvertedIndex(testDocuments, { kuromoji });
const invertedIndex = loadInvertedIndex(compressed);
const results = searchInvertedIndex(invertedIndex, 'bokunoh风じょう');
// Should have at least one result
expect(results.length).toBeGreaterThan(0);
// The first result should be "僕の和風本当上手"
expect(results[0]!.documentText).toBe('僕の和風本当上手');
});
it('should highlight search result correctly', () => {
const compressed = buildInvertedIndex(testDocuments, { kuromoji });
const invertedIndex = loadInvertedIndex(compressed);
const results = searchInvertedIndex(invertedIndex, 'bokunoh风じょう');
expect(results.length).toBeGreaterThan(0);
const highlighted = highlightSearchResult(results[0]!);
// Should be an array of parts
expect(Array.isArray(highlighted)).toBe(true);
expect(highlighted.length).toBeGreaterThan(0);
// Collect highlighted text
const highlightedTexts = highlighted
.filter((part): part is { highlight: string } => typeof part !== 'string')
.map(part => part.highlight);
expect(highlightedTexts.some(text => text.includes('僕'))).toBe(true);
expect(highlightedTexts.some(text => text.includes('の'))).toBe(true);
expect(highlightedTexts.some(text => text.includes('和'))).toBe(true);
expect(highlightedTexts.some(text => text.includes('風'))).toBe(true);
expect(highlightedTexts.some(text => text.includes('上'))).toBe(true);
});
it('should match romaji input to kana documents', () => {
const compressed = buildInvertedIndex(testDocuments, { kuromoji });
const invertedIndex = loadInvertedIndex(compressed);
// Search for "yoi" should match "宵の鳥"
const results = searchInvertedIndex(invertedIndex, 'yoi');
const matchedTexts = results.map(r => r.documentText);
expect(matchedTexts).toContain('宵の鳥');
});
});
+111
View File
@@ -0,0 +1,111 @@
import { traverseTrie } from '../common';
import { buildTrie, serializeTrie } from '../indexer/trie';
import { deserializeTrie } from '../searcher/trie';
describe('Trie building', () => {
it('should build a Trie with multiple different tokens', () => {
const trie = buildTrie([
[0, 'hello'],
[1, 'help'],
[2, 'world'],
[3, 'word'],
]);
// Traverse to verify structure
const helloNode = traverseTrie(trie, 'hello');
const helpNode = traverseTrie(trie, 'help');
const worldNode = traverseTrie(trie, 'world');
const wordNode = traverseTrie(trie, 'word');
expect(helloNode).toBeDefined();
expect(helpNode).toBeDefined();
expect(worldNode).toBeDefined();
expect(wordNode).toBeDefined();
// Check token IDs
expect(helloNode!.tokenIds).toContain(0);
expect(helpNode!.tokenIds).toContain(1);
expect(worldNode!.tokenIds).toContain(2);
expect(wordNode!.tokenIds).toContain(3);
// Check that 'hel' prefix node has both tokens in subTree
const helNode = traverseTrie(trie, 'hel');
expect(helNode).toBeDefined();
expect(helNode!.subTreeTokenIds).toContain(0);
expect(helNode!.subTreeTokenIds).toContain(1);
});
it('should handle Japanese text tokens', () => {
const trie = buildTrie([
[0, 'さくら'],
[1, 'サクラ'],
[2, '桜'],
]);
expect(traverseTrie(trie, 'さくら')?.tokenIds).toContain(0);
expect(traverseTrie(trie, 'サクラ')?.tokenIds).toContain(1);
expect(traverseTrie(trie, '桜')?.tokenIds).toContain(2);
});
});
describe('Trie serialization and deserialization', () => {
it('should serialize and deserialize a Trie correctly', () => {
const originalTrie = buildTrie([
[0, 'apple'],
[1, 'app'],
[2, 'banana'],
]);
// Serialize
const serialized = serializeTrie(originalTrie);
expect(Array.isArray(serialized)).toBe(true);
expect(serialized.length).toBeGreaterThan(0);
// Deserialize
const { root: deserializedTrie, tokenCodePoints } = deserializeTrie(serialized);
// Verify structure is preserved
const appleNode = traverseTrie(deserializedTrie, 'apple');
const appNode = traverseTrie(deserializedTrie, 'app');
const bananaNode = traverseTrie(deserializedTrie, 'banana');
expect(appleNode).toBeDefined();
expect(appNode).toBeDefined();
expect(bananaNode).toBeDefined();
expect(appleNode!.tokenIds).toContain(0);
expect(appNode!.tokenIds).toContain(1);
expect(bananaNode!.tokenIds).toContain(2);
// Verify tokenCodePoints map
expect(tokenCodePoints.get(0)?.join('')).toBe('apple');
expect(tokenCodePoints.get(1)?.join('')).toBe('app');
expect(tokenCodePoints.get(2)?.join('')).toBe('banana');
// Verify subTreeTokenIds are reconstructed
expect(appNode!.subTreeTokenIds).toContain(0);
expect(appNode!.subTreeTokenIds).toContain(1);
});
it('should preserve parent references after deserialization', () => {
const originalTrie = buildTrie([
[0, 'test'],
]);
const serialized = serializeTrie(originalTrie);
const { root } = deserializeTrie(serialized);
const testNode = traverseTrie(root, 'test');
expect(testNode).toBeDefined();
// Walk back to root via parent references
let node = testNode;
let depth = 0;
while (node?.parent) {
node = node.parent;
depth++;
}
expect(depth).toBe(4); // 't' -> 'e' -> 's' -> 't' -> root
expect(node).toBe(root);
});
});
+3
View File
@@ -0,0 +1,3 @@
export * from './common';
export * from './indexer';
export * from './searcher';
+103
View File
@@ -0,0 +1,103 @@
import { getHanVariants, getPinyinCandidates, isHanCharacter, unionFindSet } from './han';
describe('unionFindSet', () => {
it('should find self as root initially', () => {
const ufs = unionFindSet<number>();
expect(ufs.find(1)).toBe(1);
expect(ufs.find(2)).toBe(2);
});
it('should union two elements', () => {
const ufs = unionFindSet<number>();
ufs.union(1, 2);
expect(ufs.find(1)).toBe(ufs.find(2));
});
it('should union multiple elements transitively', () => {
const ufs = unionFindSet<number>();
ufs.union(1, 2);
ufs.union(2, 3);
ufs.union(4, 5);
expect(ufs.find(1)).toBe(ufs.find(3));
expect(ufs.find(1)).not.toBe(ufs.find(4));
ufs.union(3, 4);
expect(ufs.find(1)).toBe(ufs.find(5));
});
it('should iterate all keys', () => {
const ufs = unionFindSet<string>();
ufs.union('a', 'b');
ufs.union('c', 'd');
const keys = [...ufs.keys()];
expect(keys).toContain('a');
expect(keys).toContain('b');
expect(keys).toContain('c');
expect(keys).toContain('d');
});
});
describe('isHanCharacter', () => {
it('should return true for CJK characters', () => {
expect(isHanCharacter('中')).toBe(true);
expect(isHanCharacter('国')).toBe(true);
expect(isHanCharacter('日')).toBe(true);
expect(isHanCharacter('本')).toBe(true);
});
it('should return false for non-CJK characters', () => {
expect(isHanCharacter('a')).toBe(false);
expect(isHanCharacter('あ')).toBe(false);
expect(isHanCharacter('ア')).toBe(false);
expect(isHanCharacter('1')).toBe(false);
});
});
describe('getHanVariants', () => {
it('should return variants for simplified/traditional characters', () => {
// 国 (simplified) and 國 (traditional) should be variants of each other
const variants1 = getHanVariants('国');
const variants2 = getHanVariants('國');
expect(variants1).toContain('国');
expect(variants1).toContain('國');
expect(variants2).toContain('国');
expect(variants2).toContain('國');
});
it('should return the character itself for characters without variants', () => {
const variants = getHanVariants('一');
expect(variants).toContain('一');
});
it('should return empty array for non-Han characters', () => {
expect(getHanVariants('a')).toEqual([]);
expect(getHanVariants('あ')).toEqual([]);
});
});
describe('getPinyinCandidates', () => {
it('should return pinyin for a Han character', () => {
const candidates = getPinyinCandidates('中');
expect(candidates).toContain('zhong');
expect(candidates).toContain('zh'); // initial
expect(candidates).toContain('z'); // first letter
});
it('should return multiple pinyin for polyphonic characters', () => {
// 行 can be "xing" or "hang"
const candidates = getPinyinCandidates('行');
expect(candidates).toContain('xing');
expect(candidates).toContain('hang');
});
it('should include fuzzy pinyin variants', () => {
// 风 is "feng", should also have fuzzy variant "fen"
const candidates = getPinyinCandidates('风');
expect(candidates).toContain('feng');
expect(candidates).toContain('fen'); // fuzzy: eng -> en
});
it('should return empty array for non-Han characters', () => {
expect(getPinyinCandidates('a')).toEqual([]);
expect(getPinyinCandidates('あ')).toEqual([]);
});
});
+85
View File
@@ -0,0 +1,85 @@
// @ts-expect-error No declaration file
import hkVariants from 'opencc-js/dict/HKVariants';
// @ts-expect-error No declaration file
import hkVariantsRev from 'opencc-js/dict/HKVariantsRev';
// @ts-expect-error No declaration file
import jpVariants from 'opencc-js/dict/JPVariants';
// @ts-expect-error No declaration file
import jpVariantsRev from 'opencc-js/dict/JPVariantsRev';
// @ts-expect-error No declaration file
import stCharacters from 'opencc-js/dict/STCharacters';
// @ts-expect-error No declaration file
import tsCharacters from 'opencc-js/dict/TSCharacters';
// @ts-expect-error No declaration file
import twVariants from 'opencc-js/dict/TWVariants';
// @ts-expect-error No declaration file
import twVariantsRev from 'opencc-js/dict/TWVariantsRev';
import { polyphonic } from 'pinyin-pro';
export const unionFindSet = <T>() => {
const parent = new Map<T, T>();
const rank = new Map<T, number>();
const find = (x: T): T => {
const p = parent.get(x);
if (p == null) {
parent.set(x, x);
return x;
} else if (p === x) return x;
else {
const root = find(p);
parent.set(x, root);
return root;
}
};
const union = (x: T, y: T) => {
x = find(x);
y = find(y);
if (x === y) return;
const rankX = rank.get(x) ?? 0, rankY = rank.get(y) ?? 0;
if (rankX < rankY) parent.set(x, y);
else if (rankX > rankY) parent.set(y, x);
else {
parent.set(y, x);
rank.set(x, rankX + 1);
}
};
const keys = () => parent.keys();
return { find, union, keys };
};
const exchangeMap = (() => {
const ufs = unionFindSet<string>();
for (const dict of [hkVariants, hkVariantsRev, jpVariants, jpVariantsRev, stCharacters, tsCharacters, twVariants, twVariantsRev] as string[]) {
for (const [from, to] of dict.split('|').map(pair => pair.split(' '))) {
if (!from || !to || [...from].length !== 1 || [...to].length !== 1) continue;
ufs.union(from, to);
}
}
const map = new Map<string, string[]>();
for (const key of ufs.keys()) {
const root = ufs.find(key);
let list = map.get(root);
if (!list) map.set(root, list = []);
if (key !== root) map.set(key, list);
list.push(key);
}
for (const list of map.values()) list.sort();
return map;
})();
export const isHanCharacter = (phrase: string) => /^[\p{Script=Han}]+$/u.test(phrase);
export const getHanVariants = (character: string) => exchangeMap.get(character) ?? (isHanCharacter(character) ? [character] : []);
const PINYIN_INITIALS: string[] = ['b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 'j', 'q', 'x', 'zh', 'ch', 'sh', 'r', 'z', 'c', 's', 'y', 'w'];
const PINYIN_FINALS_FUZZY_MAP: Record<string, string> = { 'ang': 'an', 'eng': 'en', 'ing': 'in' };
export const getPinyinCandidates = (character: string) => {
const pinyins = polyphonic(character, { type: 'array', toneType: 'none', removeNonZh: true })[0] ?? [];
return Array.from(new Set(pinyins.filter(fullPinyin => fullPinyin).flatMap(fullPinyin => {
const initial = PINYIN_INITIALS.find(initial => fullPinyin.startsWith(initial));
const initialAlphabet = initial?.[0] ?? fullPinyin[0]!;
const fuzzySuffix = fullPinyin.slice(-3);
const fuzzyPinyin = fuzzySuffix in PINYIN_FINALS_FUZZY_MAP ? fullPinyin.slice(0, -3) + PINYIN_FINALS_FUZZY_MAP[fuzzySuffix] : undefined;
return [fullPinyin, initial, initialAlphabet, fuzzyPinyin].filter((s): s is string => !!s);
})));
};
+5
View File
@@ -0,0 +1,5 @@
export * from './han';
export * from './japanese';
export * from './tokenizer';
export * from './trie';
export * from './inverted-index';
@@ -0,0 +1,46 @@
import { NORMALIZE_RULES_KANA_DAKUTEN, NORMALIZE_RULES_ROMAJI } from './japanese';
import { createTokenizer, type TokenizerOptions } from './tokenizer';
import { buildTrie, graftTriePaths, serializeTrie } from './trie';
import type { CompressedInvertedIndex, TokenDefinition } from '../common/types';
import { TokenType } from '../common/types';
const buildTypedTrie = (tokens: TokenDefinition[], typePredicate: (tokenType: TokenType) => boolean) =>
buildTrie(tokens.filter(token => typePredicate(token.type)).map(token => [token.id, token.text]));
export const buildInvertedIndex = (documents: string[], tokenizerOptions: TokenizerOptions) => {
const tokenizer = createTokenizer(tokenizerOptions);
const documentTokens = documents.map(document => tokenizer.tokenize(document));
const tokenDefinitions = [...tokenizer.tokens.values()];
const romajiRoot = buildTypedTrie(tokenDefinitions, type => type === TokenType.Romaji);
const kanaRoot = buildTypedTrie(tokenDefinitions, type => type === TokenType.Kana);
const otherRoot = buildTypedTrie(tokenDefinitions, type => type !== TokenType.Romaji && type !== TokenType.Kana);
graftTriePaths(romajiRoot, NORMALIZE_RULES_ROMAJI);
graftTriePaths(kanaRoot, NORMALIZE_RULES_KANA_DAKUTEN);
const invertedIndex: CompressedInvertedIndex = {
documents,
tokenTypes: tokenDefinitions.map(token => token.type),
tokenReferences: Array.from({ length: tokenDefinitions.length }, () => []),
tries: {
romaji: serializeTrie(romajiRoot),
kana: serializeTrie(kanaRoot),
other: serializeTrie(otherRoot),
},
};
for (const [documentId, tokens] of documentTokens.entries()) {
const tokenOccurrences = new Map<number, number[]>();
for (const token of tokens) {
let occurrences = tokenOccurrences.get(token.id);
if (!occurrences) {
occurrences = [];
tokenOccurrences.set(token.id, occurrences);
}
occurrences.push(token.start, token.end);
}
for (const [tokenId, occurrences] of tokenOccurrences) {
invertedIndex.tokenReferences[tokenId]!.push([documentId, ...occurrences]);
}
}
return invertedIndex;
};
@@ -0,0 +1,66 @@
import path from 'node:path';
import url from 'node:url';
import { TokenizerBuilder } from '@patdx/kuromoji';
import NodeDictionaryLoader from '@patdx/kuromoji/node';
import { getAllKanaReadings, toRomajiStrictly } from './japanese';
import type { KuromojiTokenizer } from './tokenizer';
let kuromoji: KuromojiTokenizer;
beforeAll(async () => {
const kuromojiDictPath = path.resolve(url.fileURLToPath(import.meta.resolve('@patdx/kuromoji')), '..', '..', 'dict');
kuromoji = await new TokenizerBuilder({ loader: new NodeDictionaryLoader({ dic_path: kuromojiDictPath }) }).build();
});
describe('toRomajiStrictly', () => {
it('should convert basic kana to romaji', () => {
expect(toRomajiStrictly('あ')).toBe('a');
expect(toRomajiStrictly('か')).toBe('ka');
expect(toRomajiStrictly('さくら')).toBe('sakura');
});
it('should convert katakana to romaji', () => {
expect(toRomajiStrictly('ア')).toBe('a');
expect(toRomajiStrictly('カ')).toBe('ka');
expect(toRomajiStrictly('サクラ')).toBe('sakura');
});
it('should handle long vowels', () => {
expect(toRomajiStrictly('おう')).toBe('ou');
expect(toRomajiStrictly('おお')).toBe('oo');
});
it('should return empty string for invalid first character', () => {
expect(toRomajiStrictly('ー')).toBe(''); // prolonged sound mark cannot be first
expect(toRomajiStrictly('ゃ')).toBe(''); // small ya cannot be first
});
it('should return empty string for invalid last character', () => {
expect(toRomajiStrictly('っ')).toBe(''); // small tsu cannot be last
});
it('should handle gemination (small tsu)', () => {
expect(toRomajiStrictly('かった')).toBe('katta');
});
});
describe('getAllKanaReadings', () => {
it('should return katakana reading for pure kana input', () => {
const readings = getAllKanaReadings(kuromoji, 'あ');
expect(readings).toContain('ア');
});
it('should return readings for kanji', () => {
const readings = getAllKanaReadings(kuromoji, '僕');
expect(readings.length).toBeGreaterThan(0);
// 僕 should have reading ボク
expect(readings).toContain('ボク');
});
it('should return readings for compound words', () => {
const readings = getAllKanaReadings(kuromoji, '和風');
expect(readings.length).toBeGreaterThan(0);
});
});
+158
View File
@@ -0,0 +1,158 @@
import { fromKana } from 'hepburn';
import type { KuromojiTokenizer } from './tokenizer';
import { toKatakana } from '../common';
// We have normalized all other sound marks to \u3099 and \u309A (combining kata-hiragana voiced/semi-voiced sound marks)
export const isMaybeJapanese = (phrase: string) => /^[\p{Script=Han}\u3041-\u309F\u30A0-\u30FF\u3005\u3006\u30FC\u3099\u309A]+$/u.test(phrase);
// See also normalize.ts
export const isJapaneseSoundMark = (phrase: string) => /^[\u3099\u309A]+$/.test(phrase);
export const stripJapaneseSoundMarks = (phrase: string) => phrase.replaceAll('\u3099', '').replaceAll('\u309A', '');
export const isKanaSingle = (char: string) => {
const code = char.charCodeAt(0);
return (code >= 0x3041 && code <= 0x309F) || (code >= 0x30A0 && code <= 0x30FF);
};
export const isKana = (phrase: string) => [...phrase].every(isKanaSingle);
const KANAS_CANNOT_BE_FIRST = [
'ァ', 'ィ', 'ゥ', 'ェ', 'ォ',
'ぁ', 'ぃ', 'ぅ', 'ぇ', 'ぉ',
'ャ', 'ュ', 'ョ',
'ゃ', 'ゅ', 'ょ',
'ヮ', 'ゎ',
'ㇰ', 'ㇱ', 'ㇲ', 'ㇳ', 'ㇴ', 'ㇵ', 'ㇶ', 'ㇷ', 'ㇸ', 'ㇹ', 'ㇺ', 'ㇻ', 'ㇼ', 'ㇽ', 'ㇾ', 'ㇿ',
'ー',
];
const KANAS_CANNOT_BE_LAST = [
'ッ', 'っ',
];
export const toRomajiStrictly = (kana: string) => {
if (KANAS_CANNOT_BE_FIRST.includes(kana[0]!)) return '';
if (KANAS_CANNOT_BE_LAST.includes(kana[kana.length - 1]!)) return '';
const romaji = fromKana(kana).toLowerCase()
.replaceAll('ā', 'aa')
.replaceAll('ī', 'ii')
.replaceAll('ū', 'uu')
.replaceAll('ē', 'ee')
.replaceAll('ō', 'ou');
if (!romaji.match(/^[a-z]+$/)) return '';
return romaji;
};
export const createTranscriptionEnumerator = (
isValidPhrase: (codePoints: string[], start: number, length: number) => boolean,
getAllTranscriptions: (phrase: string) => string[],
) => (codePoints: string[]) => {
const toKey = (start: number, length: number) => `${start}:${length}`;
const resultMap = new Map<string, { start: number; length: number; transcriptions: string[] }>();
for (let phraseLength = 1; phraseLength <= codePoints.length; phraseLength++) for (let start = 0; start + phraseLength <= codePoints.length; start++) {
if (!isValidPhrase(codePoints, start, phraseLength)) continue;
const phrase = codePoints.slice(start, start + phraseLength).join('');
const atomicTranscriptions = [...new Set(getAllTranscriptions(phrase))].filter(candidateTranscription => {
if (!candidateTranscription) return false;
// Ensure the transcription is atomic (not a combination of multiple shorter transcriptions, separated by any midpoints)
type State = { phrasePosition: number; transcriptionPosition: number };
const toStateKey = (state: State) => `${state.phrasePosition}:${state.transcriptionPosition}`;
const visitedStates = new Set<string>();
const queue: State[] = [{ phrasePosition: 0, transcriptionPosition: 0 }];
while (queue.length > 0) {
const { phrasePosition, transcriptionPosition } = queue.shift()!;
for (let prefixLength = 1; prefixLength <= phraseLength - phrasePosition; prefixLength++) {
const prefixResult = resultMap.get(toKey(start + phrasePosition, prefixLength));
if (!prefixResult) continue;
for (const transcription of prefixResult.transcriptions) {
if (candidateTranscription.slice(transcriptionPosition, transcriptionPosition + transcription.length) === transcription) {
const nextState: State = { phrasePosition: phrasePosition + prefixLength, transcriptionPosition: transcriptionPosition + transcription.length };
if (nextState.phrasePosition === phraseLength && nextState.transcriptionPosition === candidateTranscription.length) return false; // Found a valid combination
if (visitedStates.has(toStateKey(nextState))) continue;
visitedStates.add(toStateKey(nextState));
queue.push(nextState);
}
}
}
}
return true;
});
if (atomicTranscriptions.length > 0) resultMap.set(toKey(start, phraseLength), { start, length: phraseLength, transcriptions: atomicTranscriptions });
}
return [...resultMap.values()];
};
export const getAllKanaReadings = (kuromoji: KuromojiTokenizer, phrase: string) => Array.from(new Set(
[
...isKana(phrase) ? [toKatakana(phrase)] : [],
...isKana(phrase) && [...phrase].length === 1 ? [] : ((kuromoji.token_info_dictionary.target_map[kuromoji.viterbi_builder.trie.lookup(phrase)] ?? [])
.map(id => kuromoji.formatter.formatEntry(
id, 0, 'KNOWN',
kuromoji.token_info_dictionary.getFeatures(id as unknown as string)?.split(',') ?? [],
).reading)
.filter((reading): reading is string => !!reading))
.map(toKatakana),
],
));
const createNormalizer = (rules: Record<string, string>) => (text: string) => {
while (true) {
const beforeCurrentIteration = text;
for (const [from, to] of Object.entries(rules)) text = text.replaceAll(from, to);
if (text === beforeCurrentIteration) break;
}
return text;
};
export const NORMALIZE_RULES_ROMAJI: Record<string, string> = {
// Remove all long vowels (sa-ba- -> saba)
'-': '',
// Collapse consecutive vowels
'aa': 'a',
'ii': 'i',
'uu': 'u',
'ee': 'e',
'oo': 'o',
'ou': 'o',
// mb/mp/mm -> nb/np/nm (shimbun -> shinbun)
'mb': 'nb',
'mp': 'np',
'mm': 'nm',
// Others
'sha': 'sya',
'tsu': 'tu',
'chi': 'ti',
'shi': 'si',
'ji': 'zi',
};
export const normalizeRomaji = createNormalizer(NORMALIZE_RULES_ROMAJI);
export const NORMALIZE_RULES_KANA_DAKUTEN: Record<string, string> = {
'う\u3099': 'ゔ',
'か\u3099': 'が', 'き\u3099': 'ぎ', 'く\u3099': 'ぐ', 'け\u3099': 'げ', 'こ\u3099': 'ご',
'さ\u3099': 'ざ', 'し\u3099': 'じ', 'す\u3099': 'ず', 'せ\u3099': 'ぜ', 'そ\u3099': 'ぞ',
'た\u3099': 'だ', 'ち\u3099': 'ぢ', 'つ\u3099': 'づ', 'て\u3099': 'で', 'と\u3099': 'ど',
'は\u3099': 'ば', 'ひ\u3099': 'び', 'ふ\u3099': 'ぶ', 'へ\u3099': 'べ', 'ほ\u3099': 'ぼ',
'は\u309A': 'ぱ', 'ひ\u309A': 'ぴ', 'ふ\u309A': 'ぷ', 'へ\u309A': 'ぺ', 'ほ\u309A': 'ぽ',
'ゝ\u3099': 'ゞ',
'ウ\u3099': 'ヴ',
'カ\u3099': 'ガ', 'キ\u3099': 'ギ', 'ク\u3099': 'グ', 'ケ\u3099': 'ゲ', 'コ\u3099': 'ゴ',
'サ\u3099': 'ザ', 'シ\u3099': 'ジ', 'ス\u3099': 'ズ', 'セ\u3099': 'ゼ', 'ソ\u3099': 'ゾ',
'タ\u3099': 'ダ', 'チ\u3099': 'ヂ', 'ツ\u3099': 'ヅ', 'テ\u3099': 'デ', 'ト\u3099': 'ド',
'ハ\u3099': 'バ', 'ヒ\u3099': 'ビ', 'フ\u3099': 'ブ', 'ヘ\u3099': 'ベ', 'ホ\u3099': 'ボ',
'ハ\u309A': 'パ', 'ヒ\u309A': 'ピ', 'フ\u309A': 'プ', 'ヘ\u309A': 'ペ', 'ホ\u309A': 'ポ',
'ワ\u3099': 'ヷ', 'ヰ\u3099': 'ヸ', 'ヱ\u3099': 'ヹ', 'ヲ\u3099': 'ヺ',
'ヽ\u3099': 'ヾ',
};
export const normalizeKanaDakuten = createNormalizer(NORMALIZE_RULES_KANA_DAKUTEN);
const isValidJapanesePhrase = (codePoints: string[], start: number, length: number) =>
// Skip splittings that cause sound marks to occur in the first position of a phrase
!isJapaneseSoundMark(codePoints[start]!) && (start + length === codePoints.length || !isJapaneseSoundMark(codePoints[start + length]!));
export const createKanaTranscriptionEnumerator = (kuromoji: KuromojiTokenizer) => createTranscriptionEnumerator(
isValidJapanesePhrase,
phrase => getAllKanaReadings(kuromoji, stripJapaneseSoundMarks(normalizeKanaDakuten(phrase))),
);
export const createRomajiTranscriptionEnumerator = (kuromoji: KuromojiTokenizer) => createTranscriptionEnumerator(
isValidJapanesePhrase,
phrase => getAllKanaReadings(kuromoji, stripJapaneseSoundMarks(normalizeKanaDakuten(phrase))).map(kana => normalizeRomaji(toRomajiStrictly(kana))),
);
@@ -0,0 +1,166 @@
import path from 'node:path';
import url from 'node:url';
import { TokenizerBuilder } from '@patdx/kuromoji';
import NodeDictionaryLoader from '@patdx/kuromoji/node';
import { createTokenizer, type KuromojiTokenizer } from './tokenizer';
import { TokenType } from '../common/types';
let kuromoji: KuromojiTokenizer;
beforeAll(async () => {
const kuromojiDictPath = path.resolve(url.fileURLToPath(import.meta.resolve('@patdx/kuromoji')), '..', '..', 'dict');
kuromoji = await new TokenizerBuilder({ loader: new NodeDictionaryLoader({ dic_path: kuromojiDictPath }) }).build();
});
describe('tokenizer', () => {
it('should tokenize mixed Japanese text', () => {
const tokenizer = createTokenizer({ kuromoji });
const tokens = tokenizer.tokenize('僕の和風本当上手');
// Get all token definitions
const tokenDefs = [...tokenizer.tokens.values()];
// Should have tokens of various types
const types = new Set(tokenDefs.map(t => t.type));
expect(types.has(TokenType.Han)).toBe(true);
expect(types.has(TokenType.Pinyin)).toBe(true);
expect(types.has(TokenType.Kana)).toBe(true);
expect(types.has(TokenType.Romaji)).toBe(true);
const getTokenTextsAt = (pos: number, type: TokenType) => tokens
.filter(t => t.start <= pos && t.end > pos && tokenDefs.find(d => d.id === t.id)?.type === type)
.map(t => tokenDefs.find(d => d.id === t.id)!.text);
// Position 0: 僕
expect(getTokenTextsAt(0, TokenType.Han)).toContain('僕');
expect(getTokenTextsAt(0, TokenType.Pinyin)).toContain('pu');
expect(getTokenTextsAt(0, TokenType.Kana)).toContain('ボク');
expect(getTokenTextsAt(0, TokenType.Romaji)).toContain('boku');
// Position 1: の (hiragana, no Han/Pinyin)
expect(getTokenTextsAt(1, TokenType.Han)).toEqual([]);
expect(getTokenTextsAt(1, TokenType.Pinyin)).toEqual([]);
expect(getTokenTextsAt(1, TokenType.Kana)).toContain('');
expect(getTokenTextsAt(1, TokenType.Romaji)).toContain('no');
// Position 2: 和
expect(getTokenTextsAt(2, TokenType.Han)).toContain('和');
expect(getTokenTextsAt(2, TokenType.Pinyin)).toContain('he');
expect(getTokenTextsAt(2, TokenType.Kana)).toContain('ワ');
expect(getTokenTextsAt(2, TokenType.Romaji)).toContain('wa');
// Position 3: 風
expect(getTokenTextsAt(3, TokenType.Han)).toContain('風');
expect(getTokenTextsAt(3, TokenType.Han)).toContain('风'); // simplified variant
expect(getTokenTextsAt(3, TokenType.Pinyin)).toContain('feng');
expect(getTokenTextsAt(3, TokenType.Kana)).toContain('フウ');
expect(getTokenTextsAt(3, TokenType.Romaji)).toContain('fu');
// Position 4: 本
expect(getTokenTextsAt(4, TokenType.Han)).toContain('本');
expect(getTokenTextsAt(4, TokenType.Pinyin)).toContain('ben');
expect(getTokenTextsAt(4, TokenType.Kana)).toContain('ホン');
expect(getTokenTextsAt(4, TokenType.Romaji)).toContain('hon');
// Position 5: 当
expect(getTokenTextsAt(5, TokenType.Han)).toContain('当');
expect(getTokenTextsAt(5, TokenType.Han)).toContain('當'); // traditional variant
expect(getTokenTextsAt(5, TokenType.Pinyin)).toContain('dang');
expect(getTokenTextsAt(5, TokenType.Kana)).toContain('トウ');
expect(getTokenTextsAt(5, TokenType.Romaji)).toContain('to'); // normalized: tou -> to
// Position 6: 上
expect(getTokenTextsAt(6, TokenType.Han)).toContain('上');
expect(getTokenTextsAt(6, TokenType.Pinyin)).toContain('shang');
expect(getTokenTextsAt(6, TokenType.Kana)).toContain('ジョウ');
expect(getTokenTextsAt(6, TokenType.Romaji)).toContain('jo'); // normalized: jou -> jo
// Position 7: 手
expect(getTokenTextsAt(7, TokenType.Han)).toContain('手');
expect(getTokenTextsAt(7, TokenType.Pinyin)).toContain('shou');
expect(getTokenTextsAt(7, TokenType.Kana)).toContain('シュ');
expect(getTokenTextsAt(7, TokenType.Romaji)).toContain('shu');
// Check that tokens cover the entire input
expect(tokens.length).toBeGreaterThan(0);
// Check some specific token definitions exist
const hanTokenTexts = tokenDefs.filter(t => t.type === TokenType.Han).map(t => t.text);
expect(hanTokenTexts).toContain('僕');
expect(hanTokenTexts).toContain('和');
expect(hanTokenTexts).toContain('風');
// Check kana readings exist for kanji
const kanaTokenTexts = tokenDefs.filter(t => t.type === TokenType.Kana).map(t => t.text);
expect(kanaTokenTexts).toContain('ボク'); // 僕 -> ボク
// Check romaji readings exist
const romajiTokenTexts = tokenDefs.filter(t => t.type === TokenType.Romaji).map(t => t.text);
expect(romajiTokenTexts).toContain('boku'); // 僕 -> boku
});
it('should not create duplicate tokens when tokenizing multiple documents', () => {
const tokenizer = createTokenizer({ kuromoji });
// Tokenize multiple music names that share some characters
tokenizer.tokenize('僕の和風本当上手');
tokenizer.tokenize('僕');
tokenizer.tokenize('和風');
// Check that there are no duplicate tokens
const tokenDefs = [...tokenizer.tokens.values()];
const tokenKeys = tokenDefs.map(t => `${t.type}:${t.text}`);
const uniqueKeys = new Set(tokenKeys);
expect(tokenKeys.length).toBe(uniqueKeys.size);
// Also check that IDs are unique
const ids = tokenDefs.map(t => t.id);
const uniqueIds = new Set(ids);
expect(ids.length).toBe(uniqueIds.size);
});
it('should handle Raw tokens for non-CJK characters', () => {
const tokenizer = createTokenizer({ kuromoji });
tokenizer.tokenize('a-b');
const tokenDefs = [...tokenizer.tokens.values()];
const rawTokenTexts = tokenDefs.filter(t => t.type === TokenType.Raw).map(t => t.text);
expect(rawTokenTexts).toContain('a'); // normalized to lowercase
expect(rawTokenTexts).toContain('-');
expect(rawTokenTexts).toContain('b');
});
it('should tokenize compound word "今日" with both individual and combined readings', () => {
const tokenizer = createTokenizer({ kuromoji });
const tokens = tokenizer.tokenize('今日');
const tokenDefs = [...tokenizer.tokens.values()];
const getTokensWithSpan = (type: TokenType, start: number, end: number) => tokens
.filter(t => t.start === start && t.end === end && tokenDefs.find(d => d.id === t.id)?.type === type)
.map(t => tokenDefs.find(d => d.id === t.id)!.text);
// Individual character readings at position 0: 今
expect(getTokensWithSpan(TokenType.Han, 0, 1)).toContain('今');
expect(getTokensWithSpan(TokenType.Pinyin, 0, 1)).toContain('jin');
expect(getTokensWithSpan(TokenType.Kana, 0, 1)).toContain('コン');
expect(getTokensWithSpan(TokenType.Kana, 0, 1)).toContain('イマ');
expect(getTokensWithSpan(TokenType.Romaji, 0, 1)).toContain('kon');
expect(getTokensWithSpan(TokenType.Romaji, 0, 1)).toContain('ima');
// Individual character readings at position 1: 日
expect(getTokensWithSpan(TokenType.Han, 1, 2)).toContain('日');
expect(getTokensWithSpan(TokenType.Pinyin, 1, 2)).toContain('ri');
expect(getTokensWithSpan(TokenType.Kana, 1, 2)).toContain('ニチ');
expect(getTokensWithSpan(TokenType.Kana, 1, 2)).toContain('ヒ');
expect(getTokensWithSpan(TokenType.Romaji, 1, 2)).toContain('niti');
expect(getTokensWithSpan(TokenType.Romaji, 1, 2)).toContain('hi');
// Combined reading for "今日" [0, 2] - this is an indivisible compound word
expect(getTokensWithSpan(TokenType.Kana, 0, 2)).toContain('キョウ');
expect(getTokensWithSpan(TokenType.Romaji, 0, 2)).toContain('kyo'); // normalized: kyou -> kyo
});
});
+93
View File
@@ -0,0 +1,93 @@
import type { TokenizerBuilder } from '@patdx/kuromoji';
import { getHanVariants, getPinyinCandidates } from './han';
import { createKanaTranscriptionEnumerator, createRomajiTranscriptionEnumerator, isMaybeJapanese } from './japanese';
import { normalizeByCodePoint } from '../common/normalize';
import { TokenType, type TokenDefinition } from '../common/types';
export interface Token {
id: number;
start: number;
end: number;
}
export type KuromojiTokenizer = Awaited<ReturnType<TokenizerBuilder['build']>>;
export interface TokenizerOptions {
kuromoji: KuromojiTokenizer;
}
export const createTokenizer = (options: TokenizerOptions) => {
const tokens = new Map<string, TokenDefinition>();
let nextId = 0;
const ensureToken = (type: TokenType, text: string) => {
const key = `${type}:${text}`;
let tokenDefinition = tokens.get(key);
if (tokenDefinition) return tokenDefinition;
tokenDefinition = { id: nextId++, type, text, codePointLength: [...text].length };
tokens.set(key, tokenDefinition);
return tokenDefinition;
};
const enumerateAllKanaCombinations = createKanaTranscriptionEnumerator(options.kuromoji);
const enumerateAllRomajiCombinations = createRomajiTranscriptionEnumerator(options.kuromoji);
const tokenize = (text: string) => {
const results: Token[] = [];
const emitter = (start: number, end: number) => (type: TokenType, text: string) => results.push({ id: ensureToken(type, text).id, start, end });
const emitMaybeJapanese = (codePoints: string[], offset: number) => {
for (const { start, length, transcriptions } of enumerateAllKanaCombinations(codePoints)) {
const emit = emitter(offset + start, offset + start + length);
for (const transcription of transcriptions) emit(TokenType.Kana, transcription);
}
for (const { start, length, transcriptions } of enumerateAllRomajiCombinations(codePoints)) {
const emit = emitter(offset + start, offset + start + length);
for (const transcription of transcriptions) emit(TokenType.Romaji, transcription);
}
for (let i = 0; i < codePoints.length; i++) {
// Single character may have not only kana readings, but also Chinese pronunciations or Simplified/Traditional/Japanese variants.
const character = codePoints[i]!;
const hanAlternates = getHanVariants(character); // All possible variant characters (Simplified/Traditional/Japanese)
const pinyinAlternates = Array.from(new Set(hanAlternates.flatMap(han => getPinyinCandidates(han)))); // All possible pinyin candidates
const emit = emitter(offset + i, offset + i + 1);
for (const han of hanAlternates) emit(TokenType.Han, han);
for (const pinyin of pinyinAlternates) emit(TokenType.Pinyin, pinyin);
}
};
const emitRaw = (codePoint: string, offset: number) => emitter(offset, offset + 1)(TokenType.Raw, codePoint);
const codePoints = [...normalizeByCodePoint(text)];
for (let start = 0; start < codePoints.length;) {
const codePoint = codePoints[start]!;
const consequentCharsets = [
{ is: isMaybeJapanese, emit: emitMaybeJapanese },
];
let emitted = false;
for (const { is, emit } of consequentCharsets) {
let length = 0;
while (start + length < codePoints.length && is(codePoints[start + length]!)) length++;
if (length > 0) {
emit(codePoints.slice(start, start + length), start);
start += length;
emitted = true;
break;
}
}
if (emitted) continue;
// Skip whitespaces
if (/\s/.test(codePoint)) {
start++;
continue;
}
emitRaw(codePoint, start);
start++;
}
return results;
};
return {
tokens,
tokenize,
};
};
+51
View File
@@ -0,0 +1,51 @@
import { traverseTrie } from '../common';
import { buildTrie, graftTriePaths } from './trie';
describe('graftTriePaths', () => {
it('should graft paths according to normalization rules', () => {
// Build a trie with tokens containing normalized forms
const trie = buildTrie([
[0, 'sya'], // normalized form of "sha"
[1, 'tu'], // normalized form of "tsu"
]);
// Graft paths so that "sha" -> "sya" and "tsu" -> "tu"
graftTriePaths(trie, {
sha: 'sya',
tsu: 'tu',
});
// Now we should be able to traverse using both the original and grafted paths
const syaNode = traverseTrie(trie, 'sya');
const shaNode = traverseTrie(trie, 'sha');
expect(syaNode).toBeDefined();
expect(shaNode).toBeDefined();
expect(syaNode).toBe(shaNode); // Both paths should lead to the same node
const tuNode = traverseTrie(trie, 'tu');
const tsuNode = traverseTrie(trie, 'tsu');
expect(tuNode).toBeDefined();
expect(tsuNode).toBeDefined();
expect(tuNode).toBe(tsuNode);
});
it('should handle chained graft rules', () => {
const trie = buildTrie([
[0, 'o'], // normalized vowel
]);
// Chain: "ou" -> "o", "oo" -> "o"
graftTriePaths(trie, {
ou: 'o',
oo: 'o',
});
const oNode = traverseTrie(trie, 'o');
const ouNode = traverseTrie(trie, 'ou');
const ooNode = traverseTrie(trie, 'oo');
expect(oNode).toBeDefined();
expect(ouNode).toBe(oNode);
expect(ooNode).toBe(oNode);
});
});
+115
View File
@@ -0,0 +1,115 @@
import { traverseTrie, type TrieNode } from '../common';
const newNode = (parent?: TrieNode): TrieNode => ({ parent, children: new Map(), tokenIds: [], subTreeTokenIds: [] });
// Assume tokens are unique.
export const buildTrie = (tokens: [id: number, text: string][]) => {
const root = newNode(undefined);
for (const [id, text] of tokens) {
let node = root;
for (const char of text) {
const codePoint = char.codePointAt(0)!;
let childNode = node.children.get(codePoint);
if (!childNode) {
childNode = newNode(node);
node.children.set(codePoint, childNode);
}
node = childNode;
node.subTreeTokenIds.push(id);
}
node.tokenIds.push(id);
}
return root;
};
export const graftTriePaths = (root: TrieNode, rules: Record<string, string>) => {
for (const [inputPhrase, graftTo] of Object.entries(rules)) if ([...graftTo].length > [...inputPhrase].length) throw new Error(`Graft rule ${inputPhrase} -> ${graftTo} maps to longer string and may cause infinite loop`);
const visitedNodes = new Set<TrieNode>();
const graftFromNode = (node: TrieNode, recursiveChildren: boolean) => {
if (visitedNodes.has(node)) return;
visitedNodes.add(node);
if (recursiveChildren) for (const [, childNode] of node.children) graftFromNode(childNode, true);
while (true) {
const nodesWithNewGraftedChildren = new Map<TrieNode, /* depth from initial node */ number>();
for (const [inputPhrase, graftTo] of Object.entries(rules)) {
const targetNode = traverseTrie(node, graftTo);
if (!targetNode) continue;
const codePoints = [...inputPhrase];
const graftedPath = Array.from<TrieNode>({ length: codePoints.length - 1 });
let isGrafted = false;
let currentNode = node;
for (let i = 0; i < codePoints.length; i++) {
const codePoint = codePoints[i]!.codePointAt(0)!;
let childNode = currentNode.children.get(codePoint);
if (i === codePoints.length - 1) {
if (childNode) {
if (childNode !== targetNode) throw new Error(`Grafted path ${inputPhrase} conflicts with existing path`);
// Already grafted
} else {
currentNode.children.set(codePoint, childNode = targetNode);
isGrafted = true;
}
} else {
if (!childNode) {
childNode = newNode(currentNode);
childNode.subTreeTokenIds = targetNode.subTreeTokenIds;
currentNode.children.set(codePoint, childNode);
} else {
// Part of another grafted path?
childNode.subTreeTokenIds = Array.from(new Set([...childNode.subTreeTokenIds, ...targetNode.subTreeTokenIds]));
}
graftedPath[i] = currentNode = childNode;
}
}
if (isGrafted) for (const [i, nodeToAdd] of graftedPath.entries()) nodesWithNewGraftedChildren.set(nodeToAdd, i + 1);
}
if (nodesWithNewGraftedChildren.size > 0) {
// Re-check graft rules on the newly grafted path
// 1. No need to recursive other children (not on this path) since their children are not affected
// 2. No need to consider ancestors of this node since they're handled later (we run in DFS order)
const sortedNodes = [...nodesWithNewGraftedChildren.entries()].sort((a, b) => b[1] - a[1]);
for (const [changedNode] of sortedNodes) graftFromNode(changedNode, false);
} else {
// No new grafts applied
break;
}
}
};
graftFromNode(root, true);
};
export const serializeTrie = (root: TrieNode) => {
const nodeEntries = new Map<TrieNode, {
id: number;
visited: boolean;
data?: number[];
}>();
let currentId = 0;
const getNodeEntry = (node: TrieNode) => {
let entry = nodeEntries.get(node);
if (!entry) {
entry = { id: ++currentId, visited: false };
nodeEntries.set(node, entry);
}
return entry;
};
const serializeNode = (node: TrieNode) => {
const entry = getNodeEntry(node);
if (entry.visited) return entry.id;
entry.visited = true;
const children = [...node.children.entries()].map(([codePoint, childNode]) => [codePoint, serializeNode(childNode)] as const);
entry.data = [
node.parent ? getNodeEntry(node.parent).id : 0,
...children.map(child => child[0]), // code points
...children.map(child => child[1]), // child node ids
// End of children list (<= 0 are not valid code points nor node IDs)
...node.tokenIds.length > 0
? node.tokenIds.map(tokenId => -(tokenId + 1)) // Use the negative value of (tokenId + 1)
: [0], // End of children list, no token IDs (token IDs are encoded to negative values)
];
return entry.id;
};
serializeNode(root);
return [...nodeEntries.values()].sort((a, b) => a.id - b.id).flatMap(node => node.data ?? []);
};
+26
View File
@@ -0,0 +1,26 @@
import { getSpanLength, TokenType } from '../common';
import type { SearchResult } from './search';
export type HighlightedTextPart = /* not highlighted */ string | /* highlighted */ { highlight: string };
export const highlightSearchResult = (resultDocument: SearchResult): HighlightedTextPart[] => {
const highlightResult: HighlightedTextPart[] = [];
let previousHighlightEnd = 0;
for (const token of resultDocument.tokens) {
const notHighlightedText = resultDocument.documentCodePoints.slice(previousHighlightEnd, token.documentOffset.start).join('');
if (notHighlightedText.length > 0) highlightResult.push(notHighlightedText);
const highlightEnd = token.isTokenPrefixMatching && (token.definition.type === TokenType.Kana)
? token.documentOffset.start + Math.max(
1,
Math.round(
getSpanLength(token.documentOffset) *
Math.min(1, getSpanLength(token.inputOffset) / token.definition.codePointLength),
),
)
: token.documentOffset.end;
highlightResult.push({ highlight: resultDocument.documentCodePoints.slice(token.documentOffset.start, highlightEnd).join('') });
previousHighlightEnd = highlightEnd;
}
if (previousHighlightEnd < resultDocument.documentCodePoints.length) highlightResult.push(resultDocument.documentCodePoints.slice(previousHighlightEnd).join(''));
return highlightResult;
};
+4
View File
@@ -0,0 +1,4 @@
export * from './trie';
export * from './inverted-index';
export * from './search';
export * from './highlight';
@@ -0,0 +1,59 @@
import { deserializeTrie } from './trie';
import type { TrieNode } from '../common';
import type { CompressedInvertedIndex, OffsetSpan, TokenDefinition } from '../common/types';
export interface TokenDocumentReference {
documentId: number;
offsets: OffsetSpan[];
}
interface TokenDefinitionExtended extends TokenDefinition {
references: TokenDocumentReference[];
};
const mergeMap = <K, V>(...maps: Map<K, V>[]) => {
const result = new Map<K, V>();
for (const map of maps) for (const [key, value] of map.entries()) result.set(key, value);
return result;
};
export interface LoadedInvertedIndex {
documents: string[];
documentCodePoints: string[][];
tokenDefinitions: TokenDefinitionExtended[];
tries: {
romaji: TrieNode;
kana: TrieNode;
other: TrieNode;
};
}
export const loadInvertedIndex = (compressed: CompressedInvertedIndex): LoadedInvertedIndex => {
const documents = compressed.documents;
const documentCodePoints = documents.map(document => [...document]);
const romajiTrie = deserializeTrie(compressed.tries.romaji);
const kanaTrie = deserializeTrie(compressed.tries.kana);
const otherTrie = deserializeTrie(compressed.tries.other);
const tokenCodePoints = mergeMap(romajiTrie.tokenCodePoints, kanaTrie.tokenCodePoints, otherTrie.tokenCodePoints);
const tokenDefinitions = compressed.tokenTypes.map<TokenDefinitionExtended>((type, index) => ({
id: index, type, text: tokenCodePoints.get(index)!.join(''),
codePointLength: tokenCodePoints.get(index)!.length,
references: compressed.tokenReferences[index]!.map<TokenDocumentReference>(([documentId, ...offsets]) => ({
documentId: documentId!,
offsets: Array.from({ length: offsets.length / 2 }, (_, i) => ({ start: offsets[i * 2]!, end: offsets[i * 2 + 1]! })),
})),
}));
return {
documents,
documentCodePoints,
tokenDefinitions,
tries: {
romaji: romajiTrie.root,
kana: kanaTrie.root,
other: otherTrie.root,
},
};
};
+258
View File
@@ -0,0 +1,258 @@
import { highlightSearchResult } from './highlight';
import { getTrieNodeTokenIds } from './trie';
import type { TrieNode } from '../common';
import { traverseTrieStep } from '../common';
import type { LoadedInvertedIndex } from './inverted-index';
import { normalizeByCodePoint, toKatakana } from '../common/normalize';
import { type OffsetSpan, type TokenDefinition, TokenType } from '../common/types';
import { getSpanLength } from '../common/utils';
const IGNORABLE_CODE_POINTS = /[\s\u3099\u309A]/u;
enum TokenTypePrefixMatchingPolicy {
AlwaysAllow,
NeverAllow,
AllowOnlyAtInputEnd,
}
const tokenTypePrefixMatchingPolicy: Record<TokenType, TokenTypePrefixMatchingPolicy> = {
[TokenType.Romaji]: TokenTypePrefixMatchingPolicy.NeverAllow,
[TokenType.Kana]: TokenTypePrefixMatchingPolicy.AlwaysAllow,
// These token types are in an "other" Trie
[TokenType.Han]: TokenTypePrefixMatchingPolicy.AllowOnlyAtInputEnd, // No effect because always 1 code point
[TokenType.Pinyin]: TokenTypePrefixMatchingPolicy.AllowOnlyAtInputEnd,
[TokenType.Raw]: TokenTypePrefixMatchingPolicy.AllowOnlyAtInputEnd, // No effect because always 1 code point
};
const shouldAllowPrefixMatching = (tokenType: TokenType, isAtInputEnd: boolean) =>
tokenTypePrefixMatchingPolicy[tokenType] === TokenTypePrefixMatchingPolicy.AlwaysAllow ||
(tokenTypePrefixMatchingPolicy[tokenType] !== TokenTypePrefixMatchingPolicy.NeverAllow && isAtInputEnd);
export interface SearchResultToken {
definition: TokenDefinition;
documentOffset: OffsetSpan;
inputOffset: OffsetSpan;
isTokenPrefixMatching: boolean;
}
interface ComparableStateTraits<T> {
getRangeCount: (state: T) => number;
getPrefixMatchCount: (state: T) => number;
getFirstTokenDocumentOffset: (state: T) => OffsetSpan;
getLastTokenDocumentOffset: (state: T) => OffsetSpan;
getLastToken?: (state: T) => SearchResultToken; // Not on intermediate results
getMatchRatioLevel?: (state: T) => number; // Not on intermediate/candidate results
getMatchRatio: (state: T) => number;
// Called when all other comparisons are equal
nextComparer?: (a: T, b: T) => number; // Not on intermediate/candidate results
}
const getComparerForTraits = <T>(traits: ComparableStateTraits<T>) => (a: T, b: T) => {
// Prefer matches that not relying on end-of-input loose matching (full match over prefix match)
if (traits.getLastToken) {
const aLastToken = traits.getLastToken(a), bLastToken = traits.getLastToken(b);
const aDidPrefixMatchByTokenType = aLastToken.isTokenPrefixMatching && tokenTypePrefixMatchingPolicy[aLastToken.definition.type] === TokenTypePrefixMatchingPolicy.AllowOnlyAtInputEnd;
const bDidPrefixMatchByTokenType = bLastToken.isTokenPrefixMatching && tokenTypePrefixMatchingPolicy[bLastToken.definition.type] === TokenTypePrefixMatchingPolicy.AllowOnlyAtInputEnd;
if (aDidPrefixMatchByTokenType !== bDidPrefixMatchByTokenType) return aDidPrefixMatchByTokenType ? 1 : -1;
}
// Prefer results that matched fewer discontinuous ranges over more
const aRangeCount = traits.getRangeCount(a), bRangeCount = traits.getRangeCount(b);
if (aRangeCount !== bRangeCount) return aRangeCount - bRangeCount;
// Prefer results that matches first token in document earlier over later
const aFirstTokenDocumentOffset = traits.getFirstTokenDocumentOffset(a), bFirstTokenDocumentOffset = traits.getFirstTokenDocumentOffset(b);
if (aFirstTokenDocumentOffset.start !== bFirstTokenDocumentOffset.start) return aFirstTokenDocumentOffset.start - bFirstTokenDocumentOffset.start;
// Prefer results that has higher match ratio (but don't distinguish similar ratios, so we introduced `matchRatioLevel`)
if (traits.getMatchRatioLevel) {
const aMatchRatioLevel = traits.getMatchRatioLevel(a), bMatchRatioLevel = traits.getMatchRatioLevel(b);
if (aMatchRatioLevel !== bMatchRatioLevel) return bMatchRatioLevel - aMatchRatioLevel;
}
// Prefer results that last token occurred earlier (if same, ended earlier) in the document over later
const aLastTokenDocumentOffset = traits.getLastTokenDocumentOffset(a), bLastTokenDocumentOffset = traits.getLastTokenDocumentOffset(b);
if (aLastTokenDocumentOffset.start !== bLastTokenDocumentOffset.start) return aLastTokenDocumentOffset.start - bLastTokenDocumentOffset.start;
if (aLastTokenDocumentOffset.end !== bLastTokenDocumentOffset.end) return aLastTokenDocumentOffset.end - bLastTokenDocumentOffset.end;
// Prefer results that has higher match ratio (precisely)
const aMatchRatio = traits.getMatchRatio(a), bMatchRatio = traits.getMatchRatio(b);
if (aMatchRatio !== bMatchRatio) return bMatchRatio - aMatchRatio;
return traits.nextComparer?.(a, b) ?? 0;
};
interface IntermediateResult {
previousState?: IntermediateResult;
firstTokenDocumentOffset: OffsetSpan;
rangeCount: number;
tokenCount: number;
prefixMatchCount: number;
matchedTokenLength: number;
tokenId: number;
documentOffset: OffsetSpan;
inputOffset: OffsetSpan;
isTokenPrefixMatching: boolean;
}
const compareIntermediateResult = getComparerForTraits<IntermediateResult>({
getRangeCount: state => state.rangeCount,
getPrefixMatchCount: state => state.prefixMatchCount,
getFirstTokenDocumentOffset: state => state.firstTokenDocumentOffset,
getLastTokenDocumentOffset: state => state.documentOffset,
getMatchRatio: state => state.matchedTokenLength, // No need to divide document length since intermediate results are for same document
});
interface CandidateResult {
tokens: SearchResultToken[];
prefixMatchCount: number;
matchedTokenLength: number;
rangeCount: number;
}
const compareCandidateResult = getComparerForTraits<CandidateResult>({
getRangeCount: state => state.rangeCount,
getPrefixMatchCount: state => state.prefixMatchCount,
getFirstTokenDocumentOffset: state => state.tokens[0]!.documentOffset,
getLastTokenDocumentOffset: state => state.tokens[state.tokens.length - 1]!.documentOffset,
getLastToken: state => state.tokens[state.tokens.length - 1]!,
getMatchRatio: state => state.matchedTokenLength, // No need to divide document length since candidate results are for same document
});
export interface SearchResult {
documentId: number;
documentText: string;
documentCodePoints: string[];
tokens: SearchResultToken[];
prefixMatchCount: number;
rangeCount: number;
matchRatio: number;
matchRatioLevel: number;
}
const compareFinalResult = getComparerForTraits<SearchResult>({
getRangeCount: state => state.rangeCount,
getPrefixMatchCount: state => state.prefixMatchCount,
getFirstTokenDocumentOffset: state => state.tokens[0]!.documentOffset,
getLastTokenDocumentOffset: state => state.tokens[state.tokens.length - 1]!.documentOffset,
getLastToken: state => state.tokens[state.tokens.length - 1]!,
getMatchRatio: state => state.matchRatio,
getMatchRatioLevel: state => Math.round(state.matchRatio * 5),
nextComparer: (a, b) => a.documentText === b.documentText ? 0 : a.documentText < b.documentText ? -1 : 1,
});
const hasNonEmptyCharacters = (documentCodePoints: string[], start: number, end: number) => start !== end && !documentCodePoints.slice(start, end).every(char => /\s/.test(char));
export const searchInvertedIndex = (invertedIndex: LoadedInvertedIndex, text: string): SearchResult[] => {
const { documents, documentCodePoints, tokenDefinitions, tries } = invertedIndex;
const codePoints = [...toKatakana(normalizeByCodePoint(text))];
// dp[i] = docId => end => IntermediateResult, starts from dp[-1] (l === 0), ends at dp[N - 1] (r === N - 1)
const dp = Array.from({ length: codePoints.length }, () => new Map<number, Record<number, IntermediateResult>>());
for (let l = 0; l < codePoints.length; l++) {
if (l !== 0 && dp[l - 1]!.size === 0) continue; // No documents match input from beginning to this position
let romajiNode: TrieNode | undefined = tries.romaji;
let kanaNode: TrieNode | undefined = tries.kana;
let otherNode: TrieNode | undefined = tries.other;
for (let r = l; r < codePoints.length && (romajiNode || kanaNode || otherNode); r++) { // [l, r]
const codePoint = codePoints[r]!;
romajiNode = traverseTrieStep(romajiNode, codePoint, IGNORABLE_CODE_POINTS);
kanaNode = traverseTrieStep(kanaNode, codePoint, IGNORABLE_CODE_POINTS);
otherNode = traverseTrieStep(otherNode, codePoint, IGNORABLE_CODE_POINTS);
const reachingInputEnd = r === codePoints.length - 1;
const matchingTokenIds = new Set([
// Allow suffix matching of romaji/other tokens if we're at the end of the input
...getTrieNodeTokenIds(romajiNode, shouldAllowPrefixMatching(TokenType.Romaji, reachingInputEnd)),
...getTrieNodeTokenIds(kanaNode, shouldAllowPrefixMatching(TokenType.Kana, reachingInputEnd)),
...getTrieNodeTokenIds(otherNode, reachingInputEnd),
]);
for (const tokenId of matchingTokenIds) for (const { documentId, offsets } of tokenDefinitions[tokenId]!.references) {
const isTokenPrefixMatching = !romajiNode?.tokenIds.includes(tokenId) && !kanaNode?.tokenIds.includes(tokenId) && !otherNode?.tokenIds.includes(tokenId);
const previousMatchesOfDocument = dp[l - 1]?.get(documentId);
if (l !== 0 && !previousMatchesOfDocument) continue;
for (const documentOffset of offsets) {
const { start: currentStart, end: currentEnd } = documentOffset;
const contributeNextMatchingState = (previousState: IntermediateResult | undefined) => {
const nextMatchingMap = dp[r]!;
let nextMatchesOfDocument = nextMatchingMap.get(documentId);
if (!nextMatchesOfDocument) {
nextMatchesOfDocument = Object.create(null) as Record<number, IntermediateResult>;
nextMatchingMap.set(documentId, nextMatchesOfDocument);
}
const oldResult = nextMatchesOfDocument[currentEnd];
const inputOffset = { start: l, end: r + 1 };
const newResult: IntermediateResult = {
previousState,
firstTokenDocumentOffset: previousState?.firstTokenDocumentOffset ?? documentOffset,
rangeCount: !previousState ? 1
: (previousState.rangeCount + (hasNonEmptyCharacters(documentCodePoints[documentId]!, previousState.documentOffset.end, currentStart) ? 1 : 0)),
tokenCount: (previousState?.tokenCount ?? 0) + 1,
prefixMatchCount: (previousState?.prefixMatchCount ?? 0) + (isTokenPrefixMatching ? 1 : 0),
matchedTokenLength: (previousState?.matchedTokenLength ?? 0) + getSpanLength(documentOffset) *
Math.min(isTokenPrefixMatching ? getSpanLength(inputOffset) / tokenDefinitions[tokenId]!.codePointLength : Infinity, 1),
tokenId,
documentOffset,
inputOffset,
isTokenPrefixMatching,
};
nextMatchesOfDocument[currentEnd] = !oldResult || compareIntermediateResult(newResult, oldResult) < 0 ? newResult : oldResult;
};
if (l === 0) contributeNextMatchingState(undefined);
else for (const previousEnd in previousMatchesOfDocument) if (currentStart >= Number(previousEnd))
contributeNextMatchingState(previousMatchesOfDocument[previousEnd as unknown as number]!);
// Don't `break` here because keys of `previousMatchesOfDocument` are not essentially ordered
}
}
}
}
// Build search results and sort documents
return [...dp[codePoints.length - 1]!.entries()].map<SearchResult>(([documentId, matches]) => {
const sortedMatches = Object.values(matches).map<CandidateResult>(match => {
const tokens: SearchResultToken[] = [];
// Build token list from backtracking
let state: IntermediateResult | undefined = match;
while (state) {
tokens.unshift({
definition: tokenDefinitions[state.tokenId]!,
documentOffset: state.documentOffset, inputOffset: state.inputOffset,
isTokenPrefixMatching: state.isTokenPrefixMatching,
});
state = state.previousState;
}
return { tokens, prefixMatchCount: match.prefixMatchCount, matchedTokenLength: match.matchedTokenLength, rangeCount: match.rangeCount };
}).sort(compareCandidateResult);
const bestMatchOfDocument = sortedMatches[0]!;
const documentText = documents[documentId]!;
const matchRatio = bestMatchOfDocument.matchedTokenLength / documentCodePoints[documentId]!.length;
const matchRatioLevel = Math.round(matchRatio * 5);
return {
documentId,
documentText,
documentCodePoints: documentCodePoints[documentId]!,
tokens: bestMatchOfDocument.tokens,
prefixMatchCount: bestMatchOfDocument.prefixMatchCount,
rangeCount: bestMatchOfDocument.rangeCount,
matchRatio,
matchRatioLevel,
};
}).sort(compareFinalResult);
};
// For debugging
export const inspectSearchResult = (resultDocument: SearchResult, htmlHighlight: boolean) => {
const { documentText, tokens, rangeCount, matchRatio, matchRatioLevel } = resultDocument;
const escapeHtml = (s: string) => s.replaceAll('&', '&amp;').replaceAll('<', '&lt;').replaceAll('>', '&gt;');
const escapedText = htmlHighlight ? highlightSearchResult(resultDocument).map(part =>
typeof part === 'string' ? escapeHtml(part) : `<u><b>${escapeHtml(part.highlight)}</b></u>`).join('') : JSON.stringify(documentText);
const description = ` (${rangeCount} ranges, ${Math.round(matchRatio * 10000) / 10000} => L${matchRatioLevel})`;
return [
escapedText + (htmlHighlight ? `<code>${description}</code>` : description),
...tokens.map(token => {
let escapedTokenText = JSON.stringify(token.definition.text);
let escapedDocumentText = JSON.stringify([...documentText].slice(token.documentOffset.start, token.documentOffset.end).join(''));
if (htmlHighlight) {
escapedTokenText = escapeHtml(escapedTokenText);
escapedDocumentText = escapeHtml(escapedDocumentText);
}
const line = ` ${TokenType[token.definition.type]}: ${escapedTokenText} -> ${escapedDocumentText}${token.isTokenPrefixMatching ? ' (prefix match)' : ''}`;
return htmlHighlight ? `<code>${line}</code>` : line;
}),
'',
].join('\n');
};
+58
View File
@@ -0,0 +1,58 @@
import type { TrieNode } from '../common';
export const deserializeTrie = (data: number[]) => {
const nodes: TrieNode[] = [];
const getNode = (id: number) => nodes[id - 1] ??= { parent: undefined, children: new Map(), tokenIds: [], subTreeTokenIds: [] };
let currentId = 0;
for (let i = 0; i < data.length;) {
const node = getNode(++currentId);
const parentId = data[i++]!;
node.parent = parentId !== 0 ? getNode(parentId) : undefined;
let endOfChildren = i;
while (endOfChildren < data.length && data[endOfChildren]! > 0) endOfChildren++;
const numberOfChildren = (endOfChildren - i) / 2;
for (let j = i; j < i + numberOfChildren; j++) {
const codePoint = data[j]!;
const child = getNode(data[j + numberOfChildren]!);
node.children.set(codePoint, child);
}
i = endOfChildren;
if (data[i] === 0) i++; // No token IDs
else while (i < data.length && data[i]! < 0) node.tokenIds.push(-data[i++]! - 1);
}
const root = nodes[0]!;
// DFS to construct code point paths for each token
const tokenCodePoints = new Map<number, string[]>();
const currentCodePoints: string[] = [];
const dfsCodePoints = (node: TrieNode) => {
for (const tokenId of node.tokenIds) tokenCodePoints.set(tokenId, [...currentCodePoints]);
for (const [codePoint, child] of node.children.entries()) {
if (child.parent !== node) continue; // Skip grafted paths as these are not the canonical representation of the tokens
currentCodePoints.push(String.fromCodePoint(codePoint));
dfsCodePoints(child);
currentCodePoints.pop();
}
};
dfsCodePoints(root);
// DFS to construct subTreeTokenIds for each node
const visitedNodes = new Set<TrieNode>();
const dfsSubTreeTokenIds = (node: TrieNode) => {
if (visitedNodes.has(node)) return node.subTreeTokenIds;
visitedNodes.add(node);
node.subTreeTokenIds = [...node.tokenIds, ...new Set([...node.children.values()].flatMap(child => dfsSubTreeTokenIds(child)))];
return node.subTreeTokenIds;
};
dfsSubTreeTokenIds(root);
return {
root,
tokenCodePoints,
};
};
export const getTrieNodeTokenIds = (node: TrieNode | undefined, includeSubTree: boolean) =>
(includeSubTree ? node?.subTreeTokenIds : node?.tokenIds) ?? [];
+23
View File
@@ -0,0 +1,23 @@
{
"compilerOptions": {
"target": "ESNext",
"jsx": "preserve",
"lib": ["DOM", "DOM.Iterable", "ESNext", "WebWorker"],
"module": "ESNext",
"moduleResolution": "Bundler",
"noUncheckedIndexedAccess": true,
"resolveJsonModule": true,
"allowJs": true,
"strict": true,
"strictNullChecks": true,
"noEmit": true,
"esModuleInterop": true,
"forceConsistentCasingInFileNames": true,
"isolatedModules": true,
"skipLibCheck": true,
"rootDir": ".",
"outDir": "dist"
},
"include": ["src/**/*.ts"],
"exclude": ["dist", "node_modules"]
}
+15
View File
@@ -0,0 +1,15 @@
import { defineConfig } from 'tsdown';
export default defineConfig({
entry: [
'./src/index.ts',
'./src/searcher/index.ts',
'./src/indexer/index.ts',
'./src/common/index.ts',
],
dts: true,
unused: true,
fixedExtension: true,
unbundle: true,
sourcemap: true,
});