fix: skip empty input and handle ignored code points efficiently

This commit is contained in:
Menci
2026-01-06 22:56:34 +08:00
parent 63dcd3fb75
commit a64d716ef5
2 changed files with 18 additions and 6 deletions
@@ -167,6 +167,8 @@ public static class InvertedIndexSearcher
public static SearchResult[] Search(LoadedInvertedIndex invertedIndex, string text, InvertedIndexSearcherOptions? options = null)
{
if (string.IsNullOrWhiteSpace(text)) return [];
var documents = invertedIndex.Documents;
var documentCodePoints = invertedIndex.DocumentCodePoints;
var tokenDefinitions = invertedIndex.TokenDefinitions;
@@ -184,9 +186,13 @@ public static class InvertedIndexSearcher
for (var r = l; r < codePoints.Length && (romajiNode != null || kanaNode != null || otherNode != null); r++) // [l, r]
{
var codePoint = codePoints[r];
romajiNode = romajiNode.TraverseStep(codePoint, IsIgnorableCodePoint(codePoint));
kanaNode = kanaNode.TraverseStep(codePoint, IsIgnorableCodePoint(codePoint));
otherNode = otherNode.TraverseStep(codePoint, IsIgnorableCodePoint(codePoint));
var nextRomajiNode = romajiNode.TraverseStep(codePoint, IsIgnorableCodePoint(codePoint));
var nextKanaNode = kanaNode.TraverseStep(codePoint, IsIgnorableCodePoint(codePoint));
var nextOtherNode = otherNode.TraverseStep(codePoint, IsIgnorableCodePoint(codePoint));
if (nextRomajiNode == romajiNode && nextKanaNode == kanaNode && nextOtherNode == otherNode) continue; // This code point is fully ignored on current state
romajiNode = nextRomajiNode;
kanaNode = nextKanaNode;
otherNode = nextOtherNode;
var reachingInputEnd = r == codePoints.Length - 1;
HashSet<int> matchingTokenIds =
[
+9 -3
View File
@@ -151,6 +151,8 @@ export const searchInvertedIndex = (
filterDocument?: (documentId: number) => unknown;
},
): SearchResult[] => {
if (!text.trim()) return [];
const { documents, documentCodePoints, tokenDefinitions, tries } = invertedIndex;
const codePoints = [...toKatakana(normalizeByCodePoint(text))];
@@ -163,9 +165,13 @@ export const searchInvertedIndex = (
let otherNode: TrieNode | undefined = tries.other;
for (let r = l; r < codePoints.length && (romajiNode || kanaNode || otherNode); r++) { // [l, r]
const codePoint = codePoints[r]!;
romajiNode = traverseTrieStep(romajiNode, codePoint, IGNORABLE_CODE_POINTS);
kanaNode = traverseTrieStep(kanaNode, codePoint, IGNORABLE_CODE_POINTS);
otherNode = traverseTrieStep(otherNode, codePoint, IGNORABLE_CODE_POINTS);
const nextRomajiNode = traverseTrieStep(romajiNode, codePoint, IGNORABLE_CODE_POINTS);
const nextKanaNode = traverseTrieStep(kanaNode, codePoint, IGNORABLE_CODE_POINTS);
const nextOtherNode = traverseTrieStep(otherNode, codePoint, IGNORABLE_CODE_POINTS);
if (nextRomajiNode === romajiNode && nextKanaNode === kanaNode && nextOtherNode === otherNode) continue; // This code point is fully ignored on current state
romajiNode = nextRomajiNode;
kanaNode = nextKanaNode;
otherNode = nextOtherNode;
const reachingInputEnd = r === codePoints.length - 1;
const matchingTokenIds = new Set([
// Allow suffix matching of romaji/other tokens if we're at the end of the input