Files
needLe/dotnet/MaigoLabs.NeedLe.Indexer/InvertedIndexBuilder.cs
T
2026-01-06 22:33:54 +08:00

70 lines
3.1 KiB
C#

using MaigoLabs.NeedLe.Common;
using MaigoLabs.NeedLe.Common.Extensions;
using MaigoLabs.NeedLe.Common.Types;
using MaigoLabs.NeedLe.Indexer.Japanese;
using MaigoLabs.NeedLe.Indexer.Trie;
namespace MaigoLabs.NeedLe.Indexer;
public class InvertedIndexBuilderOptions
{
/// <summary>
/// If false, the documents will not be bundled with the inverted index. You must pass documents explicitly when loading the index.
/// Defaults to true.
/// </summary>
public bool BundleDocuments { get; set; } = true;
}
public static class InvertedIndexBuilder
{
private static TrieNode BuildTypedTrie(IEnumerable<TokenDefinition> tokenDefinitions, Func<TokenType, bool> typePredicate) =>
TrieBuilder.BuildTrie(tokenDefinitions
.Where(token => typePredicate(token.Type))
.Select(token => (token.Id, CodePoints: token.Text.ToCodePoints())));
public static CompressedInvertedIndex BuildInvertedIndex(
string[] documents,
TokenizerOptions? tokenizerOptions = null,
InvertedIndexBuilderOptions? invertedIndexBuilderOptions = null)
{
var tokenizer = new Tokenizer(tokenizerOptions);
var documentTokens = documents.Select(tokenizer.Tokenize).ToArray();
var tokenDefinitions = tokenizer.Tokens.Values;
var romajiRoot = BuildTypedTrie(tokenDefinitions, type => type == TokenType.Romaji);
var kanaRoot = BuildTypedTrie(tokenDefinitions, type => type == TokenType.Kana);
var otherRoot = BuildTypedTrie(tokenDefinitions, type => type != TokenType.Romaji && type != TokenType.Kana);
TrieBuilder.GraftTriePaths(romajiRoot, JapaneseNormalization.NORMALIZE_RULES_ROMAJI_CODEPOINTS);
TrieBuilder.GraftTriePaths(kanaRoot, JapaneseNormalization.NORMALIZE_RULES_KANA_DAKUTEN_CODEPOINTS);
var invertedIndex = new CompressedInvertedIndex
{
documents = (invertedIndexBuilderOptions?.BundleDocuments ?? true) ? documents : null,
tokenTypes = [.. tokenDefinitions.Select(token => (int)token.Type)],
tokenReferences = [.. tokenDefinitions.Select(_ => new List<int[]>())],
tries = new CompressedInvertedIndexTries
{
romaji = TrieSerializer.Serialize(romajiRoot),
kana = TrieSerializer.Serialize(kanaRoot),
other = TrieSerializer.Serialize(otherRoot),
},
};
for (var documentId = 0; documentId < documents.Length; documentId++)
{
var tokens = documentTokens[documentId];
var tokenOccurrences = new Dictionary<int, List<int>>();
foreach (var token in tokens)
{
if (!tokenOccurrences.TryGetValue(token.Id, out var occurrences)) tokenOccurrences[token.Id] = occurrences = [];
occurrences.Add(token.Start);
occurrences.Add(token.End);
}
foreach (var (tokenId, occurrences) in tokenOccurrences)
{
invertedIndex.tokenReferences[tokenId].Add([documentId, .. occurrences]);
}
}
return invertedIndex;
}
}