Files
needLe/dotnet/MaigoLabs.NeedLe.Tests/Indexer/TokenizerTests.cs
T
2026-01-01 03:40:41 +08:00

166 lines
6.9 KiB
C#
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
using MaigoLabs.NeedLe.Common.Types;
using MaigoLabs.NeedLe.Indexer;
namespace MaigoLabs.NeedLe.Tests.Indexer;
public sealed class Tokenizer_TokenizesMixedJapaneseTextTest : NeedleTestBase
{
[Fact]
public void Execute()
{
var tokenizer = new Tokenizer(TokenizerOptions);
var tokens = tokenizer.Tokenize("僕の和風本当上手");
var tokenDefs = tokenizer.Tokens.Values.ToList();
// Should have tokens of various types
var types = tokenDefs.Select(t => t.Type).ToHashSet();
Assert.Contains(TokenType.Han, types);
Assert.Contains(TokenType.Pinyin, types);
Assert.Contains(TokenType.Kana, types);
Assert.Contains(TokenType.Romaji, types);
// Helper to get token texts at a specific position by type
List<string> GetTokenTextsAt(int pos, TokenType type) => tokens
.Where(t => t.Start <= pos && t.End > pos)
.Select(t => tokenDefs.First(d => d.Id == t.Id))
.Where(d => d.Type == type)
.Select(d => d.Text)
.ToList();
// Position 0: 僕
Assert.Contains("僕", GetTokenTextsAt(0, TokenType.Han));
Assert.Contains("pu", GetTokenTextsAt(0, TokenType.Pinyin));
Assert.Contains("ボク", GetTokenTextsAt(0, TokenType.Kana));
Assert.Contains("boku", GetTokenTextsAt(0, TokenType.Romaji));
// Position 1: の (hiragana, no Han/Pinyin)
Assert.Empty(GetTokenTextsAt(1, TokenType.Han));
Assert.Empty(GetTokenTextsAt(1, TokenType.Pinyin));
Assert.Contains("", GetTokenTextsAt(1, TokenType.Kana));
Assert.Contains("no", GetTokenTextsAt(1, TokenType.Romaji));
// Position 2: 和
Assert.Contains("和", GetTokenTextsAt(2, TokenType.Han));
Assert.Contains("he", GetTokenTextsAt(2, TokenType.Pinyin));
Assert.Contains("ワ", GetTokenTextsAt(2, TokenType.Kana));
Assert.Contains("wa", GetTokenTextsAt(2, TokenType.Romaji));
// Position 3: 風
Assert.Contains("風", GetTokenTextsAt(3, TokenType.Han));
Assert.Contains("风", GetTokenTextsAt(3, TokenType.Han)); // simplified variant
Assert.Contains("feng", GetTokenTextsAt(3, TokenType.Pinyin));
Assert.Contains("フウ", GetTokenTextsAt(3, TokenType.Kana));
Assert.Contains("fu", GetTokenTextsAt(3, TokenType.Romaji));
// Position 4: 本
Assert.Contains("本", GetTokenTextsAt(4, TokenType.Han));
Assert.Contains("ben", GetTokenTextsAt(4, TokenType.Pinyin));
Assert.Contains("ホン", GetTokenTextsAt(4, TokenType.Kana));
Assert.Contains("hon", GetTokenTextsAt(4, TokenType.Romaji));
// Position 5: 当
Assert.Contains("当", GetTokenTextsAt(5, TokenType.Han));
Assert.Contains("當", GetTokenTextsAt(5, TokenType.Han)); // traditional variant
Assert.Contains("dang", GetTokenTextsAt(5, TokenType.Pinyin));
Assert.Contains("トウ", GetTokenTextsAt(5, TokenType.Kana));
Assert.Contains("to", GetTokenTextsAt(5, TokenType.Romaji)); // normalized: tou -> to
// Position 6: 上
Assert.Contains("上", GetTokenTextsAt(6, TokenType.Han));
Assert.Contains("shang", GetTokenTextsAt(6, TokenType.Pinyin));
Assert.Contains("ジョウ", GetTokenTextsAt(6, TokenType.Kana));
Assert.Contains("jo", GetTokenTextsAt(6, TokenType.Romaji)); // normalized: jou -> jo
// Position 7: 手
Assert.Contains("手", GetTokenTextsAt(7, TokenType.Han));
Assert.Contains("shou", GetTokenTextsAt(7, TokenType.Pinyin));
Assert.Contains("シュ", GetTokenTextsAt(7, TokenType.Kana));
Assert.Contains("shu", GetTokenTextsAt(7, TokenType.Romaji));
}
}
public sealed class Tokenizer_NoDuplicateTokensTest : NeedleTestBase
{
[Fact]
public void Execute()
{
var tokenizer = new Tokenizer(TokenizerOptions);
// Tokenize multiple music names that share some characters
tokenizer.Tokenize("僕の和風本当上手");
tokenizer.Tokenize("僕");
tokenizer.Tokenize("和風");
// Check that there are no duplicate tokens
var tokenDefs = tokenizer.Tokens.Values.ToList();
var tokenKeys = tokenDefs.Select(t => $"{t.Type}:{t.Text}").ToList();
var uniqueKeys = tokenKeys.ToHashSet();
Assert.Equal(uniqueKeys.Count, tokenKeys.Count);
// Also check that IDs are unique
var ids = tokenDefs.Select(t => t.Id).ToList();
var uniqueIds = ids.ToHashSet();
Assert.Equal(uniqueIds.Count, ids.Count);
}
}
public sealed class Tokenizer_HandlesRawTokensForNonCjkTest : NeedleTestBase
{
[Fact]
public void Execute()
{
var tokenizer = new Tokenizer(TokenizerOptions);
tokenizer.Tokenize("a-b");
var tokenDefs = tokenizer.Tokens.Values.ToList();
var rawTokenTexts = tokenDefs.Where(t => t.Type == TokenType.Raw).Select(t => t.Text).ToList();
Assert.Contains("a", rawTokenTexts);
Assert.Contains("-", rawTokenTexts);
Assert.Contains("b", rawTokenTexts);
}
}
public sealed class Tokenizer_TokenizesCompoundWordKyouTest : NeedleTestBase
{
[Fact]
public void Execute()
{
var tokenizer = new Tokenizer(TokenizerOptions);
var tokens = tokenizer.Tokenize("今日");
var tokenDefs = tokenizer.Tokens.Values.ToList();
// Helper to get tokens with specific type and span
List<string> GetTokensWithSpan(TokenType type, int start, int end) => tokens
.Where(t => t.Start == start && t.End == end)
.Select(t => tokenDefs.First(d => d.Id == t.Id))
.Where(d => d.Type == type)
.Select(d => d.Text)
.ToList();
// Individual character readings at position 0: 今
Assert.Contains("今", GetTokensWithSpan(TokenType.Han, 0, 1));
Assert.Contains("jin", GetTokensWithSpan(TokenType.Pinyin, 0, 1));
Assert.Contains("コン", GetTokensWithSpan(TokenType.Kana, 0, 1));
Assert.Contains("イマ", GetTokensWithSpan(TokenType.Kana, 0, 1));
Assert.Contains("kon", GetTokensWithSpan(TokenType.Romaji, 0, 1));
Assert.Contains("ima", GetTokensWithSpan(TokenType.Romaji, 0, 1));
// Individual character readings at position 1: 日
Assert.Contains("日", GetTokensWithSpan(TokenType.Han, 1, 2));
Assert.Contains("ri", GetTokensWithSpan(TokenType.Pinyin, 1, 2));
Assert.Contains("ニチ", GetTokensWithSpan(TokenType.Kana, 1, 2));
Assert.Contains("ヒ", GetTokensWithSpan(TokenType.Kana, 1, 2));
Assert.Contains("niti", GetTokensWithSpan(TokenType.Romaji, 1, 2));
Assert.Contains("hi", GetTokensWithSpan(TokenType.Romaji, 1, 2));
// Combined reading for "今日" [0, 2] - this is an indivisible compound word
Assert.Contains("キョウ", GetTokensWithSpan(TokenType.Kana, 0, 2));
Assert.Contains("kyo", GetTokensWithSpan(TokenType.Romaji, 0, 2)); // normalized: kyou -> kyo
}
}