Files
needLe/dotnet/MaigoLabs.NeedLe.Indexer/Japanese/JapaneseNormalization.cs
T
2026-01-01 03:40:41 +08:00

70 lines
3.3 KiB
C#

using MaigoLabs.NeedLe.Common.Extensions;
namespace MaigoLabs.NeedLe.Indexer.Japanese;
public static class JapaneseNormalization
{
public delegate string Normalizer(string text);
public static Normalizer CreateNormalizer(Dictionary<string, string> rules) => text =>
{
while (true)
{
var beforeCurrentIteration = text;
foreach (var (from, to) in rules) text = text.Replace(from, to);
if (text == beforeCurrentIteration) break;
}
return text;
};
public static IEnumerable<(int[] From, int[] To)> ToCodePointPairs(Dictionary<string, string> rules) =>
rules.Select(rule => (From: rule.Key.ToCodePoints().ToArray(), To: rule.Value.ToCodePoints().ToArray()));
public static readonly Dictionary<string, string> NORMALIZE_RULES_ROMAJI = new()
{
// Remove all long vowels (sa-ba- -> saba)
["-"] = "",
// Collapse consecutive vowels
["aa"] = "a",
["ii"] = "i",
["uu"] = "u",
["ee"] = "e",
["oo"] = "o",
["ou"] = "o",
// mb/mp/mm -> nb/np/nm (shimbun -> shinbun)
["mb"] = "nb",
["mp"] = "np",
["mm"] = "nm",
// Others
["sha"] = "sya",
["tsu"] = "tu",
["chi"] = "ti",
["shi"] = "si",
["ji"] = "zi",
};
public static readonly IEnumerable<(int[] From, int[] To)> NORMALIZE_RULES_ROMAJI_CODEPOINTS = ToCodePointPairs(NORMALIZE_RULES_ROMAJI);
public static readonly Normalizer NormalizeRomaji = CreateNormalizer(NORMALIZE_RULES_ROMAJI);
public static readonly Dictionary<string, string> NORMALIZE_RULES_KANA_DAKUTEN = new()
{
["う\u3099"] = "ゔ",
["か\u3099"] = "が", ["き\u3099"] = "ぎ", ["く\u3099"] = "ぐ", ["け\u3099"] = "げ", ["こ\u3099"] = "ご",
["さ\u3099"] = "ざ", ["し\u3099"] = "じ", ["す\u3099"] = "ず", ["せ\u3099"] = "ぜ", ["そ\u3099"] = "ぞ",
["た\u3099"] = "だ", ["ち\u3099"] = "ぢ", ["つ\u3099"] = "づ", ["て\u3099"] = "で", ["と\u3099"] = "ど",
["は\u3099"] = "ば", ["ひ\u3099"] = "び", ["ふ\u3099"] = "ぶ", ["へ\u3099"] = "べ", ["ほ\u3099"] = "ぼ",
["は\u309A"] = "ぱ", ["ひ\u309A"] = "ぴ", ["ふ\u309A"] = "ぷ", ["へ\u309A"] = "ぺ", ["ほ\u309A"] = "ぽ",
["ゝ\u3099"] = "ゞ",
["ウ\u3099"] = "ヴ",
["カ\u3099"] = "ガ", ["キ\u3099"] = "ギ", ["ク\u3099"] = "グ", ["ケ\u3099"] = "ゲ", ["コ\u3099"] = "ゴ",
["サ\u3099"] = "ザ", ["シ\u3099"] = "ジ", ["ス\u3099"] = "ズ", ["セ\u3099"] = "ゼ", ["ソ\u3099"] = "ゾ",
["タ\u3099"] = "ダ", ["チ\u3099"] = "ヂ", ["ツ\u3099"] = "ヅ", ["テ\u3099"] = "デ", ["ト\u3099"] = "ド",
["ハ\u3099"] = "バ", ["ヒ\u3099"] = "ビ", ["フ\u3099"] = "ブ", ["ヘ\u3099"] = "ベ", ["ホ\u3099"] = "ボ",
["ハ\u309A"] = "パ", ["ヒ\u309A"] = "ピ", ["フ\u309A"] = "プ", ["ヘ\u309A"] = "ペ", ["ホ\u309A"] = "ポ",
["ワ\u3099"] = "ヷ", ["ヰ\u3099"] = "ヸ", ["ヱ\u3099"] = "ヹ", ["ヲ\u3099"] = "ヺ",
["ヽ\u3099"] = "ヾ",
};
public static readonly IEnumerable<(int[] From, int[] To)> NORMALIZE_RULES_KANA_DAKUTEN_CODEPOINTS = ToCodePointPairs(NORMALIZE_RULES_KANA_DAKUTEN);
public static readonly Normalizer NormalizeKanaDakuten = CreateNormalizer(NORMALIZE_RULES_KANA_DAKUTEN);
}