Files
needLe/dotnet/MaigoLabs.NeedLe.Common/CommonNormalization.cs
T
2026-01-01 03:40:41 +08:00

46 lines
2.9 KiB
C#
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
namespace MaigoLabs.NeedLe.Common;
// This is for global normalization for any input and documents.
public static class CommonNormalization
{
public static int NormalizeCodePoint(int codePoint)
{
// Fullwidth ASCII -> Halfwidth ASCII
if (codePoint >= 0xFF01 && codePoint <= 0xFF5E) return ToLowerCaseAscii(codePoint - 0xFEE0);
// Fullwidth space -> Halfwidth space
else if (codePoint == /* ' ' */ 0x3000) return ' ';
// Halfwidth kana (U+FF66 - U+FF9D) -> Fullwidth kana
else if (codePoint >= 0xFF66 && codePoint <= 0xFF9D) return HALF_TO_FULL_KANA.TryGetValue(codePoint, out var value) ? value : codePoint;
else if (codePoint == /* '。' */ 0xFF61) return '。';
else if (codePoint == /* '「' */ 0xFF62) return '「';
else if (codePoint == /* '」' */ 0xFF63) return '」';
else if (codePoint == /* '、' */ 0xFF64) return '、';
else if (codePoint == /* '・' */ 0xFF65) return '・';
else if (codePoint == /* '゙' */ 0xFF9E || codePoint == /* '゛' */ 0x309B) return 0x3099; // -> COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK
else if (codePoint == /* '゚' */ 0xFF9F || codePoint == /* '゜' */ 0x309C) return 0x309A; // -> COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
else return ToLowerCaseAscii(codePoint);
}
private static readonly Dictionary<int, int> HALF_TO_FULL_KANA = new Dictionary<int, int> {
['ヲ'] = 'ヲ', ['ァ'] = 'ァ', ['ィ'] = 'ィ', ['ゥ'] = 'ゥ', ['ェ'] = 'ェ', ['ォ'] = 'ォ',
['ャ'] = 'ャ', ['ュ'] = 'ュ', ['ョ'] = 'ョ', ['ッ'] = 'ッ',
['ー'] = 'ー',
['ア'] = 'ア', ['イ'] = 'イ', ['ウ'] = 'ウ', ['エ'] = 'エ', ['オ'] = 'オ',
['カ'] = 'カ', ['キ'] = 'キ', ['ク'] = 'ク', ['ケ'] = 'ケ', ['コ'] = 'コ',
['サ'] = 'サ', ['シ'] = 'シ', ['ス'] = 'ス', ['セ'] = 'セ', ['ソ'] = 'ソ',
['タ'] = 'タ', ['チ'] = 'チ', ['ツ'] = 'ツ', ['テ'] = 'テ', ['ト'] = 'ト',
['ナ'] = 'ナ', ['ニ'] = 'ニ', ['ヌ'] = 'ヌ', ['ネ'] = 'ネ', ['ノ'] = '',
['ハ'] = 'ハ', ['ヒ'] = 'ヒ', ['フ'] = 'フ', ['ヘ'] = 'ヘ', ['ホ'] = 'ホ',
['マ'] = 'マ', ['ミ'] = 'ミ', ['ム'] = 'ム', ['メ'] = 'メ', ['モ'] = 'モ',
['ヤ'] = 'ヤ', ['ユ'] = 'ユ', ['ヨ'] = 'ヨ',
['ラ'] = 'ラ', ['リ'] = 'リ', ['ル'] = 'ル', ['レ'] = 'レ', ['ロ'] = 'ロ',
['ワ'] = 'ワ', ['ン'] = 'ン',
};
public static int ToLowerCaseAscii(int codePoint) => codePoint >= 0x41 && codePoint <= 0x5A ? codePoint + 0x20 : codePoint;
public static bool IsHiraganaRange(int codePoint) => (codePoint >= 0x3041 && codePoint <= 0x3096) || (codePoint >= 0x309D && codePoint <= 0x309E);
public static int ToKatakana(int codePoint) => IsHiraganaRange(codePoint) ? codePoint + 0x60 : codePoint;
public static string ToKatakana(string text) => string.Concat(text.Select(c => (char)ToKatakana(c)));
}