using System.Runtime.InteropServices; using MaigoLabs.NeedLe.Common; using MaigoLabs.NeedLe.Common.Extensions; using MeCab; using MeCab.Core; namespace MaigoLabs.NeedLe.Indexer.Japanese; public class Transcription { public required int Start { get; set; } public required int Length { get; set; } public required string[] Transcriptions { get; set; } } public delegate IEnumerable TranscriptionEnumerator(ReadOnlyMemory codePoints); public delegate bool IsValidPhraseDelegate(ReadOnlyMemory codePoints, int start, int length); public delegate HashSet GetAllTranscriptionsDelegate(string phrase); public class TranscriptionProvider { public MeCabDictionary[] Dictionaries { get; set; } public TranscriptionProvider(MeCabDictionary[]? dictionaries = null) { if (dictionaries == null) { var param = new MeCabParam(); param.LoadDicRC(); var dictionary = new MeCabDictionary(); dictionary.Open(Path.Combine(param.DicDir, "sys.dic")); dictionaries = [dictionary]; } Dictionaries = dictionaries; } public static TranscriptionEnumerator CreateTranscriptionEnumerator(IsValidPhraseDelegate isValidPhrase, GetAllTranscriptionsDelegate getAllTranscriptions) => codePoints => { var resultMap = new Dictionary<(int Start, int Length), Transcription>(); for (int phraseLength = 1; phraseLength <= codePoints.Length; phraseLength++) for (int start = 0; start + phraseLength <= codePoints.Length; start++) { if (!isValidPhrase(codePoints, start, phraseLength)) continue; var phrase = MemoryMarshal.ToEnumerable(codePoints.Slice(start, phraseLength)).ToUtf32String(); var atomicTranscriptions = getAllTranscriptions(phrase).Where(transcription => transcription != null).Where(candidateTranscription => { if (candidateTranscription.Length == 0) return false; // Ensure the transcription is atomic (not a combination of multiple shorter transcriptions, separated by any midpoints) var visitedStates = new HashSet<(int PhrasePosition, int TranscriptionPosition)>(); var queue = new Queue<(int PhrasePosition, int TranscriptionPosition)>(); queue.Enqueue((0, 0)); while (queue.Count > 0) { var (phrasePosition, transcriptionPosition) = queue.Dequeue(); for (int prefixLength = 1; prefixLength <= phraseLength - phrasePosition; prefixLength++) { if (!resultMap.TryGetValue((start + phrasePosition, prefixLength), out var prefixResult)) continue; foreach (var transcription in prefixResult.Transcriptions) if (string.Compare(candidateTranscription, transcriptionPosition, transcription, 0, transcription.Length) == 0) { var nextState = (PhrasePosition: phrasePosition + prefixLength, TranscriptionPosition: transcriptionPosition + transcription.Length); if (nextState.PhrasePosition == phraseLength && nextState.TranscriptionPosition == candidateTranscription.Length) return false; // Found a valid combination if (visitedStates.Contains(nextState)) continue; visitedStates.Add(nextState); queue.Enqueue(nextState); } } } return true; }).ToArray(); if (atomicTranscriptions.Length > 0) resultMap[(start, phraseLength)] = new() { Start = start, Length = phraseLength, Transcriptions = atomicTranscriptions }; } return resultMap.Values; }; public HashSet GetAllKanaReadings(string phrase) { var result = new HashSet(); var isKana = phrase.All(ch => JapaneseUtils.IsKana(ch)); if (isKana) result.Add(CommonNormalization.ToKatakana(phrase)); if (isKana && phrase.Length == 1) return result; foreach (var dictionary in Dictionaries) { var searchResult = dictionary.ExactMatchSearch(phrase); if (searchResult.Value == -1) continue; var tokens = dictionary.GetToken(searchResult); foreach (var token in tokens) { var feature = dictionary.GetFeature(token.Feature); var parts = feature.Split(','); if (parts.Length > 7) result.Add(CommonNormalization.ToKatakana(parts[7])); } } return result; } public HashSet GetAllKanaReadingsWithNormalization(string phrase) => GetAllKanaReadings(JapaneseUtils.StripJapaneseSoundMarks(JapaneseNormalization.NormalizeKanaDakuten(phrase))); public TranscriptionEnumerator EnumerateKanaTranscriptions => CreateTranscriptionEnumerator( JapaneseUtils.IsValidJapanesePhrase, GetAllKanaReadingsWithNormalization); public TranscriptionEnumerator EnumerateRomajiTranscriptions => CreateTranscriptionEnumerator( JapaneseUtils.IsValidJapanesePhrase, phrase => [.. GetAllKanaReadingsWithNormalization(phrase).Select(kana => JapaneseNormalization.NormalizeRomaji(JapaneseUtils.ToRomajiStrictly(kana)))]); }