122 lines
4.8 KiB
Python
122 lines
4.8 KiB
Python
import whisper
|
|
import os
|
|
import json
|
|
import torchaudio
|
|
import argparse
|
|
import torch
|
|
|
|
lang2token = {
|
|
'zh': "[ZH]",
|
|
'ja': "[JA]",
|
|
"en": "[EN]",
|
|
}
|
|
def transcribe_one(audio_path):
|
|
# load audio and pad/trim it to fit 30 seconds
|
|
audio = whisper.load_audio(audio_path)
|
|
audio = whisper.pad_or_trim(audio)
|
|
|
|
# make log-Mel spectrogram and move to the same device as the model
|
|
mel = whisper.log_mel_spectrogram(audio).to(model.device)
|
|
|
|
# detect the spoken language
|
|
_, probs = model.detect_language(mel)
|
|
print(f"Detected language: {max(probs, key=probs.get)}")
|
|
lang = max(probs, key=probs.get)
|
|
# decode the audio
|
|
options = whisper.DecodingOptions(beam_size=5)
|
|
result = whisper.decode(model, mel, options)
|
|
|
|
# print the recognized text
|
|
print(result.text)
|
|
return lang, result.text
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--languages", default="CJE")
|
|
parser.add_argument("--whisper_size", default="medium")
|
|
args = parser.parse_args()
|
|
if args.languages == "CJE":
|
|
lang2token = {
|
|
'zh': "[ZH]",
|
|
'ja': "[JA]",
|
|
"en": "[EN]",
|
|
}
|
|
elif args.languages == "CJ":
|
|
lang2token = {
|
|
'zh': "[ZH]",
|
|
'ja': "[JA]",
|
|
}
|
|
elif args.languages == "C":
|
|
lang2token = {
|
|
'zh': "[ZH]",
|
|
}
|
|
assert (torch.cuda.is_available()), "Please enable GPU in order to run Whisper!"
|
|
model = whisper.load_model(args.whisper_size)
|
|
parent_dir = "./custom_character_voice/"
|
|
speaker_names = list(os.walk(parent_dir))[0][1]
|
|
speaker_annos = []
|
|
total_files = sum([len(files) for r, d, files in os.walk(parent_dir)])
|
|
# resample audios
|
|
# 2023/4/21: Get the target sampling rate
|
|
with open("./configs/finetune_speaker.json", 'r', encoding='utf-8') as f:
|
|
hps = json.load(f)
|
|
target_sr = hps['data']['sampling_rate']
|
|
processed_files = 0
|
|
for speaker in speaker_names:
|
|
for i, wavfile in enumerate(list(os.walk(parent_dir + speaker))[0][2]):
|
|
# try to load file as audio
|
|
if wavfile.startswith("processed_"):
|
|
continue
|
|
try:
|
|
wav, sr = torchaudio.load(parent_dir + speaker + "/" + wavfile, frame_offset=0, num_frames=-1, normalize=True,
|
|
channels_first=True)
|
|
wav = wav.mean(dim=0).unsqueeze(0)
|
|
if sr != target_sr:
|
|
wav = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)(wav)
|
|
if wav.shape[1] / sr > 20:
|
|
print(f"{wavfile} too long, ignoring\n")
|
|
save_path = parent_dir + speaker + "/" + f"processed_{i}.wav"
|
|
torchaudio.save(save_path, wav, target_sr, channels_first=True)
|
|
# transcribe text
|
|
lang, text = transcribe_one(save_path)
|
|
if lang not in list(lang2token.keys()):
|
|
print(f"{lang} not supported, ignoring\n")
|
|
continue
|
|
text = lang2token[lang] + text + lang2token[lang] + "\n"
|
|
speaker_annos.append(save_path + "|" + speaker + "|" + text)
|
|
|
|
processed_files += 1
|
|
print(f"Processed: {processed_files}/{total_files}")
|
|
except:
|
|
continue
|
|
|
|
# # clean annotation
|
|
# import argparse
|
|
# import text
|
|
# from utils import load_filepaths_and_text
|
|
# for i, line in enumerate(speaker_annos):
|
|
# path, sid, txt = line.split("|")
|
|
# cleaned_text = text._clean_text(txt, ["cjke_cleaners2"])
|
|
# cleaned_text += "\n" if not cleaned_text.endswith("\n") else ""
|
|
# speaker_annos[i] = path + "|" + sid + "|" + cleaned_text
|
|
# write into annotation
|
|
if len(speaker_annos) == 0:
|
|
print("Warning: no short audios found, this IS expected if you have only uploaded long audios, videos or video links.")
|
|
print("this IS NOT expected if you have uploaded a zip file of short audios. Please check your file structure or make sure your audio language is supported.")
|
|
with open("short_character_anno.txt", 'w', encoding='utf-8') as f:
|
|
for line in speaker_annos:
|
|
f.write(line)
|
|
|
|
# import json
|
|
# # generate new config
|
|
# with open("./configs/finetune_speaker.json", 'r', encoding='utf-8') as f:
|
|
# hps = json.load(f)
|
|
# # modify n_speakers
|
|
# hps['data']["n_speakers"] = 1000 + len(speaker2id)
|
|
# # add speaker names
|
|
# for speaker in speaker_names:
|
|
# hps['speakers'][speaker] = speaker2id[speaker]
|
|
# # save modified config
|
|
# with open("./configs/modified_finetune_speaker.json", 'w', encoding='utf-8') as f:
|
|
# json.dump(hps, f, indent=2)
|
|
# print("finished")
|