76 lines
3.4 KiB
Python
76 lines
3.4 KiB
Python
from moviepy.editor import AudioFileClip
|
|
import whisper
|
|
import os
|
|
import json
|
|
import torchaudio
|
|
import librosa
|
|
import torch
|
|
import argparse
|
|
parent_dir = "./denoised_audio/"
|
|
filelist = list(os.walk(parent_dir))[0][2]
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--languages", default="CJE")
|
|
parser.add_argument("--whisper_size", default="medium")
|
|
args = parser.parse_args()
|
|
if args.languages == "CJE":
|
|
lang2token = {
|
|
'zh': "[ZH]",
|
|
'ja': "[JA]",
|
|
"en": "[EN]",
|
|
}
|
|
elif args.languages == "CJ":
|
|
lang2token = {
|
|
'zh': "[ZH]",
|
|
'ja': "[JA]",
|
|
}
|
|
elif args.languages == "C":
|
|
lang2token = {
|
|
'zh': "[ZH]",
|
|
}
|
|
assert(torch.cuda.is_available()), "Please enable GPU in order to run Whisper!"
|
|
with open("./configs/finetune_speaker.json", 'r', encoding='utf-8') as f:
|
|
hps = json.load(f)
|
|
target_sr = hps['data']['sampling_rate']
|
|
model = whisper.load_model(args.whisper_size)
|
|
speaker_annos = []
|
|
for file in filelist:
|
|
print(f"transcribing {parent_dir + file}...\n")
|
|
options = dict(beam_size=5, best_of=5)
|
|
transcribe_options = dict(task="transcribe", **options)
|
|
result = model.transcribe(parent_dir + file, word_timestamps=True, **transcribe_options)
|
|
segments = result["segments"]
|
|
# result = model.transcribe(parent_dir + file)
|
|
lang = result['language']
|
|
if result['language'] not in list(lang2token.keys()):
|
|
print(f"{lang} not supported, ignoring...\n")
|
|
continue
|
|
# segment audio based on segment results
|
|
character_name = file.rstrip(".wav").split("_")[0]
|
|
code = file.rstrip(".wav").split("_")[1]
|
|
if not os.path.exists("./segmented_character_voice/" + character_name):
|
|
os.mkdir("./segmented_character_voice/" + character_name)
|
|
wav, sr = torchaudio.load(parent_dir + file, frame_offset=0, num_frames=-1, normalize=True,
|
|
channels_first=True)
|
|
|
|
for i, seg in enumerate(result['segments']):
|
|
start_time = seg['start']
|
|
end_time = seg['end']
|
|
text = seg['text']
|
|
text = lang2token[lang] + text.replace("\n", "") + lang2token[lang]
|
|
text = text + "\n"
|
|
wav_seg = wav[:, int(start_time*sr):int(end_time*sr)]
|
|
wav_seg_name = f"{character_name}_{code}_{i}.wav"
|
|
savepth = "./segmented_character_voice/" + character_name + "/" + wav_seg_name
|
|
speaker_annos.append(savepth + "|" + character_name + "|" + text)
|
|
print(f"Transcribed segment: {speaker_annos[-1]}")
|
|
# trimmed_wav_seg = librosa.effects.trim(wav_seg.squeeze().numpy())
|
|
# trimmed_wav_seg = torch.tensor(trimmed_wav_seg[0]).unsqueeze(0)
|
|
torchaudio.save(savepth, wav_seg, target_sr, channels_first=True)
|
|
if len(speaker_annos) == 0:
|
|
print("Warning: no long audios & videos found, this IS expected if you have only uploaded short audios")
|
|
print("this IS NOT expected if you have uploaded any long audios, videos or video links. Please check your file structure or make sure your audio/video language is supported.")
|
|
with open("./long_character_anno.txt", 'w', encoding='utf-8') as f:
|
|
for line in speaker_annos:
|
|
f.write(line)
|