65 lines
2.4 KiB
Python
Executable File
65 lines
2.4 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
import argparse
|
|
from pathlib import Path
|
|
from faster_whisper import WhisperModel, BatchedInferencePipeline
|
|
# import nemo.collections.asr as nemo_asr
|
|
# asr_model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained(model_name="nvidia/parakeet-tdt-1.1b")
|
|
# import nemo.collections.asr.models.rnnt_bpe_models.EncDecRNNTBPEModel
|
|
# asr_model.transcribe
|
|
|
|
|
|
# model_name = 'deepdml/faster-whisper-large-v3-turbo-ct2'
|
|
model_name = 'large-v3'
|
|
# model_name = 'medium.en'
|
|
m = WhisperModel(model_name, device="cuda", compute_type="float16")
|
|
model = BatchedInferencePipeline(model=m)
|
|
|
|
|
|
def format_time(seconds):
|
|
minutes, seconds = divmod(seconds, 60)
|
|
hours, minutes = divmod(minutes, 60)
|
|
milliseconds = (seconds - int(seconds)) * 1000
|
|
return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d},{int(milliseconds):03d}"
|
|
|
|
|
|
def transcribe(input_file: Path, lang: str):
|
|
ouf = input_file.with_suffix('.srt')
|
|
if ouf.exists():
|
|
print(f"Output file {ouf} already exists. Skipping transcription.")
|
|
return
|
|
|
|
# Remove task="translate" if you want the original language
|
|
segments, info = model.transcribe(input_file, beam_size=1, batch_size=8,
|
|
# chunk_length=10,
|
|
without_timestamps=False,
|
|
task="transcribe", vad_filter=True, language=lang)
|
|
|
|
print(f"Transcribing file {input_file}")
|
|
print(f"Detected language '{info.language}' with probability {info.language_probability:.2f}")
|
|
|
|
# with ouf.open('w', encoding='utf-8') as srt_file:
|
|
out = ""
|
|
for seg in segments:
|
|
start_time = format_time(seg.start)
|
|
end_time = format_time(seg.end)
|
|
line_out = f"{seg.id + 1}\n{start_time} --> {end_time}\n{seg.text.lstrip()}\n\n"
|
|
print(line_out)
|
|
out += line_out
|
|
|
|
ouf.write_text(out)
|
|
print(f"Transcription saved to {ouf}")
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Transcribe audio from a video file and generate an SRT file.")
|
|
# parser.add_argument("input_file", help="Path to the video file for transcription")
|
|
parser.add_argument("input_file", nargs="+", help="Path to the video file for transcription")
|
|
parser.add_argument("-l", "--lang", default=None, help="Language code for transcription (e.g. 'en')")
|
|
args = parser.parse_args()
|
|
for file in args.input_file:
|
|
transcribe(Path(file), args.lang)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|