86 Commits

Author SHA1 Message Date
Plachtaa da3670a2d8 Merge pull request #49 from BushyToaster88/fix-torch-stft-error-on-gpus-sm-53
fix-torch-stft-error-on-gpus-sm-53
2023-03-03 14:10:23 +08:00
Exulan b92361b99f fix-torch-stft-error-on-gpus-sm-53
This pull request addresses an issue that arises when executing the finetune_speaker_v2.py script on GPUs with compute capability less than SM_53. The error occurs at line 104 of mel_processing.py, where the torch.stft() function is called with a half data type. To fix this, I updated the data type to float.
2023-03-03 16:17:28 +11:00
Plachtaa 8137fb9e06 Merge pull request #44 from justinjohn0306/main
Use gloo backend on Windows for Pytorch
2023-03-02 16:50:40 +08:00
Justin John 04b5e8e68c Merge branch 'Plachtaa:main' into main 2023-03-02 13:51:48 +05:30
Plachta 6addfeb97f fixed single speaker error 2023-03-02 14:42:25 +08:00
Plachta 13350512c7 updated error messages 2023-03-02 13:36:47 +08:00
Plachta 610f4a8b02 updated error messages 2023-03-02 13:34:32 +08:00
Plachta 03c9cd3ccb updated pure chinese option 2023-03-01 17:50:11 +08:00
Plachta 9f8982181d updated pure chinese option 2023-03-01 17:11:58 +08:00
Plachta 89eb6f5a83 Merge remote-tracking branch 'origin/main' 2023-03-01 16:59:07 +08:00
Plachta 952ce4b3a3 updated chinese cleaners 2023-03-01 16:58:57 +08:00
Plachtaa 86a60660bb Update DATA_EN.MD 2023-03-01 10:18:43 +08:00
Plachta e90e1b86f1 updated pipeline 2023-02-27 22:05:28 +08:00
Justin John 3482b9ce55 downgrade librosa 2023-02-27 11:30:34 +05:30
Justin John 730365fcde Use gloo backend on Windows for Pytorch 2023-02-27 10:36:35 +05:30
Plachta bf7042e454 updated pipeline 2023-02-27 00:39:32 +08:00
Plachta 03e3894fe4 updated pipeline 2023-02-27 00:22:00 +08:00
Plachta 4b2ea54c96 updated pipeline 2023-02-26 23:42:15 +08:00
Plachta 9c471d14ac updated pipeline 2023-02-26 22:15:01 +08:00
Plachta d212e70381 updated pipeline 2023-02-26 21:14:42 +08:00
Plachta e15159ba55 updated pipeline 2023-02-26 21:13:55 +08:00
Plachta ce418b1888 updated pipeline 2023-02-26 20:31:49 +08:00
Plachta 7d5b2f8547 updated pipeline 2023-02-26 20:31:00 +08:00
Plachta 7a1a8216d0 updated pipeline 2023-02-26 20:30:31 +08:00
Plachta 1a873a6f83 updated pipeline 2023-02-26 20:23:53 +08:00
Plachta 46cc84aec2 updated pipeline 2023-02-26 20:22:50 +08:00
Plachta 626ace0132 updated pipeline 2023-02-26 20:09:59 +08:00
Plachta 7d5c196cc7 updated pipeline 2023-02-26 20:08:37 +08:00
Plachta 6230f6e5b3 Merge remote-tracking branch 'origin/main'
# Conflicts:
#	README_ZH.md
2023-02-26 20:06:49 +08:00
Plachta 5b228b39be updated pipeline 2023-02-26 20:06:31 +08:00
Plachtaa b88092dd3a Update README_ZH.md 2023-02-25 18:28:49 +08:00
Plachtaa 8370ea7f35 Update README_ZH.md 2023-02-25 10:13:21 +08:00
Plachtaa b18bc53b12 Update README.md 2023-02-25 10:13:01 +08:00
Plachta cfa4cc9878 upload files 2023-02-23 17:15:14 +08:00
Plachta 4f09046710 upload files 2023-02-23 17:11:09 +08:00
Plachta 60bd93a26f upload files 2023-02-23 15:02:32 +08:00
Plachta 1bae830002 upload files 2023-02-23 14:57:46 +08:00
Plachta 6ed2e432fb upload files 2023-02-23 14:53:04 +08:00
Plachta 649919cca3 Merge remote-tracking branch 'origin/main' 2023-02-23 14:32:06 +08:00
Plachta 66d4920e69 upload files 2023-02-23 14:31:34 +08:00
Plachtaa a89610a6e4 Update preprocess.py 2023-02-21 17:21:57 +08:00
Plachta 98f6e845b6 upload files 2023-02-20 10:27:46 +08:00
Plachta 883eb06769 upload files 2023-02-20 00:54:20 +08:00
Plachta 20d91c6249 Merge remote-tracking branch 'origin/main' 2023-02-19 22:26:50 +08:00
Plachta 674ddf346a upload files 2023-02-19 22:26:21 +08:00
Plachta fc838f7e5f upload files 2023-02-19 22:25:37 +08:00
Plachtaa 95623e77a3 Update README.md 2023-02-19 18:17:39 +08:00
Plachtaa fce6ee58a5 Update README_EN.md 2023-02-19 18:17:24 +08:00
Plachtaa f5b2bcbaa4 Update README_ZH.md 2023-02-19 18:17:07 +08:00
Plachtaa 78fb90d9fc Update README_EN.md 2023-02-19 00:50:06 +08:00
Plachtaa 3ebc1befe5 Update README.md 2023-02-19 00:49:46 +08:00
Plachta 9cb391af38 upload files 2023-02-19 00:48:13 +08:00
Plachta 030bde4914 upload files 2023-02-19 00:47:10 +08:00
Plachta 93d5e26434 upload files 2023-02-19 00:46:39 +08:00
Plachta 22bc07925c upload files 2023-02-18 18:31:52 +08:00
Plachta e8fa96308c upload files 2023-02-18 07:13:43 +08:00
Plachta 3f40898dd1 upload files 2023-02-17 10:21:34 +08:00
Plachta 56f0dadc47 upload files 2023-02-16 20:35:35 +08:00
Plachta da6736fbf7 upload files 2023-02-16 20:20:06 +08:00
Plachta cbd32eb8b5 upload files 2023-02-16 20:19:45 +08:00
Plachta cc342ddc71 upload files 2023-02-16 20:17:21 +08:00
Plachta 9c3087ccba upload files 2023-02-16 20:15:00 +08:00
Plachta 2358177995 upload files 2023-02-16 20:14:27 +08:00
Plachta 2fdb60911c upload files 2023-02-16 20:00:53 +08:00
Plachta 044d386682 upload files 2023-02-16 19:59:35 +08:00
Plachta f735bc3d80 upload files 2023-02-16 19:54:14 +08:00
Plachta 018a0aa0d8 upload files 2023-02-16 19:46:23 +08:00
Plachta 6e8d3255a1 upload files 2023-02-16 19:40:47 +08:00
Plachta e12928fac9 upload files 2023-02-16 18:50:02 +08:00
Plachta f2287032e8 upload files 2023-02-16 18:33:06 +08:00
Plachta a1b6eb54e4 upload files 2023-02-16 18:29:06 +08:00
Plachta bacf4dbdff upload files 2023-02-16 18:01:33 +08:00
Plachta e160e1532f upload files 2023-02-16 17:57:41 +08:00
Plachta 3a6bf5adbb upload files 2023-02-16 17:41:23 +08:00
Plachta eed75db80d upload files 2023-02-16 17:09:40 +08:00
Plachta c9b8a0a446 upload files 2023-02-16 16:39:34 +08:00
Plachta 5031a30f1e upload files 2023-02-16 16:13:36 +08:00
Plachta 611965cb43 upload files 2023-02-16 15:58:30 +08:00
Plachta c7074471b1 upload files 2023-02-16 15:58:00 +08:00
Plachta 4e99bbd3c9 upload files 2023-02-16 15:57:28 +08:00
Plachta d60c12e9e5 upload files 2023-02-16 15:56:03 +08:00
Plachta 8d0698261c upload files 2023-02-15 17:30:17 +08:00
Plachta 6986709683 upload files 2023-02-15 17:18:24 +08:00
Plachta 71721fb7fa upload files 2023-02-15 16:45:35 +08:00
Plachta f72d72da5d upload files 2023-02-15 16:44:01 +08:00
Plachta 6246f7718d upload files 2023-02-15 16:42:50 +08:00
31 changed files with 899 additions and 676 deletions
+42
View File
@@ -0,0 +1,42 @@
本仓库的pipeline支持多种声音样本上传方式,您只需根据您所持有的样本选择任意一种或其中几种即可。
1.`.zip`文件打包的,按角色名排列的短音频,该压缩文件结构应如下所示:
```
Your-zip-file.zip
├───Character_name_1
├ ├───xxx.wav
├ ├───...
├ ├───yyy.mp3
├ └───zzz.wav
├───Character_name_2
├ ├───xxx.wav
├ ├───...
├ ├───yyy.mp3
├ └───zzz.wav
├───...
└───Character_name_n
├───xxx.wav
├───...
├───yyy.mp3
└───zzz.wav
```
注意音频的格式和名称都不重要,只要它们是音频文件。
质量要求:2秒以上,10秒以内,尽量不要有背景噪音。
数量要求:一个角色至少10条,最好每个角色20条以上。
2. 以角色名命名的长音频文件,音频内只能有单说话人,背景音会被自动去除。命名格式为:`{CharacterName}_{random_number}.wav`
(例如:`Diana_234135.wav`, `MinatoAqua_234252.wav`),必须是`.wav`文件。
3. 以角色名命名的长视频文件,视频内只能有单说话人,背景音会被自动去除。命名格式为:`{CharacterName}_{random_number}.mp4`
(例如:`Taffy_332452.mp4`, `Dingzhen_957315.mp4`),必须是`.mp4`文件。
注意:命名中,`CharacterName`必须是英文字符,`random_number`是为了区分同一个角色的多个文件,必须要添加,该数字可以为0~999999之间的任意整数。
4. 包含多行`{CharacterName}|{video_url}``.txt`文件,格式应如下所示:
```
Char1|https://xyz.com/video1/
Char2|https://xyz.com/video2/
Char2|https://xyz.com/video3/
Char3|https://xyz.com/video4/
```
视频内只能有单说话人,背景音会被自动去除。目前仅支持来自bilibili的视频,其它网站视频的url还没测试过。
若对格式有疑问,可以在[这里](https://drive.google.com/file/d/132l97zjanpoPY4daLgqXoM7HKXPRbS84/view?usp=sharing)找到所有格式对应的数据样本。
+46
View File
@@ -0,0 +1,46 @@
The pipeline of this repo supports multiple voice uploading optionsyou can choose one or more options depending on the data you have.
1. Short audios packed by a single `.zip` file, whose file structure should be as shown below:
```
Your-zip-file.zip
├───Character_name_1
├ ├───xxx.wav
├ ├───...
├ ├───yyy.mp3
├ └───zzz.wav
├───Character_name_2
├ ├───xxx.wav
├ ├───...
├ ├───yyy.mp3
├ └───zzz.wav
├───...
└───Character_name_n
├───xxx.wav
├───...
├───yyy.mp3
└───zzz.wav
```
Note that the format of the audio files does not matter as long as they are audio files。
Quality requirement: >=2s, <=10s, contain as little background sound as possible.
Quantity requirement: at least 10 per character, 20+ per character is recommended.
2. Long audio files named by character names, which should contain single character voice only. Background sound is
acceptable since they will be automatically removed. File name format `{CharacterName}_{random_number}.wav`
(E.G. `Diana_234135.wav`, `MinatoAqua_234252.wav`), must be `.wav` files.
3. Long video files named by character names, which should contain single character voice only. Background sound is
acceptable since they will be automatically removed. File name format `{CharacterName}_{random_number}.mp4`
(E.G. `Taffy_332452.mp4`, `Dingzhen_957315.mp4`), must be `.mp4` files.
Note: `CharacterName` must be English characters only, `random_number` is to identify multiple files for one character,
which is compulsory to add. It could be a random integer between 0~999999.
4. A `.txt` containing multiple lines of`{CharacterName}|{video_url}`, which should be formatted as follows:
```
Char1|https://xyz.com/video1/
Char2|https://xyz.com/video2/
Char2|https://xyz.com/video3/
Char3|https://xyz.com/video4/
```
One video should contain single speaker only. Currently supports videos links from bilibili, other websites are yet to be tested.
Having questions regarding to data format? Fine data samples of all format from [here](https://drive.google.com/file/d/132l97zjanpoPY4daLgqXoM7HKXPRbS84/view?usp=sharing).
+44 -26
View File
@@ -1,40 +1,58 @@
[中文文档请点击这里](https://github.com/SongtingLiu/VITS_voice_conversion/blob/main/README_CN.md)
# VITS Voice Conversion
This repo will guide you to add your voice into an existing VITS TTS model
to make it a high-quality voice converter to all existing character voices in the model.
[中文文档请点击这里](https://github.com/Plachtaa/VITS-fast-fine-tuning/blob/main/README_ZH.md)
# VITS Fast Fine-tuning
This repo will guide you to add your own character voices, or even your own voice, into existing VITS TTS model
to make it able to do the following tasks in less than 1 hour:
1. Many-to-many voice conversion between any characters you added & preset characters in the model.
2. English, Japanese & Chinese Text-to-Speech synthesis with the characters you added & preset characters
Welcome to play around with the base models!
Chinese & English & Japanese[![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer) Author: Me
Chinese & Japanese[![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/sayashi/vits-uma-genshin-honkai) Author: [SayaSS](https://github.com/SayaSS)
Welcome to play around with the base model, a Trilingual Anime VITS!
[![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer)
### Currently Supported Tasks:
- [x] Convert user's voice to characters listed [here](https://github.com/SongtingLiu/VITS_voice_conversion/blob/main/configs/finetune_speaker.json)
- [x] Chinese, English, Japanese TTS with user's voice
- [ ] Chinese, English, Japanese TTS with custom characters...
- [x] Clone character voice from 10+ short audios
- [x] Clone character voice from long audio(s) >= 3 minutes (one audio should contain single speaker only)
- [x] Clone character voice from videos(s) >= 3 minutes (one video should contain single speaker only)
- [x] Clone character voice from BILIBILI video links (one video should contain single speaker only)
### Currently Supported Characters for TTS & VC:
- [x] Umamusume Pretty Derby
- [x] Sanoba Witch
- [x] Genshin Impact
- [ ] Custom characters...
- [x] Umamusume Pretty Derby (Used as base model pretraining)
- [x] Sanoba Witch (Used as base model pretraining)
- [x] Genshin Impact (Used as base model pretraining)
- [x] Any character you wish as long as you have their voices!
## Fine-tuning
It's recommended to perform fine-tuning on [Google Colab](https://colab.research.google.com/drive/1omMhfYKrAAQ7a6zOCsyqpla-wU-QyfZn?usp=sharing)
It's recommended to perform fine-tuning on [Google Colab](https://colab.research.google.com/drive/1pn1xnFfdLK63gVXDwV4zCXfVeo8c-I-0?usp=sharing)
because the original VITS has some dependencies that are difficult to configure.
### How long does it take?
1. Install dependencies (2 min)
2. Record at least 10 your own voice (5 min)
3. Fine-tune (30 min)
### How long does it take?
1. Install dependencies (3 min)
2. Choose pretrained model to start. The detailed differences between them are described in [Colab Notebook](https://colab.research.google.com/drive/1pn1xnFfdLK63gVXDwV4zCXfVeo8c-I-0?usp=sharing)
3. Upload the voice samples of the characters you wish to addsee [DATA.MD](https://github.com/Plachtaa/VITS-fast-fine-tuning/blob/main/DATA_EN.MD) for detailed uploading options.
4. Start fine-tuning. Time taken varies from 20 minutes ~ 2 hours, depending on the number of voices you uploaded.
## Inference or Usage
1. Install Python if you haven't done so (Python >= 3.7)
2. Clone this repo:
`git clone https://github.com/SongtingLiu/VITS_voice_conversion.git`
3. Install dependencies
`pip install -r requirements_infer.txt`
4. run VC_inference.py
`python VC_inference.py`
## Inference or Usage (Currently support Windows only)
0. Remember to download your fine-tuned model!
1. Download the latest release
2. Put your model & config file into the folder `inference`, which are named `G_latest.pth` and `finetune_speaker.json`, respectively.
3. The file structure should be as follows:
```
inference
├───inference.exe
├───...
├───finetune_speaker.json
└───G_latest.pth
```
4. run `inference.exe`, the browser should pop up automatically.
## Use in MoeGoe
0. Prepare downloaded model & config file, which are named `G_latest.pth` and `moegoe_config.json`, respectively.
1. Follow [MoeGoe](https://github.com/CjangCjengh/MoeGoe) page instructions to install, configure path, and use.
-39
View File
@@ -1,39 +0,0 @@
# VITS 声线转换
这个代码库会指导你如何将自己的声线通过微调加入已有的VITS模型中,从而使得一个模型就可以实现用户声线到上百个角色声线的高质量转换。
欢迎体验微调所使用的底模,一个包含中日英三语的TTS(文本到语音合成)模型!
[![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer)
### 目前支持的任务:
- [x] 转换用户声线到 [这些角色](https://github.com/SongtingLiu/VITS_voice_conversion/blob/main/configs/finetune_speaker.json)
- [ ] 自定义角色的中日英三语TTS(待完成)
### 目前支持声线转换和中日英三语TTS的角色
- [x] 赛马娘 (仅已实装角色)
- [x] 魔女的夜宴(柚子社) 5人)
- [x] 原神 (仅已实装角色)
- [ ] 任意角色(待完成)
## 微调
建议使用 [Google Colab](https://colab.research.google.com/drive/1omMhfYKrAAQ7a6zOCsyqpla-wU-QyfZn?usp=sharing)
进行微调任务,因为VITS在多语言情况下的某些环境依赖相当难以配置。
### 在Google Colab里,我需要花多长时间?
1. 安装依赖 (2 min)
2. 录入你自己的声音,至少20条3~4秒的短句 (5 min)
3. 进行微调 (30 min)
微调结束后可以直接下载微调好的模型,日后在本地运行(不需要GPU)
## 本地运行和推理
1. Install Python if you haven't done so (Python >= 3.7)
2. Clone this repo:
`git clone https://github.com/SongtingLiu/VITS_voice_conversion.git`
3. Install dependencies
`pip install -r requirements_infer.txt`
4. run VC_inference.py
`python VC_inference.py`
+58
View File
@@ -0,0 +1,58 @@
English Documentation Please Click [here](https://github.com/Plachtaa/VITS-fast-fine-tuning/blob/main/README.md)
# VITS 快速微调
这个代码库会指导你如何将自定义角色(甚至你自己),加入预训练的VITS模型中,在1小时内的微调使模型具备如下功能:
1. 在 模型所包含的任意两个角色 之间进行声线转换
2. 以 你加入的角色声线 进行中日英三语 文本到语音合成。
本项目使用的底模涵盖常见二次元男/女配音声线(来自原神数据集)以及现实世界常见男/女声线(来自VCTK数据集),支持中日英三语,保证能够在微调时快速适应新的声线。
欢迎体验微调所使用的底模!
中日英:[![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer) 作者:我
中日:[![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/sayashi/vits-uma-genshin-honkai) 作者:[SayaSS](https://github.com/SayaSS)
### 目前支持的任务:
- [x] 从 10条以上的短音频 克隆角色声音
- [x] 从 3分钟以上的长音频(单个音频只能包含单说话人) 克隆角色声音
- [x] 从 3分钟以上的视频(单个视频只能包含单说话人) 克隆角色声音
- [x] 通过输入 bilibili视频链接(单个视频只能包含单说话人) 克隆角色声音
### 目前支持声线转换和中日英三语TTS的角色
- [x] 赛马娘 (仅已实装角色)(预训练时使用的角色)
- [x] 魔女的夜宴(柚子社) (5人)(预训练时使用的角色)
- [x] 原神 (仅已实装角色)(预训练时使用的角色)
- [x] 任意角色(只要你有角色的声音样本)
## 微调
建议使用 [Google Colab](https://colab.research.google.com/drive/1pn1xnFfdLK63gVXDwV4zCXfVeo8c-I-0?usp=sharing)
进行微调任务,因为VITS在多语言情况下的某些环境依赖相当难以配置。
### 在Google Colab里,我需要花多长时间?
1. 安装依赖 (3 min)
2. 选择预训练模型,详细区别参见[Colab 笔记本页面](https://colab.research.google.com/drive/1pn1xnFfdLK63gVXDwV4zCXfVeo8c-I-0?usp=sharing)。
3. 上传你希望加入的其它角色声音,详细上传方式见[DATA.MD](https://github.com/Plachtaa/VITS-fast-fine-tuning/blob/main/DATA.MD)
4. 进行微调,根据选择的微调方式和样本数量不同,花费时长可能在20分钟到2小时不等。
微调结束后可以直接下载微调好的模型,日后在本地运行(不需要GPU)
## 本地运行和推理
0. 记得下载微调好的模型和config文件!
1. 下载最新的Release包(在Github页面的右侧)
2. 把下载的模型和config文件放在 `inference`文件夹下, 其文件名分别为 `G_latest.pth``finetune_speaker.json`
3. 一切准备就绪后,文件结构应该如下所示:
```
inference
├───inference.exe
├───...
├───finetune_speaker.json
└───G_latest.pth
```
4. 运行 `inference.exe`, 浏览器会自动弹出窗口, 注意其所在路径不能有中文字符或者空格.
## 在MoeGoe使用
0. MoeGoe以及类似其它VITS推理UI使用的config格式略有不同,需要下载的文件为模型`G_latest.pth`和配置文件`moegoe_config.json`
1. 按照[MoeGoe](https://github.com/CjangCjengh/MoeGoe)页面的提示配置路径即可使用。
+69 -15
View File
@@ -3,13 +3,47 @@ import numpy as np
import torch
from torch import no_grad, LongTensor
import argparse
import commons
from mel_processing import spectrogram_torch
import utils
from models_infer import SynthesizerTrn
from models import SynthesizerTrn
import gradio as gr
import librosa
import webbrowser
from text import text_to_sequence, _clean_text
device = "cuda:0" if torch.cuda.is_available() else "cpu"
language_marks = {
"Japanese": "",
"日本語": "[JA]",
"简体中文": "[ZH]",
"English": "[EN]",
"Mix": "",
}
lang = ['日本語', '简体中文', 'English', 'Mix']
def get_text(text, hps, is_symbol):
text_norm = text_to_sequence(text, hps.symbols, [] if is_symbol else hps.data.text_cleaners)
if hps.data.add_blank:
text_norm = commons.intersperse(text_norm, 0)
text_norm = LongTensor(text_norm)
return text_norm
def create_tts_fn(model, hps, speaker_ids):
def tts_fn(text, speaker, language, speed):
if language is not None:
text = language_marks[language] + text + language_marks[language]
speaker_id = speaker_ids[speaker]
stn_tst = get_text(text, hps, False)
with no_grad():
x_tst = stn_tst.unsqueeze(0).to(device)
x_tst_lengths = LongTensor([stn_tst.size(0)]).to(device)
sid = LongTensor([speaker_id]).to(device)
audio = model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8,
length_scale=1.0 / speed)[0][0, 0].data.cpu().float().numpy()
del stn_tst, x_tst, x_tst_lengths, sid
return "Success", (hps.data.sampling_rate, audio)
return tts_fn
def create_vc_fn(model, hps, speaker_ids):
def vc_fn(original_speaker, target_speaker, record_audio, upload_audio):
@@ -63,23 +97,43 @@ if __name__ == "__main__":
_ = utils.load_checkpoint(args.model_dir, net_g, None)
speaker_ids = hps.speakers
speakers = list(hps.speakers.keys())
tts_fn = create_tts_fn(net_g, hps, speaker_ids)
vc_fn = create_vc_fn(net_g, hps, speaker_ids)
app = gr.Blocks()
with app:
gr.Markdown("""
录制或上传声音,并选择要转换的音色。User代表的音色是你自己。
""")
with gr.Column():
record_audio = gr.Audio(label="record your voice", source="microphone")
upload_audio = gr.Audio(label="or upload audio here", source="upload")
source_speaker = gr.Dropdown(choices=speakers, value="User", label="source speaker")
target_speaker = gr.Dropdown(choices=speakers, value=speakers[0], label="target speaker")
with gr.Column():
message_box = gr.Textbox(label="Message")
converted_audio = gr.Audio(label='converted audio')
btn = gr.Button("Convert!")
btn.click(vc_fn, inputs=[source_speaker, target_speaker, record_audio, upload_audio],
outputs=[message_box, converted_audio])
with gr.Tab("Text-to-Speech"):
with gr.Row():
with gr.Column():
textbox = gr.TextArea(label="Text",
placeholder="Type your sentence here",
value="こんにちわ。", elem_id=f"tts-input")
# select character
char_dropdown = gr.Dropdown(choices=speakers, value=speakers[0], label='character')
language_dropdown = gr.Dropdown(choices=lang, value=lang[0], label='language')
duration_slider = gr.Slider(minimum=0.1, maximum=5, value=1, step=0.1,
label='速度 Speed')
with gr.Column():
text_output = gr.Textbox(label="Message")
audio_output = gr.Audio(label="Output Audio", elem_id="tts-audio")
btn = gr.Button("Generate!")
btn.click(tts_fn,
inputs=[textbox, char_dropdown, language_dropdown, duration_slider,],
outputs=[text_output, audio_output])
with gr.Tab("Voice Conversion"):
gr.Markdown("""
录制或上传声音,并选择要转换的音色。User代表的音色是你自己。
""")
with gr.Column():
record_audio = gr.Audio(label="record your voice", source="microphone")
upload_audio = gr.Audio(label="or upload audio here", source="upload")
source_speaker = gr.Dropdown(choices=speakers, value="User", label="source speaker")
target_speaker = gr.Dropdown(choices=speakers, value=speakers[0], label="target speaker")
with gr.Column():
message_box = gr.Textbox(label="Message")
converted_audio = gr.Audio(label='converted audio')
btn = gr.Button("Convert!")
btn.click(vc_fn, inputs=[source_speaker, target_speaker, record_audio, upload_audio],
outputs=[message_box, converted_audio])
webbrowser.open("http://127.0.0.1:7860")
app.launch(share=args.share)
-204
View File
@@ -1,204 +0,0 @@
{
"train": {
"log_interval": 100,
"eval_interval": 1000,
"seed": 1234,
"epochs": 10000,
"learning_rate": 2e-4,
"betas": [0.8, 0.99],
"eps": 1e-9,
"batch_size": 16,
"fp16_run": true,
"lr_decay": 0.999875,
"segment_size": 8192,
"init_lr_ratio": 1,
"warmup_epochs": 0,
"c_mel": 45,
"c_kl": 1.0
},
"data": {
"training_files":"final_annotation_train.txt",
"validation_files":"final_annotation_val.txt",
"text_cleaners":["cjke_cleaners2"],
"max_wav_value": 32768.0,
"sampling_rate": 22050,
"filter_length": 1024,
"hop_length": 256,
"win_length": 1024,
"n_mel_channels": 80,
"mel_fmin": 0.0,
"mel_fmax": null,
"add_blank": true,
"n_speakers": 1001,
"cleaned_text": true
},
"model": {
"inter_channels": 192,
"hidden_channels": 192,
"filter_channels": 768,
"n_heads": 2,
"n_layers": 6,
"kernel_size": 3,
"p_dropout": 0.1,
"resblock": "1",
"resblock_kernel_sizes": [3,7,11],
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
"upsample_rates": [8,8,2,2],
"upsample_initial_channel": 512,
"upsample_kernel_sizes": [16,16,4,4],
"n_layers_q": 3,
"use_spectral_norm": false,
"gin_channels": 256
},
"symbols": ["_", ",", ".", "!", "?", "-", "~", "\u2026", "N", "Q", "a", "b", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "s", "t", "u", "v", "w", "x", "y", "z", "\u0251", "\u00e6", "\u0283", "\u0291", "\u00e7", "\u026f", "\u026a", "\u0254", "\u025b", "\u0279", "\u00f0", "\u0259", "\u026b", "\u0265", "\u0278", "\u028a", "\u027e", "\u0292", "\u03b8", "\u03b2", "\u014b", "\u0266", "\u207c", "\u02b0", "`", "^", "#", "*", "=", "\u02c8", "\u02cc", "\u2192", "\u2193", "\u2191", " "],
"speakers": {"特别周 Special Week (Umamusume Pretty Derby)": 0,
"无声铃鹿 Silence Suzuka (Umamusume Pretty Derby)": 1,
"东海帝王 Tokai Teio (Umamusume Pretty Derby)": 2,
"丸善斯基 Maruzensky (Umamusume Pretty Derby)": 3,
"富士奇迹 Fuji Kiseki (Umamusume Pretty Derby)": 4,
"小栗帽 Oguri Cap (Umamusume Pretty Derby)": 5,
"黄金船 Gold Ship (Umamusume Pretty Derby)": 6,
"伏特加 Vodka (Umamusume Pretty Derby)": 7,
"大和赤骥 Daiwa Scarlet (Umamusume Pretty Derby)": 8,
"大树快车 Taiki Shuttle (Umamusume Pretty Derby)": 9,
"草上飞 Grass Wonder (Umamusume Pretty Derby)": 10,
"菱亚马逊 Hishi Amazon (Umamusume Pretty Derby)": 11,
"目白麦昆 Mejiro Mcqueen (Umamusume Pretty Derby)": 12,
"神鹰 El Condor Pasa (Umamusume Pretty Derby)": 13,
"好歌剧 T.M. Opera O (Umamusume Pretty Derby)": 14,
"成田白仁 Narita Brian (Umamusume Pretty Derby)": 15,
"鲁道夫象征 Symboli Rudolf (Umamusume Pretty Derby)": 16,
"气槽 Air Groove (Umamusume Pretty Derby)": 17,
"爱丽数码 Agnes Digital (Umamusume Pretty Derby)": 18,
"青云天空 Seiun Sky (Umamusume Pretty Derby)": 19,
"玉藻十字 Tamamo Cross (Umamusume Pretty Derby)": 20,
"美妙姿势 Fine Motion (Umamusume Pretty Derby)": 21,
"琵琶晨光 Biwa Hayahide (Umamusume Pretty Derby)": 22,
"重炮 Mayano Topgun (Umamusume Pretty Derby)": 23,
"曼城茶座 Manhattan Cafe (Umamusume Pretty Derby)": 24,
"美普波旁 Mihono Bourbon (Umamusume Pretty Derby)": 25,
"目白雷恩 Mejiro Ryan (Umamusume Pretty Derby)": 26,
"雪之美人 Yukino Bijin (Umamusume Pretty Derby)": 28,
"米浴 Rice Shower (Umamusume Pretty Derby)": 29,
"艾尼斯风神 Ines Fujin (Umamusume Pretty Derby)": 30,
"爱丽速子 Agnes Tachyon (Umamusume Pretty Derby)": 31,
"爱慕织姬 Admire Vega (Umamusume Pretty Derby)": 32,
"稻荷一 Inari One (Umamusume Pretty Derby)": 33,
"胜利奖券 Winning Ticket (Umamusume Pretty Derby)": 34,
"空中神宫 Air Shakur (Umamusume Pretty Derby)": 35,
"荣进闪耀 Eishin Flash (Umamusume Pretty Derby)": 36,
"真机伶 Curren Chan (Umamusume Pretty Derby)": 37,
"川上公主 Kawakami Princess (Umamusume Pretty Derby)": 38,
"黄金城市 Gold City (Umamusume Pretty Derby)": 39,
"樱花进王 Sakura Bakushin O (Umamusume Pretty Derby)": 40,
"采珠 Seeking the Pearl (Umamusume Pretty Derby)": 41,
"新光风 Shinko Windy (Umamusume Pretty Derby)": 42,
"东商变革 Sweep Tosho (Umamusume Pretty Derby)": 43,
"超级小溪 Super Creek (Umamusume Pretty Derby)": 44,
"醒目飞鹰 Smart Falcon (Umamusume Pretty Derby)": 45,
"荒漠英雄 Zenno Rob Roy (Umamusume Pretty Derby)": 46,
"东瀛佐敦 Tosen Jordan (Umamusume Pretty Derby)": 47,
"中山庆典 Nakayama Festa (Umamusume Pretty Derby)": 48,
"成田大进 Narita Taishin (Umamusume Pretty Derby)": 49,
"西野花 Nishino Flower (Umamusume Pretty Derby)": 50,
"春乌拉拉 Haru Urara (Umamusume Pretty Derby)": 51,
"青竹回忆 Bamboo Memory (Umamusume Pretty Derby)": 52,
"待兼福来 Matikane Fukukitaru (Umamusume Pretty Derby)": 55,
"名将怒涛 Meisho Doto (Umamusume Pretty Derby)": 57,
"目白多伯 Mejiro Dober (Umamusume Pretty Derby)": 58,
"优秀素质 Nice Nature (Umamusume Pretty Derby)": 59,
"帝王光环 King Halo (Umamusume Pretty Derby)": 60,
"待兼诗歌剧 Matikane Tannhauser (Umamusume Pretty Derby)": 61,
"生野狄杜斯 Ikuno Dictus (Umamusume Pretty Derby)": 62,
"目白善信 Mejiro Palmer (Umamusume Pretty Derby)": 63,
"大拓太阳神 Daitaku Helios (Umamusume Pretty Derby)": 64,
"双涡轮 Twin Turbo (Umamusume Pretty Derby)": 65,
"里见光钻 Satono Diamond (Umamusume Pretty Derby)": 66,
"北部玄驹 Kitasan Black (Umamusume Pretty Derby)": 67,
"樱花千代王 Sakura Chiyono O (Umamusume Pretty Derby)": 68,
"天狼星象征 Sirius Symboli (Umamusume Pretty Derby)": 69,
"目白阿尔丹 Mejiro Ardan (Umamusume Pretty Derby)": 70,
"八重无敌 Yaeno Muteki (Umamusume Pretty Derby)": 71,
"鹤丸刚志 Tsurumaru Tsuyoshi (Umamusume Pretty Derby)": 72,
"目白光明 Mejiro Bright (Umamusume Pretty Derby)": 73,
"樱花桂冠 Sakura Laurel (Umamusume Pretty Derby)": 74,
"成田路 Narita Top Road (Umamusume Pretty Derby)": 75,
"也文摄辉 Yamanin Zephyr (Umamusume Pretty Derby)": 76,
"真弓快车 Aston Machan (Umamusume Pretty Derby)": 80,
"骏川手纲 Hayakawa Tazuna (Umamusume Pretty Derby)": 81,
"小林历奇 Kopano Rickey (Umamusume Pretty Derby)": 83,
"奇锐骏 Wonder Acute (Umamusume Pretty Derby)": 85,
"秋川理事长 President Akikawa (Umamusume Pretty Derby)": 86,
"綾地 寧々 Ayachi Nene (Sanoba Witch)": 87,
"因幡 めぐる Inaba Meguru (Sanoba Witch)": 88,
"椎葉 紬 Shiiba Tsumugi (Sanoba Witch)": 89,
"仮屋 和奏 Kariya Wakama (Sanoba Witch)": 90,
"戸隠 憧子 Togakushi Touko (Sanoba Witch)": 91,
"九条裟罗 Kujou Sara (Genshin Impact)": 92,
"芭芭拉 Barbara (Genshin Impact)": 93,
"派蒙 Paimon (Genshin Impact)": 94,
"荒泷一斗 Arataki Itto (Genshin Impact)": 96,
"早柚 Sayu (Genshin Impact)": 97,
"香菱 Xiangling (Genshin Impact)": 98,
"神里绫华 Kamisato Ayaka (Genshin Impact)": 99,
"重云 Chongyun (Genshin Impact)": 100,
"流浪者 Wanderer (Genshin Impact)": 102,
"优菈 Eula (Genshin Impact)": 103,
"凝光 Ningguang (Genshin Impact)": 105,
"钟离 Zhongli (Genshin Impact)": 106,
"雷电将军 Raiden Shogun (Genshin Impact)": 107,
"枫原万叶 Kaedehara Kazuha (Genshin Impact)": 108,
"赛诺 Cyno (Genshin Impact)": 109,
"诺艾尔 Noelle (Genshin Impact)": 112,
"八重神子 Yae Miko (Genshin Impact)": 113,
"凯亚 Kaeya (Genshin Impact)": 114,
"魈 Xiao (Genshin Impact)": 115,
"托马 Thoma (Genshin Impact)": 116,
"可莉 Klee (Genshin Impact)": 117,
"迪卢克 Diluc (Genshin Impact)": 120,
"夜兰 Yelan (Genshin Impact)": 121,
"鹿野院平藏 Shikanoin Heizou (Genshin Impact)": 123,
"辛焱 Xinyan (Genshin Impact)": 124,
"丽莎 Lisa (Genshin Impact)": 125,
"云堇 Yun Jin (Genshin Impact)": 126,
"坎蒂丝 Candace (Genshin Impact)": 127,
"罗莎莉亚 Rosaria (Genshin Impact)": 128,
"北斗 Beidou (Genshin Impact)": 129,
"珊瑚宫心海 Sangonomiya Kokomi (Genshin Impact)": 132,
"烟绯 Yanfei (Genshin Impact)": 133,
"久岐忍 Kuki Shinobu (Genshin Impact)": 136,
"宵宫 Yoimiya (Genshin Impact)": 139,
"安柏 Amber (Genshin Impact)": 143,
"迪奥娜 Diona (Genshin Impact)": 144,
"班尼特 Bennett (Genshin Impact)": 146,
"雷泽 Razor (Genshin Impact)": 147,
"阿贝多 Albedo (Genshin Impact)": 151,
"温迪 Venti (Genshin Impact)": 152,
"空 Player Male (Genshin Impact)": 153,
"神里绫人 Kamisato Ayato (Genshin Impact)": 154,
"琴 Jean (Genshin Impact)": 155,
"艾尔海森 Alhaitham (Genshin Impact)": 156,
"莫娜 Mona (Genshin Impact)": 157,
"妮露 Nilou (Genshin Impact)": 159,
"胡桃 Hu Tao (Genshin Impact)": 160,
"甘雨 Ganyu (Genshin Impact)": 161,
"纳西妲 Nahida (Genshin Impact)": 162,
"刻晴 Keqing (Genshin Impact)": 165,
"荧 Player Female (Genshin Impact)": 169,
"埃洛伊 Aloy (Genshin Impact)": 179,
"柯莱 Collei (Genshin Impact)": 182,
"多莉 Dori (Genshin Impact)": 184,
"提纳里 Tighnari (Genshin Impact)": 186,
"砂糖 Sucrose (Genshin Impact)": 188,
"行秋 Xingqiu (Genshin Impact)": 190,
"奥兹 Oz (Genshin Impact)": 193,
"五郎 Gorou (Genshin Impact)": 198,
"达达利亚 Tartalia (Genshin Impact)": 202,
"七七 Qiqi (Genshin Impact)": 207,
"申鹤 Shenhe (Genshin Impact)": 217,
"莱依拉 Layla (Genshin Impact)": 228,
"菲谢尔 Fishl (Genshin Impact)": 230,
"User": 999
}
}
+172
View File
@@ -0,0 +1,172 @@
{
"train": {
"log_interval": 10,
"eval_interval": 100,
"seed": 1234,
"epochs": 10000,
"learning_rate": 0.0002,
"betas": [
0.8,
0.99
],
"eps": 1e-09,
"batch_size": 16,
"fp16_run": true,
"lr_decay": 0.999875,
"segment_size": 8192,
"init_lr_ratio": 1,
"warmup_epochs": 0,
"c_mel": 45,
"c_kl": 1.0
},
"data": {
"training_files": "final_annotation_train.txt",
"validation_files": "final_annotation_val.txt",
"text_cleaners": [
"chinese_cleaners"
],
"max_wav_value": 32768.0,
"sampling_rate": 22050,
"filter_length": 1024,
"hop_length": 256,
"win_length": 1024,
"n_mel_channels": 80,
"mel_fmin": 0.0,
"mel_fmax": null,
"add_blank": true,
"n_speakers": 2,
"cleaned_text": true
},
"model": {
"inter_channels": 192,
"hidden_channels": 192,
"filter_channels": 768,
"n_heads": 2,
"n_layers": 6,
"kernel_size": 3,
"p_dropout": 0.1,
"resblock": "1",
"resblock_kernel_sizes": [
3,
7,
11
],
"resblock_dilation_sizes": [
[
1,
3,
5
],
[
1,
3,
5
],
[
1,
3,
5
]
],
"upsample_rates": [
8,
8,
2,
2
],
"upsample_initial_channel": 512,
"upsample_kernel_sizes": [
16,
16,
4,
4
],
"n_layers_q": 3,
"use_spectral_norm": false,
"gin_channels": 256
},
"symbols": [
"_",
"\uff1b",
"\uff1a",
"\uff0c",
"\u3002",
"\uff01",
"\uff1f",
"-",
"\u201c",
"\u201d",
"\u300a",
"\u300b",
"\u3001",
"\uff08",
"\uff09",
"\u2026",
"\u2014",
" ",
"A",
"B",
"C",
"D",
"E",
"F",
"G",
"H",
"I",
"J",
"K",
"L",
"M",
"N",
"O",
"P",
"Q",
"R",
"S",
"T",
"U",
"V",
"W",
"X",
"Y",
"Z",
"a",
"b",
"c",
"d",
"e",
"f",
"g",
"h",
"i",
"j",
"k",
"l",
"m",
"n",
"o",
"p",
"q",
"r",
"s",
"t",
"u",
"v",
"w",
"x",
"y",
"z",
"1",
"2",
"3",
"4",
"5",
"0",
"\uff22",
"\uff30"
],
"speakers": {
"dingzhen": 0,
"taffy": 1
}
}
+3 -142
View File
@@ -10,146 +10,6 @@ import commons
from mel_processing import spectrogram_torch
from utils import load_wav_to_torch, load_filepaths_and_text
from text import text_to_sequence, cleaned_text_to_sequence
class TextAudioLoader(torch.utils.data.Dataset):
"""
1) loads audio, text pairs
2) normalizes text and converts them to sequences of integers
3) computes spectrograms from audio files.
"""
def __init__(self, audiopaths_and_text, hparams):
self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
self.text_cleaners = hparams.text_cleaners
self.max_wav_value = hparams.max_wav_value
self.sampling_rate = hparams.sampling_rate
self.filter_length = hparams.filter_length
self.hop_length = hparams.hop_length
self.win_length = hparams.win_length
self.sampling_rate = hparams.sampling_rate
self.cleaned_text = getattr(hparams, "cleaned_text", False)
self.add_blank = hparams.add_blank
self.min_text_len = getattr(hparams, "min_text_len", 1)
self.max_text_len = getattr(hparams, "max_text_len", 190)
random.seed(1234)
random.shuffle(self.audiopaths_and_text)
self._filter()
def _filter(self):
"""
Filter text & store spec lengths
"""
# Store spectrogram lengths for Bucketing
# wav_length ~= file_size / (wav_channels * Bytes per dim) = file_size / (1 * 2)
# spec_length = wav_length // hop_length
audiopaths_and_text_new = []
lengths = []
for audiopath, text in self.audiopaths_and_text:
if self.min_text_len <= len(text) and len(text) <= self.max_text_len:
audiopaths_and_text_new.append([audiopath, text])
lengths.append(os.path.getsize(audiopath) // (2 * self.hop_length))
self.audiopaths_and_text = audiopaths_and_text_new
self.lengths = lengths
def get_audio_text_pair(self, audiopath_and_text):
# separate filename and text
audiopath, text = audiopath_and_text[0], audiopath_and_text[1]
text = self.get_text(text)
spec, wav = self.get_audio(audiopath)
return (text, spec, wav)
def get_audio(self, filename):
audio, sampling_rate = load_wav_to_torch(filename)
if sampling_rate != self.sampling_rate:
raise ValueError("{} {} SR doesn't match target {} SR".format(
sampling_rate, self.sampling_rate))
audio_norm = audio / self.max_wav_value
audio_norm = audio_norm.unsqueeze(0)
spec_filename = filename.replace(".wav", ".spec.pt")
if os.path.exists(spec_filename):
spec = torch.load(spec_filename)
else:
spec = spectrogram_torch(audio_norm, self.filter_length,
self.sampling_rate, self.hop_length, self.win_length,
center=False)
spec = torch.squeeze(spec, 0)
torch.save(spec, spec_filename)
return spec, audio_norm
def get_text(self, text):
if self.cleaned_text:
text_norm = cleaned_text_to_sequence(text)
else:
text_norm = text_to_sequence(text, self.text_cleaners)
if self.add_blank:
text_norm = commons.intersperse(text_norm, 0)
text_norm = torch.LongTensor(text_norm)
return text_norm
def __getitem__(self, index):
return self.get_audio_text_pair(self.audiopaths_and_text[index])
def __len__(self):
return len(self.audiopaths_and_text)
class TextAudioCollate():
""" Zero-pads model inputs and targets
"""
def __init__(self, return_ids=False):
self.return_ids = return_ids
def __call__(self, batch):
"""Collate's training batch from normalized text and aduio
PARAMS
------
batch: [text_normalized, spec_normalized, wav_normalized]
"""
# Right zero-pad all one-hot text sequences to max input length
_, ids_sorted_decreasing = torch.sort(
torch.LongTensor([x[1].size(1) for x in batch]),
dim=0, descending=True)
max_text_len = max([len(x[0]) for x in batch])
max_spec_len = max([x[1].size(1) for x in batch])
max_wav_len = max([x[2].size(1) for x in batch])
text_lengths = torch.LongTensor(len(batch))
spec_lengths = torch.LongTensor(len(batch))
wav_lengths = torch.LongTensor(len(batch))
text_padded = torch.LongTensor(len(batch), max_text_len)
spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len)
wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
text_padded.zero_()
spec_padded.zero_()
wav_padded.zero_()
for i in range(len(ids_sorted_decreasing)):
row = batch[ids_sorted_decreasing[i]]
text = row[0]
text_padded[i, :text.size(0)] = text
text_lengths[i] = text.size(0)
spec = row[1]
spec_padded[i, :, :spec.size(1)] = spec
spec_lengths[i] = spec.size(1)
wav = row[2]
wav_padded[i, :, :wav.size(1)] = wav
wav_lengths[i] = wav.size(1)
if self.return_ids:
return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, ids_sorted_decreasing
return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths
"""Multi speaker version"""
@@ -160,7 +20,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
3) computes spectrograms from audio files.
"""
def __init__(self, audiopaths_sid_text, hparams):
def __init__(self, audiopaths_sid_text, hparams, symbols):
self.audiopaths_sid_text = load_filepaths_and_text(audiopaths_sid_text)
self.text_cleaners = hparams.text_cleaners
self.max_wav_value = hparams.max_wav_value
@@ -175,6 +35,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
self.add_blank = hparams.add_blank
self.min_text_len = getattr(hparams, "min_text_len", 1)
self.max_text_len = getattr(hparams, "max_text_len", 190)
self.symbols = symbols
random.seed(1234)
random.shuffle(self.audiopaths_sid_text)
@@ -232,7 +93,7 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
def get_text(self, text):
if self.cleaned_text:
text_norm = cleaned_text_to_sequence(text)
text_norm = cleaned_text_to_sequence(text, self.symbols)
else:
text_norm = text_to_sequence(text, self.text_cleaners)
if self.add_blank:
-23
View File
@@ -1,23 +0,0 @@
import os
import torch
import torchaudio
audio_dir = "./user_voice/"
wavfiles = []
for filename in list(os.walk(audio_dir))[0][2]:
if filename.endswith(".wav"):
wavfiles.append(filename)
# denoise with demucs
for i, wavfile in enumerate(wavfiles):
os.system(f"demucs --two-stems=vocals {audio_dir}{wavfile}")
# read & store the denoised vocals back
for wavfile in wavfiles:
i = wavfile.strip(".wav")
wav, sr = torchaudio.load(f"./separated/htdemucs/{i}/vocals.wav", frame_offset=0, num_frames=-1, normalize=True, channels_first=True)
# merge two channels into one
wav = wav.mean(dim=0).unsqueeze(0)
if sr != 22050:
wav = torchaudio.transforms.Resample(orig_freq=sr, new_freq=22050)(wav)
torchaudio.save(f"./user_voice/{i}.wav", wav, 22050, channels_first=True)
+18
View File
@@ -0,0 +1,18 @@
import os
import torchaudio
raw_audio_dir = "./raw_audio/"
denoise_audio_dir = "./denoised_audio/"
filelist = list(os.walk(raw_audio_dir))[0][2]
for file in filelist:
if file.endswith(".wav"):
os.system(f"demucs --two-stems=vocals {raw_audio_dir}{file}")
for file in filelist:
file = file.replace(".wav", "")
wav, sr = torchaudio.load(f"./separated/htdemucs/{file}/vocals.wav", frame_offset=0, num_frames=-1, normalize=True,
channels_first=True)
# merge two channels into one
wav = wav.mean(dim=0).unsqueeze(0)
if sr != 22050:
wav = torchaudio.transforms.Resample(orig_freq=sr, new_freq=22050)(wav)
torchaudio.save(denoise_audio_dir + file + ".wav", wav, 22050, channels_first=True)
+3 -2
View File
@@ -1,3 +1,4 @@
from google.colab import files
files.download("./OUTPUT_MODEL/G_latest.pth")
files.download("./OUTPUT_MODEL/config.json")
files.download("./G_latest.pth")
files.download("./finetune_speaker.json")
files.download("./moegoe_config.json")
+23
View File
@@ -0,0 +1,23 @@
from google.colab import files
import shutil
import os
basepath = os.getcwd()
uploaded = files.upload() # 上传文件
for filename in uploaded.keys():
assert(filename.endswith(".txt")), "speaker-videolink info could only be .txt file!"
shutil.move(os.path.join(basepath, filename), os.path.join("./speaker_links.txt"))
with open("./speaker_links.txt", 'r', encoding='utf-8') as f:
lines = f.readlines()
speakers = []
for line in lines:
line = line.replace("\n", "").replace(" ", "")
if line == "":
continue
speaker, link = line.split("|")
if speaker not in speakers:
speakers.append(speaker)
# download link
import random
filename = speaker + "_" + str(random.randint(0, 1000000))
os.system(f"youtube-dl -f 0 {link} -o ./video_data/{filename}.mp4")
+11 -9
View File
@@ -65,11 +65,12 @@ def run(rank, n_gpus, hps):
writer = SummaryWriter(log_dir=hps.model_dir)
writer_eval = SummaryWriter(log_dir=os.path.join(hps.model_dir, "eval"))
dist.init_process_group(backend='nccl', init_method='env://', world_size=n_gpus, rank=rank)
# Use gloo backend on Windows for Pytorch
dist.init_process_group(backend= 'gloo' if os.name == 'nt' else 'nccl', init_method='env://', world_size=n_gpus, rank=rank)
torch.manual_seed(hps.train.seed)
torch.cuda.set_device(rank)
train_dataset = TextAudioSpeakerLoader(hps.data.training_files, hps.data)
train_dataset = TextAudioSpeakerLoader(hps.data.training_files, hps.data, symbols)
train_sampler = DistributedBucketSampler(
train_dataset,
hps.train.batch_size,
@@ -78,12 +79,12 @@ def run(rank, n_gpus, hps):
rank=rank,
shuffle=True)
collate_fn = TextAudioSpeakerCollate()
train_loader = DataLoader(train_dataset, num_workers=0, shuffle=False, pin_memory=True,
train_loader = DataLoader(train_dataset, num_workers=2, shuffle=False, pin_memory=True,
collate_fn=collate_fn, batch_sampler=train_sampler)
# train_loader = DataLoader(train_dataset, batch_size=hps.train.batch_size, num_workers=0, shuffle=False, pin_memory=True,
# train_loader = DataLoader(train_dataset, batch_size=hps.train.batch_size, num_workers=2, shuffle=False, pin_memory=True,
# collate_fn=collate_fn)
if rank == 0:
eval_dataset = TextAudioSpeakerLoader(hps.data.validation_files, hps.data)
eval_dataset = TextAudioSpeakerLoader(hps.data.validation_files, hps.data, symbols)
eval_loader = DataLoader(eval_dataset, num_workers=0, shuffle=False,
batch_size=hps.train.batch_size, pin_memory=True,
drop_last=False, collate_fn=collate_fn)
@@ -97,8 +98,8 @@ def run(rank, n_gpus, hps):
net_d = MultiPeriodDiscriminator(hps.model.use_spectral_norm).cuda(rank)
# load existing model
_, _, _, _ = utils.load_checkpoint("./pretrained_models/G_trilingual.pth", net_g, None)
_, _, _, _ = utils.load_checkpoint("./pretrained_models/D_trilingual.pth", net_d, None)
_, _, _, _ = utils.load_checkpoint("./pretrained_models/G_0.pth", net_g, None, drop_speaker_emb=hps.drop_speaker_embed)
_, _, _, _ = utils.load_checkpoint("./pretrained_models/D_0.pth", net_d, None)
epoch_str = 1
global_step = 0
# freeze all other layers except speaker embedding
@@ -250,7 +251,8 @@ def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, scaler, loade
# if os.path.exists(old_d):
# os.remove(old_d)
global_step += 1
if global_step == hps.n_steps + 1:
if epoch > hps.max_epochs:
print("Maximum epoch reached, closing training...")
exit()
if rank == 0:
@@ -316,4 +318,4 @@ def evaluate(hps, generator, eval_loader, writer_eval):
if __name__ == "__main__":
main()
main()
+69
View File
@@ -0,0 +1,69 @@
from moviepy.editor import AudioFileClip
import whisper
import os
import torchaudio
import librosa
import torch
import argparse
parent_dir = "./denoised_audio/"
filelist = list(os.walk(parent_dir))[0][2]
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--languages", default="CJE")
parser.add_argument("--whisper_size", default="medium")
args = parser.parse_args()
if args.languages == "CJE":
lang2token = {
'zh': "[ZH]",
'ja': "[JA]",
"en": "[EN]",
}
elif args.languages == "CJ":
lang2token = {
'zh': "[ZH]",
'ja': "[JA]",
}
elif args.languages == "C":
lang2token = {
'zh': "[ZH]",
}
model = whisper.load_model(args.whisper_size)
speaker_annos = []
for file in filelist:
print(f"transcribing {parent_dir + file}...\n")
options = dict(beam_size=5, best_of=5)
transcribe_options = dict(task="transcribe", **options)
result = model.transcribe(parent_dir + file, **transcribe_options)
segments = result["segments"]
# result = model.transcribe(parent_dir + file)
lang = result['language']
if result['language'] not in list(lang2token.keys()):
print(f"{lang} not supported, ignoring...\n")
continue
# segment audio based on segment results
character_name = file.rstrip(".wav").split("_")[0]
code = file.rstrip(".wav").split("_")[1]
if not os.path.exists("./segmented_character_voice/" + character_name):
os.mkdir("./segmented_character_voice/" + character_name)
wav, sr = torchaudio.load(parent_dir + file, frame_offset=0, num_frames=-1, normalize=True,
channels_first=True)
for i, seg in enumerate(result['segments']):
start_time = seg['start']
end_time = seg['end']
text = seg['text']
text = lang2token[lang] + text.replace("\n", "") + lang2token[lang]
text = text + "\n"
wav_seg = wav[:, int(start_time*sr):int(end_time*sr)]
wav_seg_name = f"{character_name}_{code}_{i}.wav"
savepth = "./segmented_character_voice/" + character_name + "/" + wav_seg_name
speaker_annos.append(savepth + "|" + character_name + "|" + text)
# trimmed_wav_seg = librosa.effects.trim(wav_seg.squeeze().numpy())
# trimmed_wav_seg = torch.tensor(trimmed_wav_seg[0]).unsqueeze(0)
torchaudio.save(savepth, wav_seg, 22050, channels_first=True)
if len(speaker_annos) == 0:
print("Warning: no long audios & videos found, this IS expected if you have only uploaded short audios")
print("this IS NOT expected if you have uploaded any long audios, videos or video links. Please check your file structure or make sure your audio/video language is supported.")
with open("long_character_anno.txt", 'w', encoding='utf-8') as f:
for line in speaker_annos:
f.write(line)
+2 -2
View File
@@ -101,8 +101,8 @@ def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size,
y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
y = y.squeeze(1)
spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
center=center, pad_mode='reflect', normalized=False, onesided=True)
spec = torch.stft(y.float(), n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
center=center, pad_mode='reflect', normalized=False, onesided=True)
spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
+1 -1
View File
@@ -453,7 +453,7 @@ class SynthesizerTrn(nn.Module):
else:
self.dp = DurationPredictor(hidden_channels, 256, 3, 0.5, gin_channels=gin_channels)
if n_speakers > 1:
if n_speakers >= 1:
self.emb_g = nn.Embedding(n_speakers, gin_channels)
def forward(self, x, x_lengths, y, y_lengths, sid=None):
-28
View File
@@ -1,28 +0,0 @@
import os
MIN_VOICE_NUM = 10
if __name__ == "__main__":
# load sampled_audio4ft
with open("sampled_audio4ft.txt", 'r', encoding='utf-8') as f:
old_annos = f.readlines()
num_old_voices = len(old_annos)
# load user text
with open("./user_voice/user_voice.txt.cleaned", 'r', encoding='utf-8') as f:
user_annos = f.readlines()
# check how many voices are recorded
wavfiles = [file for file in list(os.walk("./user_voice"))[0][2] if file.endswith(".wav")]
num_user_voices = len(wavfiles)
if num_user_voices < MIN_VOICE_NUM:
raise Exception(f"You need to record at least {MIN_VOICE_NUM} voices for fine-tuning!")
# user voices need to occupy 1/4 of the total dataset
duplicate = num_old_voices // num_user_voices // 3
# find corresponding existing annotation lines
actual_user_annos = ["./user_voice/" + line for line in user_annos if line.split("|")[0] in wavfiles]
final_annos = old_annos + actual_user_annos * duplicate
# save annotation file
with open("final_annotation_train.txt", 'w', encoding='utf-8') as f:
for line in final_annos:
f.write(line)
# save annotation file for validation
with open("final_annotation_val.txt", 'w', encoding='utf-8') as f:
for line in actual_user_annos:
f.write(line)
+127
View File
@@ -0,0 +1,127 @@
import os
import argparse
import json
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--add_auxiliary_data", type=bool, help="Whether to add extra data as fine-tuning helper")
args = parser.parse_args()
new_annos = []
# Source 1: transcribed short audios
if os.path.exists("short_character_anno.txt"):
with open("short_character_anno.txt", 'r', encoding='utf-8') as f:
short_character_anno = f.readlines()
new_annos += short_character_anno
# Source 2: transcribed long audio segments
if os.path.exists("long_character_anno.txt"):
with open("long_character_anno.txt", 'r', encoding='utf-8') as f:
long_character_anno = f.readlines()
new_annos += long_character_anno
# Get all speaker names
speakers = []
for line in new_annos:
path, speaker, text = line.split("|")
if speaker not in speakers:
speakers.append(speaker)
assert (len(speakers) != 0), "No audio file found. Please check your uploaded file structure."
# Source 3 (Optional): sampled audios as extra training helpers
if args.add_auxiliary_data:
with open("sampled_audio4ft.txt", 'r', encoding='utf-8') as f:
old_annos = f.readlines()
num_old_voices = len(old_annos)
num_new_voices = len(new_annos)
# STEP 1: balance number of new & old voices
cc_duplicate = num_old_voices // num_new_voices
if cc_duplicate == 0:
cc_duplicate = 1
# STEP 2: modify config file
with open("./configs/finetune_speaker.json", 'r', encoding='utf-8') as f:
hps = json.load(f)
# assign ids to new speakers
speaker2id = {}
for i, speaker in enumerate(speakers):
speaker2id[speaker] = hps['data']["n_speakers"] + i
# modify n_speakers
hps['data']["n_speakers"] = hps['data']["n_speakers"] + len(speakers)
# add speaker names
for speaker in speakers:
hps['speakers'][speaker] = speaker2id[speaker]
hps['train']['log_interval'] = 100
hps['train']['eval_interval'] = 1000
hps['train']['batch_size'] = 16
hps['data']['training_files'] = "final_annotation_train.txt"
hps['data']['validation_files'] = "final_annotation_val.txt"
# save modified config
with open("./configs/modified_finetune_speaker.json", 'w', encoding='utf-8') as f:
json.dump(hps, f, indent=2)
# STEP 3: clean annotations, replace speaker names with assigned speaker IDs
import text
cleaned_new_annos = []
for i, line in enumerate(new_annos):
path, speaker, txt = line.split("|")
if len(txt) > 150:
continue
cleaned_text = text._clean_text(txt, hps['data']['text_cleaners'])
cleaned_text += "\n" if not cleaned_text.endswith("\n") else ""
cleaned_new_annos.append(path + "|" + str(speaker2id[speaker]) + "|" + cleaned_text)
# merge with old annotation
final_annos = old_annos + cc_duplicate * cleaned_new_annos
# save annotation file
with open("final_annotation_train.txt", 'w', encoding='utf-8') as f:
for line in final_annos:
f.write(line)
# save annotation file for validation
with open("final_annotation_val.txt", 'w', encoding='utf-8') as f:
for line in cleaned_new_annos:
f.write(line)
print("finished")
else:
# Do not add extra helper data
# STEP 1: modify config file
with open("./configs/finetune_speaker.json", 'r', encoding='utf-8') as f:
hps = json.load(f)
# assign ids to new speakers
speaker2id = {}
for i, speaker in enumerate(speakers):
speaker2id[speaker] = i
# modify n_speakers
hps['data']["n_speakers"] = len(speakers)
# overwrite speaker names
hps['speakers'] = speaker2id
hps['train']['log_interval'] = 10
hps['train']['eval_interval'] = 100
hps['train']['batch_size'] = 16
hps['data']['training_files'] = "final_annotation_train.txt"
hps['data']['validation_files'] = "final_annotation_val.txt"
# save modified config
with open("./configs/modified_finetune_speaker.json", 'w', encoding='utf-8') as f:
json.dump(hps, f, indent=2)
# STEP 2: clean annotations, replace speaker names with assigned speaker IDs
import text
cleaned_new_annos = []
for i, line in enumerate(new_annos):
path, speaker, txt = line.split("|")
if len(txt) > 150:
continue
cleaned_text = text._clean_text(txt, hps['data']['text_cleaners']).replace("[ZH]", "")
cleaned_text += "\n" if not cleaned_text.endswith("\n") else ""
cleaned_new_annos.append(path + "|" + str(speaker2id[speaker]) + "|" + cleaned_text)
final_annos = cleaned_new_annos
# save annotation file
with open("final_annotation_train.txt", 'w', encoding='utf-8') as f:
for line in final_annos:
f.write(line)
# save annotation file for validation
with open("final_annotation_val.txt", 'w', encoding='utf-8') as f:
for line in cleaned_new_annos:
f.write(line)
print("finished")
+37
View File
@@ -0,0 +1,37 @@
import torch
import argparse
import json
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model_dir", type=str, default="./OUTPUT_MODEL/G_latest.pth")
parser.add_argument("--config_dir", type=str, default="./configs/modified_finetune_speaker.json")
args = parser.parse_args()
model_sd = torch.load(args.model_dir, map_location='cpu')
with open(args.config_dir, 'r', encoding='utf-8') as f:
hps = json.load(f)
valid_speakers = list(hps['speakers'].keys())
if hps['data']['n_speakers'] > len(valid_speakers):
new_emb_g = torch.zeros([len(valid_speakers), 256])
old_emb_g = model_sd['model']['emb_g.weight']
for i, speaker in enumerate(valid_speakers):
new_emb_g[i, :] = old_emb_g[hps['speakers'][speaker], :]
hps['speakers'][speaker] = i
hps['data']['n_speakers'] = len(valid_speakers)
model_sd['model']['emb_g.weight'] = new_emb_g
with open("./finetune_speaker.json", 'w', encoding='utf-8') as f:
json.dump(hps, f, indent=2)
torch.save(model_sd, "./G_latest.pth")
else:
with open("./finetune_speaker.json", 'w', encoding='utf-8') as f:
json.dump(hps, f, indent=2)
torch.save(model_sd, "./G_latest.pth")
# save another config file copy in MoeGoe format
hps['speakers'] = valid_speakers
with open("./moegoe_config.json", 'w', encoding='utf-8') as f:
json.dump(hps, f, indent=2)
+2 -1
View File
@@ -1,5 +1,5 @@
Cython
librosa
librosa==0.9.1
numpy
scipy
tensorboard
@@ -20,4 +20,5 @@ indic_transliteration==2.3.37
num_thai==0.0.5
opencc==1.1.1
demucs
openai-whisper
gradio
-8
View File
@@ -1,8 +0,0 @@
Cython
librosa
numpy
scipy
torch
torchaudio
unidecode
gradio
+109
View File
@@ -0,0 +1,109 @@
import whisper
import os
import torchaudio
import argparse
lang2token = {
'zh': "[ZH]",
'ja': "[JA]",
"en": "[EN]",
}
def transcribe_one(audio_path):
# load audio and pad/trim it to fit 30 seconds
audio = whisper.load_audio(audio_path)
audio = whisper.pad_or_trim(audio)
# make log-Mel spectrogram and move to the same device as the model
mel = whisper.log_mel_spectrogram(audio).to(model.device)
# detect the spoken language
_, probs = model.detect_language(mel)
print(f"Detected language: {max(probs, key=probs.get)}")
lang = max(probs, key=probs.get)
# decode the audio
options = whisper.DecodingOptions()
result = whisper.decode(model, mel, options)
# print the recognized text
print(result.text)
return lang, result.text
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--languages", default="CJE")
parser.add_argument("--whisper_size", default="medium")
args = parser.parse_args()
if args.languages == "CJE":
lang2token = {
'zh': "[ZH]",
'ja': "[JA]",
"en": "[EN]",
}
elif args.languages == "CJ":
lang2token = {
'zh': "[ZH]",
'ja': "[JA]",
}
elif args.languages == "C":
lang2token = {
'zh': "[ZH]",
}
model = whisper.load_model(args.whisper_size)
parent_dir = "./custom_character_voice/"
speaker_names = list(os.walk(parent_dir))[0][1]
speaker_annos = []
# resample audios
for speaker in speaker_names:
for i, wavfile in enumerate(list(os.walk(parent_dir + speaker))[0][2]):
# try to load file as audio
if wavfile.startswith("processed_"):
continue
try:
wav, sr = torchaudio.load(parent_dir + speaker + "/" + wavfile, frame_offset=0, num_frames=-1, normalize=True,
channels_first=True)
wav = wav.mean(dim=0).unsqueeze(0)
if sr != 22050:
wav = torchaudio.transforms.Resample(orig_freq=sr, new_freq=22050)(wav)
if wav.shape[1] / sr > 20:
print(f"{wavfile} too long, ignoring\n")
save_path = parent_dir + speaker + "/" + f"processed_{i}.wav"
torchaudio.save(save_path, wav, 22050, channels_first=True)
# transcribe text
lang, text = transcribe_one(save_path)
if lang not in list(lang2token.keys()):
print(f"{lang} not supported, ignoring\n")
continue
text = lang2token[lang] + text + lang2token[lang] + "\n"
speaker_annos.append(save_path + "|" + speaker + "|" + text)
except:
continue
# # clean annotation
# import argparse
# import text
# from utils import load_filepaths_and_text
# for i, line in enumerate(speaker_annos):
# path, sid, txt = line.split("|")
# cleaned_text = text._clean_text(txt, ["cjke_cleaners2"])
# cleaned_text += "\n" if not cleaned_text.endswith("\n") else ""
# speaker_annos[i] = path + "|" + sid + "|" + cleaned_text
# write into annotation
if len(speaker_annos) == 0:
print("Warning: no short audios found, this IS expected if you have only uploaded long audios, videos or video links.")
print("this IS NOT expected if you have uploaded a zip file of short audios. Please check your file structure or make sure your audio language is supported.")
with open("short_character_anno.txt", 'w', encoding='utf-8') as f:
for line in speaker_annos:
f.write(line)
# import json
# # generate new config
# with open("./configs/finetune_speaker.json", 'r', encoding='utf-8') as f:
# hps = json.load(f)
# # modify n_speakers
# hps['data']["n_speakers"] = 1000 + len(speaker2id)
# # add speaker names
# for speaker in speaker_names:
# hps['speakers'][speaker] = speaker2id[speaker]
# # save modified config
# with open("./configs/modified_finetune_speaker.json", 'w', encoding='utf-8') as f:
# json.dump(hps, f, indent=2)
# print("finished")
+3 -2
View File
@@ -30,14 +30,15 @@ def text_to_sequence(text, symbols, cleaner_names):
return sequence
def cleaned_text_to_sequence(cleaned_text):
def cleaned_text_to_sequence(cleaned_text, symbols):
'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
Args:
text: string to convert to a sequence
Returns:
List of integers corresponding to the symbols in the text
'''
sequence = [_symbol_to_id[symbol] for symbol in cleaned_text if symbol in _symbol_to_id.keys()]
symbol_to_id = {s: i for i, s in enumerate(symbols)}
sequence = [symbol_to_id[symbol] for symbol in cleaned_text if symbol in symbol_to_id.keys()]
return sequence
+12 -6
View File
@@ -29,13 +29,19 @@ def korean_cleaners(text):
return text
# def chinese_cleaners(text):
# '''Pipeline for Chinese text'''
# text = number_to_chinese(text)
# text = chinese_to_bopomofo(text)
# text = latin_to_bopomofo(text)
# text = re.sub(r'([ˉˊˇˋ˙])$', r'\1。', text)
# return text
def chinese_cleaners(text):
'''Pipeline for Chinese text'''
text = number_to_chinese(text)
text = chinese_to_bopomofo(text)
text = latin_to_bopomofo(text)
text = re.sub(r'([ˉˊˇˋ˙])$', r'\1。', text)
return text
from pypinyin import Style, pinyin
text = text.replace("[ZH]", "")
phones = [phone[0] for phone in pinyin(text, style=Style.TONE3)]
return ' '.join(phones)
def zh_ja_mixture_cleaners(text):
-45
View File
@@ -1,45 +0,0 @@
0.wav|999|[ZH]所以,人的内在拥有对于人的幸福才是最关键的。[ZH]
1.wav|999|[ZH]正因为在大多数情形下人的自身内在相当贫乏,[ZH]
2.wav|999|[ZH]所以,那些再也用不着与生活的匮乏作斗争的人,[ZH]
3.wav|999|[ZH]他们之中的大多数从根本上还是感觉闷闷不乐。[ZH]
4.wav|999|[ZH]情形就跟那些还在生活的困苦中搏斗的人一般无异。[ZH]
5.wav|999|[ZH]他们内在空虚、感觉意识呆滞、思想匮乏,[ZH]
6.wav|999|[ZH]这些就驱使他们投入社交人群中。[ZH]
7.wav|999|[ZH]组成那些社交圈子的人也正是他们这一类的人。[ZH]
8.wav|999|[ZH]“因为相同羽毛的鸟聚在一块”。[ZH]
9.wav|999|[ZH]他们聚在一块追逐消遣、娱乐。[ZH]
10.wav|999|[ZH]他们以放纵感官的欢娱、极尽声色的享受开始,[ZH]
11.wav|999|[ZH]以荒唐、无度而告终。[ZH]
12.wav|999|[ZH]众多刚刚踏入生活的纨绔子弟穷奢极欲,[ZH]
13.wav|999|[ZH]在令人难以置信的极短时间内就把大部分家财挥霍殆尽。[ZH]
14.wav|999|[ZH]这种做派,其根源确实不是别的,正是无聊[ZH]
18.wav|999|[ZH]它源自上述的精神贫乏和空虚。[ZH]
16.wav|999|[ZH]一个外在富有、但内在贫乏的富家子弟来到这个世界,[ZH]
17.wav|999|[ZH]会徒劳地用外在的财富去补偿内在的不足;[ZH]
18.wav|999|[ZH]他渴望从外部得到一切,[ZH]
19.wav|999|[ZH]这情形就好比试图以少女的汗水去强健自己体魄的老朽之人。[ZH]
20.wav|999|[ZH]人自身内在的贫乏由此导致了外在财富的贫乏。[ZH]
21.wav|999|[ZH]至于另外两项人生好处的重要性,[ZH]
22.wav|999|[ZH]不需要我特别强调。[ZH]
23.wav|999|[ZH]财产的价值在当今是人所公认的,[ZH]
24.wav|999|[ZH]用不着为其宣传介绍。[ZH]
25.wav|999|[ZH]比起第二项的好处,[ZH]
26.wav|999|[ZH]第三项的好处具有一种相当飘渺的成分,[ZH]
27.wav|999|[ZH]因为名誉、名望、地位等[ZH]
28.wav|999|[ZH]全由他人的意见构成。[ZH]
29.wav|999|[ZH]每人都可以争取得到名誉,[ZH]
30.wav|999|[ZH]亦即清白的名声;[ZH]
31.wav|999|[ZH]但社会地位,则只有月盼国家政府的人才能染指;[ZH]
32.wav|999|[ZH]至于显赫的名望就只有极少数人才会得到。[ZH]
33.wav|999|[ZH]在所有这些当中,[ZH]
34.wav|999|[ZH]名誉是弥足珍贵的;[ZH]
35.wav|999|[ZH]显赫的名望则是人所希望得到的价值至昂的东西,[ZH]
36.wav|999|[ZH]那是天之骄子才能得到的金羊毛。[ZH]
37.wav|999|[ZH]另一方面,[ZH]
38.wav|999|[ZH]只有傻瓜才会把社会地位放置在财产之前。[ZH]
39.wav|999|[ZH]另外,人拥有的财产、物品和名誉、声望,[ZH]
40.wav|999|[ZH]是处于一种所谓的互为影响、促进的关系。[ZH]
41.wav|999|[ZH]彼得尼斯说过:“一个人所拥有的财产决定了这个人在他人眼中的价值”。[ZH]
42.wav|999|[ZH]如果这句话是正确的话,[ZH]
43.wav|999|[ZH]那么,反过来,他人对自己的良好评价,[ZH]
44.wav|999|[ZH]能以各种形式帮助自己获取财产。[ZH]
-45
View File
@@ -1,45 +0,0 @@
0.wav|999|swo↓↑i↓↑, ɹ`ən↑ t⁼ə neɪ↓ts⁼aɪ↓ jʊŋ→joʊ↓↑ t⁼weɪ↓ɥ↑ ɹ`ən↑ t⁼ə ʃiŋ↓fu↑ tsʰaɪ↑ s`ɹ`↓ ts⁼weɪ↓ k⁼wan→tʃ⁼jɛn↓ t⁼ə.
1.wav|999|ts`⁼əŋ↓ in→weɪ↓ ts⁼aɪ↓ t⁼a↓t⁼wo→s`u↓ tʃʰiŋ↑ʃiŋ↑ ʃja↓ɹ`ən↑ t⁼ə ts⁼ɹ↓s`ən→ neɪ↓ts⁼aɪ↓ ʃiɑŋ→t⁼ɑŋ→ pʰin↑fa↑,
2.wav|999|swo↓↑i↓↑, na↓ʃiɛ→ ts⁼aɪ↓iɛ↓↑ jʊŋ↓p⁼u↓ts`⁼ə ɥ↓↑ s`əŋ→xwo↑ t⁼ə kʰweɪ↓fa↑ ts⁼wo↓ t⁼oʊ↓ts`⁼əŋ→ t⁼ə ɹ`ən↑,
3.wav|999|tʰa→mən ts`⁼ɹ`→ts`⁼ʊŋ→ t⁼ə t⁼a↓t⁼wo→s`u↓ tsʰʊŋ↑k⁼ən→p⁼ən↓↑s`ɑŋ↓ xaɪ↑s`ɹ`↓ k⁼an↓↑tʃ⁼ɥɛ↑ mən↓mən↓p⁼u↓lə↓.
4.wav|999|tʃʰiŋ↑ʃiŋ↑ tʃ⁼joʊ↓ k⁼ən→ na↓ʃiɛ→ xaɪ↑ ts⁼aɪ↓ s`əŋ→xwo↑ t⁼ə kʰwən↓kʰu↓↑ ts`⁼ʊŋ→ p⁼wo↑t⁼oʊ↓ t⁼ə ɹ`ən↑ i↓p⁼an→ u↑i↓.
5.wav|999|tʰa→mən neɪ↓ts⁼aɪ↓ kʰʊŋ→ʃɥ→, k⁼an↓↑tʃ⁼ɥɛ↑ i↓s`ɹ`↑ t⁼aɪ→ts`⁼ɹ`↓, sɹ→ʃiɑŋ↓↑ kʰweɪ↓fa↑,
6.wav|999|ts`⁼ə↓ʃiɛ→ tʃ⁼joʊ↓ tʃʰɥ→s`ɹ`↓↑ tʰa→mən tʰoʊ↑ɹ`u↓ s`ə↓tʃ⁼iɑʊ→ ɹ`ən↑tʃʰɥn↑ ts`⁼ʊŋ→.
7.wav|999|ts⁼u↓↑ts`ʰəŋ↑ na↓ʃiɛ→ s`ə↓tʃ⁼iɑʊ→tʃʰɥæn→ts⁼ɹ t⁼ə ɹ`ən↑ iɛ↓↑ ts`⁼əŋ↓s`ɹ`↓ tʰa→mən ts`⁼ə↓ i→leɪ↓ t⁼ə ɹ`ən↑.
8.wav|999|“ in→weɪ↓ ʃiɑŋ→tʰʊŋ↑ ɥ↓↑mɑʊ↑ t⁼ə niɑʊ↓↑ tʃ⁼ɥ↓ ts⁼aɪ↓ i→kʰwaɪ↓”.
9.wav|999|tʰa→mən tʃ⁼ɥ↓ts⁼aɪ↓ i→kʰwaɪ↓ ts`⁼weɪ→ts`⁼u↑ ʃiɑʊ→tʃʰjɛn↓↑, ɥ↑lə↓.
10.wav|999|tʰa→mən i↓↑ fɑŋ↓ts⁼ʊŋ↓ k⁼an↓↑k⁼wan→ t⁼ə xwan→ɥ↑, tʃ⁼i↑tʃ⁼in↓↑ s`əŋ→sə↓ t⁼ə ʃiɑŋ↓↑s`oʊ↓ kʰaɪ→s`ɹ`↓↑,
11.wav|999|i↓↑ xuɑŋ→tʰɑŋ↑, u↑t⁼u↓ əɹ`↑ k⁼ɑʊ↓ts`⁼ʊŋ→.
12.wav|999|ts`⁼ʊŋ↓t⁼wo→ k⁼ɑŋ→k⁼ɑŋ→ tʰa↓ɹ`u↓ s`əŋ→xwo↑ t⁼ə wan↑kʰu↓ts⁼ɹ↓↑t⁼i↓ tʃʰjʊŋ↑s`ə→tʃ⁼i↑ɥ↓,
13.wav|999|ts⁼aɪ↓ liŋ↓ɹ`ən↑ nan↑i↓↑ts`⁼ɹ`↓ʃin↓ t⁼ə tʃ⁼i↑ t⁼wan↓↑s`ɹ`↑tʃ⁼jɛn→ neɪ↓ tʃ⁼joʊ↓ p⁼a↓↑ t⁼a↓p⁼u↓fən↓ tʃ⁼ja→tsʰaɪ↑ xweɪ→xwo↓ t⁼aɪ↓tʃ⁼in↓.
14.wav|999|ts`⁼ə↓ts`⁼ʊŋ↓↑ ts⁼wo↓pʰaɪ↓, tʃʰi↑ k⁼ən→ɥæn↑ tʃʰɥɛ↓s`ɹ`↑ p⁼u↑s`ɹ`↓ p⁼iɛ↑t⁼ə, ts`⁼əŋ↓s`ɹ`↓ u↑liɑʊ↑.
18.wav|999|tʰa→ ɥæn↑ts⁼ɹ↓ s`ɑŋ↓s`u↓ t⁼ə tʃ⁼iŋ→s`ən↑ pʰin↑fa↑ xə↑ kʰʊŋ→ʃɥ→.
16.wav|999|i↑k⁼ə↓ waɪ↓ ts⁼aɪ↓ fu↓joʊ↓↑, t⁼an↓ neɪ↓ts⁼aɪ↓ pʰin↑fa↑ t⁼ə fu↓tʃ⁼ja→ts⁼ɹ↓↑t⁼i↓ laɪ↑t⁼ɑʊ↓ ts`⁼ə↓k⁼ə↓ s`ɹ`↓tʃ⁼iɛ↓,
17.wav|999|xweɪ↓ tʰu↑lɑʊ↑t⁼i↓ jʊŋ↓waɪ↓ ts⁼aɪ↓ t⁼ə tsʰaɪ↑fu↓ tʃʰɥ↓ p⁼u↓↑ts`ʰɑŋ↑ neɪ↓ts⁼aɪ↓ t⁼ə p⁼u↓ts⁼u↑,
18.wav|999|tʰa→ kʰə↓↑uɑŋ↓ tsʰʊŋ↑ waɪ↓p⁼u↓ t⁼ə↑t⁼ɑʊ↓ i→tʃʰiɛ↓,
19.wav|999|ts`⁼ə↓ tʃʰiŋ↑ʃiŋ↑ tʃ⁼joʊ↓ xɑʊ↓↑p⁼i↓↑ s`ɹ`↓tʰu↑ i↓↑ s`ɑʊ↓nɥ↓↑ t⁼ə xan↓s`weɪ↓↑ tʃʰɥ↓ tʃʰiɑŋ↑tʃ⁼jɛn↓ ts⁼ɹ↓tʃ⁼i↓↑ tʰi↓↑pʰwo↓ t⁼ə lɑʊ↓↑ʃjoʊ↓↑ ts`⁼ɹ`→ ɹ`ən↑.
20.wav|999|ɹ`ən↑ ts⁼ɹ↓s`ən→ neɪ↓ts⁼aɪ↓ t⁼ə pʰin↑fa↑ joʊ↑tsʰɹ↓↑ t⁼ɑʊ↓↑ts`⁼ɹ`↓ lə waɪ↓ ts⁼aɪ↓ tsʰaɪ↑fu↓ t⁼ə pʰin↑fa↑.
21.wav|999|ts`⁼ɹ`↓ɥ↑ liŋ↓waɪ↓ liɑŋ↓↑ʃiɑŋ↓ ɹ`ən↑s`əŋ→ xɑʊ↓↑ts`ʰu↓ t⁼ə ts`⁼ʊŋ↓iɑʊ↓ʃiŋ↓,
22.wav|999|p⁼u↓ ʃɥ→iɑʊ↓ wo↓↑ tʰə↓p⁼iɛ↑tʃʰiɑŋ↑t⁼iɑʊ↓.
23.wav|999|tsʰaɪ↑ts`ʰan↓↑ t⁼ə tʃ⁼ja↓ts`⁼ɹ`↑ ts⁼aɪ↓ t⁼ɑŋ→tʃ⁼in→ s`ɹ`↓ ɹ`ən↑ swo↓↑ k⁼ʊŋ→ɹ`ən↓ t⁼ə,
24.wav|999|jʊŋ↓p⁼u↓ts`⁼ə weɪ↓ tʃʰi↑ ʃɥæn→ts`ʰwan↑ tʃ⁼iɛ↓s`ɑʊ↓.
25.wav|999|p⁼i↓↑tʃʰi↓↑ t⁼i↓əɹ`↓ʃiɑŋ↓ t⁼ə xɑʊ↓↑ts`ʰu↓,
26.wav|999|t⁼i↓san→ʃiɑŋ↓ t⁼ə xɑʊ↓↑ts`ʰu↓ tʃ⁼ɥ↓joʊ↓↑ i→ts`⁼ʊŋ↓↑ ʃiɑŋ→t⁼ɑŋ→ pʰiɑʊ→miɑʊ↓↑ t⁼ə ts`ʰəŋ↑fən↓,
27.wav|999|in→weɪ↓ miŋ↑ɥ↓, miŋ↑uɑŋ↓, t⁼i↓weɪ↓ t⁼əŋ↓↑.
28.wav|999|tʃʰɥæn↑ joʊ↑ tʰa→ɹ`ən↑ t⁼ə i↓tʃ⁼jɛn↓ k⁼oʊ↓ts`ʰəŋ↑.
29.wav|999|meɪ↓↑ɹ`ən↑ t⁼oʊ→ kʰə↓↑i↓↑ ts`⁼əŋ→tʃʰɥ↓↑ t⁼ə↑t⁼ɑʊ↓ miŋ↑ɥ↓,
30.wav|999|i↓ tʃ⁼i↑ tʃʰiŋ→p⁼aɪ↑ t⁼ə miŋ↑s`əŋ→,
31.wav|999|t⁼an↓ s`ə↓xweɪ↓ t⁼i↓weɪ↓, ts⁼ə↑ ts`⁼ɹ`↓↑joʊ↓↑ ɥɛ↓ pʰan↓ k⁼wo↑tʃ⁼ja→ ts`⁼əŋ↓fu↓↑ t⁼ə ɹ`ən↑tsʰaɪ↑ nəŋ↑ ɹ`an↓↑ts`⁼ɹ`↓↑,
32.wav|999|ts`⁼ɹ`↓ɥ↑ ʃjɛn↓↑xə↓ t⁼ə miŋ↑uɑŋ↓ tʃ⁼joʊ↓ ts`⁼ɹ`↓↑joʊ↓↑ tʃ⁼i↑s`ɑʊ↓↑s`u↓ ɹ`ən↑tsʰaɪ↑ xweɪ↓ t⁼ə↑t⁼ɑʊ↓.
33.wav|999|ts⁼aɪ↓ swo↓↑joʊ↓↑ ts`⁼ə↓ʃiɛ→ t⁼ɑŋ→ts`⁼ʊŋ→,
34.wav|999|miŋ↑ɥ↓ s`ɹ`↓ mi↑ts⁼u↑ts`⁼ən→k⁼weɪ↓ t⁼ə,
35.wav|999|ʃjɛn↓↑xə↓ t⁼ə miŋ↑uɑŋ↓ ts⁼ə↑ s`ɹ`↓ ɹ`ən↑ swo↓↑ ʃi→uɑŋ↓ t⁼ə↑t⁼ɑʊ↓ t⁼ə tʃ⁼ja↓ts`⁼ɹ`↑ ts`⁼ɹ`↓ɑŋ↑ t⁼ə t⁼ʊŋ→ʃi→,
36.wav|999|na↓ s`ɹ`↓ tʰjɛn→ts`⁼ɹ`→tʃ⁼iɑʊ→ts⁼ɹ tsʰaɪ↑nəŋ↑ t⁼ə↑t⁼ɑʊ↓ t⁼ə tʃ⁼in→ iɑŋ↑mɑʊ↑.
37.wav|999|liŋ↓i↓fɑŋ→mjɛn↓,
38.wav|999|ts`⁼ɹ`↓↑joʊ↓↑ s`a↓↑k⁼wa→ tsʰaɪ↑ xweɪ↓ p⁼a↓↑ s`ə↓xweɪ↓ t⁼i↓weɪ↓ fɑŋ↓ts`⁼ɹ`↓ ts⁼aɪ↓ tsʰaɪ↑ts`ʰan↓↑ ts`⁼ɹ`→tʃʰjɛn↑.
39.wav|999|liŋ↓waɪ↓, ɹ`ən↑ jʊŋ→joʊ↓↑ t⁼ə tsʰaɪ↑ts`ʰan↓↑, u↓pʰin↓↑ xə↑ miŋ↑ɥ↓, s`əŋ→uɑŋ↓,
40.wav|999|s`ɹ`↓ ts`ʰu↓↑ɥ↑ i→ts`⁼ʊŋ↓↑ swo↓↑weɪ↓ t⁼ə xu↓weɪ↓ iŋ↓↑ʃiɑŋ↓↑, tsʰu↓tʃ⁼in↓ t⁼ə k⁼wan→ʃi↓.
41.wav|999|p⁼i↓↑t⁼ə↑ ni↑sɹ→ s`wo→ k⁼wo↓,“ i↑k⁼ə↓ ɹ`ən↑ swo↓↑ jʊŋ→joʊ↓↑ t⁼ə tsʰaɪ↑ts`ʰan↓↑ tʃ⁼ɥɛ↑t⁼iŋ↓ lə ts`⁼ə↓k⁼ə↓ ɹ`ən↑ ts⁼aɪ↓ tʰa→ɹ`ən↑ jɛn↓↑ts`⁼ʊŋ→ t⁼ə tʃ⁼ja↓ts`⁼ɹ`↑”.
42.wav|999|ɹ`u↑k⁼wo↓↑ ts`⁼ə↓tʃ⁼ɥ↓ xwa↓ s`ɹ`↓ ts`⁼əŋ↓tʃʰɥɛ↓ t⁼əxwa↓,
43.wav|999|na↓mə, fan↓↑k⁼wo↓laɪ↑, tʰa→ɹ`ən↑ t⁼weɪ↓ ts⁼ɹ↓tʃ⁼i↓↑ t⁼ə liɑŋ↑xɑʊ↓↑ pʰiŋ↑tʃ⁼ja↓,
44.wav|999|nəŋ↑i↓↑ k⁼ə↓ts`⁼ʊŋ↓↑ ʃiŋ↑s`ɹ`↓ p⁼ɑŋ→ts`⁼u↓ ts⁼ɹ↓tʃ⁼i↓↑ xwo↓tʃʰɥ↓↑ tsʰaɪ↑ts`ʰan↓↑.
-72
View File
@@ -1,72 +0,0 @@
import numpy as np
import torch
import torchaudio
import gradio as gr
import os
anno_lines = []
with open("./user_voice/user_voice.txt", 'r', encoding='utf-8') as f:
for line in f.readlines():
anno_lines.append(line.strip("\n"))
text_index = 0
def display_text(index):
index = int(index)
global text_index
text_index = index
return f"{text_index}: " + anno_lines[index].split("|")[2].strip("[ZH]")
def display_prev_text():
global text_index
if text_index != 0:
text_index -= 1
return f"{text_index}: " + anno_lines[text_index].split("|")[2].strip("[ZH]")
def display_next_text():
global text_index
if text_index != len(anno_lines)-1:
text_index += 1
return f"{text_index}: " + anno_lines[text_index].split("|")[2].strip("[ZH]")
def save_audio(audio):
global text_index
if audio:
sr, wav = audio
wav = torch.tensor(wav).type(torch.float32) / max(wav.max(), -wav.min())
wav = wav.unsqueeze(0) if len(wav.shape) == 1 else wav
if sr != 22050:
res_wav = torchaudio.transforms.Resample(orig_freq=sr, new_freq=22050)(wav)
else:
res_wav = wav
torchaudio.save(f"./user_voice/{str(text_index)}.wav", res_wav, 22050, channels_first=True)
return f"Audio saved to ./user_voice/{str(text_index)}.wav successfully!"
else:
return "Error: Please record your audio!"
if __name__ == "__main__":
app = gr.Blocks()
with app:
with gr.Row():
text = gr.Textbox(value="0: " + anno_lines[0].split("|")[2].strip("[ZH]"), label="Please read the text here")
with gr.Row():
audio_to_collect = gr.Audio(source="microphone")
with gr.Row():
with gr.Column():
prev_btn = gr.Button(value="Previous")
with gr.Column():
next_btn = gr.Button(value="Next")
with gr.Row():
index_dropdown = gr.Dropdown(choices=[str(i) for i in range(len(anno_lines))], value="0",
label="No. of text", interactive=True)
with gr.Row():
with gr.Column():
save_btn = gr.Button(value="Save Audio")
with gr.Column():
audio_save_message = gr.Textbox(label="Message")
index_dropdown.change(display_text, inputs=index_dropdown, outputs=text)
prev_btn.click(display_prev_text, inputs=None, outputs=text)
next_btn.click(display_next_text, inputs=None, outputs=text)
save_btn.click(save_audio, inputs=audio_to_collect, outputs=audio_save_message)
app.launch()
+10 -6
View File
@@ -15,7 +15,7 @@ logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logger = logging
def load_checkpoint(checkpoint_path, model, optimizer=None):
def load_checkpoint(checkpoint_path, model, optimizer=None, drop_speaker_emb=False):
assert os.path.isfile(checkpoint_path)
checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
iteration = checkpoint_dict['iteration']
@@ -31,8 +31,10 @@ def load_checkpoint(checkpoint_path, model, optimizer=None):
for k, v in state_dict.items():
try:
if k == 'emb_g.weight':
if drop_speaker_emb:
new_state_dict[k] = v
continue
v[:saved_state_dict[k].shape[0], :] = saved_state_dict[k]
# v[999, :] = saved_state_dict[k][154, :]
new_state_dict[k] = v
else:
new_state_dict[k] = saved_state_dict[k]
@@ -148,12 +150,13 @@ def load_filepaths_and_text(filename, split="|"):
def get_hparams(init=True):
parser = argparse.ArgumentParser()
parser.add_argument('-c', '--config', type=str, default="./configs/finetune_speaker.json",
parser.add_argument('-c', '--config', type=str, default="./configs/modified_finetune_speaker.json",
help='JSON file for configuration')
parser.add_argument('-m', '--model', type=str, default="pretrained_models",
help='Model name')
parser.add_argument('-n', '--n_steps', type=int, default="2000",
help='finetune steps')
parser.add_argument('-n', '--max_epochs', type=int, default=50,
help='finetune epochs')
parser.add_argument('--drop_speaker_embed', type=bool, default=False, help='whether to drop existing characters')
args = parser.parse_args()
model_dir = os.path.join("./", args.model)
@@ -175,7 +178,8 @@ def get_hparams(init=True):
hparams = HParams(**config)
hparams.model_dir = model_dir
hparams.n_steps = args.n_steps
hparams.max_epochs = args.max_epochs
hparams.drop_speaker_embed = args.drop_speaker_embed
return hparams
+10
View File
@@ -0,0 +1,10 @@
from moviepy.editor import AudioFileClip
import os
video_dir = "./video_data/"
audio_dir = "./raw_audio/"
filelist = list(os.walk(video_dir))[0][2]
for file in filelist:
if file.endswith(".mp4"):
my_audio_clip = AudioFileClip(video_dir + file)
my_audio_clip.write_audiofile(audio_dir + file.rstrip(".mp4") + ".wav")
+28
View File
@@ -0,0 +1,28 @@
from google.colab import files
import shutil
import os
import argparse
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--type", type=str, required=True, help="type of file to upload")
args = parser.parse_args()
file_type = args.type
basepath = os.getcwd()
uploaded = files.upload() # 上传文件
assert(file_type in ['zip', 'audio', 'video'])
if file_type == "zip":
upload_path = "./custom_character_voice/"
for filename in uploaded.keys():
#将上传的文件移动到指定的位置上
shutil.move(os.path.join(basepath, filename), os.path.join(upload_path, "custom_character_voice.zip"))
elif file_type == "audio":
upload_path = "./raw_audio/"
for filename in uploaded.keys():
#将上传的文件移动到指定的位置上
shutil.move(os.path.join(basepath, filename), os.path.join(upload_path, filename))
elif file_type == "video":
upload_path = "./video_data/"
for filename in uploaded.keys():
# 将上传的文件移动到指定的位置上
shutil.move(os.path.join(basepath, filename), os.path.join(upload_path, filename))