Add files via upload
This commit is contained in:
+303
@@ -0,0 +1,303 @@
|
||||
import copy
|
||||
import math
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch.nn import functional as F
|
||||
|
||||
import commons
|
||||
import modules
|
||||
from modules import LayerNorm
|
||||
|
||||
|
||||
class Encoder(nn.Module):
|
||||
def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=4, **kwargs):
|
||||
super().__init__()
|
||||
self.hidden_channels = hidden_channels
|
||||
self.filter_channels = filter_channels
|
||||
self.n_heads = n_heads
|
||||
self.n_layers = n_layers
|
||||
self.kernel_size = kernel_size
|
||||
self.p_dropout = p_dropout
|
||||
self.window_size = window_size
|
||||
|
||||
self.drop = nn.Dropout(p_dropout)
|
||||
self.attn_layers = nn.ModuleList()
|
||||
self.norm_layers_1 = nn.ModuleList()
|
||||
self.ffn_layers = nn.ModuleList()
|
||||
self.norm_layers_2 = nn.ModuleList()
|
||||
for i in range(self.n_layers):
|
||||
self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, window_size=window_size))
|
||||
self.norm_layers_1.append(LayerNorm(hidden_channels))
|
||||
self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout))
|
||||
self.norm_layers_2.append(LayerNorm(hidden_channels))
|
||||
|
||||
def forward(self, x, x_mask):
|
||||
attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
|
||||
x = x * x_mask
|
||||
for i in range(self.n_layers):
|
||||
y = self.attn_layers[i](x, x, attn_mask)
|
||||
y = self.drop(y)
|
||||
x = self.norm_layers_1[i](x + y)
|
||||
|
||||
y = self.ffn_layers[i](x, x_mask)
|
||||
y = self.drop(y)
|
||||
x = self.norm_layers_2[i](x + y)
|
||||
x = x * x_mask
|
||||
return x
|
||||
|
||||
|
||||
class Decoder(nn.Module):
|
||||
def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., proximal_bias=False, proximal_init=True, **kwargs):
|
||||
super().__init__()
|
||||
self.hidden_channels = hidden_channels
|
||||
self.filter_channels = filter_channels
|
||||
self.n_heads = n_heads
|
||||
self.n_layers = n_layers
|
||||
self.kernel_size = kernel_size
|
||||
self.p_dropout = p_dropout
|
||||
self.proximal_bias = proximal_bias
|
||||
self.proximal_init = proximal_init
|
||||
|
||||
self.drop = nn.Dropout(p_dropout)
|
||||
self.self_attn_layers = nn.ModuleList()
|
||||
self.norm_layers_0 = nn.ModuleList()
|
||||
self.encdec_attn_layers = nn.ModuleList()
|
||||
self.norm_layers_1 = nn.ModuleList()
|
||||
self.ffn_layers = nn.ModuleList()
|
||||
self.norm_layers_2 = nn.ModuleList()
|
||||
for i in range(self.n_layers):
|
||||
self.self_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, proximal_init=proximal_init))
|
||||
self.norm_layers_0.append(LayerNorm(hidden_channels))
|
||||
self.encdec_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout))
|
||||
self.norm_layers_1.append(LayerNorm(hidden_channels))
|
||||
self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True))
|
||||
self.norm_layers_2.append(LayerNorm(hidden_channels))
|
||||
|
||||
def forward(self, x, x_mask, h, h_mask):
|
||||
"""
|
||||
x: decoder input
|
||||
h: encoder output
|
||||
"""
|
||||
self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype)
|
||||
encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
|
||||
x = x * x_mask
|
||||
for i in range(self.n_layers):
|
||||
y = self.self_attn_layers[i](x, x, self_attn_mask)
|
||||
y = self.drop(y)
|
||||
x = self.norm_layers_0[i](x + y)
|
||||
|
||||
y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
|
||||
y = self.drop(y)
|
||||
x = self.norm_layers_1[i](x + y)
|
||||
|
||||
y = self.ffn_layers[i](x, x_mask)
|
||||
y = self.drop(y)
|
||||
x = self.norm_layers_2[i](x + y)
|
||||
x = x * x_mask
|
||||
return x
|
||||
|
||||
|
||||
class MultiHeadAttention(nn.Module):
|
||||
def __init__(self, channels, out_channels, n_heads, p_dropout=0., window_size=None, heads_share=True, block_length=None, proximal_bias=False, proximal_init=False):
|
||||
super().__init__()
|
||||
assert channels % n_heads == 0
|
||||
|
||||
self.channels = channels
|
||||
self.out_channels = out_channels
|
||||
self.n_heads = n_heads
|
||||
self.p_dropout = p_dropout
|
||||
self.window_size = window_size
|
||||
self.heads_share = heads_share
|
||||
self.block_length = block_length
|
||||
self.proximal_bias = proximal_bias
|
||||
self.proximal_init = proximal_init
|
||||
self.attn = None
|
||||
|
||||
self.k_channels = channels // n_heads
|
||||
self.conv_q = nn.Conv1d(channels, channels, 1)
|
||||
self.conv_k = nn.Conv1d(channels, channels, 1)
|
||||
self.conv_v = nn.Conv1d(channels, channels, 1)
|
||||
self.conv_o = nn.Conv1d(channels, out_channels, 1)
|
||||
self.drop = nn.Dropout(p_dropout)
|
||||
|
||||
if window_size is not None:
|
||||
n_heads_rel = 1 if heads_share else n_heads
|
||||
rel_stddev = self.k_channels**-0.5
|
||||
self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
|
||||
self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
|
||||
|
||||
nn.init.xavier_uniform_(self.conv_q.weight)
|
||||
nn.init.xavier_uniform_(self.conv_k.weight)
|
||||
nn.init.xavier_uniform_(self.conv_v.weight)
|
||||
if proximal_init:
|
||||
with torch.no_grad():
|
||||
self.conv_k.weight.copy_(self.conv_q.weight)
|
||||
self.conv_k.bias.copy_(self.conv_q.bias)
|
||||
|
||||
def forward(self, x, c, attn_mask=None):
|
||||
q = self.conv_q(x)
|
||||
k = self.conv_k(c)
|
||||
v = self.conv_v(c)
|
||||
|
||||
x, self.attn = self.attention(q, k, v, mask=attn_mask)
|
||||
|
||||
x = self.conv_o(x)
|
||||
return x
|
||||
|
||||
def attention(self, query, key, value, mask=None):
|
||||
# reshape [b, d, t] -> [b, n_h, t, d_k]
|
||||
b, d, t_s, t_t = (*key.size(), query.size(2))
|
||||
query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
|
||||
key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
|
||||
value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
|
||||
|
||||
scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
|
||||
if self.window_size is not None:
|
||||
assert t_s == t_t, "Relative attention is only available for self-attention."
|
||||
key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
|
||||
rel_logits = self._matmul_with_relative_keys(query /math.sqrt(self.k_channels), key_relative_embeddings)
|
||||
scores_local = self._relative_position_to_absolute_position(rel_logits)
|
||||
scores = scores + scores_local
|
||||
if self.proximal_bias:
|
||||
assert t_s == t_t, "Proximal bias is only available for self-attention."
|
||||
scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype)
|
||||
if mask is not None:
|
||||
scores = scores.masked_fill(mask == 0, -1e4)
|
||||
if self.block_length is not None:
|
||||
assert t_s == t_t, "Local attention is only available for self-attention."
|
||||
block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length)
|
||||
scores = scores.masked_fill(block_mask == 0, -1e4)
|
||||
p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
|
||||
p_attn = self.drop(p_attn)
|
||||
output = torch.matmul(p_attn, value)
|
||||
if self.window_size is not None:
|
||||
relative_weights = self._absolute_position_to_relative_position(p_attn)
|
||||
value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
|
||||
output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
|
||||
output = output.transpose(2, 3).contiguous().view(b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t]
|
||||
return output, p_attn
|
||||
|
||||
def _matmul_with_relative_values(self, x, y):
|
||||
"""
|
||||
x: [b, h, l, m]
|
||||
y: [h or 1, m, d]
|
||||
ret: [b, h, l, d]
|
||||
"""
|
||||
ret = torch.matmul(x, y.unsqueeze(0))
|
||||
return ret
|
||||
|
||||
def _matmul_with_relative_keys(self, x, y):
|
||||
"""
|
||||
x: [b, h, l, d]
|
||||
y: [h or 1, m, d]
|
||||
ret: [b, h, l, m]
|
||||
"""
|
||||
ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
|
||||
return ret
|
||||
|
||||
def _get_relative_embeddings(self, relative_embeddings, length):
|
||||
max_relative_position = 2 * self.window_size + 1
|
||||
# Pad first before slice to avoid using cond ops.
|
||||
pad_length = max(length - (self.window_size + 1), 0)
|
||||
slice_start_position = max((self.window_size + 1) - length, 0)
|
||||
slice_end_position = slice_start_position + 2 * length - 1
|
||||
if pad_length > 0:
|
||||
padded_relative_embeddings = F.pad(
|
||||
relative_embeddings,
|
||||
commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]))
|
||||
else:
|
||||
padded_relative_embeddings = relative_embeddings
|
||||
used_relative_embeddings = padded_relative_embeddings[:,slice_start_position:slice_end_position]
|
||||
return used_relative_embeddings
|
||||
|
||||
def _relative_position_to_absolute_position(self, x):
|
||||
"""
|
||||
x: [b, h, l, 2*l-1]
|
||||
ret: [b, h, l, l]
|
||||
"""
|
||||
batch, heads, length, _ = x.size()
|
||||
# Concat columns of pad to shift from relative to absolute indexing.
|
||||
x = F.pad(x, commons.convert_pad_shape([[0,0],[0,0],[0,0],[0,1]]))
|
||||
|
||||
# Concat extra elements so to add up to shape (len+1, 2*len-1).
|
||||
x_flat = x.view([batch, heads, length * 2 * length])
|
||||
x_flat = F.pad(x_flat, commons.convert_pad_shape([[0,0],[0,0],[0,length-1]]))
|
||||
|
||||
# Reshape and slice out the padded elements.
|
||||
x_final = x_flat.view([batch, heads, length+1, 2*length-1])[:, :, :length, length-1:]
|
||||
return x_final
|
||||
|
||||
def _absolute_position_to_relative_position(self, x):
|
||||
"""
|
||||
x: [b, h, l, l]
|
||||
ret: [b, h, l, 2*l-1]
|
||||
"""
|
||||
batch, heads, length, _ = x.size()
|
||||
# padd along column
|
||||
x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length-1]]))
|
||||
x_flat = x.view([batch, heads, length**2 + length*(length -1)])
|
||||
# add 0's in the beginning that will skew the elements after reshape
|
||||
x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
|
||||
x_final = x_flat.view([batch, heads, length, 2*length])[:,:,:,1:]
|
||||
return x_final
|
||||
|
||||
def _attention_bias_proximal(self, length):
|
||||
"""Bias for self-attention to encourage attention to close positions.
|
||||
Args:
|
||||
length: an integer scalar.
|
||||
Returns:
|
||||
a Tensor with shape [1, 1, length, length]
|
||||
"""
|
||||
r = torch.arange(length, dtype=torch.float32)
|
||||
diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
|
||||
return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
|
||||
|
||||
|
||||
class FFN(nn.Module):
|
||||
def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None, causal=False):
|
||||
super().__init__()
|
||||
self.in_channels = in_channels
|
||||
self.out_channels = out_channels
|
||||
self.filter_channels = filter_channels
|
||||
self.kernel_size = kernel_size
|
||||
self.p_dropout = p_dropout
|
||||
self.activation = activation
|
||||
self.causal = causal
|
||||
|
||||
if causal:
|
||||
self.padding = self._causal_padding
|
||||
else:
|
||||
self.padding = self._same_padding
|
||||
|
||||
self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
|
||||
self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
|
||||
self.drop = nn.Dropout(p_dropout)
|
||||
|
||||
def forward(self, x, x_mask):
|
||||
x = self.conv_1(self.padding(x * x_mask))
|
||||
if self.activation == "gelu":
|
||||
x = x * torch.sigmoid(1.702 * x)
|
||||
else:
|
||||
x = torch.relu(x)
|
||||
x = self.drop(x)
|
||||
x = self.conv_2(self.padding(x * x_mask))
|
||||
return x * x_mask
|
||||
|
||||
def _causal_padding(self, x):
|
||||
if self.kernel_size == 1:
|
||||
return x
|
||||
pad_l = self.kernel_size - 1
|
||||
pad_r = 0
|
||||
padding = [[0, 0], [0, 0], [pad_l, pad_r]]
|
||||
x = F.pad(x, commons.convert_pad_shape(padding))
|
||||
return x
|
||||
|
||||
def _same_padding(self, x):
|
||||
if self.kernel_size == 1:
|
||||
return x
|
||||
pad_l = (self.kernel_size - 1) // 2
|
||||
pad_r = self.kernel_size // 2
|
||||
padding = [[0, 0], [0, 0], [pad_l, pad_r]]
|
||||
x = F.pad(x, commons.convert_pad_shape(padding))
|
||||
return x
|
||||
+161
@@ -0,0 +1,161 @@
|
||||
import math
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch.nn import functional as F
|
||||
|
||||
|
||||
def init_weights(m, mean=0.0, std=0.01):
|
||||
classname = m.__class__.__name__
|
||||
if classname.find("Conv") != -1:
|
||||
m.weight.data.normal_(mean, std)
|
||||
|
||||
|
||||
def get_padding(kernel_size, dilation=1):
|
||||
return int((kernel_size*dilation - dilation)/2)
|
||||
|
||||
|
||||
def convert_pad_shape(pad_shape):
|
||||
l = pad_shape[::-1]
|
||||
pad_shape = [item for sublist in l for item in sublist]
|
||||
return pad_shape
|
||||
|
||||
|
||||
def intersperse(lst, item):
|
||||
result = [item] * (len(lst) * 2 + 1)
|
||||
result[1::2] = lst
|
||||
return result
|
||||
|
||||
|
||||
def kl_divergence(m_p, logs_p, m_q, logs_q):
|
||||
"""KL(P||Q)"""
|
||||
kl = (logs_q - logs_p) - 0.5
|
||||
kl += 0.5 * (torch.exp(2. * logs_p) + ((m_p - m_q)**2)) * torch.exp(-2. * logs_q)
|
||||
return kl
|
||||
|
||||
|
||||
def rand_gumbel(shape):
|
||||
"""Sample from the Gumbel distribution, protect from overflows."""
|
||||
uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
|
||||
return -torch.log(-torch.log(uniform_samples))
|
||||
|
||||
|
||||
def rand_gumbel_like(x):
|
||||
g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
|
||||
return g
|
||||
|
||||
|
||||
def slice_segments(x, ids_str, segment_size=4):
|
||||
ret = torch.zeros_like(x[:, :, :segment_size])
|
||||
for i in range(x.size(0)):
|
||||
idx_str = ids_str[i]
|
||||
idx_end = idx_str + segment_size
|
||||
ret[i] = x[i, :, idx_str:idx_end]
|
||||
return ret
|
||||
|
||||
|
||||
def rand_slice_segments(x, x_lengths=None, segment_size=4):
|
||||
b, d, t = x.size()
|
||||
if x_lengths is None:
|
||||
x_lengths = t
|
||||
ids_str_max = x_lengths - segment_size + 1
|
||||
ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
|
||||
ret = slice_segments(x, ids_str, segment_size)
|
||||
return ret, ids_str
|
||||
|
||||
|
||||
def get_timing_signal_1d(
|
||||
length, channels, min_timescale=1.0, max_timescale=1.0e4):
|
||||
position = torch.arange(length, dtype=torch.float)
|
||||
num_timescales = channels // 2
|
||||
log_timescale_increment = (
|
||||
math.log(float(max_timescale) / float(min_timescale)) /
|
||||
(num_timescales - 1))
|
||||
inv_timescales = min_timescale * torch.exp(
|
||||
torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment)
|
||||
scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
|
||||
signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
|
||||
signal = F.pad(signal, [0, 0, 0, channels % 2])
|
||||
signal = signal.view(1, channels, length)
|
||||
return signal
|
||||
|
||||
|
||||
def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
|
||||
b, channels, length = x.size()
|
||||
signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
|
||||
return x + signal.to(dtype=x.dtype, device=x.device)
|
||||
|
||||
|
||||
def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
|
||||
b, channels, length = x.size()
|
||||
signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
|
||||
return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
|
||||
|
||||
|
||||
def subsequent_mask(length):
|
||||
mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
|
||||
return mask
|
||||
|
||||
|
||||
@torch.jit.script
|
||||
def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
|
||||
n_channels_int = n_channels[0]
|
||||
in_act = input_a + input_b
|
||||
t_act = torch.tanh(in_act[:, :n_channels_int, :])
|
||||
s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
|
||||
acts = t_act * s_act
|
||||
return acts
|
||||
|
||||
|
||||
def convert_pad_shape(pad_shape):
|
||||
l = pad_shape[::-1]
|
||||
pad_shape = [item for sublist in l for item in sublist]
|
||||
return pad_shape
|
||||
|
||||
|
||||
def shift_1d(x):
|
||||
x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
|
||||
return x
|
||||
|
||||
|
||||
def sequence_mask(length, max_length=None):
|
||||
if max_length is None:
|
||||
max_length = length.max()
|
||||
x = torch.arange(max_length, dtype=length.dtype, device=length.device)
|
||||
return x.unsqueeze(0) < length.unsqueeze(1)
|
||||
|
||||
|
||||
def generate_path(duration, mask):
|
||||
"""
|
||||
duration: [b, 1, t_x]
|
||||
mask: [b, 1, t_y, t_x]
|
||||
"""
|
||||
device = duration.device
|
||||
|
||||
b, _, t_y, t_x = mask.shape
|
||||
cum_duration = torch.cumsum(duration, -1)
|
||||
|
||||
cum_duration_flat = cum_duration.view(b * t_x)
|
||||
path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
|
||||
path = path.view(b, t_x, t_y)
|
||||
path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
|
||||
path = path.unsqueeze(1).transpose(2,3) * mask
|
||||
return path
|
||||
|
||||
|
||||
def clip_grad_value_(parameters, clip_value, norm_type=2):
|
||||
if isinstance(parameters, torch.Tensor):
|
||||
parameters = [parameters]
|
||||
parameters = list(filter(lambda p: p.grad is not None, parameters))
|
||||
norm_type = float(norm_type)
|
||||
if clip_value is not None:
|
||||
clip_value = float(clip_value)
|
||||
|
||||
total_norm = 0
|
||||
for p in parameters:
|
||||
param_norm = p.grad.data.norm(norm_type)
|
||||
total_norm += param_norm.item() ** norm_type
|
||||
if clip_value is not None:
|
||||
p.grad.data.clamp_(min=-clip_value, max=clip_value)
|
||||
total_norm = total_norm ** (1. / norm_type)
|
||||
return total_norm
|
||||
@@ -0,0 +1,55 @@
|
||||
{
|
||||
"train": {
|
||||
"log_interval": 200,
|
||||
"eval_interval": 1000,
|
||||
"seed": 1234,
|
||||
"epochs": 10000,
|
||||
"learning_rate": 2e-4,
|
||||
"betas": [0.8, 0.99],
|
||||
"eps": 1e-9,
|
||||
"batch_size": 32,
|
||||
"fp16_run": true,
|
||||
"lr_decay": 0.999875,
|
||||
"segment_size": 8192,
|
||||
"init_lr_ratio": 1,
|
||||
"warmup_epochs": 0,
|
||||
"c_mel": 45,
|
||||
"c_kl": 1.0
|
||||
},
|
||||
"data": {
|
||||
"training_files":"filelists/juzi_train_filelist.txt.cleaned",
|
||||
"validation_files":"filelists/juzi_val_filelist.txt.cleaned",
|
||||
"text_cleaners":["chinese_cleaners"],
|
||||
"max_wav_value": 32768.0,
|
||||
"sampling_rate": 22050,
|
||||
"filter_length": 1024,
|
||||
"hop_length": 256,
|
||||
"win_length": 1024,
|
||||
"n_mel_channels": 80,
|
||||
"mel_fmin": 0.0,
|
||||
"mel_fmax": null,
|
||||
"add_blank": true,
|
||||
"n_speakers": 8,
|
||||
"cleaned_text": true
|
||||
},
|
||||
"model": {
|
||||
"inter_channels": 192,
|
||||
"hidden_channels": 192,
|
||||
"filter_channels": 768,
|
||||
"n_heads": 2,
|
||||
"n_layers": 6,
|
||||
"kernel_size": 3,
|
||||
"p_dropout": 0.1,
|
||||
"resblock": "1",
|
||||
"resblock_kernel_sizes": [3,7,11],
|
||||
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
||||
"upsample_rates": [8,8,2,2],
|
||||
"upsample_initial_channel": 512,
|
||||
"upsample_kernel_sizes": [16,16,4,4],
|
||||
"n_layers_q": 3,
|
||||
"use_spectral_norm": false,
|
||||
"gin_channels": 256
|
||||
},
|
||||
"speakers": ["\u5c0f\u8338", "\u5510\u4e50\u541f", "\u5c0f\u6bb7", "\u82b1\u73b2", "\u8bb8\u8001\u5e08", "\u90b1\u7433", "\u4e03\u4e00", "\u516b\u56db"],
|
||||
"symbols": ["_", "\uff0c", "\u3002", "\uff01", "\uff1f", "\u2014", "\u2026", "\u3105", "\u3106", "\u3107", "\u3108", "\u3109", "\u310a", "\u310b", "\u310c", "\u310d", "\u310e", "\u310f", "\u3110", "\u3111", "\u3112", "\u3113", "\u3114", "\u3115", "\u3116", "\u3117", "\u3118", "\u3119", "\u311a", "\u311b", "\u311c", "\u311d", "\u311e", "\u311f", "\u3120", "\u3121", "\u3122", "\u3123", "\u3124", "\u3125", "\u3126", "\u3127", "\u3128", "\u3129", "\u02c9", "\u02ca", "\u02c7", "\u02cb", "\u02d9", " "]
|
||||
}
|
||||
@@ -0,0 +1,54 @@
|
||||
{
|
||||
"train": {
|
||||
"log_interval": 200,
|
||||
"eval_interval": 1000,
|
||||
"seed": 1234,
|
||||
"epochs": 10000,
|
||||
"learning_rate": 2e-4,
|
||||
"betas": [0.8, 0.99],
|
||||
"eps": 1e-9,
|
||||
"batch_size": 32,
|
||||
"fp16_run": true,
|
||||
"lr_decay": 0.999875,
|
||||
"segment_size": 8192,
|
||||
"init_lr_ratio": 1,
|
||||
"warmup_epochs": 0,
|
||||
"c_mel": 45,
|
||||
"c_kl": 1.0
|
||||
},
|
||||
"data": {
|
||||
"training_files":"filelists/cjke_train_filelist.txt.cleaned",
|
||||
"validation_files":"filelists/cjke_val_filelist.txt.cleaned",
|
||||
"text_cleaners":["cjke_cleaners2"],
|
||||
"max_wav_value": 32768.0,
|
||||
"sampling_rate": 22050,
|
||||
"filter_length": 1024,
|
||||
"hop_length": 256,
|
||||
"win_length": 1024,
|
||||
"n_mel_channels": 80,
|
||||
"mel_fmin": 0.0,
|
||||
"mel_fmax": null,
|
||||
"add_blank": true,
|
||||
"n_speakers": 2891,
|
||||
"cleaned_text": true
|
||||
},
|
||||
"model": {
|
||||
"inter_channels": 192,
|
||||
"hidden_channels": 192,
|
||||
"filter_channels": 768,
|
||||
"n_heads": 2,
|
||||
"n_layers": 6,
|
||||
"kernel_size": 3,
|
||||
"p_dropout": 0.1,
|
||||
"resblock": "1",
|
||||
"resblock_kernel_sizes": [3,7,11],
|
||||
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
||||
"upsample_rates": [8,8,2,2],
|
||||
"upsample_initial_channel": 512,
|
||||
"upsample_kernel_sizes": [16,16,4,4],
|
||||
"n_layers_q": 3,
|
||||
"use_spectral_norm": false,
|
||||
"gin_channels": 256
|
||||
},
|
||||
"symbols": ["_", ",", ".", "!", "?", "-", "~", "\u2026", "N", "Q", "a", "b", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "s", "t", "u", "v", "w", "x", "y", "z", "\u0251", "\u00e6", "\u0283", "\u0291", "\u00e7", "\u026f", "\u026a", "\u0254", "\u025b", "\u0279", "\u00f0", "\u0259", "\u026b", "\u0265", "\u0278", "\u028a", "\u027e", "\u0292", "\u03b8", "\u03b2", "\u014b", "\u0266", "\u207c", "\u02b0", "`", "^", "#", "*", "=", "\u02c8", "\u02cc", "\u2192", "\u2193", "\u2191", " "]
|
||||
}
|
||||
@@ -0,0 +1,55 @@
|
||||
{
|
||||
"train": {
|
||||
"log_interval": 200,
|
||||
"eval_interval": 1000,
|
||||
"seed": 1234,
|
||||
"epochs": 10000,
|
||||
"learning_rate": 2e-4,
|
||||
"betas": [0.8, 0.99],
|
||||
"eps": 1e-9,
|
||||
"batch_size": 32,
|
||||
"fp16_run": true,
|
||||
"lr_decay": 0.999875,
|
||||
"segment_size": 8192,
|
||||
"init_lr_ratio": 1,
|
||||
"warmup_epochs": 0,
|
||||
"c_mel": 45,
|
||||
"c_kl": 1.0
|
||||
},
|
||||
"data": {
|
||||
"training_files":"filelists/cjks_train_filelist.txt.cleaned",
|
||||
"validation_files":"filelists/cjks_val_filelist.txt.cleaned",
|
||||
"text_cleaners":["cjks_cleaners"],
|
||||
"max_wav_value": 32768.0,
|
||||
"sampling_rate": 22050,
|
||||
"filter_length": 1024,
|
||||
"hop_length": 256,
|
||||
"win_length": 1024,
|
||||
"n_mel_channels": 80,
|
||||
"mel_fmin": 0.0,
|
||||
"mel_fmax": null,
|
||||
"add_blank": true,
|
||||
"n_speakers": 24,
|
||||
"cleaned_text": true
|
||||
},
|
||||
"model": {
|
||||
"inter_channels": 192,
|
||||
"hidden_channels": 192,
|
||||
"filter_channels": 768,
|
||||
"n_heads": 2,
|
||||
"n_layers": 6,
|
||||
"kernel_size": 3,
|
||||
"p_dropout": 0.1,
|
||||
"resblock": "1",
|
||||
"resblock_kernel_sizes": [3,7,11],
|
||||
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
||||
"upsample_rates": [8,8,2,2],
|
||||
"upsample_initial_channel": 512,
|
||||
"upsample_kernel_sizes": [16,16,4,4],
|
||||
"n_layers_q": 3,
|
||||
"use_spectral_norm": false,
|
||||
"gin_channels": 256
|
||||
},
|
||||
"speakers": ["\u7dbe\u5730\u5be7\u3005", "\u671d\u6b66\u82b3\u4e43", "\u5728\u539f\u4e03\u6d77", "\u30eb\u30a4\u30ba", "\u91d1\u8272\u306e\u95c7", "\u30e2\u30e2", "\u7d50\u57ce\u7f8e\u67d1", "\u5c0f\u8338", "\u5510\u4e50\u541f", "\u5c0f\u6bb7", "\u82b1\u73b2", "\u516b\u56db", "\uc218\uc544", "\ubbf8\ubbf8\ub974", "\uc544\ub9b0", "\uc720\ud654", "\uc5f0\ud654", "SA1", "SA2", "SA3", "SA4", "SA5", "SA6", ""],
|
||||
"symbols": ["_", ",", ".", "!", "?", "-", "~", "\u2026", "N", "Q", "a", "b", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "s", "t", "u", "v", "w", "x", "y", "z", "\u0283", "\u02a7", "\u02a5", "\u02a6", "\u026f", "\u0279", "\u0259", "\u0265", "\u00e7", "\u0278", "\u027e", "\u03b2", "\u014b", "\u0266", "\u02d0", "\u207c", "\u02b0", "`", "^", "#", "*", "=", "\u2192", "\u2193", "\u2191", " "]
|
||||
}
|
||||
@@ -0,0 +1,54 @@
|
||||
{
|
||||
"train": {
|
||||
"log_interval": 100,
|
||||
"eval_interval": 200,
|
||||
"seed": 1234,
|
||||
"epochs": 10000,
|
||||
"learning_rate": 2e-4,
|
||||
"betas": [0.8, 0.99],
|
||||
"eps": 1e-9,
|
||||
"batch_size": 16,
|
||||
"fp16_run": true,
|
||||
"lr_decay": 0.999875,
|
||||
"segment_size": 8192,
|
||||
"init_lr_ratio": 1,
|
||||
"warmup_epochs": 0,
|
||||
"c_mel": 45,
|
||||
"c_kl": 1.0
|
||||
},
|
||||
"data": {
|
||||
"training_files":"../CH_JA_EN_mix_voice/rosalia/rosalia.txt.cleaned",
|
||||
"validation_files":"../CH_JA_EN_mix_voice/rosalia/rosalia.txt.cleaned",
|
||||
"text_cleaners":["cjke_cleaners2"],
|
||||
"max_wav_value": 32768.0,
|
||||
"sampling_rate": 22050,
|
||||
"filter_length": 1024,
|
||||
"hop_length": 256,
|
||||
"win_length": 1024,
|
||||
"n_mel_channels": 80,
|
||||
"mel_fmin": 0.0,
|
||||
"mel_fmax": null,
|
||||
"add_blank": true,
|
||||
"n_speakers": 1001,
|
||||
"cleaned_text": true
|
||||
},
|
||||
"model": {
|
||||
"inter_channels": 192,
|
||||
"hidden_channels": 192,
|
||||
"filter_channels": 768,
|
||||
"n_heads": 2,
|
||||
"n_layers": 6,
|
||||
"kernel_size": 3,
|
||||
"p_dropout": 0.1,
|
||||
"resblock": "1",
|
||||
"resblock_kernel_sizes": [3,7,11],
|
||||
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
||||
"upsample_rates": [8,8,2,2],
|
||||
"upsample_initial_channel": 512,
|
||||
"upsample_kernel_sizes": [16,16,4,4],
|
||||
"n_layers_q": 3,
|
||||
"use_spectral_norm": false,
|
||||
"gin_channels": 256
|
||||
},
|
||||
"symbols": ["_", ",", ".", "!", "?", "-", "~", "\u2026", "N", "Q", "a", "b", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "s", "t", "u", "v", "w", "x", "y", "z", "\u0251", "\u00e6", "\u0283", "\u0291", "\u00e7", "\u026f", "\u026a", "\u0254", "\u025b", "\u0279", "\u00f0", "\u0259", "\u026b", "\u0265", "\u0278", "\u028a", "\u027e", "\u0292", "\u03b8", "\u03b2", "\u014b", "\u0266", "\u207c", "\u02b0", "`", "^", "#", "*", "=", "\u02c8", "\u02cc", "\u2192", "\u2193", "\u2191", " "]
|
||||
}
|
||||
@@ -0,0 +1,55 @@
|
||||
{
|
||||
"train": {
|
||||
"log_interval": 200,
|
||||
"eval_interval": 1000,
|
||||
"seed": 1234,
|
||||
"epochs": 10000,
|
||||
"learning_rate": 2e-4,
|
||||
"betas": [0.8, 0.99],
|
||||
"eps": 1e-9,
|
||||
"batch_size": 32,
|
||||
"fp16_run": true,
|
||||
"lr_decay": 0.999875,
|
||||
"segment_size": 8192,
|
||||
"init_lr_ratio": 1,
|
||||
"warmup_epochs": 0,
|
||||
"c_mel": 45,
|
||||
"c_kl": 1.0
|
||||
},
|
||||
"data": {
|
||||
"training_files":"filelists/train_filelist.txt.cleaned",
|
||||
"validation_files":"filelists/val_filelist.txt.cleaned",
|
||||
"text_cleaners":["japanese_cleaners"],
|
||||
"max_wav_value": 32768.0,
|
||||
"sampling_rate": 22050,
|
||||
"filter_length": 1024,
|
||||
"hop_length": 256,
|
||||
"win_length": 1024,
|
||||
"n_mel_channels": 80,
|
||||
"mel_fmin": 0.0,
|
||||
"mel_fmax": null,
|
||||
"add_blank": true,
|
||||
"n_speakers": 7,
|
||||
"cleaned_text": true
|
||||
},
|
||||
"model": {
|
||||
"inter_channels": 192,
|
||||
"hidden_channels": 192,
|
||||
"filter_channels": 768,
|
||||
"n_heads": 2,
|
||||
"n_layers": 6,
|
||||
"kernel_size": 3,
|
||||
"p_dropout": 0.1,
|
||||
"resblock": "1",
|
||||
"resblock_kernel_sizes": [3,7,11],
|
||||
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
||||
"upsample_rates": [8,8,2,2],
|
||||
"upsample_initial_channel": 512,
|
||||
"upsample_kernel_sizes": [16,16,4,4],
|
||||
"n_layers_q": 3,
|
||||
"use_spectral_norm": false,
|
||||
"gin_channels": 256
|
||||
},
|
||||
"speakers": ["\u7dbe\u5730\u5be7\u3005", "\u56e0\u5e61\u3081\u3050\u308b", "\u671d\u6b66\u82b3\u4e43", "\u5e38\u9678\u8309\u5b50", "\u30e0\u30e9\u30b5\u30e1", "\u978d\u99ac\u5c0f\u6625", "\u5728\u539f\u4e03\u6d77"],
|
||||
"symbols": ["_", ",", ".", "!", "?", "-", "A", "E", "I", "N", "O", "Q", "U", "a", "b", "d", "e", "f", "g", "h", "i", "j", "k", "m", "n", "o", "p", "r", "s", "t", "u", "v", "w", "y", "z", "\u0283", "\u02a7", "\u2193", "\u2191", " "]
|
||||
}
|
||||
@@ -0,0 +1,55 @@
|
||||
{
|
||||
"train": {
|
||||
"log_interval": 200,
|
||||
"eval_interval": 1000,
|
||||
"seed": 1234,
|
||||
"epochs": 10000,
|
||||
"learning_rate": 2e-4,
|
||||
"betas": [0.8, 0.99],
|
||||
"eps": 1e-9,
|
||||
"batch_size": 32,
|
||||
"fp16_run": true,
|
||||
"lr_decay": 0.999875,
|
||||
"segment_size": 8192,
|
||||
"init_lr_ratio": 1,
|
||||
"warmup_epochs": 0,
|
||||
"c_mel": 45,
|
||||
"c_kl": 1.0
|
||||
},
|
||||
"data": {
|
||||
"training_files":"filelists/hamidashi_train_filelist.txt.cleaned",
|
||||
"validation_files":"filelists/hamidashi_val_filelist.txt.cleaned",
|
||||
"text_cleaners":["japanese_cleaners2"],
|
||||
"max_wav_value": 32768.0,
|
||||
"sampling_rate": 22050,
|
||||
"filter_length": 1024,
|
||||
"hop_length": 256,
|
||||
"win_length": 1024,
|
||||
"n_mel_channels": 80,
|
||||
"mel_fmin": 0.0,
|
||||
"mel_fmax": null,
|
||||
"add_blank": true,
|
||||
"n_speakers": 8,
|
||||
"cleaned_text": true
|
||||
},
|
||||
"model": {
|
||||
"inter_channels": 192,
|
||||
"hidden_channels": 192,
|
||||
"filter_channels": 768,
|
||||
"n_heads": 2,
|
||||
"n_layers": 6,
|
||||
"kernel_size": 3,
|
||||
"p_dropout": 0.1,
|
||||
"resblock": "1",
|
||||
"resblock_kernel_sizes": [3,7,11],
|
||||
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
||||
"upsample_rates": [8,8,2,2],
|
||||
"upsample_initial_channel": 512,
|
||||
"upsample_kernel_sizes": [16,16,4,4],
|
||||
"n_layers_q": 3,
|
||||
"use_spectral_norm": false,
|
||||
"gin_channels": 256
|
||||
},
|
||||
"speakers": ["\u548c\u6cc9\u5983\u611b", "\u5e38\u76e4\u83ef\u4e43", "\u9326\u3042\u3059\u307f", "\u938c\u5009\u8a69\u685c", "\u7adc\u9591\u5929\u68a8", "\u548c\u6cc9\u91cc", "\u65b0\u5ddd\u5e83\u5922", "\u8056\u8389\u3005\u5b50"],
|
||||
"symbols": ["_", ",", ".", "!", "?", "-", "~", "\u2026", "A", "E", "I", "N", "O", "Q", "U", "a", "b", "d", "e", "f", "g", "h", "i", "j", "k", "m", "n", "o", "p", "r", "s", "t", "u", "v", "w", "y", "z", "\u0283", "\u02a7", "\u02a6", "\u2193", "\u2191", " "]
|
||||
}
|
||||
@@ -0,0 +1,54 @@
|
||||
{
|
||||
"train": {
|
||||
"log_interval": 200,
|
||||
"eval_interval": 1000,
|
||||
"seed": 1234,
|
||||
"epochs": 20000,
|
||||
"learning_rate": 2e-4,
|
||||
"betas": [0.8, 0.99],
|
||||
"eps": 1e-9,
|
||||
"batch_size": 32,
|
||||
"fp16_run": true,
|
||||
"lr_decay": 0.999875,
|
||||
"segment_size": 8192,
|
||||
"init_lr_ratio": 1,
|
||||
"warmup_epochs": 0,
|
||||
"c_mel": 45,
|
||||
"c_kl": 1.0
|
||||
},
|
||||
"data": {
|
||||
"training_files":"filelists/train_filelist.txt.cleaned",
|
||||
"validation_files":"filelists/val_filelist.txt.cleaned",
|
||||
"text_cleaners":["japanese_cleaners2"],
|
||||
"max_wav_value": 32768.0,
|
||||
"sampling_rate": 22050,
|
||||
"filter_length": 1024,
|
||||
"hop_length": 256,
|
||||
"win_length": 1024,
|
||||
"n_mel_channels": 80,
|
||||
"mel_fmin": 0.0,
|
||||
"mel_fmax": null,
|
||||
"add_blank": true,
|
||||
"n_speakers": 0,
|
||||
"cleaned_text": true
|
||||
},
|
||||
"model": {
|
||||
"inter_channels": 192,
|
||||
"hidden_channels": 192,
|
||||
"filter_channels": 768,
|
||||
"n_heads": 2,
|
||||
"n_layers": 6,
|
||||
"kernel_size": 3,
|
||||
"p_dropout": 0.1,
|
||||
"resblock": "1",
|
||||
"resblock_kernel_sizes": [3,7,11],
|
||||
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
||||
"upsample_rates": [8,8,2,2],
|
||||
"upsample_initial_channel": 512,
|
||||
"upsample_kernel_sizes": [16,16,4,4],
|
||||
"n_layers_q": 3,
|
||||
"use_spectral_norm": false
|
||||
},
|
||||
"speakers": ["\u30eb\u30a4\u30ba"],
|
||||
"symbols": ["_", ",", ".", "!", "?", "-", "~", "\u2026", "A", "E", "I", "N", "O", "Q", "U", "a", "b", "d", "e", "f", "g", "h", "i", "j", "k", "m", "n", "o", "p", "r", "s", "t", "u", "v", "w", "y", "z", "\u0283", "\u02a7", "\u02a6", "\u2193", "\u2191", " "]
|
||||
}
|
||||
@@ -0,0 +1,55 @@
|
||||
{
|
||||
"train": {
|
||||
"log_interval": 200,
|
||||
"eval_interval": 1000,
|
||||
"seed": 1234,
|
||||
"epochs": 10000,
|
||||
"learning_rate": 2e-4,
|
||||
"betas": [0.8, 0.99],
|
||||
"eps": 1e-9,
|
||||
"batch_size": 32,
|
||||
"fp16_run": true,
|
||||
"lr_decay": 0.999875,
|
||||
"segment_size": 8192,
|
||||
"init_lr_ratio": 1,
|
||||
"warmup_epochs": 0,
|
||||
"c_mel": 45,
|
||||
"c_kl": 1.0
|
||||
},
|
||||
"data": {
|
||||
"training_files":"filelists/fox_train_filelist.txt.cleaned",
|
||||
"validation_files":"filelists/fox_val_filelist.txt.cleaned",
|
||||
"text_cleaners":["korean_cleaners"],
|
||||
"max_wav_value": 32768.0,
|
||||
"sampling_rate": 22050,
|
||||
"filter_length": 1024,
|
||||
"hop_length": 256,
|
||||
"win_length": 1024,
|
||||
"n_mel_channels": 80,
|
||||
"mel_fmin": 0.0,
|
||||
"mel_fmax": null,
|
||||
"add_blank": true,
|
||||
"n_speakers": 6,
|
||||
"cleaned_text": true
|
||||
},
|
||||
"model": {
|
||||
"inter_channels": 192,
|
||||
"hidden_channels": 192,
|
||||
"filter_channels": 768,
|
||||
"n_heads": 2,
|
||||
"n_layers": 6,
|
||||
"kernel_size": 3,
|
||||
"p_dropout": 0.1,
|
||||
"resblock": "1",
|
||||
"resblock_kernel_sizes": [3,7,11],
|
||||
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
||||
"upsample_rates": [8,8,2,2],
|
||||
"upsample_initial_channel": 512,
|
||||
"upsample_kernel_sizes": [16,16,4,4],
|
||||
"n_layers_q": 3,
|
||||
"use_spectral_norm": false,
|
||||
"gin_channels": 256
|
||||
},
|
||||
"speakers": ["\uc218\uc544", "\ubbf8\ubbf8\ub974", "\uc544\ub9b0", "\uc5f0\ud654", "\uc720\ud654", "\uc120\ubc30"],
|
||||
"symbols": ["_", ",", ".", "!", "?", "\u2026", "~", "\u3131", "\u3134", "\u3137", "\u3139", "\u3141", "\u3142", "\u3145", "\u3147", "\u3148", "\u314a", "\u314b", "\u314c", "\u314d", "\u314e", "\u3132", "\u3138", "\u3143", "\u3146", "\u3149", "\u314f", "\u3153", "\u3157", "\u315c", "\u3161", "\u3163", "\u3150", "\u3154", " "]
|
||||
}
|
||||
@@ -0,0 +1,55 @@
|
||||
{
|
||||
"train": {
|
||||
"log_interval": 200,
|
||||
"eval_interval": 1000,
|
||||
"seed": 1234,
|
||||
"epochs": 10000,
|
||||
"learning_rate": 2e-4,
|
||||
"betas": [0.8, 0.99],
|
||||
"eps": 1e-9,
|
||||
"batch_size": 32,
|
||||
"fp16_run": true,
|
||||
"lr_decay": 0.999875,
|
||||
"segment_size": 8192,
|
||||
"init_lr_ratio": 1,
|
||||
"warmup_epochs": 0,
|
||||
"c_mel": 45,
|
||||
"c_kl": 1.0
|
||||
},
|
||||
"data": {
|
||||
"training_files":"filelists/sanskrit_train_filelist.txt.cleaned",
|
||||
"validation_files":"filelists/sanskrit_val_filelist.txt.cleaned",
|
||||
"text_cleaners":["sanskrit_cleaners"],
|
||||
"max_wav_value": 32768.0,
|
||||
"sampling_rate": 22050,
|
||||
"filter_length": 1024,
|
||||
"hop_length": 256,
|
||||
"win_length": 1024,
|
||||
"n_mel_channels": 80,
|
||||
"mel_fmin": 0.0,
|
||||
"mel_fmax": null,
|
||||
"add_blank": true,
|
||||
"n_speakers": 27,
|
||||
"cleaned_text": true
|
||||
},
|
||||
"model": {
|
||||
"inter_channels": 192,
|
||||
"hidden_channels": 192,
|
||||
"filter_channels": 768,
|
||||
"n_heads": 2,
|
||||
"n_layers": 6,
|
||||
"kernel_size": 3,
|
||||
"p_dropout": 0.1,
|
||||
"resblock": "1",
|
||||
"resblock_kernel_sizes": [3,7,11],
|
||||
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
||||
"upsample_rates": [8,8,2,2],
|
||||
"upsample_initial_channel": 512,
|
||||
"upsample_kernel_sizes": [16,16,4,4],
|
||||
"n_layers_q": 3,
|
||||
"use_spectral_norm": false,
|
||||
"gin_channels": 256
|
||||
},
|
||||
"speakers": ["Male 1", "Male 2", "Male 3", "Male 4 (Malayalam)", "Male 5", "Male 6", "Male 7", "Male 8 (Kannada)", "Female 1 (Tamil)", "Male 9 (Kannada)", "Female 2 (Marathi)", "Female 3 (Marathi)", "Female 4 (Marathi)", "Female 5 (Telugu)", "Female 6 (Telugu)", "Male 10 (Kannada)", "Male 11 (Kannada)", "Male 12", "Male 13", "Male 14", "Male 15", "Female 7", "Male 16 (Malayalam)", "Male 17 (Tamil)", "Male 18 (Hindi)", "Male 19 (Telugu)", "Male 20 (Hindi)"],
|
||||
"symbols": ["_", "\u0964", "\u0901", "\u0902", "\u0903", "\u0905", "\u0906", "\u0907", "\u0908", "\u0909", "\u090a", "\u090b", "\u090f", "\u0910", "\u0913", "\u0914", "\u0915", "\u0916", "\u0917", "\u0918", "\u0919", "\u091a", "\u091b", "\u091c", "\u091d", "\u091e", "\u091f", "\u0920", "\u0921", "\u0922", "\u0923", "\u0924", "\u0925", "\u0926", "\u0927", "\u0928", "\u092a", "\u092b", "\u092c", "\u092d", "\u092e", "\u092f", "\u0930", "\u0932", "\u0933", "\u0935", "\u0936", "\u0937", "\u0938", "\u0939", "\u093d", "\u093e", "\u093f", "\u0940", "\u0941", "\u0942", "\u0943", "\u0944", "\u0947", "\u0948", "\u094b", "\u094c", "\u094d", "\u0960", "\u0962", " "]
|
||||
}
|
||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@@ -0,0 +1,55 @@
|
||||
{
|
||||
"train": {
|
||||
"log_interval": 200,
|
||||
"eval_interval": 1000,
|
||||
"seed": 1234,
|
||||
"epochs": 10000,
|
||||
"learning_rate": 2e-4,
|
||||
"betas": [0.8, 0.99],
|
||||
"eps": 1e-9,
|
||||
"batch_size": 32,
|
||||
"fp16_run": true,
|
||||
"lr_decay": 0.999875,
|
||||
"segment_size": 8192,
|
||||
"init_lr_ratio": 1,
|
||||
"warmup_epochs": 0,
|
||||
"c_mel": 45,
|
||||
"c_kl": 1.0
|
||||
},
|
||||
"data": {
|
||||
"training_files":"filelists/zaonhe_train_filelist.txt.cleaned",
|
||||
"validation_files":"filelists/zaonhe_val_filelist.txt.cleaned",
|
||||
"text_cleaners":["shanghainese_cleaners"],
|
||||
"max_wav_value": 32768.0,
|
||||
"sampling_rate": 22050,
|
||||
"filter_length": 1024,
|
||||
"hop_length": 256,
|
||||
"win_length": 1024,
|
||||
"n_mel_channels": 80,
|
||||
"mel_fmin": 0.0,
|
||||
"mel_fmax": null,
|
||||
"add_blank": true,
|
||||
"n_speakers": 2,
|
||||
"cleaned_text": true
|
||||
},
|
||||
"model": {
|
||||
"inter_channels": 192,
|
||||
"hidden_channels": 192,
|
||||
"filter_channels": 768,
|
||||
"n_heads": 2,
|
||||
"n_layers": 6,
|
||||
"kernel_size": 3,
|
||||
"p_dropout": 0.1,
|
||||
"resblock": "1",
|
||||
"resblock_kernel_sizes": [3,7,11],
|
||||
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
||||
"upsample_rates": [8,8,2,2],
|
||||
"upsample_initial_channel": 512,
|
||||
"upsample_kernel_sizes": [16,16,4,4],
|
||||
"n_layers_q": 3,
|
||||
"use_spectral_norm": false,
|
||||
"gin_channels": 256
|
||||
},
|
||||
"speakers": ["1", "2"],
|
||||
"symbols": ["_", ",", ".", "!", "?", "\u2026", "a", "b", "d", "f", "g", "h", "i", "k", "l", "m", "n", "o", "p", "s", "t", "u", "v", "y", "z", "\u00f8", "\u014b", "\u0235", "\u0251", "\u0254", "\u0255", "\u0259", "\u0264", "\u0266", "\u026a", "\u027f", "\u0291", "\u0294", "\u02b0", "\u0303", "\u0329", "\u1d00", "\u1d07", "1", "5", "6", "7", "8", " "]
|
||||
}
|
||||
@@ -0,0 +1,142 @@
|
||||
{
|
||||
"train": {
|
||||
"log_interval": 200,
|
||||
"eval_interval": 1000,
|
||||
"seed": 1234,
|
||||
"epochs": 10000,
|
||||
"learning_rate": 2e-4,
|
||||
"betas": [0.8, 0.99],
|
||||
"eps": 1e-9,
|
||||
"batch_size": 1,
|
||||
"fp16_run": true,
|
||||
"lr_decay": 0.999875,
|
||||
"segment_size": 8192,
|
||||
"init_lr_ratio": 1,
|
||||
"warmup_epochs": 0,
|
||||
"c_mel": 45,
|
||||
"c_kl": 1.0
|
||||
},
|
||||
"data": {
|
||||
"training_files":"E:/uma_voice/output_train.txt.cleaned",
|
||||
"validation_files":"E:/uma_voice/output_val.txt.cleaned",
|
||||
"text_cleaners":["japanese_cleaners"],
|
||||
"max_wav_value": 32768.0,
|
||||
"sampling_rate": 22050,
|
||||
"filter_length": 1024,
|
||||
"hop_length": 256,
|
||||
"win_length": 1024,
|
||||
"n_mel_channels": 80,
|
||||
"mel_fmin": 0.0,
|
||||
"mel_fmax": null,
|
||||
"add_blank": true,
|
||||
"n_speakers": 87,
|
||||
"cleaned_text": true
|
||||
},
|
||||
"model": {
|
||||
"inter_channels": 192,
|
||||
"hidden_channels": 192,
|
||||
"filter_channels": 768,
|
||||
"n_heads": 2,
|
||||
"n_layers": 6,
|
||||
"kernel_size": 3,
|
||||
"p_dropout": 0.1,
|
||||
"resblock": "1",
|
||||
"resblock_kernel_sizes": [3,7,11],
|
||||
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
||||
"upsample_rates": [8,8,2,2],
|
||||
"upsample_initial_channel": 512,
|
||||
"upsample_kernel_sizes": [16,16,4,4],
|
||||
"n_layers_q": 3,
|
||||
"use_spectral_norm": false,
|
||||
"gin_channels": 256
|
||||
},
|
||||
"speakers": ["Special Week",
|
||||
"Silence Suzuka",
|
||||
"Tokai Teio",
|
||||
"Maruzensky",
|
||||
"Fuji Kiseki",
|
||||
"Oguri Cap",
|
||||
"Gold Ship",
|
||||
"Vodka",
|
||||
"Daiwa Scarlet",
|
||||
"Taiki Shuttle",
|
||||
"Grass Wonder",
|
||||
"Hishi Amazon",
|
||||
"Mejiro Mcqueen",
|
||||
"El Condor Pasa",
|
||||
"T.M. Opera O",
|
||||
"Narita Brian",
|
||||
"Symboli Rudolf",
|
||||
"Air Groove",
|
||||
"Agnes Digital",
|
||||
"Seiun Sky",
|
||||
"Tamamo Cross",
|
||||
"Fine Motion",
|
||||
"Biwa Hayahide",
|
||||
"Mayano Topgun",
|
||||
"Manhattan Cafe",
|
||||
"Mihono Bourbon",
|
||||
"Mejiro Ryan",
|
||||
"Hishi Akebono",
|
||||
"Yukino Bijin",
|
||||
"Rice Shower",
|
||||
"Ines Fujin",
|
||||
"Agnes Tachyon",
|
||||
"Admire Vega",
|
||||
"Inari One",
|
||||
"Winning Ticket",
|
||||
"Air Shakur",
|
||||
"Eishin Flash",
|
||||
"Curren Chan",
|
||||
"Kawakami Princess",
|
||||
"Gold City",
|
||||
"Sakura Bakushin O",
|
||||
"Seeking the Pearl",
|
||||
"Shinko Windy",
|
||||
"Sweep Tosho",
|
||||
"Super Creek",
|
||||
"Smart Falcon",
|
||||
"Zenno Rob Roy",
|
||||
"Tosen Jordan",
|
||||
"Nakayama Festa",
|
||||
"Narita Taishin",
|
||||
"Nishino Flower",
|
||||
"Haru Urara",
|
||||
"Bamboo Memory",
|
||||
"Biko Pegasus",
|
||||
"Marvelous Sunday",
|
||||
"Matikane Fukukitaru",
|
||||
"Mr. C.B.",
|
||||
"Meisho Doto",
|
||||
"Mejiro Dober",
|
||||
"Nice Nature",
|
||||
"King Halo",
|
||||
"Matikane Tannhauser",
|
||||
"Ikuno Dictus",
|
||||
"Mejiro Palmer",
|
||||
"Daitaku Helios",
|
||||
"Twin Turbo",
|
||||
"Satono Diamond",
|
||||
"Kitasan Black",
|
||||
"Sakura Chiyono O",
|
||||
"Sirius Symboli",
|
||||
"Mejiro Ardan",
|
||||
"Yaeno Muteki",
|
||||
"Tsurumaru Tsuyoshi",
|
||||
"Mejiro Bright",
|
||||
"Sakura Laurel",
|
||||
"Narita Top Road",
|
||||
"Yamanin Zephyr",
|
||||
"Symboli Kris S",
|
||||
"Tanino Gimlet",
|
||||
"Daiichi Ruby",
|
||||
"Aston Machan",
|
||||
"Hayakawa Tazuna",
|
||||
"KS Miracle",
|
||||
"Kopano Rickey",
|
||||
"Hoko Tarumae",
|
||||
"Wonder Acute",
|
||||
"President Akikawa"
|
||||
],
|
||||
"symbols": ["_", ",", ".", "!", "?", "-", "A", "E", "I", "N", "O", "Q", "U", "a", "b", "d", "e", "f", "g", "h", "i", "j", "k", "m", "n", "o", "p", "r", "s", "t", "u", "v", "w", "y", "z", "\u0283", "\u02a7", "\u2193", "\u2191", " "]
|
||||
}
|
||||
@@ -0,0 +1,142 @@
|
||||
{
|
||||
"train": {
|
||||
"log_interval": 200,
|
||||
"eval_interval": 1000,
|
||||
"seed": 1234,
|
||||
"epochs": 10000,
|
||||
"learning_rate": 2e-4,
|
||||
"betas": [0.8, 0.99],
|
||||
"eps": 1e-9,
|
||||
"batch_size": 16,
|
||||
"fp16_run": true,
|
||||
"lr_decay": 0.999875,
|
||||
"segment_size": 8192,
|
||||
"init_lr_ratio": 1,
|
||||
"warmup_epochs": 0,
|
||||
"c_mel": 45,
|
||||
"c_kl": 1.0
|
||||
},
|
||||
"data": {
|
||||
"training_files":"../uma_voice/output_train.txt.cleaned",
|
||||
"validation_files":"../uma_voice/output_val.txt.cleaned",
|
||||
"text_cleaners":[],
|
||||
"max_wav_value": 32768.0,
|
||||
"sampling_rate": 22050,
|
||||
"filter_length": 1024,
|
||||
"hop_length": 256,
|
||||
"win_length": 1024,
|
||||
"n_mel_channels": 80,
|
||||
"mel_fmin": 0.0,
|
||||
"mel_fmax": null,
|
||||
"add_blank": true,
|
||||
"n_speakers": 87
|
||||
},
|
||||
"model": {
|
||||
"inter_channels": 192,
|
||||
"hidden_channels": 256,
|
||||
"filter_channels": 768,
|
||||
"n_heads": 2,
|
||||
"n_layers": 6,
|
||||
"kernel_size": 3,
|
||||
"p_dropout": 0.1,
|
||||
"resblock": "1",
|
||||
"resblock_kernel_sizes": [3,7,11],
|
||||
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
||||
"upsample_rates": [8,8,2,2],
|
||||
"upsample_initial_channel": 512,
|
||||
"upsample_kernel_sizes": [16,16,4,4],
|
||||
"n_layers_q": 3,
|
||||
"use_spectral_norm": false,
|
||||
"gin_channels": 256,
|
||||
"cleaned_text": true
|
||||
},
|
||||
"speakers": ["Special Week",
|
||||
"Silence Suzuka",
|
||||
"Tokai Teio",
|
||||
"Maruzensky",
|
||||
"Fuji Kiseki",
|
||||
"Oguri Cap",
|
||||
"Gold Ship",
|
||||
"Vodka",
|
||||
"Daiwa Scarlet",
|
||||
"Taiki Shuttle",
|
||||
"Grass Wonder",
|
||||
"Hishi Amazon",
|
||||
"Mejiro Mcqueen",
|
||||
"El Condor Pasa",
|
||||
"T.M. Opera O",
|
||||
"Narita Brian",
|
||||
"Symboli Rudolf",
|
||||
"Air Groove",
|
||||
"Agnes Digital",
|
||||
"Seiun Sky",
|
||||
"Tamamo Cross",
|
||||
"Fine Motion",
|
||||
"Biwa Hayahide",
|
||||
"Mayano Topgun",
|
||||
"Manhattan Cafe",
|
||||
"Mihono Bourbon",
|
||||
"Mejiro Ryan",
|
||||
"Hishi Akebono",
|
||||
"Yukino Bijin",
|
||||
"Rice Shower",
|
||||
"Ines Fujin",
|
||||
"Agnes Tachyon",
|
||||
"Admire Vega",
|
||||
"Inari One",
|
||||
"Winning Ticket",
|
||||
"Air Shakur",
|
||||
"Eishin Flash",
|
||||
"Curren Chan",
|
||||
"Kawakami Princess",
|
||||
"Gold City",
|
||||
"Sakura Bakushin O",
|
||||
"Seeking the Pearl",
|
||||
"Shinko Windy",
|
||||
"Sweep Tosho",
|
||||
"Super Creek",
|
||||
"Smart Falcon",
|
||||
"Zenno Rob Roy",
|
||||
"Tosen Jordan",
|
||||
"Nakayama Festa",
|
||||
"Narita Taishin",
|
||||
"Nishino Flower",
|
||||
"Haru Urara",
|
||||
"Bamboo Memory",
|
||||
"Biko Pegasus",
|
||||
"Marvelous Sunday",
|
||||
"Matikane Fukukitaru",
|
||||
"Mr. C.B.",
|
||||
"Meisho Doto",
|
||||
"Mejiro Dober",
|
||||
"Nice Nature",
|
||||
"King Halo",
|
||||
"Matikane Tannhauser",
|
||||
"Ikuno Dictus",
|
||||
"Mejiro Palmer",
|
||||
"Daitaku Helios",
|
||||
"Twin Turbo",
|
||||
"Satono Diamond",
|
||||
"Kitasan Black",
|
||||
"Sakura Chiyono O",
|
||||
"Sirius Symboli",
|
||||
"Mejiro Ardan",
|
||||
"Yaeno Muteki",
|
||||
"Tsurumaru Tsuyoshi",
|
||||
"Mejiro Bright",
|
||||
"Sakura Laurel",
|
||||
"Narita Top Road",
|
||||
"Yamanin Zephyr",
|
||||
"Symboli Kris S",
|
||||
"Tanino Gimlet",
|
||||
"Daiichi Ruby",
|
||||
"Aston Machan",
|
||||
"Hayakawa Tazuna",
|
||||
"KS Miracle",
|
||||
"Kopano Rickey",
|
||||
"Hoko Tarumae",
|
||||
"Wonder Acute",
|
||||
"President Akikawa"
|
||||
],
|
||||
"symbols": []
|
||||
}
|
||||
@@ -0,0 +1,54 @@
|
||||
{
|
||||
"train": {
|
||||
"log_interval": 200,
|
||||
"eval_interval": 1000,
|
||||
"seed": 1234,
|
||||
"epochs": 10000,
|
||||
"learning_rate": 2e-4,
|
||||
"betas": [0.8, 0.99],
|
||||
"eps": 1e-9,
|
||||
"batch_size": 16,
|
||||
"fp16_run": true,
|
||||
"lr_decay": 0.999875,
|
||||
"segment_size": 8192,
|
||||
"init_lr_ratio": 1,
|
||||
"warmup_epochs": 0,
|
||||
"c_mel": 45,
|
||||
"c_kl": 1.0
|
||||
},
|
||||
"data": {
|
||||
"training_files":"../CH_JA_EN_mix_voice/clipped_3_vits_trilingual_annotations.train.txt.cleaned",
|
||||
"validation_files":"../CH_JA_EN_mix_voice/clipped_3_vits_trilingual_annotations.val.txt.cleaned",
|
||||
"text_cleaners":["cjke_cleaners2"],
|
||||
"max_wav_value": 32768.0,
|
||||
"sampling_rate": 22050,
|
||||
"filter_length": 1024,
|
||||
"hop_length": 256,
|
||||
"win_length": 1024,
|
||||
"n_mel_channels": 80,
|
||||
"mel_fmin": 0.0,
|
||||
"mel_fmax": null,
|
||||
"add_blank": true,
|
||||
"n_speakers": 999,
|
||||
"cleaned_text": true
|
||||
},
|
||||
"model": {
|
||||
"inter_channels": 192,
|
||||
"hidden_channels": 192,
|
||||
"filter_channels": 768,
|
||||
"n_heads": 2,
|
||||
"n_layers": 6,
|
||||
"kernel_size": 3,
|
||||
"p_dropout": 0.1,
|
||||
"resblock": "1",
|
||||
"resblock_kernel_sizes": [3,7,11],
|
||||
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
||||
"upsample_rates": [8,8,2,2],
|
||||
"upsample_initial_channel": 512,
|
||||
"upsample_kernel_sizes": [16,16,4,4],
|
||||
"n_layers_q": 3,
|
||||
"use_spectral_norm": false,
|
||||
"gin_channels": 256
|
||||
},
|
||||
"symbols": ["_", ",", ".", "!", "?", "-", "~", "\u2026", "N", "Q", "a", "b", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "s", "t", "u", "v", "w", "x", "y", "z", "\u0251", "\u00e6", "\u0283", "\u0291", "\u00e7", "\u026f", "\u026a", "\u0254", "\u025b", "\u0279", "\u00f0", "\u0259", "\u026b", "\u0265", "\u0278", "\u028a", "\u027e", "\u0292", "\u03b8", "\u03b2", "\u014b", "\u0266", "\u207c", "\u02b0", "`", "^", "#", "*", "=", "\u02c8", "\u02cc", "\u2192", "\u2193", "\u2191", " "]
|
||||
}
|
||||
@@ -0,0 +1,35 @@
|
||||
{
|
||||
"train": {
|
||||
"segment_size": 8192
|
||||
},
|
||||
"data": {
|
||||
"text_cleaners":["japanese_cleaners"],
|
||||
"max_wav_value": 32768.0,
|
||||
"sampling_rate": 22050,
|
||||
"filter_length": 1024,
|
||||
"hop_length": 256,
|
||||
"win_length": 1024,
|
||||
"add_blank": true,
|
||||
"n_speakers": 7
|
||||
},
|
||||
"model": {
|
||||
"inter_channels": 192,
|
||||
"hidden_channels": 192,
|
||||
"filter_channels": 768,
|
||||
"n_heads": 2,
|
||||
"n_layers": 6,
|
||||
"kernel_size": 3,
|
||||
"p_dropout": 0.1,
|
||||
"resblock": "1",
|
||||
"resblock_kernel_sizes": [3,7,11],
|
||||
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
||||
"upsample_rates": [8,8,2,2],
|
||||
"upsample_initial_channel": 512,
|
||||
"upsample_kernel_sizes": [16,16,4,4],
|
||||
"n_layers_q": 3,
|
||||
"use_spectral_norm": false,
|
||||
"gin_channels": 256
|
||||
},
|
||||
"speakers": ["\u7dbe\u5730\u5be7\u3005", "\u56e0\u5e61\u3081\u3050\u308b", "\u671d\u6b66\u82b3\u4e43", "\u5e38\u9678\u8309\u5b50", "\u30e0\u30e9\u30b5\u30e1", "\u978d\u99ac\u5c0f\u6625", "\u5728\u539f\u4e03\u6d77"],
|
||||
"symbols": ["_", ",", ".", "!", "?", "-", "A", "E", "I", "N", "O", "Q", "U", "a", "b", "d", "e", "f", "g", "h", "i", "j", "k", "m", "n", "o", "p", "r", "s", "t", "u", "v", "w", "y", "z", "\u0283", "\u02a7", "\u2193", "\u2191", " "]
|
||||
}
|
||||
@@ -0,0 +1,55 @@
|
||||
{
|
||||
"train": {
|
||||
"log_interval": 200,
|
||||
"eval_interval": 1000,
|
||||
"seed": 1234,
|
||||
"epochs": 10000,
|
||||
"learning_rate": 2e-4,
|
||||
"betas": [0.8, 0.99],
|
||||
"eps": 1e-9,
|
||||
"batch_size": 32,
|
||||
"fp16_run": true,
|
||||
"lr_decay": 0.999875,
|
||||
"segment_size": 8192,
|
||||
"init_lr_ratio": 1,
|
||||
"warmup_epochs": 0,
|
||||
"c_mel": 45,
|
||||
"c_kl": 1.0
|
||||
},
|
||||
"data": {
|
||||
"training_files":"filelists/zero_train_filelist.txt.cleaned",
|
||||
"validation_files":"filelists/zero_val_filelist.txt.cleaned",
|
||||
"text_cleaners":["japanese_cleaners2"],
|
||||
"max_wav_value": 32768.0,
|
||||
"sampling_rate": 22050,
|
||||
"filter_length": 1024,
|
||||
"hop_length": 256,
|
||||
"win_length": 1024,
|
||||
"n_mel_channels": 80,
|
||||
"mel_fmin": 0.0,
|
||||
"mel_fmax": null,
|
||||
"add_blank": true,
|
||||
"n_speakers": 26,
|
||||
"cleaned_text": true
|
||||
},
|
||||
"model": {
|
||||
"inter_channels": 192,
|
||||
"hidden_channels": 192,
|
||||
"filter_channels": 768,
|
||||
"n_heads": 2,
|
||||
"n_layers": 6,
|
||||
"kernel_size": 3,
|
||||
"p_dropout": 0.1,
|
||||
"resblock": "1",
|
||||
"resblock_kernel_sizes": [3,7,11],
|
||||
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
||||
"upsample_rates": [8,8,2,2],
|
||||
"upsample_initial_channel": 512,
|
||||
"upsample_kernel_sizes": [16,16,4,4],
|
||||
"n_layers_q": 3,
|
||||
"use_spectral_norm": false,
|
||||
"gin_channels": 256
|
||||
},
|
||||
"speakers": ["\u30eb\u30a4\u30ba", "\u30c6\u30a3\u30d5\u30a1\u30cb\u30a2", "\u30a4\u30eb\u30af\u30af\u30a5", "\u30a2\u30f3\u30ea\u30a8\u30c3\u30bf", "\u30bf\u30d0\u30b5", "\u30b7\u30a8\u30b9\u30bf", "\u30cf\u30eb\u30ca", "\u5c11\u5973\u30ea\u30b7\u30e5", "\u30ea\u30b7\u30e5", "\u30a2\u30ad\u30ca", "\u30af\u30ea\u30b9", "\u30ab\u30c8\u30ec\u30a2", "\u30a8\u30ec\u30aa\u30ce\u30fc\u30eb", "\u30e2\u30f3\u30e2\u30e9\u30f3\u30b7\u30fc", "\u30ea\u30fc\u30f4\u30eb", "\u30ad\u30e5\u30eb\u30b1", "\u30a6\u30a7\u30b6\u30ea\u30fc", "\u30b5\u30a4\u30c8", "\u30ae\u30fc\u30b7\u30e5", "\u30b3\u30eb\u30d9\u30fc\u30eb", "\u30aa\u30b9\u30de\u30f3", "\u30c7\u30eb\u30d5\u30ea\u30f3\u30ac\u30fc", "\u30c6\u30af\u30b9\u30c8", "\u30c0\u30f3\u30d7\u30ea\u30e1", "\u30ac\u30ec\u30c3\u30c8", "\u30b9\u30ab\u30ed\u30f3"],
|
||||
"symbols": ["_", ",", ".", "!", "?", "-", "~", "\u2026", "A", "E", "I", "N", "O", "Q", "U", "a", "b", "d", "e", "f", "g", "h", "i", "j", "k", "m", "n", "o", "p", "r", "s", "t", "u", "v", "w", "y", "z", "\u0283", "\u02a7", "\u02a6", "\u2193", "\u2191", " "]
|
||||
}
|
||||
@@ -0,0 +1,55 @@
|
||||
{
|
||||
"train": {
|
||||
"log_interval": 200,
|
||||
"eval_interval": 1000,
|
||||
"seed": 1234,
|
||||
"epochs": 10000,
|
||||
"learning_rate": 2e-4,
|
||||
"betas": [0.8, 0.99],
|
||||
"eps": 1e-9,
|
||||
"batch_size": 32,
|
||||
"fp16_run": true,
|
||||
"lr_decay": 0.999875,
|
||||
"segment_size": 8192,
|
||||
"init_lr_ratio": 1,
|
||||
"warmup_epochs": 0,
|
||||
"c_mel": 45,
|
||||
"c_kl": 1.0
|
||||
},
|
||||
"data": {
|
||||
"training_files":"filelists/mix_train_filelist.txt.cleaned",
|
||||
"validation_files":"filelists/mix_val_filelist.txt.cleaned",
|
||||
"text_cleaners":["zh_ja_mixture_cleaners"],
|
||||
"max_wav_value": 32768.0,
|
||||
"sampling_rate": 22050,
|
||||
"filter_length": 1024,
|
||||
"hop_length": 256,
|
||||
"win_length": 1024,
|
||||
"n_mel_channels": 80,
|
||||
"mel_fmin": 0.0,
|
||||
"mel_fmax": null,
|
||||
"add_blank": true,
|
||||
"n_speakers": 5,
|
||||
"cleaned_text": true
|
||||
},
|
||||
"model": {
|
||||
"inter_channels": 192,
|
||||
"hidden_channels": 192,
|
||||
"filter_channels": 768,
|
||||
"n_heads": 2,
|
||||
"n_layers": 6,
|
||||
"kernel_size": 3,
|
||||
"p_dropout": 0.1,
|
||||
"resblock": "1",
|
||||
"resblock_kernel_sizes": [3,7,11],
|
||||
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
||||
"upsample_rates": [8,8,2,2],
|
||||
"upsample_initial_channel": 512,
|
||||
"upsample_kernel_sizes": [16,16,4,4],
|
||||
"n_layers_q": 3,
|
||||
"use_spectral_norm": false,
|
||||
"gin_channels": 256
|
||||
},
|
||||
"speakers": ["\u7dbe\u5730\u5be7\u3005", "\u5728\u539f\u4e03\u6d77", "\u5c0f\u8338", "\u5510\u4e50\u541f"],
|
||||
"symbols": ["_", ",", ".", "!", "?", "-", "~", "\u2026", "A", "E", "I", "N", "O", "Q", "U", "a", "b", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "r", "s", "t", "u", "v", "w", "y", "z", "\u0283", "\u02a7", "\u02a6", "\u026f", "\u0279", "\u0259", "\u0265", "\u207c", "\u02b0", "`", "\u2192", "\u2193", "\u2191", " "]
|
||||
}
|
||||
+399
@@ -0,0 +1,399 @@
|
||||
import time
|
||||
import os
|
||||
import random
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.utils.data
|
||||
|
||||
import commons
|
||||
from mel_processing import spectrogram_torch
|
||||
from utils import load_wav_to_torch, load_filepaths_and_text
|
||||
from text import text_to_sequence, cleaned_text_to_sequence
|
||||
|
||||
|
||||
class TextAudioLoader(torch.utils.data.Dataset):
|
||||
"""
|
||||
1) loads audio, text pairs
|
||||
2) normalizes text and converts them to sequences of integers
|
||||
3) computes spectrograms from audio files.
|
||||
"""
|
||||
|
||||
def __init__(self, audiopaths_and_text, hparams):
|
||||
self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
|
||||
self.text_cleaners = hparams.text_cleaners
|
||||
self.max_wav_value = hparams.max_wav_value
|
||||
self.sampling_rate = hparams.sampling_rate
|
||||
self.filter_length = hparams.filter_length
|
||||
self.hop_length = hparams.hop_length
|
||||
self.win_length = hparams.win_length
|
||||
self.sampling_rate = hparams.sampling_rate
|
||||
|
||||
self.cleaned_text = getattr(hparams, "cleaned_text", False)
|
||||
|
||||
self.add_blank = hparams.add_blank
|
||||
self.min_text_len = getattr(hparams, "min_text_len", 1)
|
||||
self.max_text_len = getattr(hparams, "max_text_len", 190)
|
||||
|
||||
random.seed(1234)
|
||||
random.shuffle(self.audiopaths_and_text)
|
||||
self._filter()
|
||||
|
||||
def _filter(self):
|
||||
"""
|
||||
Filter text & store spec lengths
|
||||
"""
|
||||
# Store spectrogram lengths for Bucketing
|
||||
# wav_length ~= file_size / (wav_channels * Bytes per dim) = file_size / (1 * 2)
|
||||
# spec_length = wav_length // hop_length
|
||||
|
||||
audiopaths_and_text_new = []
|
||||
lengths = []
|
||||
for audiopath, text in self.audiopaths_and_text:
|
||||
if self.min_text_len <= len(text) and len(text) <= self.max_text_len:
|
||||
audiopaths_and_text_new.append([audiopath, text])
|
||||
lengths.append(os.path.getsize(audiopath) // (2 * self.hop_length))
|
||||
self.audiopaths_and_text = audiopaths_and_text_new
|
||||
self.lengths = lengths
|
||||
|
||||
def get_audio_text_pair(self, audiopath_and_text):
|
||||
# separate filename and text
|
||||
audiopath, text = audiopath_and_text[0], audiopath_and_text[1]
|
||||
text = self.get_text(text)
|
||||
spec, wav = self.get_audio(audiopath)
|
||||
return (text, spec, wav)
|
||||
|
||||
def get_audio(self, filename):
|
||||
audio, sampling_rate = load_wav_to_torch(filename)
|
||||
if sampling_rate != self.sampling_rate:
|
||||
raise ValueError("{} {} SR doesn't match target {} SR".format(
|
||||
sampling_rate, self.sampling_rate))
|
||||
audio_norm = audio / self.max_wav_value
|
||||
audio_norm = audio_norm.unsqueeze(0)
|
||||
spec_filename = filename.replace(".wav", ".spec.pt")
|
||||
if os.path.exists(spec_filename):
|
||||
spec = torch.load(spec_filename)
|
||||
else:
|
||||
spec = spectrogram_torch(audio_norm, self.filter_length,
|
||||
self.sampling_rate, self.hop_length, self.win_length,
|
||||
center=False)
|
||||
spec = torch.squeeze(spec, 0)
|
||||
torch.save(spec, spec_filename)
|
||||
return spec, audio_norm
|
||||
|
||||
def get_text(self, text):
|
||||
if self.cleaned_text:
|
||||
text_norm = cleaned_text_to_sequence(text)
|
||||
else:
|
||||
text_norm = text_to_sequence(text, self.text_cleaners)
|
||||
if self.add_blank:
|
||||
text_norm = commons.intersperse(text_norm, 0)
|
||||
text_norm = torch.LongTensor(text_norm)
|
||||
return text_norm
|
||||
|
||||
def __getitem__(self, index):
|
||||
return self.get_audio_text_pair(self.audiopaths_and_text[index])
|
||||
|
||||
def __len__(self):
|
||||
return len(self.audiopaths_and_text)
|
||||
|
||||
|
||||
class TextAudioCollate():
|
||||
""" Zero-pads model inputs and targets
|
||||
"""
|
||||
|
||||
def __init__(self, return_ids=False):
|
||||
self.return_ids = return_ids
|
||||
|
||||
def __call__(self, batch):
|
||||
"""Collate's training batch from normalized text and aduio
|
||||
PARAMS
|
||||
------
|
||||
batch: [text_normalized, spec_normalized, wav_normalized]
|
||||
"""
|
||||
# Right zero-pad all one-hot text sequences to max input length
|
||||
_, ids_sorted_decreasing = torch.sort(
|
||||
torch.LongTensor([x[1].size(1) for x in batch]),
|
||||
dim=0, descending=True)
|
||||
|
||||
max_text_len = max([len(x[0]) for x in batch])
|
||||
max_spec_len = max([x[1].size(1) for x in batch])
|
||||
max_wav_len = max([x[2].size(1) for x in batch])
|
||||
|
||||
text_lengths = torch.LongTensor(len(batch))
|
||||
spec_lengths = torch.LongTensor(len(batch))
|
||||
wav_lengths = torch.LongTensor(len(batch))
|
||||
|
||||
text_padded = torch.LongTensor(len(batch), max_text_len)
|
||||
spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len)
|
||||
wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
|
||||
text_padded.zero_()
|
||||
spec_padded.zero_()
|
||||
wav_padded.zero_()
|
||||
for i in range(len(ids_sorted_decreasing)):
|
||||
row = batch[ids_sorted_decreasing[i]]
|
||||
|
||||
text = row[0]
|
||||
text_padded[i, :text.size(0)] = text
|
||||
text_lengths[i] = text.size(0)
|
||||
|
||||
spec = row[1]
|
||||
spec_padded[i, :, :spec.size(1)] = spec
|
||||
spec_lengths[i] = spec.size(1)
|
||||
|
||||
wav = row[2]
|
||||
wav_padded[i, :, :wav.size(1)] = wav
|
||||
wav_lengths[i] = wav.size(1)
|
||||
|
||||
if self.return_ids:
|
||||
return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, ids_sorted_decreasing
|
||||
return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths
|
||||
|
||||
|
||||
"""Multi speaker version"""
|
||||
|
||||
|
||||
class TextAudioSpeakerLoader(torch.utils.data.Dataset):
|
||||
"""
|
||||
1) loads audio, speaker_id, text pairs
|
||||
2) normalizes text and converts them to sequences of integers
|
||||
3) computes spectrograms from audio files.
|
||||
"""
|
||||
|
||||
def __init__(self, audiopaths_sid_text, hparams):
|
||||
self.audiopaths_sid_text = load_filepaths_and_text(audiopaths_sid_text)
|
||||
self.text_cleaners = hparams.text_cleaners
|
||||
self.max_wav_value = hparams.max_wav_value
|
||||
self.sampling_rate = hparams.sampling_rate
|
||||
self.filter_length = hparams.filter_length
|
||||
self.hop_length = hparams.hop_length
|
||||
self.win_length = hparams.win_length
|
||||
self.sampling_rate = hparams.sampling_rate
|
||||
|
||||
self.cleaned_text = getattr(hparams, "cleaned_text", False)
|
||||
|
||||
self.add_blank = hparams.add_blank
|
||||
self.min_text_len = getattr(hparams, "min_text_len", 1)
|
||||
self.max_text_len = getattr(hparams, "max_text_len", 190)
|
||||
|
||||
random.seed(1234)
|
||||
random.shuffle(self.audiopaths_sid_text)
|
||||
self._filter()
|
||||
|
||||
def _filter(self):
|
||||
"""
|
||||
Filter text & store spec lengths
|
||||
"""
|
||||
# Store spectrogram lengths for Bucketing
|
||||
# wav_length ~= file_size / (wav_channels * Bytes per dim) = file_size / (1 * 2)
|
||||
# spec_length = wav_length // hop_length
|
||||
|
||||
audiopaths_sid_text_new = []
|
||||
lengths = []
|
||||
for audiopath, sid, text in self.audiopaths_sid_text:
|
||||
audiopath = "E:/uma_voice/" + audiopath
|
||||
if self.min_text_len <= len(text) and len(text) <= self.max_text_len:
|
||||
audiopaths_sid_text_new.append([audiopath, sid, text])
|
||||
lengths.append(os.path.getsize(audiopath) // (2 * self.hop_length))
|
||||
self.audiopaths_sid_text = audiopaths_sid_text_new
|
||||
self.lengths = lengths
|
||||
|
||||
def get_audio_text_speaker_pair(self, audiopath_sid_text):
|
||||
# separate filename, speaker_id and text
|
||||
audiopath, sid, text = audiopath_sid_text[0], audiopath_sid_text[1], audiopath_sid_text[2]
|
||||
text = self.get_text(text)
|
||||
spec, wav = self.get_audio(audiopath)
|
||||
sid = self.get_sid(sid)
|
||||
return (text, spec, wav, sid)
|
||||
|
||||
def get_audio(self, filename):
|
||||
audio, sampling_rate = load_wav_to_torch(filename)
|
||||
if sampling_rate != self.sampling_rate:
|
||||
raise ValueError("{} {} SR doesn't match target {} SR".format(
|
||||
sampling_rate, self.sampling_rate))
|
||||
audio_norm = audio / self.max_wav_value
|
||||
audio_norm = audio_norm.unsqueeze(0)
|
||||
spec_filename = filename.replace(".wav", ".spec.pt")
|
||||
if os.path.exists(spec_filename):
|
||||
spec = torch.load(spec_filename)
|
||||
else:
|
||||
spec = spectrogram_torch(audio_norm, self.filter_length,
|
||||
self.sampling_rate, self.hop_length, self.win_length,
|
||||
center=False)
|
||||
spec = torch.squeeze(spec, 0)
|
||||
torch.save(spec, spec_filename)
|
||||
return spec, audio_norm
|
||||
|
||||
def get_text(self, text):
|
||||
if self.cleaned_text:
|
||||
text_norm = cleaned_text_to_sequence(text)
|
||||
else:
|
||||
text_norm = text_to_sequence(text, self.text_cleaners)
|
||||
if self.add_blank:
|
||||
text_norm = commons.intersperse(text_norm, 0)
|
||||
text_norm = torch.LongTensor(text_norm)
|
||||
return text_norm
|
||||
|
||||
def get_sid(self, sid):
|
||||
sid = torch.LongTensor([int(sid)])
|
||||
return sid
|
||||
|
||||
def __getitem__(self, index):
|
||||
return self.get_audio_text_speaker_pair(self.audiopaths_sid_text[index])
|
||||
|
||||
def __len__(self):
|
||||
return len(self.audiopaths_sid_text)
|
||||
|
||||
|
||||
class TextAudioSpeakerCollate():
|
||||
""" Zero-pads model inputs and targets
|
||||
"""
|
||||
|
||||
def __init__(self, return_ids=False):
|
||||
self.return_ids = return_ids
|
||||
|
||||
def __call__(self, batch):
|
||||
"""Collate's training batch from normalized text, audio and speaker identities
|
||||
PARAMS
|
||||
------
|
||||
batch: [text_normalized, spec_normalized, wav_normalized, sid]
|
||||
"""
|
||||
# Right zero-pad all one-hot text sequences to max input length
|
||||
_, ids_sorted_decreasing = torch.sort(
|
||||
torch.LongTensor([x[1].size(1) for x in batch]),
|
||||
dim=0, descending=True)
|
||||
|
||||
max_text_len = max([len(x[0]) for x in batch])
|
||||
max_spec_len = max([x[1].size(1) for x in batch])
|
||||
max_wav_len = max([x[2].size(1) for x in batch])
|
||||
|
||||
text_lengths = torch.LongTensor(len(batch))
|
||||
spec_lengths = torch.LongTensor(len(batch))
|
||||
wav_lengths = torch.LongTensor(len(batch))
|
||||
sid = torch.LongTensor(len(batch))
|
||||
|
||||
text_padded = torch.LongTensor(len(batch), max_text_len)
|
||||
spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len)
|
||||
wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
|
||||
text_padded.zero_()
|
||||
spec_padded.zero_()
|
||||
wav_padded.zero_()
|
||||
for i in range(len(ids_sorted_decreasing)):
|
||||
row = batch[ids_sorted_decreasing[i]]
|
||||
|
||||
text = row[0]
|
||||
text_padded[i, :text.size(0)] = text
|
||||
text_lengths[i] = text.size(0)
|
||||
|
||||
spec = row[1]
|
||||
spec_padded[i, :, :spec.size(1)] = spec
|
||||
spec_lengths[i] = spec.size(1)
|
||||
|
||||
wav = row[2]
|
||||
wav_padded[i, :, :wav.size(1)] = wav
|
||||
wav_lengths[i] = wav.size(1)
|
||||
|
||||
sid[i] = row[3]
|
||||
|
||||
if self.return_ids:
|
||||
return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, sid, ids_sorted_decreasing
|
||||
return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, sid
|
||||
|
||||
|
||||
class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
|
||||
"""
|
||||
Maintain similar input lengths in a batch.
|
||||
Length groups are specified by boundaries.
|
||||
Ex) boundaries = [b1, b2, b3] -> any batch is included either {x | b1 < length(x) <=b2} or {x | b2 < length(x) <= b3}.
|
||||
|
||||
It removes samples which are not included in the boundaries.
|
||||
Ex) boundaries = [b1, b2, b3] -> any x s.t. length(x) <= b1 or length(x) > b3 are discarded.
|
||||
"""
|
||||
|
||||
def __init__(self, dataset, batch_size, boundaries, num_replicas=None, rank=None, shuffle=True):
|
||||
super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle)
|
||||
self.lengths = dataset.lengths
|
||||
self.batch_size = batch_size
|
||||
self.boundaries = boundaries
|
||||
|
||||
self.buckets, self.num_samples_per_bucket = self._create_buckets()
|
||||
self.total_size = sum(self.num_samples_per_bucket)
|
||||
self.num_samples = self.total_size // self.num_replicas
|
||||
|
||||
def _create_buckets(self):
|
||||
buckets = [[] for _ in range(len(self.boundaries) - 1)]
|
||||
for i in range(len(self.lengths)):
|
||||
length = self.lengths[i]
|
||||
idx_bucket = self._bisect(length)
|
||||
if idx_bucket != -1:
|
||||
buckets[idx_bucket].append(i)
|
||||
|
||||
for i in range(len(buckets) - 1, 0, -1):
|
||||
if len(buckets[i]) == 0:
|
||||
buckets.pop(i)
|
||||
self.boundaries.pop(i + 1)
|
||||
|
||||
num_samples_per_bucket = []
|
||||
for i in range(len(buckets)):
|
||||
len_bucket = len(buckets[i])
|
||||
total_batch_size = self.num_replicas * self.batch_size
|
||||
rem = (total_batch_size - (len_bucket % total_batch_size)) % total_batch_size
|
||||
num_samples_per_bucket.append(len_bucket + rem)
|
||||
return buckets, num_samples_per_bucket
|
||||
|
||||
def __iter__(self):
|
||||
# deterministically shuffle based on epoch
|
||||
g = torch.Generator()
|
||||
g.manual_seed(self.epoch)
|
||||
|
||||
indices = []
|
||||
if self.shuffle:
|
||||
for bucket in self.buckets:
|
||||
indices.append(torch.randperm(len(bucket), generator=g).tolist())
|
||||
else:
|
||||
for bucket in self.buckets:
|
||||
indices.append(list(range(len(bucket))))
|
||||
|
||||
batches = []
|
||||
for i in range(len(self.buckets)):
|
||||
bucket = self.buckets[i]
|
||||
len_bucket = len(bucket)
|
||||
ids_bucket = indices[i]
|
||||
num_samples_bucket = self.num_samples_per_bucket[i]
|
||||
|
||||
# add extra samples to make it evenly divisible
|
||||
rem = num_samples_bucket - len_bucket
|
||||
ids_bucket = ids_bucket + ids_bucket * (rem // len_bucket) + ids_bucket[:(rem % len_bucket)]
|
||||
|
||||
# subsample
|
||||
ids_bucket = ids_bucket[self.rank::self.num_replicas]
|
||||
|
||||
# batching
|
||||
for j in range(len(ids_bucket) // self.batch_size):
|
||||
batch = [bucket[idx] for idx in ids_bucket[j * self.batch_size:(j + 1) * self.batch_size]]
|
||||
batches.append(batch)
|
||||
|
||||
if self.shuffle:
|
||||
batch_ids = torch.randperm(len(batches), generator=g).tolist()
|
||||
batches = [batches[i] for i in batch_ids]
|
||||
self.batches = batches
|
||||
|
||||
assert len(self.batches) * self.batch_size == self.num_samples
|
||||
return iter(self.batches)
|
||||
|
||||
def _bisect(self, x, lo=0, hi=None):
|
||||
if hi is None:
|
||||
hi = len(self.boundaries) - 1
|
||||
|
||||
if hi > lo:
|
||||
mid = (hi + lo) // 2
|
||||
if self.boundaries[mid] < x and x <= self.boundaries[mid + 1]:
|
||||
return mid
|
||||
elif x <= self.boundaries[mid]:
|
||||
return self._bisect(x, lo, mid)
|
||||
else:
|
||||
return self._bisect(x, mid + 1, hi)
|
||||
else:
|
||||
return -1
|
||||
|
||||
def __len__(self):
|
||||
return self.num_samples // self.batch_size
|
||||
@@ -0,0 +1,61 @@
|
||||
import torch
|
||||
from torch.nn import functional as F
|
||||
|
||||
import commons
|
||||
|
||||
|
||||
def feature_loss(fmap_r, fmap_g):
|
||||
loss = 0
|
||||
for dr, dg in zip(fmap_r, fmap_g):
|
||||
for rl, gl in zip(dr, dg):
|
||||
rl = rl.float().detach()
|
||||
gl = gl.float()
|
||||
loss += torch.mean(torch.abs(rl - gl))
|
||||
|
||||
return loss * 2
|
||||
|
||||
|
||||
def discriminator_loss(disc_real_outputs, disc_generated_outputs):
|
||||
loss = 0
|
||||
r_losses = []
|
||||
g_losses = []
|
||||
for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
|
||||
dr = dr.float()
|
||||
dg = dg.float()
|
||||
r_loss = torch.mean((1-dr)**2)
|
||||
g_loss = torch.mean(dg**2)
|
||||
loss += (r_loss + g_loss)
|
||||
r_losses.append(r_loss.item())
|
||||
g_losses.append(g_loss.item())
|
||||
|
||||
return loss, r_losses, g_losses
|
||||
|
||||
|
||||
def generator_loss(disc_outputs):
|
||||
loss = 0
|
||||
gen_losses = []
|
||||
for dg in disc_outputs:
|
||||
dg = dg.float()
|
||||
l = torch.mean((1-dg)**2)
|
||||
gen_losses.append(l)
|
||||
loss += l
|
||||
|
||||
return loss, gen_losses
|
||||
|
||||
|
||||
def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
|
||||
"""
|
||||
z_p, logs_q: [b, h, t_t]
|
||||
m_p, logs_p: [b, h, t_t]
|
||||
"""
|
||||
z_p = z_p.float()
|
||||
logs_q = logs_q.float()
|
||||
m_p = m_p.float()
|
||||
logs_p = logs_p.float()
|
||||
z_mask = z_mask.float()
|
||||
|
||||
kl = logs_p - logs_q - 0.5
|
||||
kl += 0.5 * ((z_p - m_p)**2) * torch.exp(-2. * logs_p)
|
||||
kl = torch.sum(kl * z_mask)
|
||||
l = kl / torch.sum(z_mask)
|
||||
return l
|
||||
@@ -0,0 +1,112 @@
|
||||
import math
|
||||
import os
|
||||
import random
|
||||
import torch
|
||||
from torch import nn
|
||||
import torch.nn.functional as F
|
||||
import torch.utils.data
|
||||
import numpy as np
|
||||
import librosa
|
||||
import librosa.util as librosa_util
|
||||
from librosa.util import normalize, pad_center, tiny
|
||||
from scipy.signal import get_window
|
||||
from scipy.io.wavfile import read
|
||||
from librosa.filters import mel as librosa_mel_fn
|
||||
|
||||
MAX_WAV_VALUE = 32768.0
|
||||
|
||||
|
||||
def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
|
||||
"""
|
||||
PARAMS
|
||||
------
|
||||
C: compression factor
|
||||
"""
|
||||
return torch.log(torch.clamp(x, min=clip_val) * C)
|
||||
|
||||
|
||||
def dynamic_range_decompression_torch(x, C=1):
|
||||
"""
|
||||
PARAMS
|
||||
------
|
||||
C: compression factor used to compress
|
||||
"""
|
||||
return torch.exp(x) / C
|
||||
|
||||
|
||||
def spectral_normalize_torch(magnitudes):
|
||||
output = dynamic_range_compression_torch(magnitudes)
|
||||
return output
|
||||
|
||||
|
||||
def spectral_de_normalize_torch(magnitudes):
|
||||
output = dynamic_range_decompression_torch(magnitudes)
|
||||
return output
|
||||
|
||||
|
||||
mel_basis = {}
|
||||
hann_window = {}
|
||||
|
||||
|
||||
def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
|
||||
if torch.min(y) < -1.:
|
||||
print('min value is ', torch.min(y))
|
||||
if torch.max(y) > 1.:
|
||||
print('max value is ', torch.max(y))
|
||||
|
||||
global hann_window
|
||||
dtype_device = str(y.dtype) + '_' + str(y.device)
|
||||
wnsize_dtype_device = str(win_size) + '_' + dtype_device
|
||||
if wnsize_dtype_device not in hann_window:
|
||||
hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
|
||||
|
||||
y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
|
||||
y = y.squeeze(1)
|
||||
|
||||
spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
|
||||
center=center, pad_mode='reflect', normalized=False, onesided=True)
|
||||
|
||||
spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
|
||||
return spec
|
||||
|
||||
|
||||
def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
|
||||
global mel_basis
|
||||
dtype_device = str(spec.dtype) + '_' + str(spec.device)
|
||||
fmax_dtype_device = str(fmax) + '_' + dtype_device
|
||||
if fmax_dtype_device not in mel_basis:
|
||||
mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
|
||||
mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device)
|
||||
spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
|
||||
spec = spectral_normalize_torch(spec)
|
||||
return spec
|
||||
|
||||
|
||||
def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
|
||||
if torch.min(y) < -1.:
|
||||
print('min value is ', torch.min(y))
|
||||
if torch.max(y) > 1.:
|
||||
print('max value is ', torch.max(y))
|
||||
|
||||
global mel_basis, hann_window
|
||||
dtype_device = str(y.dtype) + '_' + str(y.device)
|
||||
fmax_dtype_device = str(fmax) + '_' + dtype_device
|
||||
wnsize_dtype_device = str(win_size) + '_' + dtype_device
|
||||
if fmax_dtype_device not in mel_basis:
|
||||
mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
|
||||
mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device)
|
||||
if wnsize_dtype_device not in hann_window:
|
||||
hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
|
||||
|
||||
y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
|
||||
y = y.squeeze(1)
|
||||
|
||||
spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
|
||||
center=center, pad_mode='reflect', normalized=False, onesided=True)
|
||||
|
||||
spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
|
||||
|
||||
spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
|
||||
spec = spectral_normalize_torch(spec)
|
||||
|
||||
return spec
|
||||
@@ -0,0 +1,590 @@
|
||||
import math
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch.nn import functional as F
|
||||
|
||||
import commons
|
||||
import modules
|
||||
import attentions
|
||||
import monotonic_align
|
||||
|
||||
from torch.nn import Conv1d, ConvTranspose1d, Conv2d
|
||||
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
||||
from commons import init_weights, get_padding
|
||||
|
||||
|
||||
class StochasticDurationPredictor(nn.Module):
|
||||
def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, n_flows=4, gin_channels=0):
|
||||
super().__init__()
|
||||
filter_channels = in_channels # it needs to be removed from future version.
|
||||
self.in_channels = in_channels
|
||||
self.filter_channels = filter_channels
|
||||
self.kernel_size = kernel_size
|
||||
self.p_dropout = p_dropout
|
||||
self.n_flows = n_flows
|
||||
self.gin_channels = gin_channels
|
||||
|
||||
self.log_flow = modules.Log()
|
||||
self.flows = nn.ModuleList()
|
||||
self.flows.append(modules.ElementwiseAffine(2))
|
||||
for i in range(n_flows):
|
||||
self.flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
|
||||
self.flows.append(modules.Flip())
|
||||
|
||||
self.post_pre = nn.Conv1d(1, filter_channels, 1)
|
||||
self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1)
|
||||
self.post_convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
|
||||
self.post_flows = nn.ModuleList()
|
||||
self.post_flows.append(modules.ElementwiseAffine(2))
|
||||
for i in range(4):
|
||||
self.post_flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
|
||||
self.post_flows.append(modules.Flip())
|
||||
|
||||
self.pre = nn.Conv1d(in_channels, filter_channels, 1)
|
||||
self.proj = nn.Conv1d(filter_channels, filter_channels, 1)
|
||||
self.convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
|
||||
if gin_channels != 0:
|
||||
self.cond = nn.Conv1d(gin_channels, filter_channels, 1)
|
||||
|
||||
def forward(self, x, x_mask, w=None, g=None, reverse=False, noise_scale=1.0):
|
||||
x = torch.detach(x)
|
||||
x = self.pre(x)
|
||||
if g is not None:
|
||||
g = torch.detach(g)
|
||||
x = x + self.cond(g)
|
||||
x = self.convs(x, x_mask)
|
||||
x = self.proj(x) * x_mask
|
||||
|
||||
if not reverse:
|
||||
flows = self.flows
|
||||
assert w is not None
|
||||
|
||||
logdet_tot_q = 0
|
||||
h_w = self.post_pre(w)
|
||||
h_w = self.post_convs(h_w, x_mask)
|
||||
h_w = self.post_proj(h_w) * x_mask
|
||||
e_q = torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype) * x_mask
|
||||
z_q = e_q
|
||||
for flow in self.post_flows:
|
||||
z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w))
|
||||
logdet_tot_q += logdet_q
|
||||
z_u, z1 = torch.split(z_q, [1, 1], 1)
|
||||
u = torch.sigmoid(z_u) * x_mask
|
||||
z0 = (w - u) * x_mask
|
||||
logdet_tot_q += torch.sum((F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1, 2])
|
||||
logq = torch.sum(-0.5 * (math.log(2 * math.pi) + (e_q ** 2)) * x_mask, [1, 2]) - logdet_tot_q
|
||||
|
||||
logdet_tot = 0
|
||||
z0, logdet = self.log_flow(z0, x_mask)
|
||||
logdet_tot += logdet
|
||||
z = torch.cat([z0, z1], 1)
|
||||
for flow in flows:
|
||||
z, logdet = flow(z, x_mask, g=x, reverse=reverse)
|
||||
logdet_tot = logdet_tot + logdet
|
||||
nll = torch.sum(0.5 * (math.log(2 * math.pi) + (z ** 2)) * x_mask, [1, 2]) - logdet_tot
|
||||
return nll + logq # [b]
|
||||
else:
|
||||
flows = list(reversed(self.flows))
|
||||
flows = flows[:-2] + [flows[-1]] # remove a useless vflow
|
||||
z = torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) * noise_scale
|
||||
for flow in flows:
|
||||
z = flow(z, x_mask, g=x, reverse=reverse)
|
||||
z0, z1 = torch.split(z, [1, 1], 1)
|
||||
logw = z0
|
||||
return logw
|
||||
|
||||
|
||||
class DurationPredictor(nn.Module):
|
||||
def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0):
|
||||
super().__init__()
|
||||
|
||||
self.in_channels = in_channels
|
||||
self.filter_channels = filter_channels
|
||||
self.kernel_size = kernel_size
|
||||
self.p_dropout = p_dropout
|
||||
self.gin_channels = gin_channels
|
||||
|
||||
self.drop = nn.Dropout(p_dropout)
|
||||
self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size // 2)
|
||||
self.norm_1 = modules.LayerNorm(filter_channels)
|
||||
self.conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size // 2)
|
||||
self.norm_2 = modules.LayerNorm(filter_channels)
|
||||
self.proj = nn.Conv1d(filter_channels, 1, 1)
|
||||
|
||||
if gin_channels != 0:
|
||||
self.cond = nn.Conv1d(gin_channels, in_channels, 1)
|
||||
|
||||
def forward(self, x, x_mask, g=None):
|
||||
x = torch.detach(x)
|
||||
if g is not None:
|
||||
g = torch.detach(g)
|
||||
x = x + self.cond(g)
|
||||
x = self.conv_1(x * x_mask)
|
||||
x = torch.relu(x)
|
||||
x = self.norm_1(x)
|
||||
x = self.drop(x)
|
||||
x = self.conv_2(x * x_mask)
|
||||
x = torch.relu(x)
|
||||
x = self.norm_2(x)
|
||||
x = self.drop(x)
|
||||
x = self.proj(x * x_mask)
|
||||
return x * x_mask
|
||||
|
||||
|
||||
class TextEncoder(nn.Module):
|
||||
def __init__(self,
|
||||
n_vocab,
|
||||
out_channels,
|
||||
hidden_channels,
|
||||
filter_channels,
|
||||
n_heads,
|
||||
n_layers,
|
||||
kernel_size,
|
||||
p_dropout,
|
||||
emotion_embedding):
|
||||
super().__init__()
|
||||
self.n_vocab = n_vocab
|
||||
self.out_channels = out_channels
|
||||
self.hidden_channels = hidden_channels
|
||||
self.filter_channels = filter_channels
|
||||
self.n_heads = n_heads
|
||||
self.n_layers = n_layers
|
||||
self.kernel_size = kernel_size
|
||||
self.p_dropout = p_dropout
|
||||
self.emotion_embedding = emotion_embedding
|
||||
|
||||
if self.n_vocab != 0:
|
||||
self.emb = nn.Embedding(n_vocab, hidden_channels)
|
||||
if emotion_embedding:
|
||||
self.emo_proj = nn.Linear(1024, hidden_channels)
|
||||
nn.init.normal_(self.emb.weight, 0.0, hidden_channels ** -0.5)
|
||||
|
||||
self.encoder = attentions.Encoder(
|
||||
hidden_channels,
|
||||
filter_channels,
|
||||
n_heads,
|
||||
n_layers,
|
||||
kernel_size,
|
||||
p_dropout)
|
||||
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
|
||||
|
||||
def forward(self, x, x_lengths, emotion_embedding=None):
|
||||
if self.n_vocab != 0:
|
||||
x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h]
|
||||
if emotion_embedding is not None:
|
||||
x = x + self.emo_proj(emotion_embedding.unsqueeze(1))
|
||||
x = torch.transpose(x, 1, -1) # [b, h, t]
|
||||
x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
|
||||
|
||||
x = self.encoder(x * x_mask, x_mask)
|
||||
stats = self.proj(x) * x_mask
|
||||
|
||||
m, logs = torch.split(stats, self.out_channels, dim=1)
|
||||
return x, m, logs, x_mask
|
||||
|
||||
|
||||
class ResidualCouplingBlock(nn.Module):
|
||||
def __init__(self,
|
||||
channels,
|
||||
hidden_channels,
|
||||
kernel_size,
|
||||
dilation_rate,
|
||||
n_layers,
|
||||
n_flows=4,
|
||||
gin_channels=0):
|
||||
super().__init__()
|
||||
self.channels = channels
|
||||
self.hidden_channels = hidden_channels
|
||||
self.kernel_size = kernel_size
|
||||
self.dilation_rate = dilation_rate
|
||||
self.n_layers = n_layers
|
||||
self.n_flows = n_flows
|
||||
self.gin_channels = gin_channels
|
||||
|
||||
self.flows = nn.ModuleList()
|
||||
for i in range(n_flows):
|
||||
self.flows.append(
|
||||
modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers,
|
||||
gin_channels=gin_channels, mean_only=True))
|
||||
self.flows.append(modules.Flip())
|
||||
|
||||
def forward(self, x, x_mask, g=None, reverse=False):
|
||||
if not reverse:
|
||||
for flow in self.flows:
|
||||
x, _ = flow(x, x_mask, g=g, reverse=reverse)
|
||||
else:
|
||||
for flow in reversed(self.flows):
|
||||
x = flow(x, x_mask, g=g, reverse=reverse)
|
||||
return x
|
||||
|
||||
|
||||
class PosteriorEncoder(nn.Module):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
hidden_channels,
|
||||
kernel_size,
|
||||
dilation_rate,
|
||||
n_layers,
|
||||
gin_channels=0):
|
||||
super().__init__()
|
||||
self.in_channels = in_channels
|
||||
self.out_channels = out_channels
|
||||
self.hidden_channels = hidden_channels
|
||||
self.kernel_size = kernel_size
|
||||
self.dilation_rate = dilation_rate
|
||||
self.n_layers = n_layers
|
||||
self.gin_channels = gin_channels
|
||||
|
||||
self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
|
||||
self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
|
||||
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
|
||||
|
||||
def forward(self, x, x_lengths, g=None):
|
||||
x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
|
||||
x = self.pre(x) * x_mask
|
||||
x = self.enc(x, x_mask, g=g)
|
||||
stats = self.proj(x) * x_mask
|
||||
m, logs = torch.split(stats, self.out_channels, dim=1)
|
||||
z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
|
||||
return z, m, logs, x_mask
|
||||
|
||||
|
||||
class Generator(torch.nn.Module):
|
||||
def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates,
|
||||
upsample_initial_channel, upsample_kernel_sizes, gin_channels=0):
|
||||
super(Generator, self).__init__()
|
||||
self.num_kernels = len(resblock_kernel_sizes)
|
||||
self.num_upsamples = len(upsample_rates)
|
||||
self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
|
||||
resblock = modules.ResBlock1 if resblock == '1' else modules.ResBlock2
|
||||
|
||||
self.ups = nn.ModuleList()
|
||||
for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
|
||||
self.ups.append(weight_norm(
|
||||
ConvTranspose1d(upsample_initial_channel // (2 ** i), upsample_initial_channel // (2 ** (i + 1)),
|
||||
k, u, padding=(k - u) // 2)))
|
||||
|
||||
self.resblocks = nn.ModuleList()
|
||||
for i in range(len(self.ups)):
|
||||
ch = upsample_initial_channel // (2 ** (i + 1))
|
||||
for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
|
||||
self.resblocks.append(resblock(ch, k, d))
|
||||
|
||||
self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
|
||||
self.ups.apply(init_weights)
|
||||
|
||||
if gin_channels != 0:
|
||||
self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
|
||||
|
||||
def forward(self, x, g=None):
|
||||
x = self.conv_pre(x)
|
||||
if g is not None:
|
||||
x = x + self.cond(g)
|
||||
|
||||
for i in range(self.num_upsamples):
|
||||
x = F.leaky_relu(x, modules.LRELU_SLOPE)
|
||||
x = self.ups[i](x)
|
||||
xs = None
|
||||
for j in range(self.num_kernels):
|
||||
if xs is None:
|
||||
xs = self.resblocks[i * self.num_kernels + j](x)
|
||||
else:
|
||||
xs += self.resblocks[i * self.num_kernels + j](x)
|
||||
x = xs / self.num_kernels
|
||||
x = F.leaky_relu(x)
|
||||
x = self.conv_post(x)
|
||||
x = torch.tanh(x)
|
||||
|
||||
return x
|
||||
|
||||
def remove_weight_norm(self):
|
||||
print('Removing weight norm...')
|
||||
for l in self.ups:
|
||||
remove_weight_norm(l)
|
||||
for l in self.resblocks:
|
||||
l.remove_weight_norm()
|
||||
|
||||
|
||||
class DiscriminatorP(torch.nn.Module):
|
||||
def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
|
||||
super(DiscriminatorP, self).__init__()
|
||||
self.period = period
|
||||
self.use_spectral_norm = use_spectral_norm
|
||||
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
|
||||
self.convs = nn.ModuleList([
|
||||
norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
|
||||
norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
|
||||
norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
|
||||
norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
|
||||
norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))),
|
||||
])
|
||||
self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
|
||||
|
||||
def forward(self, x):
|
||||
fmap = []
|
||||
|
||||
# 1d to 2d
|
||||
b, c, t = x.shape
|
||||
if t % self.period != 0: # pad first
|
||||
n_pad = self.period - (t % self.period)
|
||||
x = F.pad(x, (0, n_pad), "reflect")
|
||||
t = t + n_pad
|
||||
x = x.view(b, c, t // self.period, self.period)
|
||||
|
||||
for l in self.convs:
|
||||
x = l(x)
|
||||
x = F.leaky_relu(x, modules.LRELU_SLOPE)
|
||||
fmap.append(x)
|
||||
x = self.conv_post(x)
|
||||
fmap.append(x)
|
||||
x = torch.flatten(x, 1, -1)
|
||||
|
||||
return x, fmap
|
||||
|
||||
|
||||
class DiscriminatorS(torch.nn.Module):
|
||||
def __init__(self, use_spectral_norm=False):
|
||||
super(DiscriminatorS, self).__init__()
|
||||
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
|
||||
self.convs = nn.ModuleList([
|
||||
norm_f(Conv1d(1, 16, 15, 1, padding=7)),
|
||||
norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
|
||||
norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
|
||||
norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
|
||||
norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
|
||||
norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
|
||||
])
|
||||
self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
|
||||
|
||||
def forward(self, x):
|
||||
fmap = []
|
||||
|
||||
for l in self.convs:
|
||||
x = l(x)
|
||||
x = F.leaky_relu(x, modules.LRELU_SLOPE)
|
||||
fmap.append(x)
|
||||
x = self.conv_post(x)
|
||||
fmap.append(x)
|
||||
x = torch.flatten(x, 1, -1)
|
||||
|
||||
return x, fmap
|
||||
|
||||
|
||||
class MultiPeriodDiscriminator(torch.nn.Module):
|
||||
def __init__(self, use_spectral_norm=False):
|
||||
super(MultiPeriodDiscriminator, self).__init__()
|
||||
periods = [2, 3, 5, 7, 11]
|
||||
|
||||
discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
|
||||
discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
|
||||
self.discriminators = nn.ModuleList(discs)
|
||||
|
||||
def forward(self, y, y_hat):
|
||||
y_d_rs = []
|
||||
y_d_gs = []
|
||||
fmap_rs = []
|
||||
fmap_gs = []
|
||||
for i, d in enumerate(self.discriminators):
|
||||
y_d_r, fmap_r = d(y)
|
||||
y_d_g, fmap_g = d(y_hat)
|
||||
y_d_rs.append(y_d_r)
|
||||
y_d_gs.append(y_d_g)
|
||||
fmap_rs.append(fmap_r)
|
||||
fmap_gs.append(fmap_g)
|
||||
|
||||
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
|
||||
|
||||
|
||||
class SynthesizerTrn(nn.Module):
|
||||
"""
|
||||
Synthesizer for Training
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
n_vocab,
|
||||
spec_channels,
|
||||
segment_size,
|
||||
inter_channels,
|
||||
hidden_channels,
|
||||
filter_channels,
|
||||
n_heads,
|
||||
n_layers,
|
||||
kernel_size,
|
||||
p_dropout,
|
||||
resblock,
|
||||
resblock_kernel_sizes,
|
||||
resblock_dilation_sizes,
|
||||
upsample_rates,
|
||||
upsample_initial_channel,
|
||||
upsample_kernel_sizes,
|
||||
n_speakers=0,
|
||||
gin_channels=0,
|
||||
use_sdp=True,
|
||||
emotion_embedding=False,
|
||||
**kwargs):
|
||||
|
||||
super().__init__()
|
||||
self.n_vocab = n_vocab
|
||||
self.spec_channels = spec_channels
|
||||
self.inter_channels = inter_channels
|
||||
self.hidden_channels = hidden_channels
|
||||
self.filter_channels = filter_channels
|
||||
self.n_heads = n_heads
|
||||
self.n_layers = n_layers
|
||||
self.kernel_size = kernel_size
|
||||
self.p_dropout = p_dropout
|
||||
self.resblock = resblock
|
||||
self.resblock_kernel_sizes = resblock_kernel_sizes
|
||||
self.resblock_dilation_sizes = resblock_dilation_sizes
|
||||
self.upsample_rates = upsample_rates
|
||||
self.upsample_initial_channel = upsample_initial_channel
|
||||
self.upsample_kernel_sizes = upsample_kernel_sizes
|
||||
self.segment_size = segment_size
|
||||
self.n_speakers = n_speakers
|
||||
self.gin_channels = gin_channels
|
||||
|
||||
self.use_sdp = use_sdp
|
||||
|
||||
self.enc_p = TextEncoder(n_vocab,
|
||||
inter_channels,
|
||||
hidden_channels,
|
||||
filter_channels,
|
||||
n_heads,
|
||||
n_layers,
|
||||
kernel_size,
|
||||
p_dropout,
|
||||
emotion_embedding)
|
||||
self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates,
|
||||
upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
|
||||
self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16,
|
||||
gin_channels=gin_channels)
|
||||
self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
|
||||
|
||||
if use_sdp:
|
||||
self.dp = StochasticDurationPredictor(hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels)
|
||||
else:
|
||||
self.dp = DurationPredictor(hidden_channels, 256, 3, 0.5, gin_channels=gin_channels)
|
||||
|
||||
if n_speakers > 1:
|
||||
self.emb_g = nn.Embedding(n_speakers, gin_channels)
|
||||
|
||||
def forward(self, x, x_lengths, y, y_lengths, sid=None, emotion_embedding=None):
|
||||
|
||||
x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths, emotion_embedding)
|
||||
if self.n_speakers > 1:
|
||||
g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
|
||||
else:
|
||||
g = None
|
||||
|
||||
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
|
||||
z_p = self.flow(z, y_mask, g=g)
|
||||
|
||||
with torch.no_grad():
|
||||
# negative cross-entropy
|
||||
s_p_sq_r = torch.exp(-2 * logs_p) # [b, d, t]
|
||||
neg_cent1 = torch.sum(-0.5 * math.log(2 * math.pi) - logs_p, [1], keepdim=True) # [b, 1, t_s]
|
||||
neg_cent2 = torch.matmul(-0.5 * (z_p ** 2).transpose(1, 2),
|
||||
s_p_sq_r) # [b, t_t, d] x [b, d, t_s] = [b, t_t, t_s]
|
||||
neg_cent3 = torch.matmul(z_p.transpose(1, 2), (m_p * s_p_sq_r)) # [b, t_t, d] x [b, d, t_s] = [b, t_t, t_s]
|
||||
neg_cent4 = torch.sum(-0.5 * (m_p ** 2) * s_p_sq_r, [1], keepdim=True) # [b, 1, t_s]
|
||||
neg_cent = neg_cent1 + neg_cent2 + neg_cent3 + neg_cent4
|
||||
|
||||
attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
|
||||
attn = monotonic_align.maximum_path(neg_cent, attn_mask.squeeze(1)).unsqueeze(1).detach()
|
||||
|
||||
w = attn.sum(2)
|
||||
if self.use_sdp:
|
||||
l_length = self.dp(x, x_mask, w, g=g)
|
||||
l_length = l_length / torch.sum(x_mask)
|
||||
else:
|
||||
logw_ = torch.log(w + 1e-6) * x_mask
|
||||
logw = self.dp(x, x_mask, g=g)
|
||||
l_length = torch.sum((logw - logw_) ** 2, [1, 2]) / torch.sum(x_mask) # for averaging
|
||||
|
||||
# expand prior
|
||||
m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2)
|
||||
logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2)
|
||||
|
||||
z_slice, ids_slice = commons.rand_slice_segments(z, y_lengths, self.segment_size)
|
||||
o = self.dec(z_slice, g=g)
|
||||
return o, l_length, attn, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
|
||||
|
||||
def infer(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None,
|
||||
emotion_embedding=None):
|
||||
x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths, emotion_embedding)
|
||||
if self.n_speakers > 1:
|
||||
g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
|
||||
else:
|
||||
g = None
|
||||
|
||||
if self.use_sdp:
|
||||
logw = self.dp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w)
|
||||
else:
|
||||
logw = self.dp(x, x_mask, g=g)
|
||||
w = torch.exp(logw) * x_mask * length_scale
|
||||
w_ceil = torch.ceil(w)
|
||||
y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
|
||||
y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, None), 1).to(x_mask.dtype)
|
||||
attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
|
||||
attn = commons.generate_path(w_ceil, attn_mask)
|
||||
|
||||
m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
|
||||
logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1,
|
||||
2) # [b, t', t], [b, t, d] -> [b, d, t']
|
||||
|
||||
z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
|
||||
z = self.flow(z_p, y_mask, g=g, reverse=True)
|
||||
o = self.dec((z * y_mask)[:, :, :max_len], g=g)
|
||||
return o, attn, y_mask, (z, z_p, m_p, logs_p)
|
||||
|
||||
def predict_duration(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None,
|
||||
emotion_embedding=None):
|
||||
x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths, emotion_embedding)
|
||||
if self.n_speakers > 1:
|
||||
g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
|
||||
else:
|
||||
g = None
|
||||
|
||||
if self.use_sdp:
|
||||
logw = self.dp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w)
|
||||
else:
|
||||
logw = self.dp(x, x_mask, g=g)
|
||||
w = torch.exp(logw) * x_mask * length_scale
|
||||
w_ceil = torch.ceil(w)
|
||||
return list(w_ceil.squeeze())
|
||||
|
||||
def infer_with_duration(self, x, x_lengths, w_ceil, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None,
|
||||
emotion_embedding=None):
|
||||
x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths, emotion_embedding)
|
||||
if self.n_speakers > 1:
|
||||
g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
|
||||
else:
|
||||
g = None
|
||||
print(len(w_ceil))
|
||||
print(x.shape)
|
||||
assert len(w_ceil) == x.shape[2]
|
||||
w_ceil = torch.FloatTensor(w_ceil).reshape(1, 1, -1)
|
||||
y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
|
||||
y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, None), 1).to(x_mask.dtype)
|
||||
attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
|
||||
attn = commons.generate_path(w_ceil, attn_mask)
|
||||
|
||||
m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
|
||||
logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1,
|
||||
2) # [b, t', t], [b, t, d] -> [b, d, t']
|
||||
|
||||
z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
|
||||
z = self.flow(z_p, y_mask, g=g, reverse=True)
|
||||
o = self.dec((z * y_mask)[:, :, :max_len], g=g)
|
||||
return o, attn, y_mask, (z, z_p, m_p, logs_p)
|
||||
|
||||
def voice_conversion(self, y, y_lengths, sid_src, sid_tgt):
|
||||
assert self.n_speakers > 1, "n_speakers have to be larger than 1."
|
||||
g_src = self.emb_g(sid_src).unsqueeze(-1)
|
||||
g_tgt = self.emb_g(sid_tgt).unsqueeze(-1)
|
||||
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src)
|
||||
z_p = self.flow(z, y_mask, g=g_src)
|
||||
z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
|
||||
o_hat = self.dec(z_hat * y_mask, g=g_tgt)
|
||||
return o_hat, y_mask, (z, z_p, z_hat)
|
||||
@@ -0,0 +1,19 @@
|
||||
import numpy as np
|
||||
import torch
|
||||
from .monotonic_align.core import maximum_path_c
|
||||
|
||||
|
||||
def maximum_path(neg_cent, mask):
|
||||
""" Cython optimized version.
|
||||
neg_cent: [b, t_t, t_s]
|
||||
mask: [b, t_t, t_s]
|
||||
"""
|
||||
device = neg_cent.device
|
||||
dtype = neg_cent.dtype
|
||||
neg_cent = neg_cent.data.cpu().numpy().astype(np.float32)
|
||||
path = np.zeros(neg_cent.shape, dtype=np.int32)
|
||||
|
||||
t_t_max = mask.sum(1)[:, 0].data.cpu().numpy().astype(np.int32)
|
||||
t_s_max = mask.sum(2)[:, 0].data.cpu().numpy().astype(np.int32)
|
||||
maximum_path_c(path, neg_cent, t_t_max, t_s_max)
|
||||
return torch.from_numpy(path).to(device=device, dtype=dtype)
|
||||
Binary file not shown.
BIN
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
+21299
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,42 @@
|
||||
cimport cython
|
||||
from cython.parallel import prange
|
||||
|
||||
|
||||
@cython.boundscheck(False)
|
||||
@cython.wraparound(False)
|
||||
cdef void maximum_path_each(int[:,::1] path, float[:,::1] value, int t_y, int t_x, float max_neg_val=-1e9) nogil:
|
||||
cdef int x
|
||||
cdef int y
|
||||
cdef float v_prev
|
||||
cdef float v_cur
|
||||
cdef float tmp
|
||||
cdef int index = t_x - 1
|
||||
|
||||
for y in range(t_y):
|
||||
for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)):
|
||||
if x == y:
|
||||
v_cur = max_neg_val
|
||||
else:
|
||||
v_cur = value[y-1, x]
|
||||
if x == 0:
|
||||
if y == 0:
|
||||
v_prev = 0.
|
||||
else:
|
||||
v_prev = max_neg_val
|
||||
else:
|
||||
v_prev = value[y-1, x-1]
|
||||
value[y, x] += max(v_prev, v_cur)
|
||||
|
||||
for y in range(t_y - 1, -1, -1):
|
||||
path[y, index] = 1
|
||||
if index != 0 and (index == y or value[y-1, index] < value[y-1, index-1]):
|
||||
index = index - 1
|
||||
|
||||
|
||||
@cython.boundscheck(False)
|
||||
@cython.wraparound(False)
|
||||
cpdef void maximum_path_c(int[:,:,::1] paths, float[:,:,::1] values, int[::1] t_ys, int[::1] t_xs) nogil:
|
||||
cdef int b = paths.shape[0]
|
||||
cdef int i
|
||||
for i in prange(b, nogil=True):
|
||||
maximum_path_each(paths[i], values[i], t_ys[i], t_xs[i])
|
||||
Binary file not shown.
@@ -0,0 +1,9 @@
|
||||
from distutils.core import setup
|
||||
from Cython.Build import cythonize
|
||||
import numpy
|
||||
|
||||
setup(
|
||||
name = 'monotonic_align',
|
||||
ext_modules = cythonize("core.pyx"),
|
||||
include_dirs=[numpy.get_include()]
|
||||
)
|
||||
@@ -0,0 +1,23 @@
|
||||
Cython==0.29.21
|
||||
librosa==0.8.0
|
||||
matplotlib==3.3.1
|
||||
numpy==1.21.6
|
||||
scipy==1.5.2
|
||||
tensorboard==2.3.0
|
||||
torch==1.6.0
|
||||
torchvision==0.7.0
|
||||
unidecode==1.3.4
|
||||
pyopenjtalk==0.2.0
|
||||
jamo==0.4.1
|
||||
pypinyin==0.44.0
|
||||
jieba==0.42.1
|
||||
protobuf==3.19.0
|
||||
cn2an==0.5.17
|
||||
inflect==6.0.0
|
||||
eng_to_ipa==0.0.2
|
||||
ko_pron==1.3
|
||||
indic_transliteration==2.3.37
|
||||
num_thai==0.0.5
|
||||
opencc==1.1.1
|
||||
googletrans==4.0.0rc1
|
||||
gradio
|
||||
@@ -0,0 +1,19 @@
|
||||
Copyright (c) 2017 Keith Ito
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
@@ -0,0 +1,59 @@
|
||||
""" from https://github.com/keithito/tacotron """
|
||||
from text import cleaners
|
||||
from text.symbols import symbols
|
||||
|
||||
|
||||
# Mappings from symbol to numeric ID and vice versa:
|
||||
_symbol_to_id = {s: i for i, s in enumerate(symbols)}
|
||||
_id_to_symbol = {i: s for i, s in enumerate(symbols)}
|
||||
|
||||
|
||||
def text_to_sequence(text, symbols, cleaner_names):
|
||||
'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
|
||||
Args:
|
||||
text: string to convert to a sequence
|
||||
cleaner_names: names of the cleaner functions to run the text through
|
||||
Returns:
|
||||
List of integers corresponding to the symbols in the text
|
||||
'''
|
||||
sequence = []
|
||||
symbol_to_id = {s: i for i, s in enumerate(symbols)}
|
||||
clean_text = _clean_text(text, cleaner_names)
|
||||
print(clean_text)
|
||||
print(f" length:{len(clean_text)}")
|
||||
for symbol in clean_text:
|
||||
if symbol not in symbol_to_id.keys():
|
||||
continue
|
||||
symbol_id = symbol_to_id[symbol]
|
||||
sequence += [symbol_id]
|
||||
print(f" length:{len(sequence)}")
|
||||
return sequence
|
||||
|
||||
|
||||
def cleaned_text_to_sequence(cleaned_text):
|
||||
'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
|
||||
Args:
|
||||
text: string to convert to a sequence
|
||||
Returns:
|
||||
List of integers corresponding to the symbols in the text
|
||||
'''
|
||||
sequence = [_symbol_to_id[symbol] for symbol in cleaned_text if symbol in _symbol_to_id.keys()]
|
||||
return sequence
|
||||
|
||||
|
||||
def sequence_to_text(sequence):
|
||||
'''Converts a sequence of IDs back to a string'''
|
||||
result = ''
|
||||
for symbol_id in sequence:
|
||||
s = _id_to_symbol[symbol_id]
|
||||
result += s
|
||||
return result
|
||||
|
||||
|
||||
def _clean_text(text, cleaner_names):
|
||||
for name in cleaner_names:
|
||||
cleaner = getattr(cleaners, name)
|
||||
if not cleaner:
|
||||
raise Exception('Unknown cleaner: %s' % name)
|
||||
text = cleaner(text)
|
||||
return text
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,59 @@
|
||||
import re
|
||||
import cn2an
|
||||
import opencc
|
||||
|
||||
|
||||
converter = opencc.OpenCC('jyutjyu')
|
||||
|
||||
# List of (Latin alphabet, ipa) pairs:
|
||||
_latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
|
||||
('A', 'ei˥'),
|
||||
('B', 'biː˥'),
|
||||
('C', 'siː˥'),
|
||||
('D', 'tiː˥'),
|
||||
('E', 'iː˥'),
|
||||
('F', 'e˥fuː˨˩'),
|
||||
('G', 'tsiː˥'),
|
||||
('H', 'ɪk̚˥tsʰyː˨˩'),
|
||||
('I', 'ɐi˥'),
|
||||
('J', 'tsei˥'),
|
||||
('K', 'kʰei˥'),
|
||||
('L', 'e˥llou˨˩'),
|
||||
('M', 'ɛːm˥'),
|
||||
('N', 'ɛːn˥'),
|
||||
('O', 'ou˥'),
|
||||
('P', 'pʰiː˥'),
|
||||
('Q', 'kʰiːu˥'),
|
||||
('R', 'aː˥lou˨˩'),
|
||||
('S', 'ɛː˥siː˨˩'),
|
||||
('T', 'tʰiː˥'),
|
||||
('U', 'juː˥'),
|
||||
('V', 'wiː˥'),
|
||||
('W', 'tʊk̚˥piː˥juː˥'),
|
||||
('X', 'ɪk̚˥siː˨˩'),
|
||||
('Y', 'waːi˥'),
|
||||
('Z', 'iː˨sɛːt̚˥')
|
||||
]]
|
||||
|
||||
|
||||
def number_to_cantonese(text):
|
||||
return re.sub(r'\d+(?:\.?\d+)?', lambda x: cn2an.an2cn(x.group()), text)
|
||||
|
||||
|
||||
def latin_to_ipa(text):
|
||||
for regex, replacement in _latin_to_ipa:
|
||||
text = re.sub(regex, replacement, text)
|
||||
return text
|
||||
|
||||
|
||||
def cantonese_to_ipa(text):
|
||||
text = number_to_cantonese(text.upper())
|
||||
text = converter.convert(text).replace('-','').replace('$',' ')
|
||||
text = re.sub(r'[A-Z]', lambda x: latin_to_ipa(x.group())+' ', text)
|
||||
text = re.sub(r'[、;:]', ',', text)
|
||||
text = re.sub(r'\s*,\s*', ', ', text)
|
||||
text = re.sub(r'\s*。\s*', '. ', text)
|
||||
text = re.sub(r'\s*?\s*', '? ', text)
|
||||
text = re.sub(r'\s*!\s*', '! ', text)
|
||||
text = re.sub(r'\s*$', '', text)
|
||||
return text
|
||||
@@ -0,0 +1,128 @@
|
||||
import re
|
||||
from text.japanese import japanese_to_romaji_with_accent, japanese_to_ipa, japanese_to_ipa2, japanese_to_ipa3
|
||||
from text.korean import latin_to_hangul, number_to_hangul, divide_hangul, korean_to_lazy_ipa, korean_to_ipa
|
||||
from text.mandarin import number_to_chinese, chinese_to_bopomofo, latin_to_bopomofo, chinese_to_romaji, chinese_to_lazy_ipa, chinese_to_ipa, chinese_to_ipa2
|
||||
from text.sanskrit import devanagari_to_ipa
|
||||
from text.english import english_to_lazy_ipa, english_to_ipa2, english_to_lazy_ipa2
|
||||
from text.thai import num_to_thai, latin_to_thai
|
||||
# from text.shanghainese import shanghainese_to_ipa
|
||||
# from text.cantonese import cantonese_to_ipa
|
||||
# from text.ngu_dialect import ngu_dialect_to_ipa
|
||||
|
||||
|
||||
def japanese_cleaners(text):
|
||||
text = japanese_to_romaji_with_accent(text)
|
||||
text = re.sub(r'([A-Za-z])$', r'\1.', text)
|
||||
return text
|
||||
|
||||
|
||||
def japanese_cleaners2(text):
|
||||
return japanese_cleaners(text).replace('ts', 'ʦ').replace('...', '…')
|
||||
|
||||
|
||||
def korean_cleaners(text):
|
||||
'''Pipeline for Korean text'''
|
||||
text = latin_to_hangul(text)
|
||||
text = number_to_hangul(text)
|
||||
text = divide_hangul(text)
|
||||
text = re.sub(r'([\u3131-\u3163])$', r'\1.', text)
|
||||
return text
|
||||
|
||||
|
||||
def chinese_cleaners(text):
|
||||
'''Pipeline for Chinese text'''
|
||||
text = number_to_chinese(text)
|
||||
text = chinese_to_bopomofo(text)
|
||||
text = latin_to_bopomofo(text)
|
||||
text = re.sub(r'([ˉˊˇˋ˙])$', r'\1。', text)
|
||||
return text
|
||||
|
||||
|
||||
def zh_ja_mixture_cleaners(text):
|
||||
text = re.sub(r'\[ZH\](.*?)\[ZH\]',
|
||||
lambda x: chinese_to_romaji(x.group(1))+' ', text)
|
||||
text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_romaji_with_accent(
|
||||
x.group(1)).replace('ts', 'ʦ').replace('u', 'ɯ').replace('...', '…')+' ', text)
|
||||
text = re.sub(r'\s+$', '', text)
|
||||
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
||||
return text
|
||||
|
||||
|
||||
def sanskrit_cleaners(text):
|
||||
text = text.replace('॥', '।').replace('ॐ', 'ओम्')
|
||||
text = re.sub(r'([^।])$', r'\1।', text)
|
||||
return text
|
||||
|
||||
|
||||
def cjks_cleaners(text):
|
||||
text = re.sub(r'\[ZH\](.*?)\[ZH\]',
|
||||
lambda x: chinese_to_lazy_ipa(x.group(1))+' ', text)
|
||||
text = re.sub(r'\[JA\](.*?)\[JA\]',
|
||||
lambda x: japanese_to_ipa(x.group(1))+' ', text)
|
||||
text = re.sub(r'\[KO\](.*?)\[KO\]',
|
||||
lambda x: korean_to_lazy_ipa(x.group(1))+' ', text)
|
||||
text = re.sub(r'\[SA\](.*?)\[SA\]',
|
||||
lambda x: devanagari_to_ipa(x.group(1))+' ', text)
|
||||
text = re.sub(r'\[EN\](.*?)\[EN\]',
|
||||
lambda x: english_to_lazy_ipa(x.group(1))+' ', text)
|
||||
text = re.sub(r'\s+$', '', text)
|
||||
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
||||
return text
|
||||
|
||||
|
||||
def cjke_cleaners(text):
|
||||
text = re.sub(r'\[ZH\](.*?)\[ZH\]', lambda x: chinese_to_lazy_ipa(x.group(1)).replace(
|
||||
'ʧ', 'tʃ').replace('ʦ', 'ts').replace('ɥan', 'ɥæn')+' ', text)
|
||||
text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_ipa(x.group(1)).replace('ʧ', 'tʃ').replace(
|
||||
'ʦ', 'ts').replace('ɥan', 'ɥæn').replace('ʥ', 'dz')+' ', text)
|
||||
text = re.sub(r'\[KO\](.*?)\[KO\]',
|
||||
lambda x: korean_to_ipa(x.group(1))+' ', text)
|
||||
text = re.sub(r'\[EN\](.*?)\[EN\]', lambda x: english_to_ipa2(x.group(1)).replace('ɑ', 'a').replace(
|
||||
'ɔ', 'o').replace('ɛ', 'e').replace('ɪ', 'i').replace('ʊ', 'u')+' ', text)
|
||||
text = re.sub(r'\s+$', '', text)
|
||||
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
||||
return text
|
||||
|
||||
|
||||
def cjke_cleaners2(text):
|
||||
text = re.sub(r'\[ZH\](.*?)\[ZH\]',
|
||||
lambda x: chinese_to_ipa(x.group(1))+' ', text)
|
||||
text = re.sub(r'\[JA\](.*?)\[JA\]',
|
||||
lambda x: japanese_to_ipa2(x.group(1))+' ', text)
|
||||
text = re.sub(r'\[KO\](.*?)\[KO\]',
|
||||
lambda x: korean_to_ipa(x.group(1))+' ', text)
|
||||
text = re.sub(r'\[EN\](.*?)\[EN\]',
|
||||
lambda x: english_to_ipa2(x.group(1))+' ', text)
|
||||
text = re.sub(r'\s+$', '', text)
|
||||
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
||||
return text
|
||||
|
||||
|
||||
def thai_cleaners(text):
|
||||
text = num_to_thai(text)
|
||||
text = latin_to_thai(text)
|
||||
return text
|
||||
|
||||
|
||||
# def shanghainese_cleaners(text):
|
||||
# text = shanghainese_to_ipa(text)
|
||||
# text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
||||
# return text
|
||||
|
||||
|
||||
# def chinese_dialect_cleaners(text):
|
||||
# text = re.sub(r'\[ZH\](.*?)\[ZH\]',
|
||||
# lambda x: chinese_to_ipa2(x.group(1))+' ', text)
|
||||
# text = re.sub(r'\[JA\](.*?)\[JA\]',
|
||||
# lambda x: japanese_to_ipa3(x.group(1)).replace('Q', 'ʔ')+' ', text)
|
||||
# text = re.sub(r'\[SH\](.*?)\[SH\]', lambda x: shanghainese_to_ipa(x.group(1)).replace('1', '˥˧').replace('5',
|
||||
# '˧˧˦').replace('6', '˩˩˧').replace('7', '˥').replace('8', '˩˨').replace('ᴀ', 'ɐ').replace('ᴇ', 'e')+' ', text)
|
||||
# text = re.sub(r'\[GD\](.*?)\[GD\]',
|
||||
# lambda x: cantonese_to_ipa(x.group(1))+' ', text)
|
||||
# text = re.sub(r'\[EN\](.*?)\[EN\]',
|
||||
# lambda x: english_to_lazy_ipa2(x.group(1))+' ', text)
|
||||
# text = re.sub(r'\[([A-Z]{2})\](.*?)\[\1\]', lambda x: ngu_dialect_to_ipa(x.group(2), x.group(
|
||||
# 1)).replace('ʣ', 'dz').replace('ʥ', 'dʑ').replace('ʦ', 'ts').replace('ʨ', 'tɕ')+' ', text)
|
||||
# text = re.sub(r'\s+$', '', text)
|
||||
# text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
||||
# return text
|
||||
+188
@@ -0,0 +1,188 @@
|
||||
""" from https://github.com/keithito/tacotron """
|
||||
|
||||
'''
|
||||
Cleaners are transformations that run over the input text at both training and eval time.
|
||||
|
||||
Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
|
||||
hyperparameter. Some cleaners are English-specific. You'll typically want to use:
|
||||
1. "english_cleaners" for English text
|
||||
2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
|
||||
the Unidecode library (https://pypi.python.org/pypi/Unidecode)
|
||||
3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
|
||||
the symbols in symbols.py to match your data).
|
||||
'''
|
||||
|
||||
|
||||
# Regular expression matching whitespace:
|
||||
|
||||
|
||||
import re
|
||||
import inflect
|
||||
from unidecode import unidecode
|
||||
import eng_to_ipa as ipa
|
||||
_inflect = inflect.engine()
|
||||
_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
|
||||
_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
|
||||
_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
|
||||
_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
|
||||
_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
|
||||
_number_re = re.compile(r'[0-9]+')
|
||||
|
||||
# List of (regular expression, replacement) pairs for abbreviations:
|
||||
_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
|
||||
('mrs', 'misess'),
|
||||
('mr', 'mister'),
|
||||
('dr', 'doctor'),
|
||||
('st', 'saint'),
|
||||
('co', 'company'),
|
||||
('jr', 'junior'),
|
||||
('maj', 'major'),
|
||||
('gen', 'general'),
|
||||
('drs', 'doctors'),
|
||||
('rev', 'reverend'),
|
||||
('lt', 'lieutenant'),
|
||||
('hon', 'honorable'),
|
||||
('sgt', 'sergeant'),
|
||||
('capt', 'captain'),
|
||||
('esq', 'esquire'),
|
||||
('ltd', 'limited'),
|
||||
('col', 'colonel'),
|
||||
('ft', 'fort'),
|
||||
]]
|
||||
|
||||
|
||||
# List of (ipa, lazy ipa) pairs:
|
||||
_lazy_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
|
||||
('r', 'ɹ'),
|
||||
('æ', 'e'),
|
||||
('ɑ', 'a'),
|
||||
('ɔ', 'o'),
|
||||
('ð', 'z'),
|
||||
('θ', 's'),
|
||||
('ɛ', 'e'),
|
||||
('ɪ', 'i'),
|
||||
('ʊ', 'u'),
|
||||
('ʒ', 'ʥ'),
|
||||
('ʤ', 'ʥ'),
|
||||
('ˈ', '↓'),
|
||||
]]
|
||||
|
||||
# List of (ipa, lazy ipa2) pairs:
|
||||
_lazy_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [
|
||||
('r', 'ɹ'),
|
||||
('ð', 'z'),
|
||||
('θ', 's'),
|
||||
('ʒ', 'ʑ'),
|
||||
('ʤ', 'dʑ'),
|
||||
('ˈ', '↓'),
|
||||
]]
|
||||
|
||||
# List of (ipa, ipa2) pairs
|
||||
_ipa_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [
|
||||
('r', 'ɹ'),
|
||||
('ʤ', 'dʒ'),
|
||||
('ʧ', 'tʃ')
|
||||
]]
|
||||
|
||||
|
||||
def expand_abbreviations(text):
|
||||
for regex, replacement in _abbreviations:
|
||||
text = re.sub(regex, replacement, text)
|
||||
return text
|
||||
|
||||
|
||||
def collapse_whitespace(text):
|
||||
return re.sub(r'\s+', ' ', text)
|
||||
|
||||
|
||||
def _remove_commas(m):
|
||||
return m.group(1).replace(',', '')
|
||||
|
||||
|
||||
def _expand_decimal_point(m):
|
||||
return m.group(1).replace('.', ' point ')
|
||||
|
||||
|
||||
def _expand_dollars(m):
|
||||
match = m.group(1)
|
||||
parts = match.split('.')
|
||||
if len(parts) > 2:
|
||||
return match + ' dollars' # Unexpected format
|
||||
dollars = int(parts[0]) if parts[0] else 0
|
||||
cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
|
||||
if dollars and cents:
|
||||
dollar_unit = 'dollar' if dollars == 1 else 'dollars'
|
||||
cent_unit = 'cent' if cents == 1 else 'cents'
|
||||
return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
|
||||
elif dollars:
|
||||
dollar_unit = 'dollar' if dollars == 1 else 'dollars'
|
||||
return '%s %s' % (dollars, dollar_unit)
|
||||
elif cents:
|
||||
cent_unit = 'cent' if cents == 1 else 'cents'
|
||||
return '%s %s' % (cents, cent_unit)
|
||||
else:
|
||||
return 'zero dollars'
|
||||
|
||||
|
||||
def _expand_ordinal(m):
|
||||
return _inflect.number_to_words(m.group(0))
|
||||
|
||||
|
||||
def _expand_number(m):
|
||||
num = int(m.group(0))
|
||||
if num > 1000 and num < 3000:
|
||||
if num == 2000:
|
||||
return 'two thousand'
|
||||
elif num > 2000 and num < 2010:
|
||||
return 'two thousand ' + _inflect.number_to_words(num % 100)
|
||||
elif num % 100 == 0:
|
||||
return _inflect.number_to_words(num // 100) + ' hundred'
|
||||
else:
|
||||
return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
|
||||
else:
|
||||
return _inflect.number_to_words(num, andword='')
|
||||
|
||||
|
||||
def normalize_numbers(text):
|
||||
text = re.sub(_comma_number_re, _remove_commas, text)
|
||||
text = re.sub(_pounds_re, r'\1 pounds', text)
|
||||
text = re.sub(_dollars_re, _expand_dollars, text)
|
||||
text = re.sub(_decimal_number_re, _expand_decimal_point, text)
|
||||
text = re.sub(_ordinal_re, _expand_ordinal, text)
|
||||
text = re.sub(_number_re, _expand_number, text)
|
||||
return text
|
||||
|
||||
|
||||
def mark_dark_l(text):
|
||||
return re.sub(r'l([^aeiouæɑɔəɛɪʊ ]*(?: |$))', lambda x: 'ɫ'+x.group(1), text)
|
||||
|
||||
|
||||
def english_to_ipa(text):
|
||||
text = unidecode(text).lower()
|
||||
text = expand_abbreviations(text)
|
||||
text = normalize_numbers(text)
|
||||
phonemes = ipa.convert(text)
|
||||
phonemes = collapse_whitespace(phonemes)
|
||||
return phonemes
|
||||
|
||||
|
||||
def english_to_lazy_ipa(text):
|
||||
text = english_to_ipa(text)
|
||||
for regex, replacement in _lazy_ipa:
|
||||
text = re.sub(regex, replacement, text)
|
||||
return text
|
||||
|
||||
|
||||
def english_to_ipa2(text):
|
||||
text = english_to_ipa(text)
|
||||
text = mark_dark_l(text)
|
||||
for regex, replacement in _ipa_to_ipa2:
|
||||
text = re.sub(regex, replacement, text)
|
||||
return text.replace('...', '…')
|
||||
|
||||
|
||||
def english_to_lazy_ipa2(text):
|
||||
text = english_to_ipa(text)
|
||||
for regex, replacement in _lazy_ipa2:
|
||||
text = re.sub(regex, replacement, text)
|
||||
return text
|
||||
@@ -0,0 +1,153 @@
|
||||
import re
|
||||
from unidecode import unidecode
|
||||
import pyopenjtalk
|
||||
|
||||
|
||||
# Regular expression matching Japanese without punctuation marks:
|
||||
_japanese_characters = re.compile(
|
||||
r'[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
|
||||
|
||||
# Regular expression matching non-Japanese characters or punctuation marks:
|
||||
_japanese_marks = re.compile(
|
||||
r'[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
|
||||
|
||||
# List of (symbol, Japanese) pairs for marks:
|
||||
_symbols_to_japanese = [(re.compile('%s' % x[0]), x[1]) for x in [
|
||||
('%', 'パーセント')
|
||||
]]
|
||||
|
||||
# List of (romaji, ipa) pairs for marks:
|
||||
_romaji_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
|
||||
('ts', 'ʦ'),
|
||||
('u', 'ɯ'),
|
||||
('j', 'ʥ'),
|
||||
('y', 'j'),
|
||||
('ni', 'n^i'),
|
||||
('nj', 'n^'),
|
||||
('hi', 'çi'),
|
||||
('hj', 'ç'),
|
||||
('f', 'ɸ'),
|
||||
('I', 'i*'),
|
||||
('U', 'ɯ*'),
|
||||
('r', 'ɾ')
|
||||
]]
|
||||
|
||||
# List of (romaji, ipa2) pairs for marks:
|
||||
_romaji_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [
|
||||
('u', 'ɯ'),
|
||||
('ʧ', 'tʃ'),
|
||||
('j', 'dʑ'),
|
||||
('y', 'j'),
|
||||
('ni', 'n^i'),
|
||||
('nj', 'n^'),
|
||||
('hi', 'çi'),
|
||||
('hj', 'ç'),
|
||||
('f', 'ɸ'),
|
||||
('I', 'i*'),
|
||||
('U', 'ɯ*'),
|
||||
('r', 'ɾ')
|
||||
]]
|
||||
|
||||
# List of (consonant, sokuon) pairs:
|
||||
_real_sokuon = [(re.compile('%s' % x[0]), x[1]) for x in [
|
||||
(r'Q([↑↓]*[kg])', r'k#\1'),
|
||||
(r'Q([↑↓]*[tdjʧ])', r't#\1'),
|
||||
(r'Q([↑↓]*[sʃ])', r's\1'),
|
||||
(r'Q([↑↓]*[pb])', r'p#\1')
|
||||
]]
|
||||
|
||||
# List of (consonant, hatsuon) pairs:
|
||||
_real_hatsuon = [(re.compile('%s' % x[0]), x[1]) for x in [
|
||||
(r'N([↑↓]*[pbm])', r'm\1'),
|
||||
(r'N([↑↓]*[ʧʥj])', r'n^\1'),
|
||||
(r'N([↑↓]*[tdn])', r'n\1'),
|
||||
(r'N([↑↓]*[kg])', r'ŋ\1')
|
||||
]]
|
||||
|
||||
|
||||
def symbols_to_japanese(text):
|
||||
for regex, replacement in _symbols_to_japanese:
|
||||
text = re.sub(regex, replacement, text)
|
||||
return text
|
||||
|
||||
|
||||
def japanese_to_romaji_with_accent(text):
|
||||
'''Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html'''
|
||||
text = symbols_to_japanese(text)
|
||||
sentences = re.split(_japanese_marks, text)
|
||||
marks = re.findall(_japanese_marks, text)
|
||||
text = ''
|
||||
for i, sentence in enumerate(sentences):
|
||||
if re.match(_japanese_characters, sentence):
|
||||
if text != '':
|
||||
text += ' '
|
||||
labels = pyopenjtalk.extract_fullcontext(sentence)
|
||||
for n, label in enumerate(labels):
|
||||
phoneme = re.search(r'\-([^\+]*)\+', label).group(1)
|
||||
if phoneme not in ['sil', 'pau']:
|
||||
text += phoneme.replace('ch', 'ʧ').replace('sh',
|
||||
'ʃ').replace('cl', 'Q')
|
||||
else:
|
||||
continue
|
||||
# n_moras = int(re.search(r'/F:(\d+)_', label).group(1))
|
||||
a1 = int(re.search(r"/A:(\-?[0-9]+)\+", label).group(1))
|
||||
a2 = int(re.search(r"\+(\d+)\+", label).group(1))
|
||||
a3 = int(re.search(r"\+(\d+)/", label).group(1))
|
||||
if re.search(r'\-([^\+]*)\+', labels[n + 1]).group(1) in ['sil', 'pau']:
|
||||
a2_next = -1
|
||||
else:
|
||||
a2_next = int(
|
||||
re.search(r"\+(\d+)\+", labels[n + 1]).group(1))
|
||||
# Accent phrase boundary
|
||||
if a3 == 1 and a2_next == 1:
|
||||
text += ' '
|
||||
# Falling
|
||||
elif a1 == 0 and a2_next == a2 + 1:
|
||||
text += '↓'
|
||||
# Rising
|
||||
elif a2 == 1 and a2_next == 2:
|
||||
text += '↑'
|
||||
if i < len(marks):
|
||||
text += unidecode(marks[i]).replace(' ', '')
|
||||
return text
|
||||
|
||||
|
||||
def get_real_sokuon(text):
|
||||
for regex, replacement in _real_sokuon:
|
||||
text = re.sub(regex, replacement, text)
|
||||
return text
|
||||
|
||||
|
||||
def get_real_hatsuon(text):
|
||||
for regex, replacement in _real_hatsuon:
|
||||
text = re.sub(regex, replacement, text)
|
||||
return text
|
||||
|
||||
|
||||
def japanese_to_ipa(text):
|
||||
text = japanese_to_romaji_with_accent(text).replace('...', '…')
|
||||
text = re.sub(
|
||||
r'([aiueo])\1+', lambda x: x.group(0)[0]+'ː'*(len(x.group(0))-1), text)
|
||||
text = get_real_sokuon(text)
|
||||
text = get_real_hatsuon(text)
|
||||
for regex, replacement in _romaji_to_ipa:
|
||||
text = re.sub(regex, replacement, text)
|
||||
return text
|
||||
|
||||
|
||||
def japanese_to_ipa2(text):
|
||||
text = japanese_to_romaji_with_accent(text).replace('...', '…')
|
||||
text = get_real_sokuon(text)
|
||||
text = get_real_hatsuon(text)
|
||||
for regex, replacement in _romaji_to_ipa2:
|
||||
text = re.sub(regex, replacement, text)
|
||||
return text
|
||||
|
||||
|
||||
def japanese_to_ipa3(text):
|
||||
text = japanese_to_ipa2(text).replace('n^', 'ȵ').replace(
|
||||
'ʃ', 'ɕ').replace('*', '\u0325').replace('#', '\u031a')
|
||||
text = re.sub(
|
||||
r'([aiɯeo])\1+', lambda x: x.group(0)[0]+'ː'*(len(x.group(0))-1), text)
|
||||
text = re.sub(r'((?:^|\s)(?:ts|tɕ|[kpt]))', r'\1ʰ', text)
|
||||
return text
|
||||
+210
@@ -0,0 +1,210 @@
|
||||
import re
|
||||
from jamo import h2j, j2hcj
|
||||
import ko_pron
|
||||
|
||||
|
||||
# This is a list of Korean classifiers preceded by pure Korean numerals.
|
||||
_korean_classifiers = '군데 권 개 그루 닢 대 두 마리 모 모금 뭇 발 발짝 방 번 벌 보루 살 수 술 시 쌈 움큼 정 짝 채 척 첩 축 켤레 톨 통'
|
||||
|
||||
# List of (hangul, hangul divided) pairs:
|
||||
_hangul_divided = [(re.compile('%s' % x[0]), x[1]) for x in [
|
||||
('ㄳ', 'ㄱㅅ'),
|
||||
('ㄵ', 'ㄴㅈ'),
|
||||
('ㄶ', 'ㄴㅎ'),
|
||||
('ㄺ', 'ㄹㄱ'),
|
||||
('ㄻ', 'ㄹㅁ'),
|
||||
('ㄼ', 'ㄹㅂ'),
|
||||
('ㄽ', 'ㄹㅅ'),
|
||||
('ㄾ', 'ㄹㅌ'),
|
||||
('ㄿ', 'ㄹㅍ'),
|
||||
('ㅀ', 'ㄹㅎ'),
|
||||
('ㅄ', 'ㅂㅅ'),
|
||||
('ㅘ', 'ㅗㅏ'),
|
||||
('ㅙ', 'ㅗㅐ'),
|
||||
('ㅚ', 'ㅗㅣ'),
|
||||
('ㅝ', 'ㅜㅓ'),
|
||||
('ㅞ', 'ㅜㅔ'),
|
||||
('ㅟ', 'ㅜㅣ'),
|
||||
('ㅢ', 'ㅡㅣ'),
|
||||
('ㅑ', 'ㅣㅏ'),
|
||||
('ㅒ', 'ㅣㅐ'),
|
||||
('ㅕ', 'ㅣㅓ'),
|
||||
('ㅖ', 'ㅣㅔ'),
|
||||
('ㅛ', 'ㅣㅗ'),
|
||||
('ㅠ', 'ㅣㅜ')
|
||||
]]
|
||||
|
||||
# List of (Latin alphabet, hangul) pairs:
|
||||
_latin_to_hangul = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
|
||||
('a', '에이'),
|
||||
('b', '비'),
|
||||
('c', '시'),
|
||||
('d', '디'),
|
||||
('e', '이'),
|
||||
('f', '에프'),
|
||||
('g', '지'),
|
||||
('h', '에이치'),
|
||||
('i', '아이'),
|
||||
('j', '제이'),
|
||||
('k', '케이'),
|
||||
('l', '엘'),
|
||||
('m', '엠'),
|
||||
('n', '엔'),
|
||||
('o', '오'),
|
||||
('p', '피'),
|
||||
('q', '큐'),
|
||||
('r', '아르'),
|
||||
('s', '에스'),
|
||||
('t', '티'),
|
||||
('u', '유'),
|
||||
('v', '브이'),
|
||||
('w', '더블유'),
|
||||
('x', '엑스'),
|
||||
('y', '와이'),
|
||||
('z', '제트')
|
||||
]]
|
||||
|
||||
# List of (ipa, lazy ipa) pairs:
|
||||
_ipa_to_lazy_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
|
||||
('t͡ɕ','ʧ'),
|
||||
('d͡ʑ','ʥ'),
|
||||
('ɲ','n^'),
|
||||
('ɕ','ʃ'),
|
||||
('ʷ','w'),
|
||||
('ɭ','l`'),
|
||||
('ʎ','ɾ'),
|
||||
('ɣ','ŋ'),
|
||||
('ɰ','ɯ'),
|
||||
('ʝ','j'),
|
||||
('ʌ','ə'),
|
||||
('ɡ','g'),
|
||||
('\u031a','#'),
|
||||
('\u0348','='),
|
||||
('\u031e',''),
|
||||
('\u0320',''),
|
||||
('\u0339','')
|
||||
]]
|
||||
|
||||
|
||||
def latin_to_hangul(text):
|
||||
for regex, replacement in _latin_to_hangul:
|
||||
text = re.sub(regex, replacement, text)
|
||||
return text
|
||||
|
||||
|
||||
def divide_hangul(text):
|
||||
text = j2hcj(h2j(text))
|
||||
for regex, replacement in _hangul_divided:
|
||||
text = re.sub(regex, replacement, text)
|
||||
return text
|
||||
|
||||
|
||||
def hangul_number(num, sino=True):
|
||||
'''Reference https://github.com/Kyubyong/g2pK'''
|
||||
num = re.sub(',', '', num)
|
||||
|
||||
if num == '0':
|
||||
return '영'
|
||||
if not sino and num == '20':
|
||||
return '스무'
|
||||
|
||||
digits = '123456789'
|
||||
names = '일이삼사오육칠팔구'
|
||||
digit2name = {d: n for d, n in zip(digits, names)}
|
||||
|
||||
modifiers = '한 두 세 네 다섯 여섯 일곱 여덟 아홉'
|
||||
decimals = '열 스물 서른 마흔 쉰 예순 일흔 여든 아흔'
|
||||
digit2mod = {d: mod for d, mod in zip(digits, modifiers.split())}
|
||||
digit2dec = {d: dec for d, dec in zip(digits, decimals.split())}
|
||||
|
||||
spelledout = []
|
||||
for i, digit in enumerate(num):
|
||||
i = len(num) - i - 1
|
||||
if sino:
|
||||
if i == 0:
|
||||
name = digit2name.get(digit, '')
|
||||
elif i == 1:
|
||||
name = digit2name.get(digit, '') + '십'
|
||||
name = name.replace('일십', '십')
|
||||
else:
|
||||
if i == 0:
|
||||
name = digit2mod.get(digit, '')
|
||||
elif i == 1:
|
||||
name = digit2dec.get(digit, '')
|
||||
if digit == '0':
|
||||
if i % 4 == 0:
|
||||
last_three = spelledout[-min(3, len(spelledout)):]
|
||||
if ''.join(last_three) == '':
|
||||
spelledout.append('')
|
||||
continue
|
||||
else:
|
||||
spelledout.append('')
|
||||
continue
|
||||
if i == 2:
|
||||
name = digit2name.get(digit, '') + '백'
|
||||
name = name.replace('일백', '백')
|
||||
elif i == 3:
|
||||
name = digit2name.get(digit, '') + '천'
|
||||
name = name.replace('일천', '천')
|
||||
elif i == 4:
|
||||
name = digit2name.get(digit, '') + '만'
|
||||
name = name.replace('일만', '만')
|
||||
elif i == 5:
|
||||
name = digit2name.get(digit, '') + '십'
|
||||
name = name.replace('일십', '십')
|
||||
elif i == 6:
|
||||
name = digit2name.get(digit, '') + '백'
|
||||
name = name.replace('일백', '백')
|
||||
elif i == 7:
|
||||
name = digit2name.get(digit, '') + '천'
|
||||
name = name.replace('일천', '천')
|
||||
elif i == 8:
|
||||
name = digit2name.get(digit, '') + '억'
|
||||
elif i == 9:
|
||||
name = digit2name.get(digit, '') + '십'
|
||||
elif i == 10:
|
||||
name = digit2name.get(digit, '') + '백'
|
||||
elif i == 11:
|
||||
name = digit2name.get(digit, '') + '천'
|
||||
elif i == 12:
|
||||
name = digit2name.get(digit, '') + '조'
|
||||
elif i == 13:
|
||||
name = digit2name.get(digit, '') + '십'
|
||||
elif i == 14:
|
||||
name = digit2name.get(digit, '') + '백'
|
||||
elif i == 15:
|
||||
name = digit2name.get(digit, '') + '천'
|
||||
spelledout.append(name)
|
||||
return ''.join(elem for elem in spelledout)
|
||||
|
||||
|
||||
def number_to_hangul(text):
|
||||
'''Reference https://github.com/Kyubyong/g2pK'''
|
||||
tokens = set(re.findall(r'(\d[\d,]*)([\uac00-\ud71f]+)', text))
|
||||
for token in tokens:
|
||||
num, classifier = token
|
||||
if classifier[:2] in _korean_classifiers or classifier[0] in _korean_classifiers:
|
||||
spelledout = hangul_number(num, sino=False)
|
||||
else:
|
||||
spelledout = hangul_number(num, sino=True)
|
||||
text = text.replace(f'{num}{classifier}', f'{spelledout}{classifier}')
|
||||
# digit by digit for remaining digits
|
||||
digits = '0123456789'
|
||||
names = '영일이삼사오육칠팔구'
|
||||
for d, n in zip(digits, names):
|
||||
text = text.replace(d, n)
|
||||
return text
|
||||
|
||||
|
||||
def korean_to_lazy_ipa(text):
|
||||
text = latin_to_hangul(text)
|
||||
text = number_to_hangul(text)
|
||||
text=re.sub('[\uac00-\ud7af]+',lambda x:ko_pron.romanise(x.group(0),'ipa').split('] ~ [')[0],text)
|
||||
for regex, replacement in _ipa_to_lazy_ipa:
|
||||
text = re.sub(regex, replacement, text)
|
||||
return text
|
||||
|
||||
|
||||
def korean_to_ipa(text):
|
||||
text = korean_to_lazy_ipa(text)
|
||||
return text.replace('ʧ','tʃ').replace('ʥ','dʑ')
|
||||
@@ -0,0 +1,326 @@
|
||||
import os
|
||||
import sys
|
||||
import re
|
||||
from pypinyin import lazy_pinyin, BOPOMOFO
|
||||
import jieba
|
||||
import cn2an
|
||||
import logging
|
||||
|
||||
|
||||
# List of (Latin alphabet, bopomofo) pairs:
|
||||
_latin_to_bopomofo = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
|
||||
('a', 'ㄟˉ'),
|
||||
('b', 'ㄅㄧˋ'),
|
||||
('c', 'ㄙㄧˉ'),
|
||||
('d', 'ㄉㄧˋ'),
|
||||
('e', 'ㄧˋ'),
|
||||
('f', 'ㄝˊㄈㄨˋ'),
|
||||
('g', 'ㄐㄧˋ'),
|
||||
('h', 'ㄝˇㄑㄩˋ'),
|
||||
('i', 'ㄞˋ'),
|
||||
('j', 'ㄐㄟˋ'),
|
||||
('k', 'ㄎㄟˋ'),
|
||||
('l', 'ㄝˊㄛˋ'),
|
||||
('m', 'ㄝˊㄇㄨˋ'),
|
||||
('n', 'ㄣˉ'),
|
||||
('o', 'ㄡˉ'),
|
||||
('p', 'ㄆㄧˉ'),
|
||||
('q', 'ㄎㄧㄡˉ'),
|
||||
('r', 'ㄚˋ'),
|
||||
('s', 'ㄝˊㄙˋ'),
|
||||
('t', 'ㄊㄧˋ'),
|
||||
('u', 'ㄧㄡˉ'),
|
||||
('v', 'ㄨㄧˉ'),
|
||||
('w', 'ㄉㄚˋㄅㄨˋㄌㄧㄡˋ'),
|
||||
('x', 'ㄝˉㄎㄨˋㄙˋ'),
|
||||
('y', 'ㄨㄞˋ'),
|
||||
('z', 'ㄗㄟˋ')
|
||||
]]
|
||||
|
||||
# List of (bopomofo, romaji) pairs:
|
||||
_bopomofo_to_romaji = [(re.compile('%s' % x[0]), x[1]) for x in [
|
||||
('ㄅㄛ', 'p⁼wo'),
|
||||
('ㄆㄛ', 'pʰwo'),
|
||||
('ㄇㄛ', 'mwo'),
|
||||
('ㄈㄛ', 'fwo'),
|
||||
('ㄅ', 'p⁼'),
|
||||
('ㄆ', 'pʰ'),
|
||||
('ㄇ', 'm'),
|
||||
('ㄈ', 'f'),
|
||||
('ㄉ', 't⁼'),
|
||||
('ㄊ', 'tʰ'),
|
||||
('ㄋ', 'n'),
|
||||
('ㄌ', 'l'),
|
||||
('ㄍ', 'k⁼'),
|
||||
('ㄎ', 'kʰ'),
|
||||
('ㄏ', 'h'),
|
||||
('ㄐ', 'ʧ⁼'),
|
||||
('ㄑ', 'ʧʰ'),
|
||||
('ㄒ', 'ʃ'),
|
||||
('ㄓ', 'ʦ`⁼'),
|
||||
('ㄔ', 'ʦ`ʰ'),
|
||||
('ㄕ', 's`'),
|
||||
('ㄖ', 'ɹ`'),
|
||||
('ㄗ', 'ʦ⁼'),
|
||||
('ㄘ', 'ʦʰ'),
|
||||
('ㄙ', 's'),
|
||||
('ㄚ', 'a'),
|
||||
('ㄛ', 'o'),
|
||||
('ㄜ', 'ə'),
|
||||
('ㄝ', 'e'),
|
||||
('ㄞ', 'ai'),
|
||||
('ㄟ', 'ei'),
|
||||
('ㄠ', 'au'),
|
||||
('ㄡ', 'ou'),
|
||||
('ㄧㄢ', 'yeNN'),
|
||||
('ㄢ', 'aNN'),
|
||||
('ㄧㄣ', 'iNN'),
|
||||
('ㄣ', 'əNN'),
|
||||
('ㄤ', 'aNg'),
|
||||
('ㄧㄥ', 'iNg'),
|
||||
('ㄨㄥ', 'uNg'),
|
||||
('ㄩㄥ', 'yuNg'),
|
||||
('ㄥ', 'əNg'),
|
||||
('ㄦ', 'əɻ'),
|
||||
('ㄧ', 'i'),
|
||||
('ㄨ', 'u'),
|
||||
('ㄩ', 'ɥ'),
|
||||
('ˉ', '→'),
|
||||
('ˊ', '↑'),
|
||||
('ˇ', '↓↑'),
|
||||
('ˋ', '↓'),
|
||||
('˙', ''),
|
||||
(',', ','),
|
||||
('。', '.'),
|
||||
('!', '!'),
|
||||
('?', '?'),
|
||||
('—', '-')
|
||||
]]
|
||||
|
||||
# List of (romaji, ipa) pairs:
|
||||
_romaji_to_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
|
||||
('ʃy', 'ʃ'),
|
||||
('ʧʰy', 'ʧʰ'),
|
||||
('ʧ⁼y', 'ʧ⁼'),
|
||||
('NN', 'n'),
|
||||
('Ng', 'ŋ'),
|
||||
('y', 'j'),
|
||||
('h', 'x')
|
||||
]]
|
||||
|
||||
# List of (bopomofo, ipa) pairs:
|
||||
_bopomofo_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
|
||||
('ㄅㄛ', 'p⁼wo'),
|
||||
('ㄆㄛ', 'pʰwo'),
|
||||
('ㄇㄛ', 'mwo'),
|
||||
('ㄈㄛ', 'fwo'),
|
||||
('ㄅ', 'p⁼'),
|
||||
('ㄆ', 'pʰ'),
|
||||
('ㄇ', 'm'),
|
||||
('ㄈ', 'f'),
|
||||
('ㄉ', 't⁼'),
|
||||
('ㄊ', 'tʰ'),
|
||||
('ㄋ', 'n'),
|
||||
('ㄌ', 'l'),
|
||||
('ㄍ', 'k⁼'),
|
||||
('ㄎ', 'kʰ'),
|
||||
('ㄏ', 'x'),
|
||||
('ㄐ', 'tʃ⁼'),
|
||||
('ㄑ', 'tʃʰ'),
|
||||
('ㄒ', 'ʃ'),
|
||||
('ㄓ', 'ts`⁼'),
|
||||
('ㄔ', 'ts`ʰ'),
|
||||
('ㄕ', 's`'),
|
||||
('ㄖ', 'ɹ`'),
|
||||
('ㄗ', 'ts⁼'),
|
||||
('ㄘ', 'tsʰ'),
|
||||
('ㄙ', 's'),
|
||||
('ㄚ', 'a'),
|
||||
('ㄛ', 'o'),
|
||||
('ㄜ', 'ə'),
|
||||
('ㄝ', 'ɛ'),
|
||||
('ㄞ', 'aɪ'),
|
||||
('ㄟ', 'eɪ'),
|
||||
('ㄠ', 'ɑʊ'),
|
||||
('ㄡ', 'oʊ'),
|
||||
('ㄧㄢ', 'jɛn'),
|
||||
('ㄩㄢ', 'ɥæn'),
|
||||
('ㄢ', 'an'),
|
||||
('ㄧㄣ', 'in'),
|
||||
('ㄩㄣ', 'ɥn'),
|
||||
('ㄣ', 'ən'),
|
||||
('ㄤ', 'ɑŋ'),
|
||||
('ㄧㄥ', 'iŋ'),
|
||||
('ㄨㄥ', 'ʊŋ'),
|
||||
('ㄩㄥ', 'jʊŋ'),
|
||||
('ㄥ', 'əŋ'),
|
||||
('ㄦ', 'əɻ'),
|
||||
('ㄧ', 'i'),
|
||||
('ㄨ', 'u'),
|
||||
('ㄩ', 'ɥ'),
|
||||
('ˉ', '→'),
|
||||
('ˊ', '↑'),
|
||||
('ˇ', '↓↑'),
|
||||
('ˋ', '↓'),
|
||||
('˙', ''),
|
||||
(',', ','),
|
||||
('。', '.'),
|
||||
('!', '!'),
|
||||
('?', '?'),
|
||||
('—', '-')
|
||||
]]
|
||||
|
||||
# List of (bopomofo, ipa2) pairs:
|
||||
_bopomofo_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [
|
||||
('ㄅㄛ', 'pwo'),
|
||||
('ㄆㄛ', 'pʰwo'),
|
||||
('ㄇㄛ', 'mwo'),
|
||||
('ㄈㄛ', 'fwo'),
|
||||
('ㄅ', 'p'),
|
||||
('ㄆ', 'pʰ'),
|
||||
('ㄇ', 'm'),
|
||||
('ㄈ', 'f'),
|
||||
('ㄉ', 't'),
|
||||
('ㄊ', 'tʰ'),
|
||||
('ㄋ', 'n'),
|
||||
('ㄌ', 'l'),
|
||||
('ㄍ', 'k'),
|
||||
('ㄎ', 'kʰ'),
|
||||
('ㄏ', 'h'),
|
||||
('ㄐ', 'tɕ'),
|
||||
('ㄑ', 'tɕʰ'),
|
||||
('ㄒ', 'ɕ'),
|
||||
('ㄓ', 'tʂ'),
|
||||
('ㄔ', 'tʂʰ'),
|
||||
('ㄕ', 'ʂ'),
|
||||
('ㄖ', 'ɻ'),
|
||||
('ㄗ', 'ts'),
|
||||
('ㄘ', 'tsʰ'),
|
||||
('ㄙ', 's'),
|
||||
('ㄚ', 'a'),
|
||||
('ㄛ', 'o'),
|
||||
('ㄜ', 'ɤ'),
|
||||
('ㄝ', 'ɛ'),
|
||||
('ㄞ', 'aɪ'),
|
||||
('ㄟ', 'eɪ'),
|
||||
('ㄠ', 'ɑʊ'),
|
||||
('ㄡ', 'oʊ'),
|
||||
('ㄧㄢ', 'jɛn'),
|
||||
('ㄩㄢ', 'yæn'),
|
||||
('ㄢ', 'an'),
|
||||
('ㄧㄣ', 'in'),
|
||||
('ㄩㄣ', 'yn'),
|
||||
('ㄣ', 'ən'),
|
||||
('ㄤ', 'ɑŋ'),
|
||||
('ㄧㄥ', 'iŋ'),
|
||||
('ㄨㄥ', 'ʊŋ'),
|
||||
('ㄩㄥ', 'jʊŋ'),
|
||||
('ㄥ', 'ɤŋ'),
|
||||
('ㄦ', 'əɻ'),
|
||||
('ㄧ', 'i'),
|
||||
('ㄨ', 'u'),
|
||||
('ㄩ', 'y'),
|
||||
('ˉ', '˥'),
|
||||
('ˊ', '˧˥'),
|
||||
('ˇ', '˨˩˦'),
|
||||
('ˋ', '˥˩'),
|
||||
('˙', ''),
|
||||
(',', ','),
|
||||
('。', '.'),
|
||||
('!', '!'),
|
||||
('?', '?'),
|
||||
('—', '-')
|
||||
]]
|
||||
|
||||
|
||||
def number_to_chinese(text):
|
||||
numbers = re.findall(r'\d+(?:\.?\d+)?', text)
|
||||
for number in numbers:
|
||||
text = text.replace(number, cn2an.an2cn(number), 1)
|
||||
return text
|
||||
|
||||
|
||||
def chinese_to_bopomofo(text):
|
||||
text = text.replace('、', ',').replace(';', ',').replace(':', ',')
|
||||
words = jieba.lcut(text, cut_all=False)
|
||||
text = ''
|
||||
for word in words:
|
||||
bopomofos = lazy_pinyin(word, BOPOMOFO)
|
||||
if not re.search('[\u4e00-\u9fff]', word):
|
||||
text += word
|
||||
continue
|
||||
for i in range(len(bopomofos)):
|
||||
bopomofos[i] = re.sub(r'([\u3105-\u3129])$', r'\1ˉ', bopomofos[i])
|
||||
if text != '':
|
||||
text += ' '
|
||||
text += ''.join(bopomofos)
|
||||
return text
|
||||
|
||||
|
||||
def latin_to_bopomofo(text):
|
||||
for regex, replacement in _latin_to_bopomofo:
|
||||
text = re.sub(regex, replacement, text)
|
||||
return text
|
||||
|
||||
|
||||
def bopomofo_to_romaji(text):
|
||||
for regex, replacement in _bopomofo_to_romaji:
|
||||
text = re.sub(regex, replacement, text)
|
||||
return text
|
||||
|
||||
|
||||
def bopomofo_to_ipa(text):
|
||||
for regex, replacement in _bopomofo_to_ipa:
|
||||
text = re.sub(regex, replacement, text)
|
||||
return text
|
||||
|
||||
|
||||
def bopomofo_to_ipa2(text):
|
||||
for regex, replacement in _bopomofo_to_ipa2:
|
||||
text = re.sub(regex, replacement, text)
|
||||
return text
|
||||
|
||||
|
||||
def chinese_to_romaji(text):
|
||||
text = number_to_chinese(text)
|
||||
text = chinese_to_bopomofo(text)
|
||||
text = latin_to_bopomofo(text)
|
||||
text = bopomofo_to_romaji(text)
|
||||
text = re.sub('i([aoe])', r'y\1', text)
|
||||
text = re.sub('u([aoəe])', r'w\1', text)
|
||||
text = re.sub('([ʦsɹ]`[⁼ʰ]?)([→↓↑ ]+|$)',
|
||||
r'\1ɹ`\2', text).replace('ɻ', 'ɹ`')
|
||||
text = re.sub('([ʦs][⁼ʰ]?)([→↓↑ ]+|$)', r'\1ɹ\2', text)
|
||||
return text
|
||||
|
||||
|
||||
def chinese_to_lazy_ipa(text):
|
||||
text = chinese_to_romaji(text)
|
||||
for regex, replacement in _romaji_to_ipa:
|
||||
text = re.sub(regex, replacement, text)
|
||||
return text
|
||||
|
||||
|
||||
def chinese_to_ipa(text):
|
||||
text = number_to_chinese(text)
|
||||
text = chinese_to_bopomofo(text)
|
||||
text = latin_to_bopomofo(text)
|
||||
text = bopomofo_to_ipa(text)
|
||||
text = re.sub('i([aoe])', r'j\1', text)
|
||||
text = re.sub('u([aoəe])', r'w\1', text)
|
||||
text = re.sub('([sɹ]`[⁼ʰ]?)([→↓↑ ]+|$)',
|
||||
r'\1ɹ`\2', text).replace('ɻ', 'ɹ`')
|
||||
text = re.sub('([s][⁼ʰ]?)([→↓↑ ]+|$)', r'\1ɹ\2', text)
|
||||
return text
|
||||
|
||||
|
||||
def chinese_to_ipa2(text):
|
||||
text = number_to_chinese(text)
|
||||
text = chinese_to_bopomofo(text)
|
||||
text = latin_to_bopomofo(text)
|
||||
text = bopomofo_to_ipa2(text)
|
||||
text = re.sub(r'i([aoe])', r'j\1', text)
|
||||
text = re.sub(r'u([aoəe])', r'w\1', text)
|
||||
text = re.sub(r'([ʂɹ]ʰ?)([˩˨˧˦˥ ]+|$)', r'\1ʅ\2', text)
|
||||
text = re.sub(r'(sʰ?)([˩˨˧˦˥ ]+|$)', r'\1ɿ\2', text)
|
||||
return text
|
||||
@@ -0,0 +1,30 @@
|
||||
import re
|
||||
import opencc
|
||||
|
||||
|
||||
dialects = {'SZ': 'suzhou', 'WX': 'wuxi', 'CZ': 'changzhou', 'HZ': 'hangzhou',
|
||||
'SX': 'shaoxing', 'NB': 'ningbo', 'JJ': 'jingjiang', 'YX': 'yixing',
|
||||
'JD': 'jiading', 'ZR': 'zhenru', 'PH': 'pinghu', 'TX': 'tongxiang',
|
||||
'JS': 'jiashan', 'HN': 'xiashi', 'LP': 'linping', 'XS': 'xiaoshan',
|
||||
'FY': 'fuyang', 'RA': 'ruao', 'CX': 'cixi', 'SM': 'sanmen',
|
||||
'TT': 'tiantai', 'WZ': 'wenzhou', 'SC': 'suichang', 'YB': 'youbu'}
|
||||
|
||||
converters = {}
|
||||
|
||||
for dialect in dialects.values():
|
||||
try:
|
||||
converters[dialect] = opencc.OpenCC(dialect)
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
def ngu_dialect_to_ipa(text, dialect):
|
||||
dialect = dialects[dialect]
|
||||
text = converters[dialect].convert(text).replace('-','').replace('$',' ')
|
||||
text = re.sub(r'[、;:]', ',', text)
|
||||
text = re.sub(r'\s*,\s*', ', ', text)
|
||||
text = re.sub(r'\s*。\s*', '. ', text)
|
||||
text = re.sub(r'\s*?\s*', '? ', text)
|
||||
text = re.sub(r'\s*!\s*', '! ', text)
|
||||
text = re.sub(r'\s*$', '', text)
|
||||
return text
|
||||
@@ -0,0 +1,62 @@
|
||||
import re
|
||||
from indic_transliteration import sanscript
|
||||
|
||||
|
||||
# List of (iast, ipa) pairs:
|
||||
_iast_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
|
||||
('a', 'ə'),
|
||||
('ā', 'aː'),
|
||||
('ī', 'iː'),
|
||||
('ū', 'uː'),
|
||||
('ṛ', 'ɹ`'),
|
||||
('ṝ', 'ɹ`ː'),
|
||||
('ḷ', 'l`'),
|
||||
('ḹ', 'l`ː'),
|
||||
('e', 'eː'),
|
||||
('o', 'oː'),
|
||||
('k', 'k⁼'),
|
||||
('k⁼h', 'kʰ'),
|
||||
('g', 'g⁼'),
|
||||
('g⁼h', 'gʰ'),
|
||||
('ṅ', 'ŋ'),
|
||||
('c', 'ʧ⁼'),
|
||||
('ʧ⁼h', 'ʧʰ'),
|
||||
('j', 'ʥ⁼'),
|
||||
('ʥ⁼h', 'ʥʰ'),
|
||||
('ñ', 'n^'),
|
||||
('ṭ', 't`⁼'),
|
||||
('t`⁼h', 't`ʰ'),
|
||||
('ḍ', 'd`⁼'),
|
||||
('d`⁼h', 'd`ʰ'),
|
||||
('ṇ', 'n`'),
|
||||
('t', 't⁼'),
|
||||
('t⁼h', 'tʰ'),
|
||||
('d', 'd⁼'),
|
||||
('d⁼h', 'dʰ'),
|
||||
('p', 'p⁼'),
|
||||
('p⁼h', 'pʰ'),
|
||||
('b', 'b⁼'),
|
||||
('b⁼h', 'bʰ'),
|
||||
('y', 'j'),
|
||||
('ś', 'ʃ'),
|
||||
('ṣ', 's`'),
|
||||
('r', 'ɾ'),
|
||||
('l̤', 'l`'),
|
||||
('h', 'ɦ'),
|
||||
("'", ''),
|
||||
('~', '^'),
|
||||
('ṃ', '^')
|
||||
]]
|
||||
|
||||
|
||||
def devanagari_to_ipa(text):
|
||||
text = text.replace('ॐ', 'ओम्')
|
||||
text = re.sub(r'\s*।\s*$', '.', text)
|
||||
text = re.sub(r'\s*।\s*', ', ', text)
|
||||
text = re.sub(r'\s*॥', '.', text)
|
||||
text = sanscript.transliterate(text, sanscript.DEVANAGARI, sanscript.IAST)
|
||||
for regex, replacement in _iast_to_ipa:
|
||||
text = re.sub(regex, replacement, text)
|
||||
text = re.sub('(.)[`ː]*ḥ', lambda x: x.group(0)
|
||||
[:-1]+'h'+x.group(1)+'*', text)
|
||||
return text
|
||||
@@ -0,0 +1,64 @@
|
||||
import re
|
||||
import cn2an
|
||||
import opencc
|
||||
|
||||
|
||||
converter = opencc.OpenCC('zaonhe')
|
||||
|
||||
# List of (Latin alphabet, ipa) pairs:
|
||||
_latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
|
||||
('A', 'ᴇ'),
|
||||
('B', 'bi'),
|
||||
('C', 'si'),
|
||||
('D', 'di'),
|
||||
('E', 'i'),
|
||||
('F', 'ᴇf'),
|
||||
('G', 'dʑi'),
|
||||
('H', 'ᴇtɕʰ'),
|
||||
('I', 'ᴀi'),
|
||||
('J', 'dʑᴇ'),
|
||||
('K', 'kʰᴇ'),
|
||||
('L', 'ᴇl'),
|
||||
('M', 'ᴇm'),
|
||||
('N', 'ᴇn'),
|
||||
('O', 'o'),
|
||||
('P', 'pʰi'),
|
||||
('Q', 'kʰiu'),
|
||||
('R', 'ᴀl'),
|
||||
('S', 'ᴇs'),
|
||||
('T', 'tʰi'),
|
||||
('U', 'ɦiu'),
|
||||
('V', 'vi'),
|
||||
('W', 'dᴀbɤliu'),
|
||||
('X', 'ᴇks'),
|
||||
('Y', 'uᴀi'),
|
||||
('Z', 'zᴇ')
|
||||
]]
|
||||
|
||||
|
||||
def _number_to_shanghainese(num):
|
||||
num = cn2an.an2cn(num).replace('一十','十').replace('二十', '廿').replace('二', '两')
|
||||
return re.sub(r'((?:^|[^三四五六七八九])十|廿)两', r'\1二', num)
|
||||
|
||||
|
||||
def number_to_shanghainese(text):
|
||||
return re.sub(r'\d+(?:\.?\d+)?', lambda x: _number_to_shanghainese(x.group()), text)
|
||||
|
||||
|
||||
def latin_to_ipa(text):
|
||||
for regex, replacement in _latin_to_ipa:
|
||||
text = re.sub(regex, replacement, text)
|
||||
return text
|
||||
|
||||
|
||||
def shanghainese_to_ipa(text):
|
||||
text = number_to_shanghainese(text.upper())
|
||||
text = converter.convert(text).replace('-','').replace('$',' ')
|
||||
text = re.sub(r'[A-Z]', lambda x: latin_to_ipa(x.group())+' ', text)
|
||||
text = re.sub(r'[、;:]', ',', text)
|
||||
text = re.sub(r'\s*,\s*', ', ', text)
|
||||
text = re.sub(r'\s*。\s*', '. ', text)
|
||||
text = re.sub(r'\s*?\s*', '? ', text)
|
||||
text = re.sub(r'\s*!\s*', '! ', text)
|
||||
text = re.sub(r'\s*$', '', text)
|
||||
return text
|
||||
@@ -0,0 +1,76 @@
|
||||
'''
|
||||
Defines the set of symbols used in text input to the model.
|
||||
'''
|
||||
|
||||
# japanese_cleaners
|
||||
# _pad = '_'
|
||||
# _punctuation = ',.!?-'
|
||||
# _letters = 'AEINOQUabdefghijkmnoprstuvwyzʃʧ↓↑ '
|
||||
|
||||
|
||||
'''# japanese_cleaners2
|
||||
_pad = '_'
|
||||
_punctuation = ',.!?-~…'
|
||||
_letters = 'AEINOQUabdefghijkmnoprstuvwyzʃʧʦ↓↑ '
|
||||
'''
|
||||
|
||||
|
||||
'''# korean_cleaners
|
||||
_pad = '_'
|
||||
_punctuation = ',.!?…~'
|
||||
_letters = 'ㄱㄴㄷㄹㅁㅂㅅㅇㅈㅊㅋㅌㅍㅎㄲㄸㅃㅆㅉㅏㅓㅗㅜㅡㅣㅐㅔ '
|
||||
'''
|
||||
|
||||
'''# chinese_cleaners
|
||||
_pad = '_'
|
||||
_punctuation = ',。!?—…'
|
||||
_letters = 'ㄅㄆㄇㄈㄉㄊㄋㄌㄍㄎㄏㄐㄑㄒㄓㄔㄕㄖㄗㄘㄙㄚㄛㄜㄝㄞㄟㄠㄡㄢㄣㄤㄥㄦㄧㄨㄩˉˊˇˋ˙ '
|
||||
'''
|
||||
|
||||
# # zh_ja_mixture_cleaners
|
||||
# _pad = '_'
|
||||
# _punctuation = ',.!?-~…'
|
||||
# _letters = 'AEINOQUabdefghijklmnoprstuvwyzʃʧʦɯɹəɥ⁼ʰ`→↓↑ '
|
||||
|
||||
|
||||
'''# sanskrit_cleaners
|
||||
_pad = '_'
|
||||
_punctuation = '।'
|
||||
_letters = 'ँंःअआइईउऊऋएऐओऔकखगघङचछजझञटठडढणतथदधनपफबभमयरलळवशषसहऽािीुूृॄेैोौ्ॠॢ '
|
||||
'''
|
||||
|
||||
'''# cjks_cleaners
|
||||
_pad = '_'
|
||||
_punctuation = ',.!?-~…'
|
||||
_letters = 'NQabdefghijklmnopstuvwxyzʃʧʥʦɯɹəɥçɸɾβŋɦː⁼ʰ`^#*=→↓↑ '
|
||||
'''
|
||||
|
||||
'''# thai_cleaners
|
||||
_pad = '_'
|
||||
_punctuation = '.!? '
|
||||
_letters = 'กขฃคฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลวศษสหฬอฮฯะัาำิีึืุูเแโใไๅๆ็่้๊๋์'
|
||||
'''
|
||||
|
||||
# # cjke_cleaners2
|
||||
_pad = '_'
|
||||
_punctuation = ',.!?-~…'
|
||||
_letters = 'NQabdefghijklmnopstuvwxyzɑæʃʑçɯɪɔɛɹðəɫɥɸʊɾʒθβŋɦ⁼ʰ`^#*=ˈˌ→↓↑ '
|
||||
|
||||
|
||||
'''# shanghainese_cleaners
|
||||
_pad = '_'
|
||||
_punctuation = ',.!?…'
|
||||
_letters = 'abdfghiklmnopstuvyzøŋȵɑɔɕəɤɦɪɿʑʔʰ̩̃ᴀᴇ15678 '
|
||||
'''
|
||||
|
||||
'''# chinese_dialect_cleaners
|
||||
_pad = '_'
|
||||
_punctuation = ',.!?~…─'
|
||||
_letters = '#Nabdefghijklmnoprstuvwxyzæçøŋœȵɐɑɒɓɔɕɗɘəɚɛɜɣɤɦɪɭɯɵɷɸɻɾɿʂʅʊʋʌʏʑʔʦʮʰʷˀː˥˦˧˨˩̥̩̃̚ᴀᴇ↑↓∅ⱼ '
|
||||
'''
|
||||
|
||||
# Export all symbols:
|
||||
symbols = [_pad] + list(_punctuation) + list(_letters)
|
||||
|
||||
# Special symbol ids
|
||||
SPACE_ID = symbols.index(" ")
|
||||
@@ -0,0 +1,44 @@
|
||||
import re
|
||||
from num_thai.thainumbers import NumThai
|
||||
|
||||
|
||||
num = NumThai()
|
||||
|
||||
# List of (Latin alphabet, Thai) pairs:
|
||||
_latin_to_thai = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
|
||||
('a', 'เอ'),
|
||||
('b','บี'),
|
||||
('c','ซี'),
|
||||
('d','ดี'),
|
||||
('e','อี'),
|
||||
('f','เอฟ'),
|
||||
('g','จี'),
|
||||
('h','เอช'),
|
||||
('i','ไอ'),
|
||||
('j','เจ'),
|
||||
('k','เค'),
|
||||
('l','แอล'),
|
||||
('m','เอ็ม'),
|
||||
('n','เอ็น'),
|
||||
('o','โอ'),
|
||||
('p','พี'),
|
||||
('q','คิว'),
|
||||
('r','แอร์'),
|
||||
('s','เอส'),
|
||||
('t','ที'),
|
||||
('u','ยู'),
|
||||
('v','วี'),
|
||||
('w','ดับเบิลยู'),
|
||||
('x','เอ็กซ์'),
|
||||
('y','วาย'),
|
||||
('z','ซี')
|
||||
]]
|
||||
|
||||
|
||||
def num_to_thai(text):
|
||||
return re.sub(r'(?:\d+(?:,?\d+)?)+(?:\.\d+(?:,?\d+)?)?', lambda x: ''.join(num.NumberToTextThai(float(x.group(0).replace(',', '')))), text)
|
||||
|
||||
def latin_to_thai(text):
|
||||
for regex, replacement in _latin_to_thai:
|
||||
text = re.sub(regex, replacement, text)
|
||||
return text
|
||||
+193
@@ -0,0 +1,193 @@
|
||||
import torch
|
||||
from torch.nn import functional as F
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
DEFAULT_MIN_BIN_WIDTH = 1e-3
|
||||
DEFAULT_MIN_BIN_HEIGHT = 1e-3
|
||||
DEFAULT_MIN_DERIVATIVE = 1e-3
|
||||
|
||||
|
||||
def piecewise_rational_quadratic_transform(inputs,
|
||||
unnormalized_widths,
|
||||
unnormalized_heights,
|
||||
unnormalized_derivatives,
|
||||
inverse=False,
|
||||
tails=None,
|
||||
tail_bound=1.,
|
||||
min_bin_width=DEFAULT_MIN_BIN_WIDTH,
|
||||
min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
|
||||
min_derivative=DEFAULT_MIN_DERIVATIVE):
|
||||
|
||||
if tails is None:
|
||||
spline_fn = rational_quadratic_spline
|
||||
spline_kwargs = {}
|
||||
else:
|
||||
spline_fn = unconstrained_rational_quadratic_spline
|
||||
spline_kwargs = {
|
||||
'tails': tails,
|
||||
'tail_bound': tail_bound
|
||||
}
|
||||
|
||||
outputs, logabsdet = spline_fn(
|
||||
inputs=inputs,
|
||||
unnormalized_widths=unnormalized_widths,
|
||||
unnormalized_heights=unnormalized_heights,
|
||||
unnormalized_derivatives=unnormalized_derivatives,
|
||||
inverse=inverse,
|
||||
min_bin_width=min_bin_width,
|
||||
min_bin_height=min_bin_height,
|
||||
min_derivative=min_derivative,
|
||||
**spline_kwargs
|
||||
)
|
||||
return outputs, logabsdet
|
||||
|
||||
|
||||
def searchsorted(bin_locations, inputs, eps=1e-6):
|
||||
bin_locations[..., -1] += eps
|
||||
return torch.sum(
|
||||
inputs[..., None] >= bin_locations,
|
||||
dim=-1
|
||||
) - 1
|
||||
|
||||
|
||||
def unconstrained_rational_quadratic_spline(inputs,
|
||||
unnormalized_widths,
|
||||
unnormalized_heights,
|
||||
unnormalized_derivatives,
|
||||
inverse=False,
|
||||
tails='linear',
|
||||
tail_bound=1.,
|
||||
min_bin_width=DEFAULT_MIN_BIN_WIDTH,
|
||||
min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
|
||||
min_derivative=DEFAULT_MIN_DERIVATIVE):
|
||||
inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
|
||||
outside_interval_mask = ~inside_interval_mask
|
||||
|
||||
outputs = torch.zeros_like(inputs)
|
||||
logabsdet = torch.zeros_like(inputs)
|
||||
|
||||
if tails == 'linear':
|
||||
unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))
|
||||
constant = np.log(np.exp(1 - min_derivative) - 1)
|
||||
unnormalized_derivatives[..., 0] = constant
|
||||
unnormalized_derivatives[..., -1] = constant
|
||||
|
||||
outputs[outside_interval_mask] = inputs[outside_interval_mask]
|
||||
logabsdet[outside_interval_mask] = 0
|
||||
else:
|
||||
raise RuntimeError('{} tails are not implemented.'.format(tails))
|
||||
|
||||
outputs[inside_interval_mask], logabsdet[inside_interval_mask] = rational_quadratic_spline(
|
||||
inputs=inputs[inside_interval_mask],
|
||||
unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
|
||||
unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
|
||||
unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
|
||||
inverse=inverse,
|
||||
left=-tail_bound, right=tail_bound, bottom=-tail_bound, top=tail_bound,
|
||||
min_bin_width=min_bin_width,
|
||||
min_bin_height=min_bin_height,
|
||||
min_derivative=min_derivative
|
||||
)
|
||||
|
||||
return outputs, logabsdet
|
||||
|
||||
def rational_quadratic_spline(inputs,
|
||||
unnormalized_widths,
|
||||
unnormalized_heights,
|
||||
unnormalized_derivatives,
|
||||
inverse=False,
|
||||
left=0., right=1., bottom=0., top=1.,
|
||||
min_bin_width=DEFAULT_MIN_BIN_WIDTH,
|
||||
min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
|
||||
min_derivative=DEFAULT_MIN_DERIVATIVE):
|
||||
if torch.min(inputs) < left or torch.max(inputs) > right:
|
||||
raise ValueError('Input to a transform is not within its domain')
|
||||
|
||||
num_bins = unnormalized_widths.shape[-1]
|
||||
|
||||
if min_bin_width * num_bins > 1.0:
|
||||
raise ValueError('Minimal bin width too large for the number of bins')
|
||||
if min_bin_height * num_bins > 1.0:
|
||||
raise ValueError('Minimal bin height too large for the number of bins')
|
||||
|
||||
widths = F.softmax(unnormalized_widths, dim=-1)
|
||||
widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
|
||||
cumwidths = torch.cumsum(widths, dim=-1)
|
||||
cumwidths = F.pad(cumwidths, pad=(1, 0), mode='constant', value=0.0)
|
||||
cumwidths = (right - left) * cumwidths + left
|
||||
cumwidths[..., 0] = left
|
||||
cumwidths[..., -1] = right
|
||||
widths = cumwidths[..., 1:] - cumwidths[..., :-1]
|
||||
|
||||
derivatives = min_derivative + F.softplus(unnormalized_derivatives)
|
||||
|
||||
heights = F.softmax(unnormalized_heights, dim=-1)
|
||||
heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
|
||||
cumheights = torch.cumsum(heights, dim=-1)
|
||||
cumheights = F.pad(cumheights, pad=(1, 0), mode='constant', value=0.0)
|
||||
cumheights = (top - bottom) * cumheights + bottom
|
||||
cumheights[..., 0] = bottom
|
||||
cumheights[..., -1] = top
|
||||
heights = cumheights[..., 1:] - cumheights[..., :-1]
|
||||
|
||||
if inverse:
|
||||
bin_idx = searchsorted(cumheights, inputs)[..., None]
|
||||
else:
|
||||
bin_idx = searchsorted(cumwidths, inputs)[..., None]
|
||||
|
||||
input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
|
||||
input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
|
||||
|
||||
input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
|
||||
delta = heights / widths
|
||||
input_delta = delta.gather(-1, bin_idx)[..., 0]
|
||||
|
||||
input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
|
||||
input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
|
||||
|
||||
input_heights = heights.gather(-1, bin_idx)[..., 0]
|
||||
|
||||
if inverse:
|
||||
a = (((inputs - input_cumheights) * (input_derivatives
|
||||
+ input_derivatives_plus_one
|
||||
- 2 * input_delta)
|
||||
+ input_heights * (input_delta - input_derivatives)))
|
||||
b = (input_heights * input_derivatives
|
||||
- (inputs - input_cumheights) * (input_derivatives
|
||||
+ input_derivatives_plus_one
|
||||
- 2 * input_delta))
|
||||
c = - input_delta * (inputs - input_cumheights)
|
||||
|
||||
discriminant = b.pow(2) - 4 * a * c
|
||||
assert (discriminant >= 0).all()
|
||||
|
||||
root = (2 * c) / (-b - torch.sqrt(discriminant))
|
||||
outputs = root * input_bin_widths + input_cumwidths
|
||||
|
||||
theta_one_minus_theta = root * (1 - root)
|
||||
denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta)
|
||||
* theta_one_minus_theta)
|
||||
derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * root.pow(2)
|
||||
+ 2 * input_delta * theta_one_minus_theta
|
||||
+ input_derivatives * (1 - root).pow(2))
|
||||
logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
|
||||
|
||||
return outputs, -logabsdet
|
||||
else:
|
||||
theta = (inputs - input_cumwidths) / input_bin_widths
|
||||
theta_one_minus_theta = theta * (1 - theta)
|
||||
|
||||
numerator = input_heights * (input_delta * theta.pow(2)
|
||||
+ input_derivatives * theta_one_minus_theta)
|
||||
denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta)
|
||||
* theta_one_minus_theta)
|
||||
outputs = input_cumheights + numerator / denominator
|
||||
|
||||
derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * theta.pow(2)
|
||||
+ 2 * input_delta * theta_one_minus_theta
|
||||
+ input_derivatives * (1 - theta).pow(2))
|
||||
logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
|
||||
|
||||
return outputs, logabsdet
|
||||
@@ -0,0 +1,54 @@
|
||||
{
|
||||
"train": {
|
||||
"log_interval": 200,
|
||||
"eval_interval": 1000,
|
||||
"seed": 1234,
|
||||
"epochs": 10000,
|
||||
"learning_rate": 2e-4,
|
||||
"betas": [0.8, 0.99],
|
||||
"eps": 1e-9,
|
||||
"batch_size": 16,
|
||||
"fp16_run": true,
|
||||
"lr_decay": 0.999875,
|
||||
"segment_size": 8192,
|
||||
"init_lr_ratio": 1,
|
||||
"warmup_epochs": 0,
|
||||
"c_mel": 45,
|
||||
"c_kl": 1.0
|
||||
},
|
||||
"data": {
|
||||
"training_files":"../CH_JA_EN_mix_voice/clipped_3_vits_trilingual_annotations.train.txt.cleaned",
|
||||
"validation_files":"../CH_JA_EN_mix_voice/clipped_3_vits_trilingual_annotations.val.txt.cleaned",
|
||||
"text_cleaners":["cjke_cleaners2"],
|
||||
"max_wav_value": 32768.0,
|
||||
"sampling_rate": 22050,
|
||||
"filter_length": 1024,
|
||||
"hop_length": 256,
|
||||
"win_length": 1024,
|
||||
"n_mel_channels": 80,
|
||||
"mel_fmin": 0.0,
|
||||
"mel_fmax": null,
|
||||
"add_blank": true,
|
||||
"n_speakers": 999,
|
||||
"cleaned_text": true
|
||||
},
|
||||
"model": {
|
||||
"inter_channels": 192,
|
||||
"hidden_channels": 192,
|
||||
"filter_channels": 768,
|
||||
"n_heads": 2,
|
||||
"n_layers": 6,
|
||||
"kernel_size": 3,
|
||||
"p_dropout": 0.1,
|
||||
"resblock": "1",
|
||||
"resblock_kernel_sizes": [3,7,11],
|
||||
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
||||
"upsample_rates": [8,8,2,2],
|
||||
"upsample_initial_channel": 512,
|
||||
"upsample_kernel_sizes": [16,16,4,4],
|
||||
"n_layers_q": 3,
|
||||
"use_spectral_norm": false,
|
||||
"gin_channels": 256
|
||||
},
|
||||
"symbols": ["_", ",", ".", "!", "?", "-", "~", "\u2026", "N", "Q", "a", "b", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "s", "t", "u", "v", "w", "x", "y", "z", "\u0251", "\u00e6", "\u0283", "\u0291", "\u00e7", "\u026f", "\u026a", "\u0254", "\u025b", "\u0279", "\u00f0", "\u0259", "\u026b", "\u0265", "\u0278", "\u028a", "\u027e", "\u0292", "\u03b8", "\u03b2", "\u014b", "\u0266", "\u207c", "\u02b0", "`", "^", "#", "*", "=", "\u02c8", "\u02cc", "\u2192", "\u2193", "\u2191", " "]
|
||||
}
|
||||
@@ -0,0 +1,258 @@
|
||||
import os
|
||||
import glob
|
||||
import sys
|
||||
import argparse
|
||||
import logging
|
||||
import json
|
||||
import subprocess
|
||||
import numpy as np
|
||||
from scipy.io.wavfile import read
|
||||
import torch
|
||||
|
||||
MATPLOTLIB_FLAG = False
|
||||
|
||||
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
|
||||
logger = logging
|
||||
|
||||
|
||||
def load_checkpoint(checkpoint_path, model, optimizer=None):
|
||||
assert os.path.isfile(checkpoint_path)
|
||||
checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
|
||||
iteration = checkpoint_dict['iteration']
|
||||
learning_rate = checkpoint_dict['learning_rate']
|
||||
if optimizer is not None:
|
||||
optimizer.load_state_dict(checkpoint_dict['optimizer'])
|
||||
saved_state_dict = checkpoint_dict['model']
|
||||
if hasattr(model, 'module'):
|
||||
state_dict = model.module.state_dict()
|
||||
else:
|
||||
state_dict = model.state_dict()
|
||||
new_state_dict = {}
|
||||
for k, v in state_dict.items():
|
||||
try:
|
||||
new_state_dict[k] = saved_state_dict[k]
|
||||
except:
|
||||
logger.info("%s is not in the checkpoint" % k)
|
||||
new_state_dict[k] = v
|
||||
if hasattr(model, 'module'):
|
||||
model.module.load_state_dict(new_state_dict)
|
||||
else:
|
||||
model.load_state_dict(new_state_dict)
|
||||
logger.info("Loaded checkpoint '{}' (iteration {})".format(
|
||||
checkpoint_path, iteration))
|
||||
return model, optimizer, learning_rate, iteration
|
||||
|
||||
|
||||
def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path):
|
||||
logger.info("Saving model and optimizer state at iteration {} to {}".format(
|
||||
iteration, checkpoint_path))
|
||||
if hasattr(model, 'module'):
|
||||
state_dict = model.module.state_dict()
|
||||
else:
|
||||
state_dict = model.state_dict()
|
||||
torch.save({'model': state_dict,
|
||||
'iteration': iteration,
|
||||
'optimizer': optimizer.state_dict(),
|
||||
'learning_rate': learning_rate}, checkpoint_path)
|
||||
|
||||
|
||||
def summarize(writer, global_step, scalars={}, histograms={}, images={}, audios={}, audio_sampling_rate=22050):
|
||||
for k, v in scalars.items():
|
||||
writer.add_scalar(k, v, global_step)
|
||||
for k, v in histograms.items():
|
||||
writer.add_histogram(k, v, global_step)
|
||||
for k, v in images.items():
|
||||
writer.add_image(k, v, global_step, dataformats='HWC')
|
||||
for k, v in audios.items():
|
||||
writer.add_audio(k, v, global_step, audio_sampling_rate)
|
||||
|
||||
|
||||
def latest_checkpoint_path(dir_path, regex="G_*.pth"):
|
||||
f_list = glob.glob(os.path.join(dir_path, regex))
|
||||
f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f))))
|
||||
x = f_list[-1]
|
||||
print(x)
|
||||
return x
|
||||
|
||||
|
||||
def plot_spectrogram_to_numpy(spectrogram):
|
||||
global MATPLOTLIB_FLAG
|
||||
if not MATPLOTLIB_FLAG:
|
||||
import matplotlib
|
||||
matplotlib.use("Agg")
|
||||
MATPLOTLIB_FLAG = True
|
||||
mpl_logger = logging.getLogger('matplotlib')
|
||||
mpl_logger.setLevel(logging.WARNING)
|
||||
import matplotlib.pylab as plt
|
||||
import numpy as np
|
||||
|
||||
fig, ax = plt.subplots(figsize=(10, 2))
|
||||
im = ax.imshow(spectrogram, aspect="auto", origin="lower",
|
||||
interpolation='none')
|
||||
plt.colorbar(im, ax=ax)
|
||||
plt.xlabel("Frames")
|
||||
plt.ylabel("Channels")
|
||||
plt.tight_layout()
|
||||
|
||||
fig.canvas.draw()
|
||||
data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
|
||||
data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
|
||||
plt.close()
|
||||
return data
|
||||
|
||||
|
||||
def plot_alignment_to_numpy(alignment, info=None):
|
||||
global MATPLOTLIB_FLAG
|
||||
if not MATPLOTLIB_FLAG:
|
||||
import matplotlib
|
||||
matplotlib.use("Agg")
|
||||
MATPLOTLIB_FLAG = True
|
||||
mpl_logger = logging.getLogger('matplotlib')
|
||||
mpl_logger.setLevel(logging.WARNING)
|
||||
import matplotlib.pylab as plt
|
||||
import numpy as np
|
||||
|
||||
fig, ax = plt.subplots(figsize=(6, 4))
|
||||
im = ax.imshow(alignment.transpose(), aspect='auto', origin='lower',
|
||||
interpolation='none')
|
||||
fig.colorbar(im, ax=ax)
|
||||
xlabel = 'Decoder timestep'
|
||||
if info is not None:
|
||||
xlabel += '\n\n' + info
|
||||
plt.xlabel(xlabel)
|
||||
plt.ylabel('Encoder timestep')
|
||||
plt.tight_layout()
|
||||
|
||||
fig.canvas.draw()
|
||||
data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
|
||||
data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
|
||||
plt.close()
|
||||
return data
|
||||
|
||||
|
||||
def load_wav_to_torch(full_path):
|
||||
sampling_rate, data = read(full_path)
|
||||
return torch.FloatTensor(data.astype(np.float32)), sampling_rate
|
||||
|
||||
|
||||
def load_filepaths_and_text(filename, split="|"):
|
||||
with open(filename, encoding='utf-8') as f:
|
||||
filepaths_and_text = [line.strip().split(split) for line in f]
|
||||
return filepaths_and_text
|
||||
|
||||
|
||||
def get_hparams(init=True):
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('-c', '--config', type=str, default="./configs/uma87.json",
|
||||
help='JSON file for configuration')
|
||||
parser.add_argument('-m', '--model', type=str, default="./pretrained_models/uma_G_0.pth",
|
||||
help='Model name')
|
||||
|
||||
args = parser.parse_args()
|
||||
model_dir = os.path.join("../drive/MyDrive", args.model)
|
||||
|
||||
if not os.path.exists(model_dir):
|
||||
os.makedirs(model_dir)
|
||||
|
||||
config_path = args.config
|
||||
config_save_path = os.path.join(model_dir, "config.json")
|
||||
if init:
|
||||
with open(config_path, "r") as f:
|
||||
data = f.read()
|
||||
with open(config_save_path, "w") as f:
|
||||
f.write(data)
|
||||
else:
|
||||
with open(config_save_path, "r") as f:
|
||||
data = f.read()
|
||||
config = json.loads(data)
|
||||
|
||||
hparams = HParams(**config)
|
||||
hparams.model_dir = model_dir
|
||||
return hparams
|
||||
|
||||
|
||||
def get_hparams_from_dir(model_dir):
|
||||
config_save_path = os.path.join(model_dir, "config.json")
|
||||
with open(config_save_path, "r") as f:
|
||||
data = f.read()
|
||||
config = json.loads(data)
|
||||
|
||||
hparams = HParams(**config)
|
||||
hparams.model_dir = model_dir
|
||||
return hparams
|
||||
|
||||
|
||||
def get_hparams_from_file(config_path):
|
||||
with open(config_path, "r") as f:
|
||||
data = f.read()
|
||||
config = json.loads(data)
|
||||
|
||||
hparams = HParams(**config)
|
||||
return hparams
|
||||
|
||||
|
||||
def check_git_hash(model_dir):
|
||||
source_dir = os.path.dirname(os.path.realpath(__file__))
|
||||
if not os.path.exists(os.path.join(source_dir, ".git")):
|
||||
logger.warn("{} is not a git repository, therefore hash value comparison will be ignored.".format(
|
||||
source_dir
|
||||
))
|
||||
return
|
||||
|
||||
cur_hash = subprocess.getoutput("git rev-parse HEAD")
|
||||
|
||||
path = os.path.join(model_dir, "githash")
|
||||
if os.path.exists(path):
|
||||
saved_hash = open(path).read()
|
||||
if saved_hash != cur_hash:
|
||||
logger.warn("git hash values are different. {}(saved) != {}(current)".format(
|
||||
saved_hash[:8], cur_hash[:8]))
|
||||
else:
|
||||
open(path, "w").write(cur_hash)
|
||||
|
||||
|
||||
def get_logger(model_dir, filename="train.log"):
|
||||
global logger
|
||||
logger = logging.getLogger(os.path.basename(model_dir))
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s")
|
||||
if not os.path.exists(model_dir):
|
||||
os.makedirs(model_dir)
|
||||
h = logging.FileHandler(os.path.join(model_dir, filename))
|
||||
h.setLevel(logging.DEBUG)
|
||||
h.setFormatter(formatter)
|
||||
logger.addHandler(h)
|
||||
return logger
|
||||
|
||||
|
||||
class HParams():
|
||||
def __init__(self, **kwargs):
|
||||
for k, v in kwargs.items():
|
||||
if type(v) == dict:
|
||||
v = HParams(**v)
|
||||
self[k] = v
|
||||
|
||||
def keys(self):
|
||||
return self.__dict__.keys()
|
||||
|
||||
def items(self):
|
||||
return self.__dict__.items()
|
||||
|
||||
def values(self):
|
||||
return self.__dict__.values()
|
||||
|
||||
def __len__(self):
|
||||
return len(self.__dict__)
|
||||
|
||||
def __getitem__(self, key):
|
||||
return getattr(self, key)
|
||||
|
||||
def __setitem__(self, key, value):
|
||||
return setattr(self, key, value)
|
||||
|
||||
def __contains__(self, key):
|
||||
return key in self.__dict__
|
||||
|
||||
def __repr__(self):
|
||||
return self.__dict__.__repr__()
|
||||
Reference in New Issue
Block a user