|
|
|
|
|
|
|
|
|
|
|
|
|
|
import torch |
|
|
import torch.nn as nn |
|
|
|
|
|
|
|
|
class Conv1d1x1(nn.Conv1d): |
|
|
"""1x1 Conv1d.""" |
|
|
|
|
|
def __init__(self, in_channels, out_channels, bias=True): |
|
|
super(Conv1d1x1, self).__init__(in_channels, out_channels, kernel_size=1, bias=bias) |
|
|
|
|
|
|
|
|
class Conv1d(nn.Module): |
|
|
def __init__( |
|
|
self, |
|
|
in_channels: int, |
|
|
out_channels: int, |
|
|
kernel_size: int, |
|
|
stride: int = 1, |
|
|
padding: int = -1, |
|
|
dilation: int = 1, |
|
|
groups: int = 1, |
|
|
bias: bool = True, |
|
|
): |
|
|
super().__init__() |
|
|
self.in_channels = in_channels |
|
|
self.out_channels = out_channels |
|
|
self.kernel_size = kernel_size |
|
|
if padding < 0: |
|
|
padding = (kernel_size - 1) // 2 * dilation |
|
|
self.dilation = dilation |
|
|
self.conv = nn.Conv1d( |
|
|
in_channels=in_channels, |
|
|
out_channels=out_channels, |
|
|
kernel_size=kernel_size, |
|
|
stride=stride, |
|
|
padding=padding, |
|
|
dilation=dilation, |
|
|
groups=groups, |
|
|
bias=bias, |
|
|
) |
|
|
|
|
|
def forward(self, x): |
|
|
""" |
|
|
Args: |
|
|
x (Tensor): Float tensor variable with the shape (B, C, T). |
|
|
Returns: |
|
|
Tensor: Float tensor variable with the shape (B, C, T). |
|
|
""" |
|
|
x = self.conv(x) |
|
|
return x |
|
|
|
|
|
|
|
|
class ResidualUnit(nn.Module): |
|
|
def __init__( |
|
|
self, |
|
|
in_channels: int, |
|
|
out_channels: int, |
|
|
kernel_size=3, |
|
|
dilation=1, |
|
|
bias=False, |
|
|
nonlinear_activation="ELU", |
|
|
nonlinear_activation_params={}, |
|
|
): |
|
|
super().__init__() |
|
|
self.activation = getattr(nn, nonlinear_activation)(**nonlinear_activation_params) |
|
|
self.conv1 = Conv1d( |
|
|
in_channels=in_channels, |
|
|
out_channels=out_channels, |
|
|
kernel_size=kernel_size, |
|
|
stride=1, |
|
|
dilation=dilation, |
|
|
bias=bias, |
|
|
) |
|
|
self.conv2 = Conv1d1x1(out_channels, out_channels, bias) |
|
|
|
|
|
def forward(self, x): |
|
|
y = self.conv1(self.activation(x)) |
|
|
y = self.conv2(self.activation(y)) |
|
|
return x + y |
|
|
|
|
|
|
|
|
class ConvTranspose1d(nn.Module): |
|
|
def __init__( |
|
|
self, |
|
|
in_channels: int, |
|
|
out_channels: int, |
|
|
kernel_size: int, |
|
|
stride: int, |
|
|
padding=-1, |
|
|
output_padding=-1, |
|
|
groups=1, |
|
|
bias=True, |
|
|
): |
|
|
super().__init__() |
|
|
if padding < 0: |
|
|
padding = (stride + 1) // 2 |
|
|
if output_padding < 0: |
|
|
output_padding = 1 if stride % 2 else 0 |
|
|
self.deconv = nn.ConvTranspose1d( |
|
|
in_channels=in_channels, |
|
|
out_channels=out_channels, |
|
|
kernel_size=kernel_size, |
|
|
stride=stride, |
|
|
padding=padding, |
|
|
output_padding=output_padding, |
|
|
groups=groups, |
|
|
bias=bias, |
|
|
) |
|
|
|
|
|
def forward(self, x): |
|
|
""" |
|
|
Args: |
|
|
x (Tensor): Float tensor variable with the shape (B, C, T). |
|
|
Returns: |
|
|
Tensor: Float tensor variable with the shape (B, C', T'). |
|
|
""" |
|
|
x = self.deconv(x) |
|
|
return x |
|
|
|
|
|
|
|
|
class EncoderBlock(nn.Module): |
|
|
def __init__( |
|
|
self, in_channels: int, out_channels: int, stride: int, dilations=(1, 1), unit_kernel_size=3, bias=True |
|
|
): |
|
|
super().__init__() |
|
|
self.res_units = torch.nn.ModuleList() |
|
|
for dilation in dilations: |
|
|
self.res_units += [ResidualUnit(in_channels, in_channels, kernel_size=unit_kernel_size, dilation=dilation)] |
|
|
self.num_res = len(self.res_units) |
|
|
|
|
|
self.conv = Conv1d( |
|
|
in_channels=in_channels, |
|
|
out_channels=out_channels, |
|
|
kernel_size=3 if stride == 1 else (2 * stride), |
|
|
stride=stride, |
|
|
bias=bias, |
|
|
) |
|
|
|
|
|
def forward(self, x): |
|
|
for idx in range(self.num_res): |
|
|
x = self.res_units[idx](x) |
|
|
x = self.conv(x) |
|
|
return x |
|
|
|
|
|
|
|
|
class Encoder(nn.Module): |
|
|
def __init__( |
|
|
self, |
|
|
input_channels: int, |
|
|
encode_channels: int, |
|
|
channel_ratios=(1, 1), |
|
|
strides=(1, 1), |
|
|
kernel_size=3, |
|
|
bias=True, |
|
|
block_dilations=(1, 1), |
|
|
unit_kernel_size=3, |
|
|
): |
|
|
super().__init__() |
|
|
assert len(channel_ratios) == len(strides) |
|
|
|
|
|
self.conv = Conv1d( |
|
|
in_channels=input_channels, out_channels=encode_channels, kernel_size=kernel_size, stride=1, bias=False |
|
|
) |
|
|
self.conv_blocks = torch.nn.ModuleList() |
|
|
in_channels = encode_channels |
|
|
for idx, stride in enumerate(strides): |
|
|
out_channels = int(encode_channels * channel_ratios[idx]) |
|
|
self.conv_blocks += [ |
|
|
EncoderBlock( |
|
|
in_channels, |
|
|
out_channels, |
|
|
stride, |
|
|
dilations=block_dilations, |
|
|
unit_kernel_size=unit_kernel_size, |
|
|
bias=bias, |
|
|
) |
|
|
] |
|
|
in_channels = out_channels |
|
|
self.num_blocks = len(self.conv_blocks) |
|
|
self.out_channels = out_channels |
|
|
|
|
|
def forward(self, x): |
|
|
x = self.conv(x) |
|
|
for i in range(self.num_blocks): |
|
|
x = self.conv_blocks[i](x) |
|
|
return x |
|
|
|
|
|
|
|
|
class DecoderBlock(nn.Module): |
|
|
"""Decoder block (no up-sampling)""" |
|
|
|
|
|
def __init__( |
|
|
self, in_channels: int, out_channels: int, stride: int, dilations=(1, 1), unit_kernel_size=3, bias=True |
|
|
): |
|
|
super().__init__() |
|
|
|
|
|
if stride == 1: |
|
|
self.conv = Conv1d( |
|
|
in_channels=in_channels, |
|
|
out_channels=out_channels, |
|
|
kernel_size=3, |
|
|
stride=stride, |
|
|
bias=bias, |
|
|
) |
|
|
else: |
|
|
self.conv = ConvTranspose1d( |
|
|
in_channels=in_channels, |
|
|
out_channels=out_channels, |
|
|
kernel_size=(2 * stride), |
|
|
stride=stride, |
|
|
bias=bias, |
|
|
) |
|
|
|
|
|
self.res_units = torch.nn.ModuleList() |
|
|
for idx, dilation in enumerate(dilations): |
|
|
self.res_units += [ |
|
|
ResidualUnit(out_channels, out_channels, kernel_size=unit_kernel_size, dilation=dilation) |
|
|
] |
|
|
self.num_res = len(self.res_units) |
|
|
|
|
|
def forward(self, x): |
|
|
x = self.conv(x) |
|
|
for idx in range(self.num_res): |
|
|
x = self.res_units[idx](x) |
|
|
return x |
|
|
|
|
|
|
|
|
class Decoder(nn.Module): |
|
|
def __init__( |
|
|
self, |
|
|
code_dim: int, |
|
|
output_channels: int, |
|
|
decode_channels: int, |
|
|
channel_ratios=(1, 1), |
|
|
strides=(1, 1), |
|
|
kernel_size=3, |
|
|
bias=True, |
|
|
block_dilations=(1, 1), |
|
|
unit_kernel_size=3, |
|
|
): |
|
|
super().__init__() |
|
|
assert len(channel_ratios) == len(strides) |
|
|
|
|
|
self.conv1 = Conv1d( |
|
|
in_channels=code_dim, |
|
|
out_channels=int(decode_channels * channel_ratios[0]), |
|
|
kernel_size=kernel_size, |
|
|
stride=1, |
|
|
bias=False, |
|
|
) |
|
|
|
|
|
self.conv_blocks = torch.nn.ModuleList() |
|
|
for idx, stride in enumerate(strides): |
|
|
in_channels = int(decode_channels * channel_ratios[idx]) |
|
|
if idx < (len(channel_ratios) - 1): |
|
|
out_channels = int(decode_channels * channel_ratios[idx + 1]) |
|
|
else: |
|
|
out_channels = decode_channels |
|
|
self.conv_blocks += [ |
|
|
DecoderBlock( |
|
|
in_channels, |
|
|
out_channels, |
|
|
stride, |
|
|
dilations=block_dilations, |
|
|
unit_kernel_size=unit_kernel_size, |
|
|
bias=bias, |
|
|
) |
|
|
] |
|
|
self.num_blocks = len(self.conv_blocks) |
|
|
|
|
|
self.conv2 = Conv1d(out_channels, output_channels, kernel_size, 1, bias=False) |
|
|
|
|
|
def forward(self, z): |
|
|
x = self.conv1(z) |
|
|
for i in range(self.num_blocks): |
|
|
x = self.conv_blocks[i](x) |
|
|
x = self.conv2(x) |
|
|
return x |
|
|
|