python – Conv-Variational-Autoencoder Loss is NaN

I am training a Variational Autoencoder. Suddenly my Loss Explodes and then becomes NaN
and I dont know why.
When evaluating the trained Vae on an Image the output data has Inf value, so I guess its happening in the sampling Method of the VAE, but why does it suddenly explode and how can i prevent it?


class VAE(nn.Module):
   def __init__(self, input_shape, z_dim):
       super().__init__()
       self.z_dim = z_dim
       self.input_shape = input_shape

       # encoder
       self.encoder_conv = nn.Sequential(
           nn.Conv2d(1, 32, 3, stride=2, padding=1),
           nn.BatchNorm2d(32),
           nn.LeakyReLU(),
           nn.Conv2d(32, 64, 3, stride=2, padding=1),
           nn.BatchNorm2d(64),
           nn.LeakyReLU(),
           nn.Conv2d(64, 64, 3, stride=2, padding=1),
           nn.BatchNorm2d(64),
           nn.LeakyReLU(),
           nn.Conv2d(64, 64, 3, stride=2, padding=1),
           nn.BatchNorm2d(64),
           nn.LeakyReLU()
       )
       self.conv_out_size = self._get_conv_out_size(input_shape)
       self.mu = nn.Sequential(
           nn.Linear(self.conv_out_size, z_dim),
           nn.LeakyReLU(),
           nn.Dropout(0.2)
       )
       self.log_var = nn.Sequential(
           nn.Linear(self.conv_out_size, z_dim),
           nn.LeakyReLU(),
           nn.Dropout(0.2)
       )

       # decoder
       self.decoder_linear = nn.Sequential(
           nn.Linear(z_dim, self.conv_out_size),
           nn.LeakyReLU(),
           nn.Dropout(0.2)
       )
       
       self.decoder_conv = nn.Sequential(
           nn.UpsamplingNearest2d(scale_factor=2),
           nn.ConvTranspose2d(64, 64, 3, stride=1, padding=1),
           nn.BatchNorm2d(64),
           nn.LeakyReLU(),
           nn.UpsamplingNearest2d(scale_factor=2),
           nn.ConvTranspose2d(64, 64, 3, stride=1, padding=1),
           nn.BatchNorm2d(64),
           nn.LeakyReLU(),
           nn.UpsamplingNearest2d(scale_factor=2),
           nn.ConvTranspose2d(64, 32, 3, stride=1, padding=1),
           nn.BatchNorm2d(32),
           nn.LeakyReLU(),
           nn.UpsamplingNearest2d(scale_factor=2),
           nn.ConvTranspose2d(32, 1, 3, stride=1, padding=(5,3)),
           nn.Sigmoid()
       )

   def sampling(self, mu, log_var):
       ## TODO: epsilon should be at the model's device (not CUDA)
       epsilon = torch.Tensor(np.random.normal(size=(self.z_dim), scale=1.0)).cuda()
       return mu + epsilon * torch.exp(log_var / 2)

   def forward_encoder(self, x):
       x = self.encoder_conv(x)
       x = x.view(x.size()(0), -1)
       mu_p = self.mu(x)
       log_var_p = self.log_var(x)
       return (mu_p, log_var_p)

   def forward_decoder(self, x):
       x = self.decoder_linear(x)
       x = x.view(x.size()(0), *self.conv_out_shape(1:))
       x = self.decoder_conv(x)
       return x

   def forward(self, x):
       mu_p, log_var_p = self.forward_encoder(x)
       x = self.sampling(mu_p, log_var_p)
       images_p = self.forward_decoder(x)
       return (mu_p, log_var_p, images_p)

   def _get_conv_out_size(self, shape):
       out = self.encoder_conv(torch.zeros(1, *shape))
       self.conv_out_shape = out.size()
       return int(np.prod(self.conv_out_shape))


   def forward_no_epsilon(self, x):
       mu_p, log_var_p = self.forward_encoder(x)
       x = mu_p
       images_p = self.forward_decoder(x)
       return images_p

Loss:

def kl_loss(mu, log_var):
    # TODO: dividir entre el numero de batches? 
    return -0.5 * torch.mean(1 + log_var - mu.pow(2) - torch.exp(log_var))

def r_loss(y_train, y_pred):
    r_loss = torch.mean((y_train - y_pred) ** 2)
    return r_loss

train:

mu_v, log_var_v, images_out_v = vae(images_v)
r_loss_v = r_loss(images_out_v, labels_v)
kl_loss_v = kl_loss(mu_v, log_var_v)
loss = kl_loss_v + r_loss_v * 10000.0
loss.backward()
optimizer.step()

Losses: