Post

[AI] Autoencoder Implementation

Simple Implementation for Autoencoder (Vanilla)

[AI] Autoencoder Implementation

1. (Convolutional) Autoencoder with MNIST dataset

1) Import Libraries

1
2
3
4
5
6
7
8
9
10
11
# AE.ipynb
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm, trange

import torch
from torchvision import datasets
import torchvision.transforms as transforms

import torch.nn as nn
import torch.nn.functional as F

2) CUDA Setting

1
2
3
4
5
6
# CUDA setting
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"CUDA GPU : {torch.cuda.get_device_name(0)}")
else:
    device = torch.device('cpu')
1
CUDA GPU : NVIDIA GeForce RTX 4070 Ti SUPER

3) Dataset Download & Parse

1
2
3
4
5
6
7
8
9
# Dataset download (MNIST)

BATCH_SIZE = 512

train_data = datasets.MNIST('./data', train=True, download=True, transform=transforms.ToTensor())
test_data =  datasets.MNIST('./data', train=False, download=True, transform=transforms.ToTensor())

train_loader = torch.utils.data.DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True, num_workers=8)
test_loader =  torch.utils.data.DataLoader(dataset=test_data, batch_size=BATCH_SIZE, shuffle=True, num_workers=8)

4) Model Archtecture & structure

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# Model implement : Autoencoder
# Input tensor : (batch_size, 1, 28, 28) & Output : (batch_size)

# Architecture : 1x28x28 -> (filtersize:4, stride:2, padding:0 ) -> 1x13x13 -> 1x28x28

class Autoencoder(nn.Module):
    def __init__(self):
        super(Autoencoder, self).__init__()

        self.encoder = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=16, kernel_size=3, stride=2, padding=1), # 1x28x28 -> 32x14x14
            nn.ReLU(),

            nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, stride=2, padding=1), # 32x14x14 -> 64x7x7
            nn.ReLU(),

            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=7), # 64x7x7 -> 64x1x1
            nn.ReLU(),
        )

        self.decoder = nn.Sequential(
            nn.ConvTranspose2d(in_channels=64, out_channels=32, kernel_size=7),
            nn.ReLU(),

            nn.ConvTranspose2d(in_channels=32, out_channels=16, kernel_size=3, stride=2, padding=1, output_padding=1),
            nn.ReLU(),

            nn.ConvTranspose2d(in_channels=16, out_channels=1, kernel_size=3, stride=2, padding=1, output_padding=1)
        )
    
    def forward(self, x):
        x = self.encoder(x)
        # x.shape == (batch, 64, 1, 1)
        latent_x = x
        x = self.decoder(x)

        return x, latent_x

5) Training defined model

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
# Model Training
from IPython import display

model = Autoencoder().to(device)

epoch = 300

learning_rate = 2E-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
loss_fn = nn.MSELoss()

losses = []

for epoch_idx in trange(epoch):
    model.train()

    running_loss = 0.0
    total_batches = 0

    for (x_train, _) in train_loader:
        x_train = x_train.to(device)
        output, __ = model(x_train)
        
        loss = loss_fn(x_train, output)
        running_loss += loss.item()
        total_batches += 1

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    losses.append(running_loss / total_batches)

    display.clear_output(wait=True)
    plt.figure(figsize=(8, 5))

    xrange = [i for i in range(1, epoch_idx+2)]
    if len(xrange) > 30:
        xrange = xrange[-30:]
        yrange = losses[xrange[0]-1:xrange[-1]]
    else:
        yrange = losses[:]
    
    plt.plot(xrange,yrange, linestyle='--', linewidth=2, c='r')
    plt.scatter(xrange, yrange, c='red', s=40)
    plt.xlabel('Batch number')
    plt.ylabel('Loss')
    plt.title('Training Loss')
    plt.show()

image

1
100%|█████████████████████████████████████████████████████████████████████████████████| 300/300 [03:11<00:00,  1.57it/s]

6) Visualization of my model

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import matplotlib.pyplot as plt
import torch

start_idx = 200
num_pairs = 100
pairs_per_row = 10

plt.figure(figsize=(20, 10))

for i in range(num_pairs):
    row = i // pairs_per_row
    col = i % pairs_per_row

    # Original Image
    plt.subplot(10, 20, 2 * col + 1 + 20 * row)
    data = test_data[start_idx + i][0]
    img = data.reshape(28, 28)
    plt.imshow(img, cmap='gray')
    plt.axis('off')

    # Reconstructed Image
    plt.subplot(10, 20, 2 * col + 2 + 20 * row)
    with torch.no_grad():
        reconstructed_images, _ = model(data.unsqueeze(0).to(device))
    reconstructed_images = reconstructed_images.cpu()
    plt.imshow(reconstructed_images.squeeze().reshape(28, 28), cmap='gray')
    plt.axis('off')

plt.subplots_adjust(wspace=0.1, hspace=0.1)
plt.show()

image

7) Saving Model Weights

1
2
# print(f"Result loss : {losses[-1]}")
torch.save(model, "./1. Autoencoder/Autoencoder_MNIST.pt") # Why not ./Autoencoder_MNIST.pt??

2. 2D Modeling (decomposition)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.patches as mpatches

model = torch.load("./Autoencoder_MNIST.pt", map_location=device)
model.eval()

latent_vectors = []
label = []
colors = ['#fe7c73', '#2471A3', '#3498DB', '#27AE60', '#82E0AA', '#D35400', '#5D6D7E', '#E74C3C', '#21618C', '#B7950B']

for (X, y) in tqdm(test_data):
    with torch.no_grad():
        _, latent_x = model(X.to(device))
        latent_vectors.append(latent_x.reshape(64).cpu().numpy())
    label.append(y)

color_labeled = [colors[label[i]] for i in range(len(label))]
all_latent_vectors = np.vstack(latent_vectors)

# decomposition_model = PCA(n_components=2) # Not that good performance..
decomposition_model = TSNE(n_components=2)
latent_2d = decomposition_model.fit_transform(all_latent_vectors)

patches = [mpatches.Patch(color=colors[i], label=f'{i}') for i in range(len(colors))]
plt.legend(handles=patches, bbox_to_anchor=(1.05, 1), loc='upper left')

plt.scatter(latent_2d[:, 0], latent_2d[:, 1], s=1, c = color_labeled)
plt.title('2D decomposition of Latent Vectors')
plt.show()
1
100%|███████████████████████████████████████████████████████████████████████████| 10000/10000 [00:05<00:00, 1859.40it/s]

image

Actually, I wanted to show the shortcut of the Vanila Autoencoder.

The Vanila Autoencoder have shortcuts below.

  • Hard to analysis the latent space from AE.

  • The generated latent space is not continuous.

  • The equality of the generated result low.

So this shortcuts lead us to VAE (Variational Autoencoder)

This is a blog for more details

3. Can we use Autoencoder as generative model??

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
model = torch.load("./Autoencoder_MNIST.pt", map_location=device)
model.eval()

label_latent_vectors = [torch.zeros(64).to(device) for _ in range(10)]
label_num = [0 for _ in range(10)]

for (X, y) in tqdm(test_data):
    with torch.no_grad():
        _, latent_x = model(X.to(device))
        label_latent_vectors[y] += latent_x.view(64)
        label_num[y] += 1


noise_std = 1
label_latent_vectors = [(label_latent_vectors[i] / label_num[i] + noise_std * torch.randn(64).to(device)).view(64, 1, 1) for i in range(10)]
label_mean_img = []

with torch.no_grad():
    for i in range(10):
        label_mean_img.append(model.decoder(label_latent_vectors[i]).cpu())

fig, axes = plt.subplots(1, 10, figsize=(15, 2))

for i in range(10):
    axes[i].imshow(label_mean_img[i].view(28, 28), cmap='gray')
    axes[i].axis('off')
    axes[i].set_title(str(i))

plt.show()
1
100%|███████████████████████████████████████████████████████████████████████████| 10000/10000 [00:04<00:00, 2015.46it/s]

image

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
assert len(all_latent_vectors) == 10000

reshaped_label_latent_vectors = np.array([llv.view(64).cpu() for llv in label_latent_vectors])
new_latent_vectors = np.vstack([all_latent_vectors, reshaped_label_latent_vectors])


decomposition_model = TSNE(n_components=2)
latent_2d = decomposition_model.fit_transform(new_latent_vectors)

patches = [mpatches.Patch(color=colors[i], label=f'{i}') for i in range(len(colors))]
plt.legend(handles=patches, bbox_to_anchor=(1.05, 1), loc='upper left')

plt.scatter(latent_2d[:, 0], latent_2d[:, 1], s=1, c = color_labeled)
plt.scatter(latent_2d[:, 0][-10:], latent_2d[:, 1][-10:], s = 10, c = 'black')

plt.title('2D decomposition of Latent Vectors')
plt.show()

image

This results shows us that AE is not enough to use as generative model.

I constructed the new latent vector using mean of the label’s latent vector, and summing little noise.

But the result of the reconstucted image is not that clear even the latent vector was obviously in the 2d cluster.

So, this means the Autoencoder cannot be used as generative model, stem from the discreteness.

(If the input of the model becomes LARGE & COMPLICATED, this shortcuts will stand out more.)

This post is licensed under CC BY 4.0 by the author.

Trending Tags