Open In Colab

145 PyTorch Tricks

This is a series of useful PyTorch tricks inspired by vainaijr in his YouTube channel.
This notebook is an implementation of all these techniques and is designed in a way to best demonstrate their usefulness.

Trick #1

Visualization model using torchsummaryX

In [0]:
import torch
import torchvision.models as models
from Utils import *

Here we will build a Single-shot-detection model with just 20 classes.

In [0]:
# Create SSD300 with pretrained weights in the base-architecture
n_classes = 20
model = SSD300(n_classes)
Downloading: "https://download.pytorch.org/models/vgg16_bn-6c64b313.pth" to /root/.cache/torch/checkpoints/vgg16_bn-6c64b313.pth
100%|██████████| 528M/528M [00:05<00:00, 107MB/s]
Loaded base model with pre-trained weights

In [0]:
# install torchsummaryX
!pip install torchsummaryX
Collecting torchsummaryX
  Downloading https://files.pythonhosted.org/packages/36/23/87eeaaf70daa61aa21495ece0969c50c446b8fd42c4b8905af264b40fe7f/torchsummaryX-1.3.0-py3-none-any.whl
Requirement already satisfied: pandas in /usr/local/lib/python3.6/dist-packages (from torchsummaryX) (0.25.3)
Requirement already satisfied: torch in /usr/local/lib/python3.6/dist-packages (from torchsummaryX) (1.3.1+cu100)
Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from torchsummaryX) (1.17.3)
Requirement already satisfied: python-dateutil>=2.6.1 in /usr/local/lib/python3.6/dist-packages (from pandas->torchsummaryX) (2.6.1)
Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas->torchsummaryX) (2018.9)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.6/dist-packages (from python-dateutil>=2.6.1->pandas->torchsummaryX) (1.12.0)
Installing collected packages: torchsummaryX
Successfully installed torchsummaryX-1.3.0
In [0]:
from torchsummaryX import summary

summary(model, input) takes our intentional model and a pseudo input with the correct shape

In [0]:
# pseudo input of batch size = 3, num_channel = 3, pixel: 300x300
summary(model, torch.zeros((3,3,300,300)))
==================================================================================================
                                         Kernel Shape        Output Shape  \
Layer                                                                       
0_base.Conv2d_conv1_1                   [3, 64, 3, 3]   [3, 64, 300, 300]   
1_base.BatchNorm2d_bn_1_1                        [64]   [3, 64, 300, 300]   
2_base.Conv2d_conv1_2                  [64, 64, 3, 3]   [3, 64, 300, 300]   
3_base.BatchNorm2d_bn_1_2                        [64]   [3, 64, 300, 300]   
4_base.MaxPool2d_pool1                              -   [3, 64, 150, 150]   
5_base.Conv2d_conv2_1                 [64, 128, 3, 3]  [3, 128, 150, 150]   
6_base.BatchNorm2d_bn_2_1                       [128]  [3, 128, 150, 150]   
7_base.Conv2d_conv2_2                [128, 128, 3, 3]  [3, 128, 150, 150]   
8_base.BatchNorm2d_bn_2_2                       [128]  [3, 128, 150, 150]   
9_base.MaxPool2d_pool2                              -    [3, 128, 75, 75]   
10_base.Conv2d_conv3_1               [128, 256, 3, 3]    [3, 256, 75, 75]   
11_base.BatchNorm2d_bn_3_1                      [256]    [3, 256, 75, 75]   
12_base.Conv2d_conv3_2               [256, 256, 3, 3]    [3, 256, 75, 75]   
13_base.BatchNorm2d_bn_3_2                      [256]    [3, 256, 75, 75]   
14_base.Conv2d_conv3_3               [256, 256, 3, 3]    [3, 256, 75, 75]   
15_base.BatchNorm2d_bn_3_3                      [256]    [3, 256, 75, 75]   
16_base.MaxPool2d_pool3                             -    [3, 256, 38, 38]   
17_base.Conv2d_conv4_1               [256, 512, 3, 3]    [3, 512, 38, 38]   
18_base.BatchNorm2d_bn_4_1                      [512]    [3, 512, 38, 38]   
19_base.Conv2d_conv4_2               [512, 512, 3, 3]    [3, 512, 38, 38]   
20_base.BatchNorm2d_bn_4_2                      [512]    [3, 512, 38, 38]   
21_base.Conv2d_conv4_3               [512, 512, 3, 3]    [3, 512, 38, 38]   
22_base.BatchNorm2d_bn_4_3                      [512]    [3, 512, 38, 38]   
23_base.MaxPool2d_pool4                             -    [3, 512, 19, 19]   
24_base.Conv2d_conv5_1               [512, 512, 3, 3]    [3, 512, 19, 19]   
25_base.BatchNorm2d_bn_5_1                      [512]    [3, 512, 19, 19]   
26_base.Conv2d_conv5_2               [512, 512, 3, 3]    [3, 512, 19, 19]   
27_base.BatchNorm2d_bn_5_2                      [512]    [3, 512, 19, 19]   
28_base.Conv2d_conv5_3               [512, 512, 3, 3]    [3, 512, 19, 19]   
29_base.BatchNorm2d_bn_5_3                      [512]    [3, 512, 19, 19]   
30_base.MaxPool2d_pool5                             -    [3, 512, 19, 19]   
31_base.Conv2d_conv6                [512, 1024, 3, 3]   [3, 1024, 19, 19]   
32_base.Conv2d_conv7               [1024, 1024, 1, 1]   [3, 1024, 19, 19]   
33_aux_convs.Conv2d_conv8_1         [1024, 256, 1, 1]    [3, 256, 19, 19]   
34_aux_convs.Conv2d_conv8_2          [256, 512, 3, 3]    [3, 512, 10, 10]   
35_aux_convs.Conv2d_conv9_1          [512, 128, 1, 1]    [3, 128, 10, 10]   
36_aux_convs.Conv2d_conv9_2          [128, 256, 3, 3]      [3, 256, 5, 5]   
37_aux_convs.Conv2d_conv10_1         [256, 128, 1, 1]      [3, 128, 5, 5]   
38_aux_convs.Conv2d_conv10_2         [128, 256, 3, 3]      [3, 256, 3, 3]   
39_aux_convs.Conv2d_conv11_1         [256, 128, 1, 1]      [3, 128, 3, 3]   
40_aux_convs.Conv2d_conv11_2         [128, 256, 3, 3]      [3, 256, 1, 1]   
41_pred_convs.Conv2d_loc_conv4_3      [512, 16, 3, 3]     [3, 16, 38, 38]   
42_pred_convs.Conv2d_loc_conv7       [1024, 24, 3, 3]     [3, 24, 19, 19]   
43_pred_convs.Conv2d_loc_conv8_2      [512, 24, 3, 3]     [3, 24, 10, 10]   
44_pred_convs.Conv2d_loc_conv9_2      [256, 24, 3, 3]       [3, 24, 5, 5]   
45_pred_convs.Conv2d_loc_conv10_2     [256, 16, 3, 3]       [3, 16, 3, 3]   
46_pred_convs.Conv2d_loc_conv11_2     [256, 16, 3, 3]       [3, 16, 1, 1]   
47_pred_convs.Conv2d_cl_conv4_3       [512, 80, 3, 3]     [3, 80, 38, 38]   
48_pred_convs.Conv2d_cl_conv7       [1024, 120, 3, 3]    [3, 120, 19, 19]   
49_pred_convs.Conv2d_cl_conv8_2      [512, 120, 3, 3]    [3, 120, 10, 10]   
50_pred_convs.Conv2d_cl_conv9_2      [256, 120, 3, 3]      [3, 120, 5, 5]   
51_pred_convs.Conv2d_cl_conv10_2      [256, 80, 3, 3]       [3, 80, 3, 3]   
52_pred_convs.Conv2d_cl_conv11_2      [256, 80, 3, 3]       [3, 80, 1, 1]   

                                      Params     Mult-Adds  
Layer                                                       
0_base.Conv2d_conv1_1                 1.792k       155.52M  
1_base.BatchNorm2d_bn_1_1              128.0          64.0  
2_base.Conv2d_conv1_2                36.928k      3.31776G  
3_base.BatchNorm2d_bn_1_2              128.0          64.0  
4_base.MaxPool2d_pool1                     -             -  
5_base.Conv2d_conv2_1                73.856k      1.65888G  
6_base.BatchNorm2d_bn_2_1              256.0         128.0  
7_base.Conv2d_conv2_2               147.584k      3.31776G  
8_base.BatchNorm2d_bn_2_2              256.0         128.0  
9_base.MaxPool2d_pool2                     -             -  
10_base.Conv2d_conv3_1              295.168k      1.65888G  
11_base.BatchNorm2d_bn_3_1             512.0         256.0  
12_base.Conv2d_conv3_2               590.08k      3.31776G  
13_base.BatchNorm2d_bn_3_2             512.0         256.0  
14_base.Conv2d_conv3_3               590.08k      3.31776G  
15_base.BatchNorm2d_bn_3_3             512.0         256.0  
16_base.MaxPool2d_pool3                    -             -  
17_base.Conv2d_conv4_1              1.18016M  1.703411712G  
18_base.BatchNorm2d_bn_4_1            1.024k         512.0  
19_base.Conv2d_conv4_2             2.359808M  3.406823424G  
20_base.BatchNorm2d_bn_4_2            1.024k         512.0  
21_base.Conv2d_conv4_3             2.359808M  3.406823424G  
22_base.BatchNorm2d_bn_4_3            1.024k         512.0  
23_base.MaxPool2d_pool4                    -             -  
24_base.Conv2d_conv5_1             2.359808M   851.705856M  
25_base.BatchNorm2d_bn_5_1            1.024k         512.0  
26_base.Conv2d_conv5_2             2.359808M   851.705856M  
27_base.BatchNorm2d_bn_5_2            1.024k         512.0  
28_base.Conv2d_conv5_3             2.359808M   851.705856M  
29_base.BatchNorm2d_bn_5_3            1.024k         512.0  
30_base.MaxPool2d_pool5                    -             -  
31_base.Conv2d_conv6               4.719616M  1.703411712G  
32_base.Conv2d_conv7                 1.0496M   378.535936M  
33_aux_convs.Conv2d_conv8_1           262.4k    94.633984M  
34_aux_convs.Conv2d_conv8_2         1.18016M     117.9648M  
35_aux_convs.Conv2d_conv9_1          65.664k       6.5536M  
36_aux_convs.Conv2d_conv9_2         295.168k       7.3728M  
37_aux_convs.Conv2d_conv10_1         32.896k        819.2k  
38_aux_convs.Conv2d_conv10_2        295.168k     2.654208M  
39_aux_convs.Conv2d_conv11_1         32.896k      294.912k  
40_aux_convs.Conv2d_conv11_2        295.168k      294.912k  
41_pred_convs.Conv2d_loc_conv4_3     73.744k   106.463232M  
42_pred_convs.Conv2d_loc_conv7      221.208k    79.847424M  
43_pred_convs.Conv2d_loc_conv8_2    110.616k      11.0592M  
44_pred_convs.Conv2d_loc_conv9_2      55.32k       1.3824M  
45_pred_convs.Conv2d_loc_conv10_2     36.88k      331.776k  
46_pred_convs.Conv2d_loc_conv11_2     36.88k       36.864k  
47_pred_convs.Conv2d_cl_conv4_3      368.72k    532.31616M  
48_pred_convs.Conv2d_cl_conv7       1.10604M    399.23712M  
49_pred_convs.Conv2d_cl_conv8_2      553.08k       55.296M  
50_pred_convs.Conv2d_cl_conv9_2       276.6k        6.912M  
51_pred_convs.Conv2d_cl_conv10_2      184.4k      1.65888M  
52_pred_convs.Conv2d_cl_conv11_2      184.4k       184.32k  
--------------------------------------------------------------------------------------------------
                             Totals
Total params              26.15976M
Trainable params          26.15976M
Non-trainable params            0.0
Mult-Adds             31.323761792G
==================================================================================================
Out[0]:
Kernel Shape Output Shape Params Mult-Adds
Layer
0_base.Conv2d_conv1_1 [3, 64, 3, 3] [3, 64, 300, 300] 1792.0 1.555200e+08
1_base.BatchNorm2d_bn_1_1 [64] [3, 64, 300, 300] 128.0 6.400000e+01
2_base.Conv2d_conv1_2 [64, 64, 3, 3] [3, 64, 300, 300] 36928.0 3.317760e+09
3_base.BatchNorm2d_bn_1_2 [64] [3, 64, 300, 300] 128.0 6.400000e+01
4_base.MaxPool2d_pool1 - [3, 64, 150, 150] NaN NaN
5_base.Conv2d_conv2_1 [64, 128, 3, 3] [3, 128, 150, 150] 73856.0 1.658880e+09
6_base.BatchNorm2d_bn_2_1 [128] [3, 128, 150, 150] 256.0 1.280000e+02
7_base.Conv2d_conv2_2 [128, 128, 3, 3] [3, 128, 150, 150] 147584.0 3.317760e+09
8_base.BatchNorm2d_bn_2_2 [128] [3, 128, 150, 150] 256.0 1.280000e+02
9_base.MaxPool2d_pool2 - [3, 128, 75, 75] NaN NaN
10_base.Conv2d_conv3_1 [128, 256, 3, 3] [3, 256, 75, 75] 295168.0 1.658880e+09
11_base.BatchNorm2d_bn_3_1 [256] [3, 256, 75, 75] 512.0 2.560000e+02
12_base.Conv2d_conv3_2 [256, 256, 3, 3] [3, 256, 75, 75] 590080.0 3.317760e+09
13_base.BatchNorm2d_bn_3_2 [256] [3, 256, 75, 75] 512.0 2.560000e+02
14_base.Conv2d_conv3_3 [256, 256, 3, 3] [3, 256, 75, 75] 590080.0 3.317760e+09
15_base.BatchNorm2d_bn_3_3 [256] [3, 256, 75, 75] 512.0 2.560000e+02
16_base.MaxPool2d_pool3 - [3, 256, 38, 38] NaN NaN
17_base.Conv2d_conv4_1 [256, 512, 3, 3] [3, 512, 38, 38] 1180160.0 1.703412e+09
18_base.BatchNorm2d_bn_4_1 [512] [3, 512, 38, 38] 1024.0 5.120000e+02
19_base.Conv2d_conv4_2 [512, 512, 3, 3] [3, 512, 38, 38] 2359808.0 3.406823e+09
20_base.BatchNorm2d_bn_4_2 [512] [3, 512, 38, 38] 1024.0 5.120000e+02
21_base.Conv2d_conv4_3 [512, 512, 3, 3] [3, 512, 38, 38] 2359808.0 3.406823e+09
22_base.BatchNorm2d_bn_4_3 [512] [3, 512, 38, 38] 1024.0 5.120000e+02
23_base.MaxPool2d_pool4 - [3, 512, 19, 19] NaN NaN
24_base.Conv2d_conv5_1 [512, 512, 3, 3] [3, 512, 19, 19] 2359808.0 8.517059e+08
25_base.BatchNorm2d_bn_5_1 [512] [3, 512, 19, 19] 1024.0 5.120000e+02
26_base.Conv2d_conv5_2 [512, 512, 3, 3] [3, 512, 19, 19] 2359808.0 8.517059e+08
27_base.BatchNorm2d_bn_5_2 [512] [3, 512, 19, 19] 1024.0 5.120000e+02
28_base.Conv2d_conv5_3 [512, 512, 3, 3] [3, 512, 19, 19] 2359808.0 8.517059e+08
29_base.BatchNorm2d_bn_5_3 [512] [3, 512, 19, 19] 1024.0 5.120000e+02
30_base.MaxPool2d_pool5 - [3, 512, 19, 19] NaN NaN
31_base.Conv2d_conv6 [512, 1024, 3, 3] [3, 1024, 19, 19] 4719616.0 1.703412e+09
32_base.Conv2d_conv7 [1024, 1024, 1, 1] [3, 1024, 19, 19] 1049600.0 3.785359e+08
33_aux_convs.Conv2d_conv8_1 [1024, 256, 1, 1] [3, 256, 19, 19] 262400.0 9.463398e+07
34_aux_convs.Conv2d_conv8_2 [256, 512, 3, 3] [3, 512, 10, 10] 1180160.0 1.179648e+08
35_aux_convs.Conv2d_conv9_1 [512, 128, 1, 1] [3, 128, 10, 10] 65664.0 6.553600e+06
36_aux_convs.Conv2d_conv9_2 [128, 256, 3, 3] [3, 256, 5, 5] 295168.0 7.372800e+06
37_aux_convs.Conv2d_conv10_1 [256, 128, 1, 1] [3, 128, 5, 5] 32896.0 8.192000e+05
38_aux_convs.Conv2d_conv10_2 [128, 256, 3, 3] [3, 256, 3, 3] 295168.0 2.654208e+06
39_aux_convs.Conv2d_conv11_1 [256, 128, 1, 1] [3, 128, 3, 3] 32896.0 2.949120e+05
40_aux_convs.Conv2d_conv11_2 [128, 256, 3, 3] [3, 256, 1, 1] 295168.0 2.949120e+05
41_pred_convs.Conv2d_loc_conv4_3 [512, 16, 3, 3] [3, 16, 38, 38] 73744.0 1.064632e+08
42_pred_convs.Conv2d_loc_conv7 [1024, 24, 3, 3] [3, 24, 19, 19] 221208.0 7.984742e+07
43_pred_convs.Conv2d_loc_conv8_2 [512, 24, 3, 3] [3, 24, 10, 10] 110616.0 1.105920e+07
44_pred_convs.Conv2d_loc_conv9_2 [256, 24, 3, 3] [3, 24, 5, 5] 55320.0 1.382400e+06
45_pred_convs.Conv2d_loc_conv10_2 [256, 16, 3, 3] [3, 16, 3, 3] 36880.0 3.317760e+05
46_pred_convs.Conv2d_loc_conv11_2 [256, 16, 3, 3] [3, 16, 1, 1] 36880.0 3.686400e+04
47_pred_convs.Conv2d_cl_conv4_3 [512, 80, 3, 3] [3, 80, 38, 38] 368720.0 5.323162e+08
48_pred_convs.Conv2d_cl_conv7 [1024, 120, 3, 3] [3, 120, 19, 19] 1106040.0 3.992371e+08
49_pred_convs.Conv2d_cl_conv8_2 [512, 120, 3, 3] [3, 120, 10, 10] 553080.0 5.529600e+07
50_pred_convs.Conv2d_cl_conv9_2 [256, 120, 3, 3] [3, 120, 5, 5] 276600.0 6.912000e+06
51_pred_convs.Conv2d_cl_conv10_2 [256, 80, 3, 3] [3, 80, 3, 3] 184400.0 1.658880e+06
52_pred_convs.Conv2d_cl_conv11_2 [256, 80, 3, 3] [3, 80, 1, 1] 184400.0 1.843200e+05

Final Note: Normally, if we use architectures directly from TorchVision or Keras we would have nice model summary just like this.
This libarary is particular useful when we want to inspect user people's model or a verions that we have modified besed on commonly used models like the example above.
In addition, we have a nice visualization of num of parameters & output demension for each layer which is kind of nice for debugging your own model or simply for reference.

Trick #2 (important)

PyTorch Hooks

PyTorch hook is a tool that we can register to any tensor or nn.Module during our computation so that we can monitor what is going on with our forward and backward loops.
The forward is not refered to nn.Module.forward bu the torch.Autograd.Function object that is the grad_fn of a tensor.
Notice, that a nn.Module like nn.Linear can have multiple forward invocations. It's output is created by two operations, $Y = W*X+B$, addition and multiplication and thus there will be two forward calls.

Hook types

  1. The Forward Hook
  2. The Backward Hook

A forward hook is excuted during the forward pass, while the backward hook is executed when backward function is called both of which are functions of Autograd.Funciton object.

A hook in PyTorch is basically a function, with a very specific signature. When we say a hook is executed, in reality, we are talkingabout this function being executed.
grad is basically the value contained in the grad attribute of the tensor after backward is called. The function is not supposed to modify it's argument. It must either return None or a Tensor which will be used in place of grad for further gradient computations.
The below example clarifies this point:

In [0]:
import torch
a = torch.ones(10)
a.requires_grad
Out[0]:
False
In [0]:
a.requires_grad = True
a.requires_grad
Out[0]:
True
In [0]:
b = 2*a
b.requires_grad
Out[0]:
True
In [0]:
print(a.is_leaf)
print(b.is_leaf)
True
False

Since b is not a leaf Variable, its grad will by degault be destroyed during computation.
We can used b.retain_grad() to ask PyTorch to retain its grad

In [0]:
b.retain_grad()
In [0]:
c = b.mean()
print(f"requires_grad: {c.requires_grad}")
print(f"is_lead: {c.is_leaf}")
requires_grad: True
is_lead: False
In [0]:
# pretend c is the loss being computed
c.backward()
print(a.grad, b.grad)
tensor([0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.2000,
        0.2000]) tensor([0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
        0.1000])

Now we redo the experiment but with a hook that multiplies b's grad by 2

In [0]:
a = torch.ones(10)
a.requires_grad = True
b = 2*a
b.retain_grad()
b.register_hook(lambda x:print(x))
b.mean().backward() # pretend the mean of b is the loss we want to back-prop
tensor([0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
        0.1000])

Here we can see that, the print out is exactly the same result by using hook on b, and the lambda function automatically take the b.grad as input.
This gives us a sense that hook is tracking.

In [0]:
print(a.grad, b.grad)
tensor([0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.2000,
        0.2000]) tensor([0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
        0.1000])

There are several uses of functionality as above:

  1. We can print the value of gradient for debugging. We can also log them. This is especially useful with non-leaf variables whose gradients are freed up unless we perform retain_grad upon them. Doing the latter can lead to increased memory retention. Hooks provide much cleaner way to aggregate these values.
  2. We can modify gradient during the backward pass. This is very important. While we can still access the grad variable of a tensor in a network, we can only access it after the entire backward pass has been processed. For example, we multiplied b's gradient by 2, and now the subsequent gradient calculations, like those of a(or any tensor that will depend upon b for gradient) used 2*brad(b) instead of grad(b). In contrast, had we individually updated the parameters after the backward, we'd have to multily b.grad as well as a.grad
In [0]:
# to demonstrate
a = torch.ones(10)
a.requires_grad = True
b = 2*a
b.retain_grad()
b.mean().backward()

print(a.grad, b.grad)
tensor([0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.2000,
        0.2000]) tensor([0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
        0.1000])
In [0]:
b.grad *= 2
print(a.grad, b.grad) # Note that in this case, a's grad needs to be updated mannually
tensor([0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.2000,
        0.2000]) tensor([0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.2000,
        0.2000])

Hooks for nn.Module objects

For backward hook: hook(module, grad_input, grad_output)


For forward hook: hook(module, input, output)


In [0]:
import torch.nn as nn
class myNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv = nn.Conv2d(3,10,2, stride=2) # (8-2+0)/2+1 = 4
        self.relu = nn.ReLU()
        self.flatten = lambda x: x.view(-1)
        self.fc1  = nn.Linear(160,5)

    def forward(self, x):
        x = self.relu(self.conv(x))
        return self.fc1(self.flatten(x))
In [0]:
Net = myNet()
summary(Net,torch.zeros(1,3,8,8))
=======================================================
         Kernel Shape   Output Shape Params Mult-Adds
Layer                                                
0_conv  [3, 10, 2, 2]  [1, 10, 4, 4]  130.0     1.92k
1_relu              -  [1, 10, 4, 4]      -         -
2_fc1        [160, 5]            [5]  805.0     800.0
-------------------------------------------------------
                      Totals
Total params           935.0
Trainable params       935.0
Non-trainable params     0.0
Mult-Adds              2.72k
=======================================================
Out[0]:
Kernel Shape Output Shape Params Mult-Adds
Layer
0_conv [3, 10, 2, 2] [1, 10, 4, 4] 130.0 1920.0
1_relu - [1, 10, 4, 4] NaN NaN
2_fc1 [160, 5] [5] 805.0 800.0
In [0]:
def hook_fn(m,i,o):
    print(m)
    print("---------Input Grad----------")

    for grad in i:
        try:
            print(grad.shape)
        except AttributeError:
            print("None found for input Gradient")
    
    print("--------Output Grad----------")
    for grad in o:
        try:
            print(grad.shape)
        except AttributeError:
            print("None found for output Gradient")
    print("\n")
In [0]:
Net.named_modules
Out[0]:
<bound method Module.named_modules of myNet(
  (conv): Conv2d(3, 10, kernel_size=(2, 2), stride=(2, 2))
  (relu): ReLU()
  (fc1): Linear(in_features=160, out_features=5, bias=True)
)>
In [0]:
Net.conv.register_backward_hook(hook_fn)
Out[0]:
<torch.utils.hooks.RemovableHandle at 0x7fc70953ce80>
In [0]:
Net.fc1.register_backward_hook(hook_fn)
Out[0]:
<torch.utils.hooks.RemovableHandle at 0x7fc709536b00>
In [0]:
inp = torch.rand(1,3,8,8)
out = Net(inp)
out
Out[0]:
tensor([ 0.1495, -0.0683,  0.1981,  0.0851, -0.0905], grad_fn=<AddBackward0>)
In [0]:
# pretend we have the following as loss
(1-out.mean()).backward()
Linear(in_features=160, out_features=5, bias=True)
---------Input Grad----------
torch.Size([5])
torch.Size([5])
--------Output Grad----------
torch.Size([5])


Conv2d(3, 10, kernel_size=(2, 2), stride=(2, 2))
---------Input Grad----------
None found for input Gradient
torch.Size([10, 3, 2, 2])
torch.Size([10])
--------Output Grad----------
torch.Size([1, 10, 4, 4])


Note that, the Linear layer gets called first because the backward pass actually go through it first and then backprop to the conv layer

Proper way of implementing Hooks(in back-prop)er way of implementing Hooks(in back-prop)

We have:

  1. torch.autograd.Variable.register_hook (Python method, in Automatic differentiation package)
  2. torch.nn.Module.register_backward_hook (Python method, in torch.nn)
  3. torch.nn.Module.register_forward_hook

The first register_hook,is for any Variable. It's essentially a callback function that is going to be executed every time when Autograd gradient is computed.
While Module.register_backward_hook & n.Module.register_forward_hook are for nn.Module object and their hook_fn shoud take torch:
def hook_fn(m, i, o): where i refers to input and o refers to output

An example

Using named_parameters function we can accomplish gradient modifying/clipping.
The following example does two things:

  1. Turn gradients of linear biases into zero while back-prop (no updates for biase)
  2. Make sure that for no gradient going to conv layer is less than 0 (all positive)
In [0]:
class myNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv = nn.Conv2d(3,10,2,stride=2)
        self.relu = nn.ReLU()
        self.flatten = lambda x: x.view(-1)
        self.fc1  = nn.Linear(160,5)
    def forward(self,x):
        x = self.relu(self.conv(x))
        x.register_hook(lambda grad: torch.clamp(grad, min=0)) # minimun back-prop gradient of value 0

        # print whether there is any negative grad
        x.register_hook(lambda grad: print("Gradients less than zero:", bool((grad<0).any())))
        
        return self.fc1(self.flatten(x))
In [0]:
net = myNet()
In [0]:
for name, param in net.named_parameters():
    print(name)
conv.weight
conv.bias
fc1.weight
fc1.bias
In [0]:
for name, param in net.named_parameters():
    if 'fc' in name and 'bias' in name:
        print(name, param, sep='\n')
fc1.bias
Parameter containing:
tensor([-0.0190, -0.0193, -0.0728,  0.0082,  0.0160], requires_grad=True)
In [0]:
for name, param in net.named_parameters():
    if 'fc' in name and 'bias' in name:
        # assign zero to bias grad with identical dimensions
        param.register_hook(lambda grad: torch.zeros_like(grad))
In [0]:
out = net(torch.randn(1,3,8,8))
In [0]:
(1-out).mean().backward()
Gradients less than zero: False
In [0]:
print(f'the bias for linear layer is: {net.fc1.bias.grad}')
the bias for linear layer is: tensor([0., 0., 0., 0., 0.])

Trick #3 (important)

pack_padded_sequence & pad_packed_sequence often used together dynamic RNNs

In [0]:
import torch
from torch import nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
In [0]:
# Create a tensor with variable length sequences and pads (25)
seqs = torch.LongTensor([[0, 1, 2, 3, 25, 25, 25],
                         [4, 5, 25, 25, 25, 25, 25],
                         [6, 7, 8, 9, 10, 11, 25]])
In [0]:
# Store lengths of the actual sequences, ignoring padding(25)
# These are the points up to which we want the RNN to process the sequence
seq_lens = torch.LongTensor([4,3,6]) # number of non-trivial elements in each row
In [0]:
seq_lens, sort_ind = seq_lens.sort(dim=0, descending=True)
seq_lens, sort_ind
Out[0]:
(tensor([6, 4, 3]), tensor([2, 0, 1]))
In [0]:
seqs = seqs[sort_ind]
seqs
Out[0]:
tensor([[ 6,  7,  8,  9, 10, 11, 25],
        [ 0,  1,  2,  3, 25, 25, 25],
        [ 4,  5, 25, 25, 25, 25, 25]])
In [0]:
# Create an embedding layer, with 0 vectors for the pads
embeds = nn.Embedding(num_embeddings=26,
                      embedding_dim=10,
                      padding_idx=25)
In [0]:
lstm = nn.LSTM(10, 50, bidirectional=False, batch_first=True)
In [0]:
# WITHOUT dynamic batching
embeddings = embeds(seqs)
print(embeddings.size())
out_static, _ = lstm(embeddings)
out_static.size()
torch.Size([3, 7, 10])
Out[0]:
torch.Size([3, 7, 50])
In [0]:
# The number of timesteps in the output will be the same as the total padded timesteps in the input,
# since the LSTM computed over the pads
assert out_static.size(1) == embeddings.size(1)
In [0]:
# Look at the output at a timestep that we know is a pad
print(out_static[1,-1])
tensor([ 0.0821, -0.0592, -0.0492,  0.0023, -0.0777, -0.0576, -0.0531,  0.1315,
         0.0021,  0.0473,  0.0195,  0.0764, -0.1129, -0.0334,  0.0724,  0.1498,
        -0.0529, -0.0625, -0.0379,  0.0425,  0.0015,  0.1318,  0.0448, -0.0354,
         0.1645, -0.0835,  0.0134,  0.0614,  0.0697,  0.0223,  0.0131,  0.0646,
        -0.0725, -0.0345,  0.0158, -0.1179,  0.0900,  0.0378, -0.1458,  0.0356,
         0.0207, -0.0591, -0.0921, -0.0226,  0.0078,  0.0157,  0.0773,  0.0889,
        -0.0189,  0.0382], grad_fn=<SelectBackward>)

Now let's try the same process with Dynamic Batching

In [0]:
# Pack the sequence
packed_seqs = pack_padded_sequence(embeddings, seq_lens.tolist(), batch_first=True)
print(f'the values in the seq_lens: {seq_lens.tolist()}, with the effective sum of {sum(seq_lens.tolist())}')
embeddings.shape,packed_seqs.data.size()
the values in the seq_lens: [6, 4, 3], with the effective sum of 13
Out[0]:
(torch.Size([3, 7, 10]), torch.Size([13, 10]))
In [0]:
out_dynamic, _ = lstm(packed_seqs)
out_dynamic.data.size()
Out[0]:
torch.Size([13, 50])
In [0]:
out_dynamic, lens = pad_packed_sequence(out_dynamic, batch_first=True)
out_dynamic.size(), lens
Out[0]:
(torch.Size([3, 6, 50]), tensor([6, 4, 3]))

Note that here, out_dynamic is padded in shape of [3,6,50] instead of [3,7,50] because we know we can discard one pad from all rows to make it even more compact.
In short, 6 is the longest sequence length in all batches.

In [0]:
assert out_dynamic.size(1) != embeddings.size(1)
print(out_dynamic.shape)
torch.Size([3, 6, 50])
In [0]:
# Look at the output at a timestep that we know is a pad
print(out_dynamic[1, -1])
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.], grad_fn=<SelectBackward>)

Final note:

  • pack_padded_sequence removes pads, flattens by timestep, and keeps track of effective batch_size at each timestep
  • The RNN computes only on the effective batch size "b_t" at each timestep while save computation from computing pads
  • This is why we sort, so that top "b_t" rows at timestep "t" are aligned with the top "b_t"" outputs from timestep "t-1"

Trick #4

Torchviz to visualize PyTorch execution graphs and traces

In [0]:
!pip install torchviz
Collecting torchviz
  Downloading https://files.pythonhosted.org/packages/8f/8e/a9630c7786b846d08b47714dd363a051f5e37b4ea0e534460d8cdfc1644b/torchviz-0.0.1.tar.gz (41kB)
     |████████████████████████████████| 51kB 5.4MB/s 
Requirement already satisfied: torch in /usr/local/lib/python3.6/dist-packages (from torchviz) (1.3.1+cu100)
Requirement already satisfied: graphviz in /usr/local/lib/python3.6/dist-packages (from torchviz) (0.10.1)
Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from torch->torchviz) (1.17.3)
Building wheels for collected packages: torchviz
  Building wheel for torchviz (setup.py) ... done
  Created wheel for torchviz: filename=torchviz-0.0.1-cp36-none-any.whl size=3520 sha256=0affc6da8f5f01332395a52be0b79c445c33958b783e4ccfcd50fedb29b878bc
  Stored in directory: /root/.cache/pip/wheels/2a/c2/c5/b8b4d0f7992c735f6db5bfa3c5f354cf36502037ca2b585667
Successfully built torchviz
Installing collected packages: torchviz
Successfully installed torchviz-0.0.1

Let's start with a basic example(base MLP model)

In [0]:
import torch
from torch import nn
from torchviz import make_dot, make_dot_from_trace
In [0]:
model = nn.Sequential()
model.add_module('W0', nn.Linear(8,16))
model.add_module('tanh', nn.Tanh())
model.add_module('W1', nn.Linear(16,1))

inp = torch.randn(1,8)

make_dot(model(inp), params = dict(model.named_parameters()))
Out[0]:
%3 139640484763240 AddmmBackward 139640484762400 W1.bias (1) 139640484762400->139640484763240 139640484763352 TanhBackward 139640484763352->139640484763240 139640484763520 AddmmBackward 139640484763520->139640484763352 139640484763632 W0.bias (16) 139640484763632->139640484763520 139640484763688 TBackward 139640484763688->139640484763520 139640484763800 W0.weight (16, 8) 139640484763800->139640484763688 139640484763408 TBackward 139640484763408->139640484763240 139640484763576 W1.weight (1, 16) 139640484763576->139640484763408

The method is built for directed graphs of PyTorch operations, built during forward propagation and showing which operations will be called on backward.
It omits subgraphs which do not require gradients.

Visualiza AlexNet

In [0]:
from torchvision.models import AlexNet

model = AlexNet()
In [0]:
x = torch.randn(1,3,227,227).requires_grad_(True)
y = model(x)
make_dot(y, params = dict(list(model.named_parameters()) + [('x',x)]))
Out[0]:
%3 139640333723520 AddmmBackward 139640333723744 classifier.6.bias (1000) 139640333723744->139640333723520 139640333725480 ReluBackward1 139640333725480->139640333723520 139640333722288 AddmmBackward 139640333722288->139640333725480 139640333722008 classifier.4.bias (4096) 139640333722008->139640333722288 139640333725536 MulBackward0 139640333725536->139640333722288 139640333721728 ReluBackward1 139640333721728->139640333725536 139640333723240 AddmmBackward 139640333723240->139640333721728 139640333722680 classifier.1.bias (4096) 139640333722680->139640333723240 139640333724808 MulBackward0 139640333724808->139640333723240 139640333724192 AsStridedBackward 139640333724192->139640333724808 139640333722848 AdaptiveAvgPool2DBackward 139640333722848->139640333724192 139640333440112 MaxPool2DWithIndicesBackward 139640333440112->139640333722848 139640333440784 ReluBackward1 139640333440784->139640333440112 139640333441960 MkldnnConvolutionBackward 139640333441960->139640333440784 139640333442968 ReluBackward1 139640333442968->139640333441960 139640333441512 MkldnnConvolutionBackward 139640333441512->139640333442968 139640333442408 ReluBackward1 139640333442408->139640333441512 139640333440560 MkldnnConvolutionBackward 139640333440560->139640333442408 139640333440224 MaxPool2DWithIndicesBackward 139640333440224->139640333440560 139640333440448 ReluBackward1 139640333440448->139640333440224 139640333442240 MkldnnConvolutionBackward 139640333442240->139640333440448 139640333442856 MaxPool2DWithIndicesBackward 139640333442856->139640333442240 139640333439776 ReluBackward1 139640333439776->139640333442856 139640333440840 MkldnnConvolutionBackward 139640333440840->139640333439776 139640333440896 x (1, 3, 227, 227) 139640333440896->139640333440840 139640333439944 features.0.weight (64, 3, 11, 11) 139640333439944->139640333440840 139640333441232 features.0.bias (64) 139640333441232->139640333440840 139640333443024 features.3.weight (192, 64, 5, 5) 139640333443024->139640333442240 139640333441456 features.3.bias (192) 139640333441456->139640333442240 139640333442352 features.6.weight (384, 192, 3, 3) 139640333442352->139640333440560 139640333441792 features.6.bias (384) 139640333441792->139640333440560 139640333441176 features.8.weight (256, 384, 3, 3) 139640333441176->139640333441512 139640333441680 features.8.bias (256) 139640333441680->139640333441512 139640333442576 features.10.weight (256, 256, 3, 3) 139640333442576->139640333441960 139640333441568 features.10.bias (256) 139640333441568->139640333441960 139640333723576 TBackward 139640333723576->139640333723240 139640333724416 classifier.1.weight (4096, 9216) 139640333724416->139640333723576 139640333722792 TBackward 139640333722792->139640333722288 139640333725144 classifier.4.weight (4096, 4096) 139640333725144->139640333722792 139640333722512 TBackward 139640333722512->139640333723520 139640333724024 classifier.6.weight (1000, 4096) 139640333724024->139640333722512
In [0]:
import torch
import torchvision.models as models
from Utils import *

# Create SSD300 with pretrained weights in the base-architecture
n_classes = 20
model = SSD300(n_classes)
Loaded base model with pre-trained weights

In [0]:
x = torch.randn(1,3,300,300)
y = model(x)
dot = make_dot(y, params = dict(list(model.named_parameters())))
In [0]:
dot.render('VGG300_BN.gv', view=True)  
Out[0]:
'VGG300_BN.gv.pdf'
In [0]:
from google.colab import drive
drive.mount('/content/drive')
Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive
In [0]:
 
In [0]:
 

This is a truly awesome repo full of practical tutorials that implements various state-of-the-art deep learning techniques using PyTorch including:

  1. NLP & Speech Processing
  2. Computer Vision
  3. Probabilistic/Generative Libraries
  4. Other libraries
  5. Paper implementations

Basically a good place to look into when starting a new project to check for relevant realization techniques.
Since deep learning is such a fast developing fielding, if it weren't for the reason that this repo stoped getting updated 2 years ago, it should be #1 on this list.

Trick #6

AdaBound optimizer
Finally, AdaBound is available in PyTorch. One of the most powerful optimizer that out performs Adam in some cases with super fast convergence rate. Definely, something you would want to try out when fast prototyping.
The method is based on Adaptive Gradient Methods with Dynamic Bound of Learning Rate.In Proc. of ICLR 2019.

In [0]:
## implementation
optimizer = adabound.AdaBound(model.parameters(), lr=1e-3, final_lr=0.1)

As described in the paper, AdaBound is an optimizer that behavces like Adam at the beginning of the training, and gradually transforms to SGD at the end. In this way, it can combines the benefits of adaptive methods, viz. fast initial process, and the good final generalization properties of SGD.
The final_lr parameter indicates Adabound would transforms to an SGD with this learninig rate. In common cases, a default final learning rate of 0.1 can achieve relatively good and statble results on unseen data.
This method is not very sensitive to it's hyperparameters. See Appendix G of the paper for more details

In [0]:
 

Trick #7

Flatten layer in PyTorch

In [0]:
import torch.nn as nn
class Flatten(nn.Module):
    def __init__(self):
        super(Flatten, self).__init__()
    
    def forward(self, x):
        return x.view(x.size(0), -1)
In [0]:
 

Trick #8

Expand_as in PyTorch for broadcasting

In [0]:
import torch
import torch.nn as nn
In [0]:
a = torch.tensor([1,2,3])
b = torch.tensor([[1,2,3],[4,5,6],[7,8,9]])
In [0]:
c = a.expand_as(b)
c
Out[0]:
tensor([[1, 2, 3],
        [1, 2, 3],
        [1, 2, 3]])
In [0]:
d = a+b
d # here a will be broadcasted before compute addition with b
Out[0]:
tensor([[ 2,  4,  6],
        [ 5,  7,  9],
        [ 8, 10, 12]])

Trick #9

FastAI listify

In [0]:
x = [1,2,3]
y = torch.arange(12)
In [0]:
x
Out[0]:
[1, 2, 3]
In [0]:
y
Out[0]:
tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])
In [0]:
from fastai.train import listify
z = listify(x)
z
Out[0]:
[1, 2, 3]
In [0]:
z = listify(1,x)
z
Out[0]:
[1, 1, 1]
In [0]:
a = listify(1,y)
a
Out[0]:
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
In [0]:
b = listify('good',x)
b
Out[0]:
['good', 'good', 'good']

Trick #10

In_place

In [0]:
# an example of NOT in-place
a = torch.randn(1)
b = torch.randn(1)
id(a)
Out[0]:
139797812240960
In [0]:
id(b)
Out[0]:
139797812224360
In [0]:
 
In [0]:
# an example of in-place
c = torch.randn(1)
d = torch.randn(1)
id(c)
Out[0]:
139797812367792
In [0]:
id(d)
Out[0]:
139797812492616
In [0]:
c += d
id(c)   # not changed because in-place
Out[0]:
139797812367792
In [0]:
# another example of in-place
e = torch.randn(1)
f = torch.randn(1)
id(e)
Out[0]:
139797812423848
In [0]:
id(f)
Out[0]:
139797812422336
In [0]:
e.add_(f)
id(e)       # this case, in-place
Out[0]:
139797812423848

In PyTroch _ as postfix means inplace.
The variable will be modified and stored in the same memory place without creating a no vacancy for storage

Trick #11

AdaptiveConcatPool2d

In [0]:
import torch.nn as nn
# nn.AdaptiveAvgPool2d??
In [0]:
class AdaptiveConcatPool2d(nn.Module):
    def __init__(self, sz=1):
        super().__init__()
        self.dropout_size = sz
        self.ap = nn.AdaptiveAvgPool2d(sz)
        self.mp = nn.AdaptiveMaxPool2d(sz)
    def forward(self, x):
        return torch.cat([self.ap(x), self.mp(x)],dim=1)
In [0]:
x = torch.tensor([
                  [
                   [1.,2.,3.],
                   [1.,2.,3.],
                   [1.,2.,4.]
                  ],
                  [
                   [1.,2.,3.],
                   [1.,2.,3.],
                   [1.,2.,5.]
                  ],
                  [
                   [1.,2.,3.],
                   [1.,2.,3.],
                   [1.,2.,3.]
                  ]
])
x.shape
Out[0]:
torch.Size([3, 3, 3])
In [0]:
A = nn.AdaptiveAvgPool2d(1) # specify the output size
print(A(x).shape)
A(x)
torch.Size([3, 1, 1])
Out[0]:
tensor([[[2.3333]],

        [[2.2222]],

        [[2.0000]]])
In [0]:
M = nn.AdaptiveMaxPool2d(1)
M(x)
Out[0]:
tensor([[[4.]],

        [[5.]],

        [[3.]]])
In [0]:
A = nn.AdaptiveAvgPool2d((1,3))
print(A(x).shape)
A(x)
torch.Size([3, 1, 3])
Out[0]:
tensor([[[1.0000, 2.0000, 3.3333]],

        [[1.0000, 2.0000, 3.6667]],

        [[1.0000, 2.0000, 3.0000]]])
In [0]:
C = AdaptiveConcatPool2d(1)
C(x)
Out[0]:
tensor([[[2.1111],
         [4.0000]],

        [[2.2222],
         [5.0000]],

        [[2.0000],
         [3.0000]]])

Trick #12

logsumexp

In [0]:
a = torch.zeros(1,3)
a
Out[0]:
tensor([[0., 0., 0.]])
In [0]:
b = torch.logsumexp(input=a,dim=1,keepdim=False)
b
Out[0]:
tensor([1.0986])
In [0]:
zero = torch.tensor([0],dtype=torch.float)
torch.log(torch.exp(zero)+torch.exp(zero)+torch.exp(zero))
Out[0]:
tensor([1.0986])
In [0]:
c = torch.ones(1,3)
c
Out[0]:
tensor([[1., 1., 1.]])
In [0]:
d = torch.logsumexp(c,dim=1)
d
Out[0]:
tensor([2.0986])
In [0]:
one = torch.tensor([1], dtype=torch.float)
torch.log(torch.exp(one)+torch.exp(one)+torch.exp(one))
Out[0]:
tensor([2.0986])

Trick 13

Named_children

In [0]:
import torch
import torch.nn as nn
In [0]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d( 1, 10, 3)
        self.conv2 = nn.Conv2d(10, 20, 3)
        self.conv2_dropout = nn.Dropout2d(p=0.5)
        self.fc1   = nn.Linear(320, 50)
        self.fc2   = nn.Linear(50, 10)
    
    def forward(self, x):
        pass
In [0]:
x = Net()
In [0]:
for l, name in x.named_children():
    print(f"layer {l} is: {name}")
layer conv1 is: Conv2d(1, 10, kernel_size=(3, 3), stride=(1, 1))
layer conv2 is: Conv2d(10, 20, kernel_size=(3, 3), stride=(1, 1))
layer conv2_dropout is: Dropout2d(p=0.5, inplace=False)
layer fc1 is: Linear(in_features=320, out_features=50, bias=True)
layer fc2 is: Linear(in_features=50, out_features=10, bias=True)

Trick #14

torch.addcmul()

In [0]:
x = torch.ones(1,3)
y = torch.ones(3,1)
z = torch.ones(1,1)*2
x, y, z
Out[0]:
(tensor([[1., 1., 1.]]), tensor([[1.],
         [1.],
         [1.]]), tensor([[2.]]))
In [0]:
# torch.addcmul(input, value=1, tensor1, tensor2)
a = torch.addcmul(z, 0.5, x, y) # z + 0.5*x*y
a
Out[0]:
tensor([[2.5000, 2.5000, 2.5000],
        [2.5000, 2.5000, 2.5000],
        [2.5000, 2.5000, 2.5000]])
In [0]:
x,y
Out[0]:
(tensor([[1., 1., 1.]]), tensor([[1.],
         [1.],
         [1.]]))
In [0]:
x*y
Out[0]:
tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]])
In [0]:
z + 0.5*x*y
Out[0]:
tensor([[2.5000, 2.5000, 2.5000],
        [2.5000, 2.5000, 2.5000],
        [2.5000, 2.5000, 2.5000]])

Trick #15

torch.permute used to re-arrange the dimension of a given tensor

In [0]:
x = torch.randn(3,4)
x
Out[0]:
tensor([[ 0.2616,  0.5046, -0.1475,  0.4465],
        [-0.1938,  0.4212, -0.4747,  0.7031],
        [ 1.9922,  2.0265, -0.9237, -1.6005]])
In [0]:
y = x.permute(1,0)
y
Out[0]:
tensor([[ 0.2616, -0.1938,  1.9922],
        [ 0.5046,  0.4212,  2.0265],
        [-0.1475, -0.4747, -0.9237],
        [ 0.4465,  0.7031, -1.6005]])
In [0]:
a = torch.randn(3,4,5,6,7,8)
a.shape
Out[0]:
torch.Size([3, 4, 5, 6, 7, 8])
In [0]:
b = a.permute(2,1,0,4,3,5)
b.size()
Out[0]:
torch.Size([5, 4, 3, 7, 6, 8])

Trick #16

Creating a concise four layer CNN

In [0]:
def conv_block(in_channels, out_channels):
    return nn.Sequential(nn.Conv2d(in_channels, out_channels, 2),
                         nn.BatchNorm2d(out_channels),
                         nn.ReLU(),
                         nn.MaxPool2d(kernel_size=2))
In [0]:
class ConvNet(nn.Module):
    def __init__(self, x_dim=3, hid_dim=64, z_dim=64):
        super().__init__()
        self.encoder = nn.Sequential(
            conv_block(x_dim,   hid_dim),
            conv_block(hid_dim, hid_dim),
            conv_block(hid_dim, hid_dim),
            conv_block(hid_dim, z_dim)
        )
    def forward(self, x):
        x = self.encoder(x)
        x = nn.MaxPool2d(5)(x)
        x = x.view(x.size(0), -1)  # flatten while only retain the batch_size dimenison
        return x
In [0]:
net = ConvNet()
net
Out[0]:
ConvNet(
  (encoder): Sequential(
    (0): Sequential(
      (0): Conv2d(3, 64, kernel_size=(2, 2), stride=(1, 1))
      (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
      (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    )
    (1): Sequential(
      (0): Conv2d(64, 64, kernel_size=(2, 2), stride=(1, 1))
      (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
      (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    )
    (2): Sequential(
      (0): Conv2d(64, 64, kernel_size=(2, 2), stride=(1, 1))
      (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
      (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    )
    (3): Sequential(
      (0): Conv2d(64, 64, kernel_size=(2, 2), stride=(1, 1))
      (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
      (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    )
  )
)

Trick #17

The mechanism behind torch.dropout()

In [0]:
y = torch.ones(3,3)
y
Out[0]:
tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]])
In [0]:
D = nn.Dropout(0)
D(y)
Out[0]:
tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]])
In [0]:
D = nn.Dropout(0.5)
D(y)
Out[0]:
tensor([[0., 2., 0.],
        [2., 2., 0.],
        [0., 2., 0.]])
In [0]:
D = nn.Dropout(1)
D(y)
Out[0]:
tensor([[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]])
In [0]:
D = nn.Dropout(0.3)
D(y)
Out[0]:
tensor([[0.0000, 0.0000, 0.0000],
        [1.4286, 0.0000, 1.4286],
        [1.4286, 1.4286, 1.4286]])
In [0]:
D = nn.Dropout(0.8)
D(y)
Out[0]:
tensor([[0., 0., 5.],
        [5., 0., 5.],
        [5., 0., 0.]])

Final note, the output value will be original/(1-p)

Trick #18

Creating mini-batch

In [0]:
import torch
x = torch.randn(3,128,128)
x.shape
Out[0]:
torch.Size([3, 128, 128])
In [0]:
t = x.unsqueeze(0)
t.shape
Out[0]:
torch.Size([1, 3, 128, 128])
In [0]:
u = x[None,:]
u.shape
Out[0]:
torch.Size([1, 3, 128, 128])
In [0]:
v = x[None]
v.shape
Out[0]:
torch.Size([1, 3, 128, 128])

Trick #19

Look into torch.nn.ReLU()

In [0]:
x = torch.randn(3,3)
x
Out[0]:
tensor([[-1.1243,  0.5653,  0.3340],
        [-1.7015, -0.8263,  0.2759],
        [ 0.5675,  0.8615, -0.4378]])
In [0]:
y = nn.ReLU()
print(y(x)) # All negative values goes to zero, inplance default is False
x
tensor([[0.0000, 0.5653, 0.3340],
        [0.0000, 0.0000, 0.2759],
        [0.5675, 0.8615, 0.0000]])
Out[0]:
tensor([[-1.1243,  0.5653,  0.3340],
        [-1.7015, -0.8263,  0.2759],
        [ 0.5675,  0.8615, -0.4378]])
In [0]:
y = nn.ReLU(inplace=True)
print(y(x))
x
tensor([[0.0000, 0.5653, 0.3340],
        [0.0000, 0.0000, 0.2759],
        [0.5675, 0.8615, 0.0000]])
Out[0]:
tensor([[0.0000, 0.5653, 0.3340],
        [0.0000, 0.0000, 0.2759],
        [0.5675, 0.8615, 0.0000]])

Trick #20

change torch.tensor type

In [0]:
x = torch.randn(3,3)
x.dtype
Out[0]:
torch.float32
In [0]:
x = x.type(torch.long)
x.dtype
Out[0]:
torch.int64
In [0]:
x = torch.randn(3,3).type(torch.float)
x.dtype
Out[0]:
torch.float32
In [0]:
x = torch.ones(3,3, dtype=torch.long)
x.dtype
Out[0]:
torch.int64

Trick #21

L1Loss vs MSELoss

In [0]:
x = torch.randn(1)
x
Out[0]:
tensor([-0.8117])
In [0]:
y = torch.ones(1)
y
Out[0]:
tensor([1.])
In [0]:
z = nn.L1Loss()
z(x,y)
Out[0]:
tensor(1.8117)
In [0]:
abs(x-y)
Out[0]:
tensor([1.8117])
In [0]:
a = nn.MSELoss()
a(x,y)
Out[0]:
tensor(3.2822)
In [0]:
pow((x-y),2)
Out[0]:
tensor([3.2822])

Trick #22

Sigmoid in PyTorch

In [0]:
x = torch.randn(1)
x
Out[0]:
tensor([-1.1949])
In [0]:
y = nn.Sigmoid()
y(x)
Out[0]:
tensor([0.2324])
In [0]:
import math
(1/(1+math.exp(-x)))
Out[0]:
0.23237839781597944
In [0]:
z = torch.ones(3,4)
z
Out[0]:
tensor([[1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.]])
In [0]:
y(z)
Out[0]:
tensor([[0.7311, 0.7311, 0.7311, 0.7311],
        [0.7311, 0.7311, 0.7311, 0.7311],
        [0.7311, 0.7311, 0.7311, 0.7311]])
In [0]:
z = torch.randn(3,4)
z
Out[0]:
tensor([[ 0.0480,  0.2540,  1.5722,  0.3227],
        [-1.0730,  0.8581, -1.3591, -0.2922],
        [ 0.6118,  0.5229,  0.0910,  1.1228]])
In [0]:
y(z)
Out[0]:
tensor([[0.5120, 0.5632, 0.8281, 0.5800],
        [0.2548, 0.7023, 0.2044, 0.4275],
        [0.6484, 0.6278, 0.5227, 0.7545]])

Trick #23

Softmax in Pytorch

In [0]:
x = torch.randn(2,2)
x
Out[0]:
tensor([[-1.0719, -0.8291],
        [-0.3748,  0.0375]])
In [0]:
y = nn.Softmax()
a = y(x);a
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:2: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.
  
Out[0]:
tensor([[0.4396, 0.5604],
        [0.3983, 0.6017]])
In [0]:
a[0][0]+a[0][1]
Out[0]:
tensor(1.)
In [0]:
a[1][0]+a[1][1]
Out[0]:
tensor(1.)

Trick #24

nn.ModuleList

In [0]:
x = nn.ModuleList([nn.Dropout(0.5),
                   nn.ReLU()])
In [0]:
x
Out[0]:
ModuleList(
  (0): Dropout(p=0.5, inplace=False)
  (1): ReLU()
)
In [0]:
y = torch.randn(3,3)
y
Out[0]:
tensor([[ 0.6233, -0.3660,  0.8597],
        [-1.4992, -0.3499,  1.3273],
        [ 0.5832, -1.0665, -1.6803]])
In [0]:
t = x[0](y);t  # performed dropout, and value modified as original/(1-0.5)
Out[0]:
tensor([[ 0.0000, -0.7321,  1.7194],
        [-2.9985, -0.6999,  2.6546],
        [ 0.0000, -2.1329, -0.0000]])
In [0]:
r = x[1](t);r   # performed ReLU
Out[0]:
tensor([[0.0000, 0.0000, 1.7194],
        [0.0000, 0.0000, 2.6546],
        [0.0000, 0.0000, 0.0000]])

Trick #25

nn.Linear

In [0]:
x = torch.randn(2);x
Out[0]:
tensor([0.0459, 0.0121])
In [0]:
a = nn.Linear(2,1);a
Out[0]:
Linear(in_features=2, out_features=1, bias=True)
In [0]:
a.weight, a.bias
Out[0]:
(Parameter containing:
 tensor([[0.1052, 0.3337]], requires_grad=True), Parameter containing:
 tensor([-0.3842], requires_grad=True))
In [0]:
a(x)
Out[0]:
tensor([-0.3754], grad_fn=<AddBackward0>)
In [0]:
x@a.weight.t()+a.bias
Out[0]:
tensor([-0.3754], grad_fn=<AddBackward0>)

Trick #26

torch.mean()

In [0]:
x = torch.FloatTensor([[1,2,3,4],[5,6,7,8]])
x.shape
Out[0]:
torch.Size([2, 4])
In [0]:
y = x.mean()
y
Out[0]:
tensor(4.5000)
In [0]:
y = x.mean(dim=1, keepdim=True);y
Out[0]:
tensor([[2.5000],
        [6.5000]])
In [0]:
y = x.mean(dim=1, keepdim=False);y
Out[0]:
tensor([2.5000, 6.5000])
In [0]:
y.shape
Out[0]:
torch.Size([2])
In [0]:
x = torch.randn(3,4,5)
x.shape
Out[0]:
torch.Size([3, 4, 5])
In [0]:
y = x.mean(dim=1, keepdim=False);y.shape
Out[0]:
torch.Size([3, 5])
In [0]:
y = x.mean(dim=1, keepdim=True);y.shape
Out[0]:
torch.Size([3, 1, 5])
In [0]:
y
Out[0]:
tensor([[[ 0.1030, -0.2823, -0.3783, -0.4649, -0.4580]],

        [[-0.0760, -0.3777, -0.3589,  0.6745,  0.0425]],

        [[ 0.5953, -0.7010, -0.4688,  0.1781, -0.3393]]])

Trick #27

Use dropblock in PyTorch

In [0]:
!pip install dropblock
Collecting dropblock
  Downloading https://files.pythonhosted.org/packages/92/ba/a2c6388f228045fa543f263923804e799b2e9d86b0517c5a53564ae0de3e/dropblock-0.3.0-py3-none-any.whl
Requirement already satisfied: torch>=0.4.1 in /usr/local/lib/python3.6/dist-packages (from dropblock) (1.3.1)
Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from dropblock) (1.17.4)
Installing collected packages: dropblock
Successfully installed dropblock-0.3.0
In [0]:
x = torch.ones(5,5,5,5);x.shape
Out[0]:
torch.Size([5, 5, 5, 5])
In [0]:
import dropblock
y = dropblock.DropBlock2D(drop_prob=0.5, block_size=2)
y(x)    # dropout 2x2 size blocks with chance of 50%
Out[0]:
tensor([[[[1.4706, 0.0000, 0.0000, 1.4706, 0.0000],
          [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
          [1.4706, 0.0000, 0.0000, 1.4706, 1.4706],
          [1.4706, 1.4706, 1.4706, 0.0000, 0.0000]],

         [[1.4706, 0.0000, 0.0000, 1.4706, 0.0000],
          [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
          [1.4706, 0.0000, 0.0000, 1.4706, 1.4706],
          [1.4706, 1.4706, 1.4706, 0.0000, 0.0000]],

         [[1.4706, 0.0000, 0.0000, 1.4706, 0.0000],
          [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
          [1.4706, 0.0000, 0.0000, 1.4706, 1.4706],
          [1.4706, 1.4706, 1.4706, 0.0000, 0.0000]],

         [[1.4706, 0.0000, 0.0000, 1.4706, 0.0000],
          [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
          [1.4706, 0.0000, 0.0000, 1.4706, 1.4706],
          [1.4706, 1.4706, 1.4706, 0.0000, 0.0000]],

         [[1.4706, 0.0000, 0.0000, 1.4706, 0.0000],
          [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
          [1.4706, 0.0000, 0.0000, 1.4706, 1.4706],
          [1.4706, 1.4706, 1.4706, 0.0000, 0.0000]]],


        [[[1.4706, 1.4706, 0.0000, 0.0000, 1.4706],
          [1.4706, 1.4706, 0.0000, 0.0000, 1.4706],
          [1.4706, 1.4706, 1.4706, 1.4706, 1.4706],
          [1.4706, 1.4706, 1.4706, 1.4706, 1.4706],
          [1.4706, 1.4706, 1.4706, 1.4706, 1.4706]],

         [[1.4706, 1.4706, 0.0000, 0.0000, 1.4706],
          [1.4706, 1.4706, 0.0000, 0.0000, 1.4706],
          [1.4706, 1.4706, 1.4706, 1.4706, 1.4706],
          [1.4706, 1.4706, 1.4706, 1.4706, 1.4706],
          [1.4706, 1.4706, 1.4706, 1.4706, 1.4706]],

         [[1.4706, 1.4706, 0.0000, 0.0000, 1.4706],
          [1.4706, 1.4706, 0.0000, 0.0000, 1.4706],
          [1.4706, 1.4706, 1.4706, 1.4706, 1.4706],
          [1.4706, 1.4706, 1.4706, 1.4706, 1.4706],
          [1.4706, 1.4706, 1.4706, 1.4706, 1.4706]],

         [[1.4706, 1.4706, 0.0000, 0.0000, 1.4706],
          [1.4706, 1.4706, 0.0000, 0.0000, 1.4706],
          [1.4706, 1.4706, 1.4706, 1.4706, 1.4706],
          [1.4706, 1.4706, 1.4706, 1.4706, 1.4706],
          [1.4706, 1.4706, 1.4706, 1.4706, 1.4706]],

         [[1.4706, 1.4706, 0.0000, 0.0000, 1.4706],
          [1.4706, 1.4706, 0.0000, 0.0000, 1.4706],
          [1.4706, 1.4706, 1.4706, 1.4706, 1.4706],
          [1.4706, 1.4706, 1.4706, 1.4706, 1.4706],
          [1.4706, 1.4706, 1.4706, 1.4706, 1.4706]]],


        [[[1.4706, 1.4706, 1.4706, 1.4706, 1.4706],
          [1.4706, 1.4706, 1.4706, 1.4706, 1.4706],
          [1.4706, 1.4706, 0.0000, 0.0000, 1.4706],
          [1.4706, 1.4706, 0.0000, 0.0000, 1.4706],
          [1.4706, 1.4706, 1.4706, 1.4706, 1.4706]],

         [[1.4706, 1.4706, 1.4706, 1.4706, 1.4706],
          [1.4706, 1.4706, 1.4706, 1.4706, 1.4706],
          [1.4706, 1.4706, 0.0000, 0.0000, 1.4706],
          [1.4706, 1.4706, 0.0000, 0.0000, 1.4706],
          [1.4706, 1.4706, 1.4706, 1.4706, 1.4706]],

         [[1.4706, 1.4706, 1.4706, 1.4706, 1.4706],
          [1.4706, 1.4706, 1.4706, 1.4706, 1.4706],
          [1.4706, 1.4706, 0.0000, 0.0000, 1.4706],
          [1.4706, 1.4706, 0.0000, 0.0000, 1.4706],
          [1.4706, 1.4706, 1.4706, 1.4706, 1.4706]],

         [[1.4706, 1.4706, 1.4706, 1.4706, 1.4706],
          [1.4706, 1.4706, 1.4706, 1.4706, 1.4706],
          [1.4706, 1.4706, 0.0000, 0.0000, 1.4706],
          [1.4706, 1.4706, 0.0000, 0.0000, 1.4706],
          [1.4706, 1.4706, 1.4706, 1.4706, 1.4706]],

         [[1.4706, 1.4706, 1.4706, 1.4706, 1.4706],
          [1.4706, 1.4706, 1.4706, 1.4706, 1.4706],
          [1.4706, 1.4706, 0.0000, 0.0000, 1.4706],
          [1.4706, 1.4706, 0.0000, 0.0000, 1.4706],
          [1.4706, 1.4706, 1.4706, 1.4706, 1.4706]]],


        [[[1.4706, 1.4706, 1.4706, 1.4706, 0.0000],
          [1.4706, 1.4706, 1.4706, 1.4706, 0.0000],
          [1.4706, 1.4706, 1.4706, 0.0000, 0.0000],
          [1.4706, 1.4706, 1.4706, 0.0000, 0.0000],
          [1.4706, 1.4706, 1.4706, 1.4706, 1.4706]],

         [[1.4706, 1.4706, 1.4706, 1.4706, 0.0000],
          [1.4706, 1.4706, 1.4706, 1.4706, 0.0000],
          [1.4706, 1.4706, 1.4706, 0.0000, 0.0000],
          [1.4706, 1.4706, 1.4706, 0.0000, 0.0000],
          [1.4706, 1.4706, 1.4706, 1.4706, 1.4706]],

         [[1.4706, 1.4706, 1.4706, 1.4706, 0.0000],
          [1.4706, 1.4706, 1.4706, 1.4706, 0.0000],
          [1.4706, 1.4706, 1.4706, 0.0000, 0.0000],
          [1.4706, 1.4706, 1.4706, 0.0000, 0.0000],
          [1.4706, 1.4706, 1.4706, 1.4706, 1.4706]],

         [[1.4706, 1.4706, 1.4706, 1.4706, 0.0000],
          [1.4706, 1.4706, 1.4706, 1.4706, 0.0000],
          [1.4706, 1.4706, 1.4706, 0.0000, 0.0000],
          [1.4706, 1.4706, 1.4706, 0.0000, 0.0000],
          [1.4706, 1.4706, 1.4706, 1.4706, 1.4706]],

         [[1.4706, 1.4706, 1.4706, 1.4706, 0.0000],
          [1.4706, 1.4706, 1.4706, 1.4706, 0.0000],
          [1.4706, 1.4706, 1.4706, 0.0000, 0.0000],
          [1.4706, 1.4706, 1.4706, 0.0000, 0.0000],
          [1.4706, 1.4706, 1.4706, 1.4706, 1.4706]]],


        [[[0.0000, 0.0000, 1.4706, 1.4706, 1.4706],
          [0.0000, 0.0000, 1.4706, 1.4706, 1.4706],
          [1.4706, 1.4706, 1.4706, 1.4706, 1.4706],
          [1.4706, 0.0000, 0.0000, 1.4706, 1.4706],
          [1.4706, 0.0000, 0.0000, 0.0000, 1.4706]],

         [[0.0000, 0.0000, 1.4706, 1.4706, 1.4706],
          [0.0000, 0.0000, 1.4706, 1.4706, 1.4706],
          [1.4706, 1.4706, 1.4706, 1.4706, 1.4706],
          [1.4706, 0.0000, 0.0000, 1.4706, 1.4706],
          [1.4706, 0.0000, 0.0000, 0.0000, 1.4706]],

         [[0.0000, 0.0000, 1.4706, 1.4706, 1.4706],
          [0.0000, 0.0000, 1.4706, 1.4706, 1.4706],
          [1.4706, 1.4706, 1.4706, 1.4706, 1.4706],
          [1.4706, 0.0000, 0.0000, 1.4706, 1.4706],
          [1.4706, 0.0000, 0.0000, 0.0000, 1.4706]],

         [[0.0000, 0.0000, 1.4706, 1.4706, 1.4706],
          [0.0000, 0.0000, 1.4706, 1.4706, 1.4706],
          [1.4706, 1.4706, 1.4706, 1.4706, 1.4706],
          [1.4706, 0.0000, 0.0000, 1.4706, 1.4706],
          [1.4706, 0.0000, 0.0000, 0.0000, 1.4706]],

         [[0.0000, 0.0000, 1.4706, 1.4706, 1.4706],
          [0.0000, 0.0000, 1.4706, 1.4706, 1.4706],
          [1.4706, 1.4706, 1.4706, 1.4706, 1.4706],
          [1.4706, 0.0000, 0.0000, 1.4706, 1.4706],
          [1.4706, 0.0000, 0.0000, 0.0000, 1.4706]]]])
In [0]:
y = dropblock.DropBlock2D(drop_prob=0.5, block_size=3)
y(x)    # dropout 3x3 size blocks with chance of 50%
Out[0]:
tensor([[[[1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887]],

         [[1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887]],

         [[1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887]],

         [[1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887]],

         [[1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887]]],


        [[[1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887]],

         [[1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887]],

         [[1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887]],

         [[1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887]],

         [[1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887]]],


        [[[0.0000, 0.0000, 1.2887, 0.0000, 0.0000],
          [0.0000, 0.0000, 1.2887, 0.0000, 0.0000],
          [0.0000, 0.0000, 1.2887, 0.0000, 0.0000],
          [1.2887, 0.0000, 0.0000, 0.0000, 1.2887],
          [1.2887, 0.0000, 0.0000, 0.0000, 1.2887]],

         [[0.0000, 0.0000, 1.2887, 0.0000, 0.0000],
          [0.0000, 0.0000, 1.2887, 0.0000, 0.0000],
          [0.0000, 0.0000, 1.2887, 0.0000, 0.0000],
          [1.2887, 0.0000, 0.0000, 0.0000, 1.2887],
          [1.2887, 0.0000, 0.0000, 0.0000, 1.2887]],

         [[0.0000, 0.0000, 1.2887, 0.0000, 0.0000],
          [0.0000, 0.0000, 1.2887, 0.0000, 0.0000],
          [0.0000, 0.0000, 1.2887, 0.0000, 0.0000],
          [1.2887, 0.0000, 0.0000, 0.0000, 1.2887],
          [1.2887, 0.0000, 0.0000, 0.0000, 1.2887]],

         [[0.0000, 0.0000, 1.2887, 0.0000, 0.0000],
          [0.0000, 0.0000, 1.2887, 0.0000, 0.0000],
          [0.0000, 0.0000, 1.2887, 0.0000, 0.0000],
          [1.2887, 0.0000, 0.0000, 0.0000, 1.2887],
          [1.2887, 0.0000, 0.0000, 0.0000, 1.2887]],

         [[0.0000, 0.0000, 1.2887, 0.0000, 0.0000],
          [0.0000, 0.0000, 1.2887, 0.0000, 0.0000],
          [0.0000, 0.0000, 1.2887, 0.0000, 0.0000],
          [1.2887, 0.0000, 0.0000, 0.0000, 1.2887],
          [1.2887, 0.0000, 0.0000, 0.0000, 1.2887]]],


        [[[1.2887, 1.2887, 1.2887, 0.0000, 0.0000],
          [1.2887, 1.2887, 1.2887, 0.0000, 0.0000],
          [1.2887, 1.2887, 1.2887, 0.0000, 0.0000],
          [1.2887, 1.2887, 1.2887, 0.0000, 0.0000],
          [1.2887, 1.2887, 1.2887, 0.0000, 0.0000]],

         [[1.2887, 1.2887, 1.2887, 0.0000, 0.0000],
          [1.2887, 1.2887, 1.2887, 0.0000, 0.0000],
          [1.2887, 1.2887, 1.2887, 0.0000, 0.0000],
          [1.2887, 1.2887, 1.2887, 0.0000, 0.0000],
          [1.2887, 1.2887, 1.2887, 0.0000, 0.0000]],

         [[1.2887, 1.2887, 1.2887, 0.0000, 0.0000],
          [1.2887, 1.2887, 1.2887, 0.0000, 0.0000],
          [1.2887, 1.2887, 1.2887, 0.0000, 0.0000],
          [1.2887, 1.2887, 1.2887, 0.0000, 0.0000],
          [1.2887, 1.2887, 1.2887, 0.0000, 0.0000]],

         [[1.2887, 1.2887, 1.2887, 0.0000, 0.0000],
          [1.2887, 1.2887, 1.2887, 0.0000, 0.0000],
          [1.2887, 1.2887, 1.2887, 0.0000, 0.0000],
          [1.2887, 1.2887, 1.2887, 0.0000, 0.0000],
          [1.2887, 1.2887, 1.2887, 0.0000, 0.0000]],

         [[1.2887, 1.2887, 1.2887, 0.0000, 0.0000],
          [1.2887, 1.2887, 1.2887, 0.0000, 0.0000],
          [1.2887, 1.2887, 1.2887, 0.0000, 0.0000],
          [1.2887, 1.2887, 1.2887, 0.0000, 0.0000],
          [1.2887, 1.2887, 1.2887, 0.0000, 0.0000]]],


        [[[1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887]],

         [[1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887]],

         [[1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887]],

         [[1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887]],

         [[1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887],
          [1.2887, 1.2887, 1.2887, 1.2887, 1.2887]]]])

Trick #28

Orthogonal Initialization in PyTorch

In [0]:
import torch
import torch.nn as nn
x, y, z = [torch.zeros(3, 3)]*3
x, y, z
Out[0]:
(tensor([[0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.]]), tensor([[0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.]]), tensor([[0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.]]))
In [0]:
a = nn.init.orthogonal_(x, gain=1) # orthogonal means A@A.t() = I
a
Out[0]:
tensor([[ 0.6239,  0.4647, -0.6283],
        [-0.7781,  0.2945, -0.5549],
        [-0.0729,  0.8350,  0.5453]])
In [0]:
a@a.t()
Out[0]:
tensor([[1.0000e+00, 1.1334e-07, 1.4851e-07],
        [1.1334e-07, 1.0000e+00, 7.9556e-08],
        [1.4851e-07, 7.9556e-08, 1.0000e+00]])
In [0]:
torch.eye(3)
Out[0]:
tensor([[1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.]])
In [0]:
b = nn.init.orthogonal_(y, gain=5) # gain adjusted
b
Out[0]:
tensor([[-0.8261, -0.1627, -4.9286],
        [ 4.9304,  0.0674, -0.8286],
        [ 0.0934, -4.9969,  0.1493]])
In [0]:
b@b.t()
Out[0]:
tensor([[ 2.5000e+01,  2.8265e-07, -1.1301e-06],
        [ 2.8265e-07,  2.5000e+01, -2.2266e-07],
        [-1.1301e-06, -2.2266e-07,  2.5000e+01]])
In [0]:
25*torch.eye(3)
Out[0]:
tensor([[25.,  0.,  0.],
        [ 0., 25.,  0.],
        [ 0.,  0., 25.]])

Final note: remember the initialization process is random. We get different matrix by re-running the cell

In [0]:
 

Trick #29

Param_groups in nn.Modules

In [0]:
import torch
import torch.nn as nn
from torch import optim
In [0]:
l = nn.Linear(3,3)
r = optim.SGD(l.parameters(),lr=0.01)
In [0]:
r
Out[0]:
SGD (
Parameter Group 0
    dampening: 0
    lr: 0.01
    momentum: 0
    nesterov: False
    weight_decay: 0
)
In [0]:
r.param_groups
Out[0]:
[{'dampening': 0,
  'lr': 0.01,
  'momentum': 0,
  'nesterov': False,
  'params': [Parameter containing:
   tensor([[-0.0972,  0.1965,  0.2558],
           [-0.1338, -0.2729,  0.3077],
           [ 0.0461,  0.3657, -0.1356]], requires_grad=True),
   Parameter containing:
   tensor([0.2679, 0.1912, 0.1528], requires_grad=True)],
  'weight_decay': 0}]
In [0]:
r.param_groups[0]['params']
Out[0]:
[Parameter containing:
 tensor([[-0.0972,  0.1965,  0.2558],
         [-0.1338, -0.2729,  0.3077],
         [ 0.0461,  0.3657, -0.1356]], requires_grad=True),
 Parameter containing:
 tensor([0.2679, 0.1912, 0.1528], requires_grad=True)]
In [0]:
# The first is the weight, the second is the bias
for count, i in enumerate(l.parameters()):
    print(count)
    print(i)
0
Parameter containing:
tensor([[-0.0972,  0.1965,  0.2558],
        [-0.1338, -0.2729,  0.3077],
        [ 0.0461,  0.3657, -0.1356]], requires_grad=True)
1
Parameter containing:
tensor([0.2679, 0.1912, 0.1528], requires_grad=True)

All of these info can be accessed by the optimizor's param_groups

Trick #30

Math behind "standard_deviation"

In [0]:
import torch
x = torch.tensor([1.,2.,3.,4.,5.,6.])
In [0]:
m = x.mean()
In [0]:
(x-m)
Out[0]:
tensor([-2.5000, -1.5000, -0.5000,  0.5000,  1.5000,  2.5000])
In [0]:
(x-m).mean()
Out[0]:
tensor(0.)
In [0]:
(x-m).pow(2).mean()
Out[0]:
tensor(2.9167)
In [0]:
(x-m).pow(2).mean().sqrt()
Out[0]:
tensor(1.7078)
In [0]:
x.std(unbiased=False)
Out[0]:
tensor(1.7078)
In [0]:
# If unbiased is False, then the standard-deviation will be calculated via the biased estimator. 
# Otherwise, Bessel’s correction will be used.
x.std(unbiased=True)
Out[0]:
tensor(1.8708)

Trick #31 (important)

Layer-sequential unit-variance(LSUV) initialization implementation.
This technique is proposed by the All you need is a good init paper in 2015.
As we know, a good initialization should get the layer's weight to have standard deviation near to 1.0 and the mean near to 0.0 no matter how deep the layer is. Take note that, when using this LSUVInit package we will have a summary at the end telling us how well it has done.

In [0]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader


class LSUVInit(object):

    def __init__(self,
                 model: nn.Module,
                 data_loader: DataLoader,
                 needed_std: float = 1.0,
                 std_tol: float = 0.1,
                 max_attempts: int = 10,
                 do_orthonorm: bool = True,
                 device: torch.device = 'str') -> None:
        self._model = model
        self.data_loader = data_loader
        self.needed_std = needed_std
        self.std_tol = std_tol
        self.max_attempts = max_attempts
        self.do_orthonorm = do_orthonorm
        self.device = device

        self.eps = 1e-8
        self.hook_position = 0
        self.total_fc_conv_layers = 0
        self.done_counter = -1
        self.hook = None
        self.act_dict: np.ndarray = None
        self.counter_to_apply_correction = 0
        self.correction_needed = False
        self.current_coef = 1.0

    def svd_orthonormal(self, w: np.ndarray) -> np.ndarray:
        shape = w.shape
        if len(shape) < 2:
            raise RuntimeError("Only shapes of length 2 or more are supported.")
        flat_shape = (shape[0], np.prod(shape[1:]))
        a = np.random.normal(0.0, 1.0, flat_shape)  # w;
        u, _, v = np.linalg.svd(a, full_matrices=False)
        q = u if u.shape == flat_shape else v
        print(shape, flat_shape)
        q = q.reshape(shape)
        return q.astype(np.float32)

    def count_conv_fc_layers(self, m: nn.Module) -> None:
        if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
            self.total_fc_conv_layers += 1

    def orthogonal_weights_init(self, m: nn.Module) -> None:
        if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
            if hasattr(m, 'weight_v'):
                w_ortho = self.svd_orthonormal(m.weight_v.data.cpu().numpy())
                m.weight_v.data = torch.from_numpy(w_ortho)
                try:
                    nn.init.constant_(m.bias, 0)
                except Exception:
                    pass
            else:
                w_ortho = self.svd_orthonormal(m.weight.data.cpu().numpy())
                m.weight.data = torch.from_numpy(w_ortho)
                try:
                    nn.init.constant_(m.bias, 0)
                except Exception:
                    pass

    def store_activations(self,
                          module: nn.Module,
                          data: torch.Tensor,
                          output: torch.Tensor) -> None:
        self.act_dict = output.detach().cpu().numpy()

    def add_current_hook(self, m: nn.Module) -> None:
        if self.hook is not None:
            return
        if (isinstance(m, nn.Conv2d)) or (isinstance(m, nn.Linear)):
            if self.hook_position > self.done_counter:
                self.hook = m.register_forward_hook(self.store_activations)
            else:
                self.hook_position += 1

    def apply_weights_correction(self, m: nn.Module) -> None:
        if self.hook is None:
            return
        if not self.correction_needed:
            return
        if (isinstance(m, nn.Conv2d)) or (isinstance(m, nn.Linear)):
            if self.counter_to_apply_correction < self.hook_position:
                self.counter_to_apply_correction += 1
            else:
                if hasattr(m, 'weight_g'):
                    m.weight_g.data *= float(self.current_coef)
                    self.correction_needed = False
                else:
                    m.weight.data *= self.current_coef
                    self.correction_needed = False

    def initialize(self) -> nn.Module:
        model = self._model
        model.eval()

        model.apply(self.count_conv_fc_layers)
        if self.do_orthonorm:
            model.apply(self.orthogonal_weights_init)

        model = model.to(self.device)
        for layer_idx in range(self.total_fc_conv_layers):
            print(layer_idx)
            model.apply(self.add_current_hook)
            data = next(iter(self.data_loader))
            data, _ = [d.to(self.device) for d in data]
            model(data)
            current_std = self.act_dict.std()
            print('std at layer ', layer_idx, ' = ', current_std)

            attempts = 0
            while (np.abs(current_std - self.needed_std) > self.std_tol):
                self.current_coef = self.needed_std / (current_std + self.eps)
                self.correction_needed = True
                model.apply(self.apply_weights_correction)

                model = model.to(self.device)
                model(data)
                current_std = self.act_dict.std()
                print('std at layer ', layer_idx, ' = ', current_std, 'mean = ', self.act_dict.mean())
                attempts += 1
                if attempts > self.max_attempts:
                    break

            if self.hook is not None:
                self.hook.remove()

            self.done_counter += 1
            self.counter_to_apply_correction = 0
            self.hook_position = 0
            self.hook = None
            print('finish at layer', layer_idx)

        print('LSUV init done!')
        return model


def lsuv_init(model: nn.Module,
              data_loader: DataLoader,
              needed_std: float,
              std_tol: float,
              max_attempts: int,
              do_orthonorm: bool,
              device: torch.device) -> nn.Module:

    return LSUVInit(
        model, data_loader, needed_std, std_tol,
        max_attempts, do_orthonorm, device).initialize()
In [0]:
import torchvision
import torchvision.transforms as transforms

print(f"CUDA available: {torch.cuda.is_available()}")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
CUDA available: True
In [0]:
# specify transforms
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))

])
In [0]:
# Download dataset and create dataloader
trainset = torchvision.datasets.CIFAR10(root='./', train=True, download=True, transform=transform_train)
train_loader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True, num_workers=2)
0it [00:00, ?it/s]
Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./cifar-10-python.tar.gz
170500096it [00:06, 27163641.65it/s]                               
Extracting ./cifar-10-python.tar.gz to ./
In [0]:
# our model architecture
x = nn.Sequential(nn.Conv2d(3,8,3), nn.Conv2d(8,16,3))
In [0]:
# create our model while initializa with LSUV init
model = lsuv_init(x, train_loader, needed_std=1., std_tol=0.1, max_attempts=10, do_orthonorm=True, device=device)
(8, 3, 3, 3) (8, 27)
(16, 8, 3, 3) (16, 72)
0
std at layer  0  =  1.3544831
std at layer  0  =  0.99999976 mean =  0.0035810915
finish at layer 0
1
std at layer  1  =  0.8738613
std at layer  1  =  1.0 mean =  -0.055405967
finish at layer 1
LSUV init done!

Trick #32

1x1_conv

In [0]:
import torch
import torch.nn as nn

inp = torch.randn(1,1,128,128) # batch_size of 1, 1x128x128 image

# create encoder and decoder of 1x1 conv2d
enc = nn.Conv2d(1,10,kernel_size=1)
dec = nn.Conv2d(10,1,kernel_size=1)
In [0]:
pred = enc(inp) # increase the dimentionality
pred.shape
Out[0]:
torch.Size([1, 10, 128, 128])
In [0]:
pred_2 = dec(pred) # decrease the dimentionality
pred_2.shape
Out[0]:
torch.Size([1, 1, 128, 128])

Trick #33

nn.Conv2d in PyTorch

In [0]:
import torch
import torch.nn as nn
from fastai.vision import show_image
In [0]:
inp = torch.randn(1,1,128,128)
show_image(inp[0]) # show image of random pixel intensity
Out[0]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f8c43940048>
In [0]:
conv = nn.Conv2d(1,3, kernel_size=3)
pred = conv(inp)
pred.shape
Out[0]:
torch.Size([1, 3, 126, 126])
In [0]:
show_image(pred[0])  # we mae image RGB, from 1 channel to 3 channels
Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).
Out[0]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f8c41f5b710>
In [0]:
conv_next = nn.Conv2d(3,1,kernel_size=3)
pred_next = conv_next(pred)
pred_next.shape
Out[0]:
torch.Size([1, 1, 124, 124])
In [0]:
show_image(pred_next[0])  # conver back to 1 channel by specifying channels
Out[0]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f8c41f2a6a0>

italicized text# Trick #33

Trick #34

PyTorch hooks #2 Take another look into pytorch hooks

In [0]:
import torch
import torch.nn as nn
import torch.nn.init as init
from fastai.vision import children
In [0]:
x = torch.randn(1,1,128,128) # one batch size, 128x128 image
In [0]:
model = nn.Sequential(nn.Conv2d(1,3,kernel_size=3),nn.ReLU())
model
Out[0]:
Sequential(
  (0): Conv2d(1, 3, kernel_size=(3, 3), stride=(1, 1))
  (1): ReLU()
)
In [0]:
# initializing weights
def weight_init_orthogonal(m):
    classname = m.__class__.__name__
    print(classname)
    if classname.find("Conv") != -1:
        init.orthogonal_(m.weight.data, gain=1)

model.apply(weight_init_orthogonal)
Conv2d
ReLU
Sequential
Out[0]:
Sequential(
  (0): Conv2d(1, 3, kernel_size=(3, 3), stride=(1, 1))
  (1): ReLU()
)
In [0]:
# We want to see the output after our input is passed to Conv2d layer
# and then we will be able to pass it to any layer

# for saving feature after every layer by registering forawrd hook
class SaveFeature():
    feature=None

    # when we initialize, we register hook_fn on to the forward pass
    def __init__(self, m):
        self.hook = m.register_forward_hook(self.hook_fn)

    def hook_fn(self, module, input, output):
        self.features = output

    def close(self):
        self.hook.remove()
In [0]:
# we have 2 children here: Conv2d and ReLU
# later we will be registering two different hooks on each
children(model)
Out[0]:
[Conv2d(1, 3, kernel_size=(3, 3), stride=(1, 1)), ReLU()]
In [0]:
saved_features_conv, saved_features_relu = SaveFeature(children(model)[0]), SaveFeature(children(model)[1])
In [0]:
saved_features_conv.features # hook registered but nothing stored yet
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-15-be44644e2347> in <module>()
----> 1 saved_features_conv.features # hook registered but nothing stored yet

AttributeError: 'SaveFeature' object has no attribute 'features'
In [0]:
# Let's run one forward pass
pred = model(x)
In [0]:
saved_features_conv.features # output of conv layer in this forward pass is stored
Out[0]:
tensor([[[[-2.5800, -0.2647,  1.6239,  ..., -0.2907,  0.1908,  0.7584],
          [ 0.0554, -1.1443, -1.1954,  ..., -1.0791, -1.3920,  0.0816],
          [-0.7532,  2.2206, -0.0770,  ...,  0.9156,  1.2014,  1.5768],
          ...,
          [-0.2323, -0.8035,  0.1890,  ..., -0.4978, -0.9132, -1.1315],
          [ 1.0107,  0.2377, -0.2464,  ..., -1.3003,  0.1436,  0.6211],
          [ 0.4233,  0.4291,  0.6064,  ...,  0.1020, -1.4040,  1.5670]],

         [[-0.8652, -1.9808,  0.6113,  ...,  0.5477, -1.1160, -0.3529],
          [ 1.3784, -2.6280, -0.6510,  ...,  0.0914, -0.2478, -1.0833],
          [ 0.5305,  0.2478,  1.9395,  ...,  0.3250, -0.1957, -1.1105],
          ...,
          [ 1.1443, -0.5373,  1.1536,  ..., -0.8420, -0.3519, -0.7982],
          [-0.0647,  0.2426, -1.6397,  ...,  0.5164,  0.9109,  0.7878],
          [-1.5264, -0.9349, -0.4995,  ..., -1.1442,  0.7425, -1.8948]],

         [[-0.5738,  0.8688,  1.2297,  ...,  0.0929,  1.1935,  0.9596],
          [-0.1158,  1.9589,  0.2431,  ...,  0.3384,  1.8386, -0.2965],
          [-0.0054,  0.6109, -0.2503,  ..., -0.7113,  0.6119,  1.0528],
          ...,
          [ 0.7711, -0.8947,  0.2121,  ...,  1.0238,  2.1332,  0.0358],
          [-1.2449,  0.3236, -0.4481,  ...,  1.0771,  0.4712,  0.3424],
          [-0.6949,  0.4338,  1.0977,  ..., -0.9355, -0.2789, -0.4622]]]],
       grad_fn=<MkldnnConvolutionBackward>)
In [0]:
saved_features_conv.features.shape # check out the shape ofoutput of conv layer in this forward pass is stored
Out[0]:
torch.Size([1, 3, 126, 126])
In [0]:
saved_features_relu.features # features after forward pass
Out[0]:
tensor([[[[0.0000, 0.0000, 1.6239,  ..., 0.0000, 0.1908, 0.7584],
          [0.0554, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0816],
          [0.0000, 2.2206, 0.0000,  ..., 0.9156, 1.2014, 1.5768],
          ...,
          [0.0000, 0.0000, 0.1890,  ..., 0.0000, 0.0000, 0.0000],
          [1.0107, 0.2377, 0.0000,  ..., 0.0000, 0.1436, 0.6211],
          [0.4233, 0.4291, 0.6064,  ..., 0.1020, 0.0000, 1.5670]],

         [[0.0000, 0.0000, 0.6113,  ..., 0.5477, 0.0000, 0.0000],
          [1.3784, 0.0000, 0.0000,  ..., 0.0914, 0.0000, 0.0000],
          [0.5305, 0.2478, 1.9395,  ..., 0.3250, 0.0000, 0.0000],
          ...,
          [1.1443, 0.0000, 1.1536,  ..., 0.0000, 0.0000, 0.0000],
          [0.0000, 0.2426, 0.0000,  ..., 0.5164, 0.9109, 0.7878],
          [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.7425, 0.0000]],

         [[0.0000, 0.8688, 1.2297,  ..., 0.0929, 1.1935, 0.9596],
          [0.0000, 1.9589, 0.2431,  ..., 0.3384, 1.8386, 0.0000],
          [0.0000, 0.6109, 0.0000,  ..., 0.0000, 0.6119, 1.0528],
          ...,
          [0.7711, 0.0000, 0.2121,  ..., 1.0238, 2.1332, 0.0358],
          [0.0000, 0.3236, 0.0000,  ..., 1.0771, 0.4712, 0.3424],
          [0.0000, 0.4338, 1.0977,  ..., 0.0000, 0.0000, 0.0000]]]],
       grad_fn=<ReluBackward0>)
In [0]:
# we pass the output after first Conv Layer to another neural network
model2 = nn.Sequential(nn.Dropout(0.5))
In [0]:
t = model2(saved_features_conv.features)
In [0]:
t
Out[0]:
tensor([[[[-5.1599, -0.0000,  3.2477,  ..., -0.5813,  0.0000,  0.0000],
          [ 0.0000, -0.0000, -0.0000,  ..., -2.1582, -0.0000,  0.1633],
          [-0.0000,  0.0000, -0.1541,  ...,  1.8312,  0.0000,  0.0000],
          ...,
          [-0.0000, -1.6069,  0.3780,  ..., -0.0000, -0.0000, -0.0000],
          [ 2.0213,  0.0000, -0.0000,  ..., -2.6007,  0.0000,  0.0000],
          [ 0.8467,  0.8583,  1.2128,  ...,  0.0000, -0.0000,  3.1339]],

         [[-1.7303, -0.0000,  0.0000,  ...,  0.0000, -2.2321, -0.0000],
          [ 0.0000, -5.2560, -1.3020,  ...,  0.1828, -0.0000, -0.0000],
          [ 1.0610,  0.0000,  3.8789,  ...,  0.6499, -0.3914, -2.2210],
          ...,
          [ 0.0000, -1.0745,  0.0000,  ..., -1.6841, -0.7037, -0.0000],
          [-0.0000,  0.0000, -3.2794,  ...,  1.0328,  0.0000,  0.0000],
          [-3.0527, -1.8698, -0.0000,  ..., -0.0000,  0.0000, -3.7897]],

         [[-1.1476,  0.0000,  0.0000,  ...,  0.1857,  2.3870,  1.9191],
          [-0.0000,  3.9178,  0.0000,  ...,  0.0000,  0.0000, -0.5930],
          [-0.0000,  1.2217, -0.0000,  ..., -1.4225,  0.0000,  0.0000],
          ...,
          [ 1.5422, -0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0716],
          [-2.4899,  0.6473, -0.8961,  ...,  0.0000,  0.0000,  0.0000],
          [-1.3897,  0.0000,  2.1954,  ..., -1.8710, -0.0000, -0.9243]]]],
       grad_fn=<MulBackward0>)
In [0]:
t.shape
Out[0]:
torch.Size([1, 3, 126, 126])

Trick #35

step-by-step the computation of training neural_netwoks in PyTorch

In [0]:
import torch
import torch.nn as nn
import torch.optim as optim
In [0]:
inp = torch.ones(1, requires_grad=True) # input, a number whose value is one
inp
Out[0]:
tensor([1.], requires_grad=True)
In [0]:
outp = torch.zeros(1, requires_grad=True) # output, we want a number whose value is 0
outp
Out[0]:
tensor([0.], requires_grad=True)
In [0]:
x = nn.Linear(1,1) # we pass input to linear layer
In [0]:
list(x.parameters())  # weights and bias of our linear layer, which we modify to get correct prediction
Out[0]:
[Parameter containing:
 tensor([[0.0511]], requires_grad=True), Parameter containing:
 tensor([0.1359], requires_grad=True)]
In [0]:
for t in x.parameters():
    print(t.grad) # now there is nothing stored
None
None
In [0]:
# forward pass
pred = x(inp)
pred # we want pred to be zero (calculate loss with outp)
Out[0]:
tensor([0.1870], grad_fn=<AddBackward0>)
In [0]:
loss_function = nn.L1Loss() 
In [0]:
loss = loss_function(pred, outp)
In [0]:
loss
Out[0]:
tensor(0.1870, grad_fn=<MeanBackward0>)
In [0]:
a = optim.SGD(x.parameters(), lr=0.01)
In [0]:
a.zero_grad()
In [0]:
for t in x.parameters():
    print(t.grad)
None
None
In [0]:
loss.backward()
In [0]:
loss # no change to the loss itself
Out[0]:
tensor(0.1870, grad_fn=<MeanBackward0>)
In [0]:
for t in x.parameters():
    print(t.grad)   # now gradients have been computed
tensor([[1.]])
tensor([1.])
In [0]:
a.step() # update step
In [0]:
list(x.parameters()) # weights have already been updated
Out[0]:
[Parameter containing:
 tensor([[0.0411]], requires_grad=True), Parameter containing:
 tensor([0.1259], requires_grad=True)]
In [0]:
for t in x.parameters():
    print(t.grad) # no change for gradient
tensor([[1.]])
tensor([1.])

Again

In [0]:
pred = x(inp)
In [0]:
pred
Out[0]:
tensor([0.1670], grad_fn=<AddBackward0>)
In [0]:
for t in x.parameters():
    print(t.grad)
tensor([[1.]])
tensor([1.])
In [0]:
loss = loss_function(pred, outp)
loss
Out[0]:
tensor(0.1670, grad_fn=<MeanBackward0>)
In [0]:
a.zero_grad()
loss.backward()
In [0]:
a.step() 
In [0]:
list(x.parameters())
Out[0]:
[Parameter containing:
 tensor([[0.0311]], requires_grad=True), Parameter containing:
 tensor([0.1159], requires_grad=True)]
In [0]:
for t in x.parameters():
    print(t.grad)
tensor([[1.]])
tensor([1.])

Trick #36

Trick #37

Trick #38

Trick #39

Trick #40

Trick #41

Trick #42

Trick #43

Trick #44

Trick #45

In [0]:
# Trick #33