diff --git a/docs/02-Quickstart.md b/docs/02-Quickstart.md
index c529ed9..4edd486 100644
--- a/docs/02-Quickstart.md
+++ b/docs/02-Quickstart.md
@@ -229,78 +229,78 @@ print("Done!")
 
     Epoch 1
     -------------------------------
-    loss: 2.300704  [    0/60000]
-    loss: 2.294491  [ 6400/60000]
-    loss: 2.270792  [12800/60000]
-    loss: 2.270757  [19200/60000]
-    loss: 2.246651  [25600/60000]
-    loss: 2.223734  [32000/60000]
-    loss: 2.230299  [38400/60000]
-    loss: 2.197789  [44800/60000]
-    loss: 2.186385  [51200/60000]
-    loss: 2.171854  [57600/60000]
+    loss: 2.300994  [    0/60000]
+    loss: 2.289627  [ 6400/60000]
+    loss: 2.278757  [12800/60000]
+    loss: 2.273481  [19200/60000]
+    loss: 2.260533  [25600/60000]
+    loss: 2.230715  [32000/60000]
+    loss: 2.240870  [38400/60000]
+    loss: 2.210235  [44800/60000]
+    loss: 2.205794  [51200/60000]
+    loss: 2.179301  [57600/60000]
     Test Error: 
-     Accuracy: 40.4%, Avg loss: 2.158354 
+     Accuracy: 42.7%, Avg loss: 2.175595 
     
     Epoch 2
     -------------------------------
-    loss: 2.157282  [    0/60000]
-    loss: 2.157837  [ 6400/60000]
-    loss: 2.098653  [12800/60000]
-    loss: 2.123712  [19200/60000]
-    loss: 2.070209  [25600/60000]
-    loss: 2.017735  [32000/60000]
-    loss: 2.044564  [38400/60000]
-    loss: 1.971302  [44800/60000]
-    loss: 1.963748  [51200/60000]
-    loss: 1.920766  [57600/60000]
+    loss: 2.179688  [    0/60000]
+    loss: 2.170581  [ 6400/60000]
+    loss: 2.125383  [12800/60000]
+    loss: 2.134987  [19200/60000]
+    loss: 2.104071  [25600/60000]
+    loss: 2.039638  [32000/60000]
+    loss: 2.065766  [38400/60000]
+    loss: 1.994649  [44800/60000]
+    loss: 1.991123  [51200/60000]
+    loss: 1.927214  [57600/60000]
     Test Error: 
-     Accuracy: 55.5%, Avg loss: 1.902382 
+     Accuracy: 56.1%, Avg loss: 1.929943 
     
     Epoch 3
     -------------------------------
-    loss: 1.919148  [    0/60000]
-    loss: 1.903148  [ 6400/60000]
-    loss: 1.782882  [12800/60000]
-    loss: 1.834309  [19200/60000]
-    loss: 1.722989  [25600/60000]
-    loss: 1.676954  [32000/60000]
-    loss: 1.698752  [38400/60000]
-    loss: 1.602475  [44800/60000]
-    loss: 1.614792  [51200/60000]
-    loss: 1.532669  [57600/60000]
+    loss: 1.957387  [    0/60000]
+    loss: 1.929036  [ 6400/60000]
+    loss: 1.825893  [12800/60000]
+    loss: 1.850506  [19200/60000]
+    loss: 1.775094  [25600/60000]
+    loss: 1.708617  [32000/60000]
+    loss: 1.727947  [38400/60000]
+    loss: 1.628896  [44800/60000]
+    loss: 1.653404  [51200/60000]
+    loss: 1.548985  [57600/60000]
     Test Error: 
-     Accuracy: 61.7%, Avg loss: 1.533873 
+     Accuracy: 60.7%, Avg loss: 1.570322 
     
     Epoch 4
     -------------------------------
-    loss: 1.585873  [    0/60000]
-    loss: 1.560321  [ 6400/60000]
-    loss: 1.407954  [12800/60000]
-    loss: 1.488211  [19200/60000]
-    loss: 1.364034  [25600/60000]
-    loss: 1.362447  [32000/60000]
-    loss: 1.370802  [38400/60000]
-    loss: 1.302972  [44800/60000]
-    loss: 1.327800  [51200/60000]
-    loss: 1.235748  [57600/60000]
+    loss: 1.634544  [    0/60000]
+    loss: 1.598077  [ 6400/60000]
+    loss: 1.457816  [12800/60000]
+    loss: 1.511364  [19200/60000]
+    loss: 1.425202  [25600/60000]
+    loss: 1.398494  [32000/60000]
+    loss: 1.412483  [38400/60000]
+    loss: 1.328141  [44800/60000]
+    loss: 1.371268  [51200/60000]
+    loss: 1.270080  [57600/60000]
     Test Error: 
-     Accuracy: 63.4%, Avg loss: 1.260575 
+     Accuracy: 63.2%, Avg loss: 1.298073 
     
     Epoch 5
     -------------------------------
-    loss: 1.331637  [    0/60000]
-    loss: 1.313866  [ 6400/60000]
-    loss: 1.153163  [12800/60000]
-    loss: 1.257744  [19200/60000]
-    loss: 1.137783  [25600/60000]
-    loss: 1.162715  [32000/60000]
-    loss: 1.172138  [38400/60000]
-    loss: 1.120971  [44800/60000]
-    loss: 1.149632  [51200/60000]
-    loss: 1.069323  [57600/60000]
+    loss: 1.375485  [    0/60000]
+    loss: 1.353134  [ 6400/60000]
+    loss: 1.197045  [12800/60000]
+    loss: 1.282228  [19200/60000]
+    loss: 1.185837  [25600/60000]
+    loss: 1.195442  [32000/60000]
+    loss: 1.213788  [38400/60000]
+    loss: 1.140980  [44800/60000]
+    loss: 1.188507  [51200/60000]
+    loss: 1.102179  [57600/60000]
     Test Error: 
-     Accuracy: 64.6%, Avg loss: 1.093657 
+     Accuracy: 64.6%, Avg loss: 1.124997 
     
     Done!
 
diff --git a/docs/03-Tensors.md b/docs/03-Tensors.md
index d87b048..19d68fd 100644
--- a/docs/03-Tensors.md
+++ b/docs/03-Tensors.md
@@ -73,8 +73,8 @@ print(f"Random Tensor: \n {x_rand} \n")
             [1, 1]]) 
     
     Random Tensor: 
-     tensor([[0.0504, 0.9505],
-            [0.6485, 0.6105]]) 
+     tensor([[0.7786, 0.0142],
+            [0.3120, 0.9157]]) 
     
 
 
@@ -97,8 +97,8 @@ print(f"Zeros Tensor: \n {zeros_tensor}")
 ```
 
     Random Tensor: 
-     tensor([[0.6582, 0.2838, 0.1244],
-            [0.1692, 0.0394, 0.2638]]) 
+     tensor([[0.7263, 0.5640, 0.3222],
+            [0.9226, 0.3125, 0.3739]]) 
     
     Ones Tensor: 
      tensor([[1., 1., 1.],
diff --git a/docs/04-Data.md b/docs/04-Data.md
index 8cd12e9..e133ffd 100644
--- a/docs/04-Data.md
+++ b/docs/04-Data.md
@@ -1,8 +1,3 @@
-```python
-%matplotlib inline
-```
-
-
 [Learn the Basics](intro.html) ||
 [Quickstart](quickstart_tutorial.html) ||
 [Tensors](tensorqs_tutorial.html) ||
@@ -49,6 +44,8 @@ We load the [FashionMNIST Dataset](https://pytorch.org/vision/stable/datasets.ht
 
 
 ```python
+%matplotlib inline
+
 import torch
 from torch.utils.data import Dataset
 from torchvision import datasets
@@ -106,7 +103,7 @@ plt.show()
 
 
     
-![png](../docs/04-Data_files/../docs/04-Data_6_0.png)
+![png](../docs/04-Data_files/../docs/04-Data_5_0.png)
     
 
 
@@ -268,11 +265,11 @@ print(f"Label: {label}")
 
 
     
-![png](../docs/04-Data_files/../docs/04-Data_21_1.png)
+![png](../docs/04-Data_files/../docs/04-Data_20_1.png)
     
 
 
-    Label: 1
+    Label: 3
 
 
 --------------
diff --git a/docs/06-BuildModel.md b/docs/06-BuildModel.md
index 92a7fca..6ed386e 100644
--- a/docs/06-BuildModel.md
+++ b/docs/06-BuildModel.md
@@ -115,7 +115,7 @@ y_pred = pred_probab.argmax(1)
 print(f"Predicted class: {y_pred}")
 ```
 
-    Predicted class: tensor([9])
+    Predicted class: tensor([1])
 
 
 --------------
@@ -191,30 +191,26 @@ hidden1 = nn.ReLU()(hidden1)
 print(f"After ReLU: {hidden1}")
 ```
 
-    Before ReLU: tensor([[-5.5712e-01,  4.1135e-01, -7.4510e-03, -5.4891e-02,  7.3538e-02,
-              4.6617e-01,  5.3287e-01,  7.2283e-02, -3.7471e-01, -3.9285e-01,
-             -6.7889e-01,  2.1088e-01,  1.8742e-01,  4.0150e-01, -5.6422e-02,
-             -4.8977e-02, -1.6230e-01,  3.0556e-01, -7.1455e-01, -6.6180e-02],
-            [-4.2601e-01,  6.2487e-01, -5.9415e-02,  2.3934e-02,  3.9810e-01,
-              3.2441e-01,  7.0026e-01, -1.2423e-01, -5.2260e-01, -1.7234e-01,
-             -5.5835e-01,  2.2128e-01,  2.7830e-01,  2.4191e-01, -7.7681e-02,
-             -2.4954e-01,  1.5836e-01,  1.9990e-01, -1.1715e-01, -3.2138e-01],
-            [-4.9225e-01,  4.1050e-01, -1.5492e-01,  8.9106e-03,  3.5985e-01,
-              3.1355e-01,  6.2615e-01, -1.9053e-04, -5.7080e-01, -1.7064e-01,
-             -6.5802e-01,  3.3700e-01,  4.5726e-01,  3.1022e-01, -4.0316e-01,
-             -3.8029e-01, -1.2243e-01,  3.6732e-01, -5.6789e-01, -9.4490e-02]],
-           grad_fn=<AddmmBackward0>)
+    Before ReLU: tensor([[-0.6535,  0.0475,  0.2762,  0.2739,  0.3857,  0.1837, -0.1904, -0.3036,
+             -0.0609, -0.2871,  0.0446,  0.2365, -0.2100,  0.3802,  0.1994, -0.4515,
+              0.1591,  0.1378,  0.1966, -0.0231],
+            [-0.7906,  0.0717,  0.3879,  0.0195,  0.2133,  0.4331,  0.1080, -0.3002,
+             -0.0044, -0.3400,  0.2174,  0.4808, -0.1150,  0.2409,  0.3484, -0.0483,
+              0.3890,  0.1460,  0.1570,  0.1086],
+            [-0.8346,  0.3771,  0.3634, -0.3699,  0.5272, -0.2396, -0.4630, -0.0269,
+             -0.0439, -0.4653,  0.1175,  0.4506, -0.1127,  0.1764,  0.1627,  0.0395,
+              0.4420,  0.1518,  0.0156,  0.0423]], grad_fn=<AddmmBackward0>)
     
     
-    After ReLU: tensor([[0.0000, 0.4113, 0.0000, 0.0000, 0.0735, 0.4662, 0.5329, 0.0723, 0.0000,
-             0.0000, 0.0000, 0.2109, 0.1874, 0.4015, 0.0000, 0.0000, 0.0000, 0.3056,
-             0.0000, 0.0000],
-            [0.0000, 0.6249, 0.0000, 0.0239, 0.3981, 0.3244, 0.7003, 0.0000, 0.0000,
-             0.0000, 0.0000, 0.2213, 0.2783, 0.2419, 0.0000, 0.0000, 0.1584, 0.1999,
-             0.0000, 0.0000],
-            [0.0000, 0.4105, 0.0000, 0.0089, 0.3599, 0.3136, 0.6262, 0.0000, 0.0000,
-             0.0000, 0.0000, 0.3370, 0.4573, 0.3102, 0.0000, 0.0000, 0.0000, 0.3673,
-             0.0000, 0.0000]], grad_fn=<ReluBackward0>)
+    After ReLU: tensor([[0.0000, 0.0475, 0.2762, 0.2739, 0.3857, 0.1837, 0.0000, 0.0000, 0.0000,
+             0.0000, 0.0446, 0.2365, 0.0000, 0.3802, 0.1994, 0.0000, 0.1591, 0.1378,
+             0.1966, 0.0000],
+            [0.0000, 0.0717, 0.3879, 0.0195, 0.2133, 0.4331, 0.1080, 0.0000, 0.0000,
+             0.0000, 0.2174, 0.4808, 0.0000, 0.2409, 0.3484, 0.0000, 0.3890, 0.1460,
+             0.1570, 0.1086],
+            [0.0000, 0.3771, 0.3634, 0.0000, 0.5272, 0.0000, 0.0000, 0.0000, 0.0000,
+             0.0000, 0.1175, 0.4506, 0.0000, 0.1764, 0.1627, 0.0395, 0.4420, 0.1518,
+             0.0156, 0.0423]], grad_fn=<ReluBackward0>)
 
 
 ### nn.Sequential
@@ -281,23 +277,23 @@ for name, param in model.named_parameters():
     )
     
     
-    Layer: linear_relu_stack.0.weight | Size: torch.Size([512, 784]) | Values : tensor([[ 0.0211,  0.0168,  0.0334,  ..., -0.0151, -0.0033,  0.0032],
-            [-0.0022,  0.0293, -0.0090,  ..., -0.0044, -0.0147, -0.0251]],
+    Layer: linear_relu_stack.0.weight | Size: torch.Size([512, 784]) | Values : tensor([[ 0.0007,  0.0351,  0.0290,  ...,  0.0157, -0.0041, -0.0052],
+            [ 0.0163, -0.0053,  0.0237,  ..., -0.0294,  0.0200,  0.0072]],
            grad_fn=<SliceBackward0>) 
     
-    Layer: linear_relu_stack.0.bias | Size: torch.Size([512]) | Values : tensor([0.0128, 0.0086], grad_fn=<SliceBackward0>) 
+    Layer: linear_relu_stack.0.bias | Size: torch.Size([512]) | Values : tensor([-0.0143, -0.0101], grad_fn=<SliceBackward0>) 
     
-    Layer: linear_relu_stack.2.weight | Size: torch.Size([512, 512]) | Values : tensor([[-0.0165, -0.0068, -0.0016,  ..., -0.0098,  0.0119,  0.0326],
-            [ 0.0330, -0.0306, -0.0129,  ..., -0.0371, -0.0291, -0.0273]],
+    Layer: linear_relu_stack.2.weight | Size: torch.Size([512, 512]) | Values : tensor([[-0.0091,  0.0016,  0.0303,  ...,  0.0147,  0.0108,  0.0114],
+            [-0.0018,  0.0363, -0.0248,  ..., -0.0332,  0.0185,  0.0011]],
            grad_fn=<SliceBackward0>) 
     
-    Layer: linear_relu_stack.2.bias | Size: torch.Size([512]) | Values : tensor([ 0.0024, -0.0164], grad_fn=<SliceBackward0>) 
+    Layer: linear_relu_stack.2.bias | Size: torch.Size([512]) | Values : tensor([0.0409, 0.0064], grad_fn=<SliceBackward0>) 
     
-    Layer: linear_relu_stack.4.weight | Size: torch.Size([10, 512]) | Values : tensor([[ 0.0046,  0.0249,  0.0123,  ...,  0.0352, -0.0170,  0.0232],
-            [ 0.0038,  0.0283,  0.0235,  ..., -0.0416,  0.0304,  0.0217]],
+    Layer: linear_relu_stack.4.weight | Size: torch.Size([10, 512]) | Values : tensor([[ 0.0349, -0.0004,  0.0420,  ..., -0.0023,  0.0277,  0.0173],
+            [ 0.0015, -0.0185,  0.0072,  ..., -0.0159, -0.0068,  0.0271]],
            grad_fn=<SliceBackward0>) 
     
-    Layer: linear_relu_stack.4.bias | Size: torch.Size([10]) | Values : tensor([0.0118, 0.0417], grad_fn=<SliceBackward0>) 
+    Layer: linear_relu_stack.4.bias | Size: torch.Size([10]) | Values : tensor([-0.0129,  0.0260], grad_fn=<SliceBackward0>) 
     
 
 
diff --git a/docs/07-Autograd.md b/docs/07-Autograd.md
index f8e1eee..5580131 100644
--- a/docs/07-Autograd.md
+++ b/docs/07-Autograd.md
@@ -1,8 +1,3 @@
-```python
-%matplotlib inline
-```
-
-
 [Learn the Basics](intro.html) ||
 [Quickstart](quickstart_tutorial.html) ||
 [Tensors](tensorqs_tutorial.html) ||
@@ -31,6 +26,8 @@ PyTorch in the following manner:
 
 
 ```python
+%matplotlib inline
+
 import torch
 
 x = torch.ones(5)  # input tensor
@@ -77,8 +74,8 @@ print(f"Gradient function for z = {z.grad_fn}")
 print(f"Gradient function for loss = {loss.grad_fn}")
 ```
 
-    Gradient function for z = <AddBackward0 object at 0x10fa1ee80>
-    Gradient function for loss = <BinaryCrossEntropyWithLogitsBackward0 object at 0x10fa1e430>
+    Gradient function for z = <AddBackward0 object at 0x10427e550>
+    Gradient function for loss = <BinaryCrossEntropyWithLogitsBackward0 object at 0x10427e670>
 
 
 ## Computing Gradients
@@ -101,12 +98,12 @@ print(w.grad)
 print(b.grad)
 ```
 
-    tensor([[0.3244, 0.2353, 0.0700],
-            [0.3244, 0.2353, 0.0700],
-            [0.3244, 0.2353, 0.0700],
-            [0.3244, 0.2353, 0.0700],
-            [0.3244, 0.2353, 0.0700]])
-    tensor([0.3244, 0.2353, 0.0700])
+    tensor([[0.1692, 0.2790, 0.2088],
+            [0.1692, 0.2790, 0.2088],
+            [0.1692, 0.2790, 0.2088],
+            [0.1692, 0.2790, 0.2088],
+            [0.1692, 0.2790, 0.2088]])
+    tensor([0.1692, 0.2790, 0.2088])
 
 
 <div class="alert alert-info"><h4>Note</h4><p>- We can only obtain the ``grad`` properties for the leaf
diff --git a/docs/08-Optimization.md b/docs/08-Optimization.md
index aeb9aa0..3367728 100644
--- a/docs/08-Optimization.md
+++ b/docs/08-Optimization.md
@@ -1,8 +1,3 @@
-```python
-%matplotlib inline
-```
-
-
 [Learn the Basics](intro.html) ||
 [Quickstart](quickstart_tutorial.html) ||
 [Tensors](tensorqs_tutorial.html) ||
@@ -28,6 +23,8 @@ and [Build Model](buildmodel_tutorial.html).
 
 
 ```python
+%matplotlib inline
+
 import torch
 from torch import nn
 from torch.utils.data import DataLoader
@@ -209,153 +206,153 @@ print("Done!")
 
     Epoch 1
     -------------------------------
-    loss: 2.310308  [   64/60000]
-    loss: 2.291682  [ 6464/60000]
-    loss: 2.282847  [12864/60000]
-    loss: 2.278148  [19264/60000]
-    loss: 2.259573  [25664/60000]
-    loss: 2.246842  [32064/60000]
-    loss: 2.237948  [38464/60000]
-    loss: 2.221490  [44864/60000]
-    loss: 2.215676  [51264/60000]
-    loss: 2.186174  [57664/60000]
+    loss: 2.299604  [   64/60000]
+    loss: 2.281797  [ 6464/60000]
+    loss: 2.269583  [12864/60000]
+    loss: 2.255457  [19264/60000]
+    loss: 2.240205  [25664/60000]
+    loss: 2.213762  [32064/60000]
+    loss: 2.215705  [38464/60000]
+    loss: 2.184422  [44864/60000]
+    loss: 2.175044  [51264/60000]
+    loss: 2.137501  [57664/60000]
     Test Error: 
-     Accuracy: 50.1%, Avg loss: 2.185173 
+     Accuracy: 53.1%, Avg loss: 2.138075 
     
     Epoch 2
     -------------------------------
-    loss: 2.192464  [   64/60000]
-    loss: 2.176265  [ 6464/60000]
-    loss: 2.138019  [12864/60000]
-    loss: 2.155484  [19264/60000]
-    loss: 2.096774  [25664/60000]
-    loss: 2.064352  [32064/60000]
-    loss: 2.073422  [38464/60000]
-    loss: 2.019561  [44864/60000]
-    loss: 2.018754  [51264/60000]
-    loss: 1.944076  [57664/60000]
+    loss: 2.153558  [   64/60000]
+    loss: 2.139259  [ 6464/60000]
+    loss: 2.081727  [12864/60000]
+    loss: 2.085114  [19264/60000]
+    loss: 2.046907  [25664/60000]
+    loss: 1.977491  [32064/60000]
+    loss: 2.007782  [38464/60000]
+    loss: 1.928677  [44864/60000]
+    loss: 1.934681  [51264/60000]
+    loss: 1.844566  [57664/60000]
     Test Error: 
-     Accuracy: 56.9%, Avg loss: 1.951974 
+     Accuracy: 59.0%, Avg loss: 1.855136 
     
     Epoch 3
     -------------------------------
-    loss: 1.979550  [   64/60000]
-    loss: 1.944613  [ 6464/60000]
-    loss: 1.850896  [12864/60000]
-    loss: 1.885921  [19264/60000]
-    loss: 1.766024  [25664/60000]
-    loss: 1.721881  [32064/60000]
-    loss: 1.732149  [38464/60000]
-    loss: 1.646069  [44864/60000]
-    loss: 1.663508  [51264/60000]
-    loss: 1.542335  [57664/60000]
+    loss: 1.898872  [   64/60000]
+    loss: 1.859855  [ 6464/60000]
+    loss: 1.745800  [12864/60000]
+    loss: 1.771856  [19264/60000]
+    loss: 1.671929  [25664/60000]
+    loss: 1.624660  [32064/60000]
+    loss: 1.646571  [38464/60000]
+    loss: 1.553838  [44864/60000]
+    loss: 1.585847  [51264/60000]
+    loss: 1.463247  [57664/60000]
     Test Error: 
-     Accuracy: 60.8%, Avg loss: 1.575167 
+     Accuracy: 62.0%, Avg loss: 1.491152 
     
     Epoch 4
     -------------------------------
-    loss: 1.641383  [   64/60000]
-    loss: 1.597785  [ 6464/60000]
-    loss: 1.460881  [12864/60000]
-    loss: 1.522893  [19264/60000]
-    loss: 1.394849  [25664/60000]
-    loss: 1.381750  [32064/60000]
-    loss: 1.389999  [38464/60000]
-    loss: 1.324359  [44864/60000]
-    loss: 1.359623  [51264/60000]
-    loss: 1.242349  [57664/60000]
+    loss: 1.570554  [   64/60000]
+    loss: 1.524995  [ 6464/60000]
+    loss: 1.381242  [12864/60000]
+    loss: 1.440385  [19264/60000]
+    loss: 1.325888  [25664/60000]
+    loss: 1.331313  [32064/60000]
+    loss: 1.343411  [38464/60000]
+    loss: 1.273921  [44864/60000]
+    loss: 1.314914  [51264/60000]
+    loss: 1.204072  [57664/60000]
     Test Error: 
-     Accuracy: 63.2%, Avg loss: 1.281596 
+     Accuracy: 63.9%, Avg loss: 1.234092 
     
     Epoch 5
     -------------------------------
-    loss: 1.364956  [   64/60000]
-    loss: 1.337699  [ 6464/60000]
-    loss: 1.179997  [12864/60000]
-    loss: 1.276043  [19264/60000]
-    loss: 1.145318  [25664/60000]
-    loss: 1.163051  [32064/60000]
-    loss: 1.179221  [38464/60000]
-    loss: 1.127842  [44864/60000]
-    loss: 1.170320  [51264/60000]
-    loss: 1.072596  [57664/60000]
+    loss: 1.318503  [   64/60000]
+    loss: 1.292388  [ 6464/60000]
+    loss: 1.131896  [12864/60000]
+    loss: 1.229624  [19264/60000]
+    loss: 1.102847  [25664/60000]
+    loss: 1.138407  [32064/60000]
+    loss: 1.157674  [38464/60000]
+    loss: 1.099932  [44864/60000]
+    loss: 1.145054  [51264/60000]
+    loss: 1.048841  [57664/60000]
     Test Error: 
-     Accuracy: 64.8%, Avg loss: 1.102368 
+     Accuracy: 65.2%, Avg loss: 1.074347 
     
     Epoch 6
     -------------------------------
-    loss: 1.181124  [   64/60000]
-    loss: 1.175671  [ 6464/60000]
-    loss: 0.999543  [12864/60000]
-    loss: 1.125861  [19264/60000]
-    loss: 0.994338  [25664/60000]
-    loss: 1.020635  [32064/60000]
-    loss: 1.052101  [38464/60000]
-    loss: 1.005876  [44864/60000]
-    loss: 1.050259  [51264/60000]
-    loss: 0.969423  [57664/60000]
+    loss: 1.147973  [   64/60000]
+    loss: 1.144627  [ 6464/60000]
+    loss: 0.967731  [12864/60000]
+    loss: 1.098405  [19264/60000]
+    loss: 0.965783  [25664/60000]
+    loss: 1.007831  [32064/60000]
+    loss: 1.040992  [38464/60000]
+    loss: 0.989532  [44864/60000]
+    loss: 1.033878  [51264/60000]
+    loss: 0.949742  [57664/60000]
     Test Error: 
-     Accuracy: 65.8%, Avg loss: 0.989962 
+     Accuracy: 66.5%, Avg loss: 0.970729 
     
     Epoch 7
     -------------------------------
-    loss: 1.055653  [   64/60000]
-    loss: 1.073796  [ 6464/60000]
-    loss: 0.878792  [12864/60000]
-    loss: 1.027988  [19264/60000]
-    loss: 0.902191  [25664/60000]
-    loss: 0.923560  [32064/60000]
-    loss: 0.970771  [38464/60000]
-    loss: 0.927402  [44864/60000]
-    loss: 0.969056  [51264/60000]
-    loss: 0.901827  [57664/60000]
+    loss: 1.027588  [   64/60000]
+    loss: 1.047764  [ 6464/60000]
+    loss: 0.855220  [12864/60000]
+    loss: 1.011105  [19264/60000]
+    loss: 0.879051  [25664/60000]
+    loss: 0.915307  [32064/60000]
+    loss: 0.963445  [38464/60000]
+    loss: 0.917342  [44864/60000]
+    loss: 0.956093  [51264/60000]
+    loss: 0.882487  [57664/60000]
     Test Error: 
-     Accuracy: 66.8%, Avg loss: 0.914991 
+     Accuracy: 67.8%, Avg loss: 0.899503 
     
     Epoch 8
     -------------------------------
-    loss: 0.964512  [   64/60000]
-    loss: 1.004631  [ 6464/60000]
-    loss: 0.793878  [12864/60000]
-    loss: 0.959500  [19264/60000]
-    loss: 0.842306  [25664/60000]
-    loss: 0.854395  [32064/60000]
-    loss: 0.914801  [38464/60000]
-    loss: 0.875149  [44864/60000]
-    loss: 0.910963  [51264/60000]
-    loss: 0.853945  [57664/60000]
+    loss: 0.938215  [   64/60000]
+    loss: 0.979911  [ 6464/60000]
+    loss: 0.774328  [12864/60000]
+    loss: 0.949241  [19264/60000]
+    loss: 0.821273  [25664/60000]
+    loss: 0.847455  [32064/60000]
+    loss: 0.908044  [38464/60000]
+    loss: 0.868443  [44864/60000]
+    loss: 0.899046  [51264/60000]
+    loss: 0.833970  [57664/60000]
     Test Error: 
-     Accuracy: 67.8%, Avg loss: 0.861828 
+     Accuracy: 69.0%, Avg loss: 0.847812 
     
     Epoch 9
     -------------------------------
-    loss: 0.895530  [   64/60000]
-    loss: 0.953656  [ 6464/60000]
-    loss: 0.731293  [12864/60000]
-    loss: 0.908750  [19264/60000]
-    loss: 0.800252  [25664/60000]
-    loss: 0.803487  [32064/60000]
-    loss: 0.873069  [38464/60000]
-    loss: 0.838708  [44864/60000]
-    loss: 0.867891  [51264/60000]
-    loss: 0.817475  [57664/60000]
+    loss: 0.869330  [   64/60000]
+    loss: 0.928490  [ 6464/60000]
+    loss: 0.713798  [12864/60000]
+    loss: 0.903194  [19264/60000]
+    loss: 0.780315  [25664/60000]
+    loss: 0.796318  [32064/60000]
+    loss: 0.865808  [38464/60000]
+    loss: 0.834232  [44864/60000]
+    loss: 0.856279  [51264/60000]
+    loss: 0.797004  [57664/60000]
     Test Error: 
-     Accuracy: 68.9%, Avg loss: 0.821918 
+     Accuracy: 70.2%, Avg loss: 0.808560 
     
     Epoch 10
     -------------------------------
-    loss: 0.841097  [   64/60000]
-    loss: 0.913210  [ 6464/60000]
-    loss: 0.683007  [12864/60000]
-    loss: 0.869649  [19264/60000]
-    loss: 0.768555  [25664/60000]
-    loss: 0.764901  [32064/60000]
-    loss: 0.839639  [38464/60000]
-    loss: 0.811697  [44864/60000]
-    loss: 0.834432  [51264/60000]
-    loss: 0.788075  [57664/60000]
+    loss: 0.814243  [   64/60000]
+    loss: 0.887317  [ 6464/60000]
+    loss: 0.666916  [12864/60000]
+    loss: 0.867729  [19264/60000]
+    loss: 0.749378  [25664/60000]
+    loss: 0.757221  [32064/60000]
+    loss: 0.831676  [38464/60000]
+    loss: 0.808831  [44864/60000]
+    loss: 0.822820  [51264/60000]
+    loss: 0.767592  [57664/60000]
     Test Error: 
-     Accuracy: 70.1%, Avg loss: 0.790321 
+     Accuracy: 71.5%, Avg loss: 0.777352 
     
     Done!
 
diff --git a/docs/09-SaveLoad.md b/docs/09-SaveLoad.md
index d07b56e..7502494 100644
--- a/docs/09-SaveLoad.md
+++ b/docs/09-SaveLoad.md
@@ -1,8 +1,3 @@
-```python
-%matplotlib inline
-```
-
-
 [Learn the Basics](intro.html) ||
 [Quickstart](quickstart_tutorial.html) ||
 [Tensors](tensorqs_tutorial.html) ||
@@ -20,6 +15,8 @@ In this section we will look at how to persist model state with saving, loading
 
 
 ```python
+%matplotlib inline
+
 import torch
 import torchvision.models as models
 ```
diff --git a/docs/docs/04-Data_20_1.png b/docs/docs/04-Data_20_1.png
new file mode 100644
index 0000000..5b12ec7
Binary files /dev/null and b/docs/docs/04-Data_20_1.png differ
diff --git a/docs/docs/04-Data_5_0.png b/docs/docs/04-Data_5_0.png
new file mode 100644
index 0000000..dcfa506
Binary files /dev/null and b/docs/docs/04-Data_5_0.png differ
diff --git a/tutorials/04-Data.ipynb b/tutorials/04-Data.ipynb
index 5ec7a02..117b646 100644
--- a/tutorials/04-Data.ipynb
+++ b/tutorials/04-Data.ipynb
@@ -1,35 +1,60 @@
 {
   "cells": [
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "collapsed": false
-      },
-      "outputs": [],
-      "source": [
-        "%matplotlib inline"
-      ]
-    },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "\n[Learn the Basics](intro.html) ||\n[Quickstart](quickstart_tutorial.html) ||\n[Tensors](tensorqs_tutorial.html) ||\n**Datasets & DataLoaders** ||\n[Transforms](transforms_tutorial.html) ||\n[Build Model](buildmodel_tutorial.html) ||\n[Autograd](autogradqs_tutorial.html) ||\n[Optimization](optimization_tutorial.html) ||\n[Save & Load Model](saveloadrun_tutorial.html)\n\n# Datasets & DataLoaders\n"
+        "\n",
+        "[Learn the Basics](intro.html) ||\n",
+        "[Quickstart](quickstart_tutorial.html) ||\n",
+        "[Tensors](tensorqs_tutorial.html) ||\n",
+        "**Datasets & DataLoaders** ||\n",
+        "[Transforms](transforms_tutorial.html) ||\n",
+        "[Build Model](buildmodel_tutorial.html) ||\n",
+        "[Autograd](autogradqs_tutorial.html) ||\n",
+        "[Optimization](optimization_tutorial.html) ||\n",
+        "[Save & Load Model](saveloadrun_tutorial.html)\n",
+        "\n",
+        "# Datasets & DataLoaders\n"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "Code for processing data samples can get messy and hard to maintain; we ideally want our dataset code\nto be decoupled from our model training code for better readability and modularity.\nPyTorch provides two data primitives: ``torch.utils.data.DataLoader`` and ``torch.utils.data.Dataset``\nthat allow you to use pre-loaded datasets as well as your own data.\n``Dataset`` stores the samples and their corresponding labels, and ``DataLoader`` wraps an iterable around\nthe ``Dataset`` to enable easy access to the samples.\n\nPyTorch domain libraries provide a number of pre-loaded datasets (such as FashionMNIST) that\nsubclass ``torch.utils.data.Dataset`` and implement functions specific to the particular data.\nThey can be used to prototype and benchmark your model. You can find them\nhere: [Image Datasets](https://pytorch.org/vision/stable/datasets.html),\n[Text Datasets](https://pytorch.org/text/stable/datasets.html), and\n[Audio Datasets](https://pytorch.org/audio/stable/datasets.html)\n\n\n"
+        "Code for processing data samples can get messy and hard to maintain; we ideally want our dataset code\n",
+        "to be decoupled from our model training code for better readability and modularity.\n",
+        "PyTorch provides two data primitives: ``torch.utils.data.DataLoader`` and ``torch.utils.data.Dataset``\n",
+        "that allow you to use pre-loaded datasets as well as your own data.\n",
+        "``Dataset`` stores the samples and their corresponding labels, and ``DataLoader`` wraps an iterable around\n",
+        "the ``Dataset`` to enable easy access to the samples.\n",
+        "\n",
+        "PyTorch domain libraries provide a number of pre-loaded datasets (such as FashionMNIST) that\n",
+        "subclass ``torch.utils.data.Dataset`` and implement functions specific to the particular data.\n",
+        "They can be used to prototype and benchmark your model. You can find them\n",
+        "here: [Image Datasets](https://pytorch.org/vision/stable/datasets.html),\n",
+        "[Text Datasets](https://pytorch.org/text/stable/datasets.html), and\n",
+        "[Audio Datasets](https://pytorch.org/audio/stable/datasets.html)\n",
+        "\n",
+        "\n"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "## Loading a Dataset\n\nHere is an example of how to load the [Fashion-MNIST](https://research.zalando.com/project/fashion_mnist/fashion_mnist/) dataset from TorchVision.\nFashion-MNIST is a dataset of Zalando\u2019s article images consisting of 60,000 training examples and 10,000 test examples.\nEach example comprises a 28\u00d728 grayscale image and an associated label from one of 10 classes.\n\nWe load the [FashionMNIST Dataset](https://pytorch.org/vision/stable/datasets.html#fashion-mnist) with the following parameters:\n - ``root`` is the path where the train/test data is stored,\n - ``train`` specifies training or test dataset,\n - ``download=True`` downloads the data from the internet if it's not available at ``root``.\n - ``transform`` and ``target_transform`` specify the feature and label transformations\n\n"
+        "## Loading a Dataset\n",
+        "\n",
+        "Here is an example of how to load the [Fashion-MNIST](https://research.zalando.com/project/fashion_mnist/fashion_mnist/) dataset from TorchVision.\n",
+        "Fashion-MNIST is a dataset of Zalando’s article images consisting of 60,000 training examples and 10,000 test examples.\n",
+        "Each example comprises a 28×28 grayscale image and an associated label from one of 10 classes.\n",
+        "\n",
+        "We load the [FashionMNIST Dataset](https://pytorch.org/vision/stable/datasets.html#fashion-mnist) with the following parameters:\n",
+        " - ``root`` is the path where the train/test data is stored,\n",
+        " - ``train`` specifies training or test dataset,\n",
+        " - ``download=True`` downloads the data from the internet if it's not available at ``root``.\n",
+        " - ``transform`` and ``target_transform`` specify the feature and label transformations\n",
+        "\n"
       ]
     },
     {
@@ -40,14 +65,39 @@
       },
       "outputs": [],
       "source": [
-        "import torch\nfrom torch.utils.data import Dataset\nfrom torchvision import datasets\nfrom torchvision.transforms import ToTensor\nimport matplotlib.pyplot as plt\n\n\ntraining_data = datasets.FashionMNIST(\n    root=\"data\",\n    train=True,\n    download=True,\n    transform=ToTensor()\n)\n\ntest_data = datasets.FashionMNIST(\n    root=\"data\",\n    train=False,\n    download=True,\n    transform=ToTensor()\n)"
+        "%matplotlib inline\n",
+        "\n",
+        "import torch\n",
+        "from torch.utils.data import Dataset\n",
+        "from torchvision import datasets\n",
+        "from torchvision.transforms import ToTensor\n",
+        "import matplotlib.pyplot as plt\n",
+        "\n",
+        "\n",
+        "training_data = datasets.FashionMNIST(\n",
+        "    root=\"data\",\n",
+        "    train=True,\n",
+        "    download=True,\n",
+        "    transform=ToTensor()\n",
+        ")\n",
+        "\n",
+        "test_data = datasets.FashionMNIST(\n",
+        "    root=\"data\",\n",
+        "    train=False,\n",
+        "    download=True,\n",
+        "    transform=ToTensor()\n",
+        ")"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "## Iterating and Visualizing the Dataset\n\nWe can index ``Datasets`` manually like a list: ``training_data[index]``.\nWe use ``matplotlib`` to visualize some samples in our training data.\n\n"
+        "## Iterating and Visualizing the Dataset\n",
+        "\n",
+        "We can index ``Datasets`` manually like a list: ``training_data[index]``.\n",
+        "We use ``matplotlib`` to visualize some samples in our training data.\n",
+        "\n"
       ]
     },
     {
@@ -58,28 +108,61 @@
       },
       "outputs": [],
       "source": [
-        "labels_map = {\n    0: \"T-Shirt\",\n    1: \"Trouser\",\n    2: \"Pullover\",\n    3: \"Dress\",\n    4: \"Coat\",\n    5: \"Sandal\",\n    6: \"Shirt\",\n    7: \"Sneaker\",\n    8: \"Bag\",\n    9: \"Ankle Boot\",\n}\nfigure = plt.figure(figsize=(8, 8))\ncols, rows = 3, 3\nfor i in range(1, cols * rows + 1):\n    sample_idx = torch.randint(len(training_data), size=(1,)).item()\n    img, label = training_data[sample_idx]\n    figure.add_subplot(rows, cols, i)\n    plt.title(labels_map[label])\n    plt.axis(\"off\")\n    plt.imshow(img.squeeze(), cmap=\"gray\")\nplt.show()"
+        "labels_map = {\n",
+        "    0: \"T-Shirt\",\n",
+        "    1: \"Trouser\",\n",
+        "    2: \"Pullover\",\n",
+        "    3: \"Dress\",\n",
+        "    4: \"Coat\",\n",
+        "    5: \"Sandal\",\n",
+        "    6: \"Shirt\",\n",
+        "    7: \"Sneaker\",\n",
+        "    8: \"Bag\",\n",
+        "    9: \"Ankle Boot\",\n",
+        "}\n",
+        "figure = plt.figure(figsize=(8, 8))\n",
+        "cols, rows = 3, 3\n",
+        "for i in range(1, cols * rows + 1):\n",
+        "    sample_idx = torch.randint(len(training_data), size=(1,)).item()\n",
+        "    img, label = training_data[sample_idx]\n",
+        "    figure.add_subplot(rows, cols, i)\n",
+        "    plt.title(labels_map[label])\n",
+        "    plt.axis(\"off\")\n",
+        "    plt.imshow(img.squeeze(), cmap=\"gray\")\n",
+        "plt.show()"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "..\n .. figure:: /_static/img/basics/fashion_mnist.png\n   :alt: fashion_mnist\n\n"
+        "..\n",
+        " .. figure:: /_static/img/basics/fashion_mnist.png\n",
+        "   :alt: fashion_mnist\n",
+        "\n"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "--------------\n\n\n"
+        "--------------\n",
+        "\n",
+        "\n"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "## Creating a Custom Dataset for your files\n\nA custom Dataset class must implement three functions: `__init__`, `__len__`, and `__getitem__`.\nTake a look at this implementation; the FashionMNIST images are stored\nin a directory ``img_dir``, and their labels are stored separately in a CSV file ``annotations_file``.\n\nIn the next sections, we'll break down what's happening in each of these functions.\n\n"
+        "## Creating a Custom Dataset for your files\n",
+        "\n",
+        "A custom Dataset class must implement three functions: `__init__`, `__len__`, and `__getitem__`.\n",
+        "Take a look at this implementation; the FashionMNIST images are stored\n",
+        "in a directory ``img_dir``, and their labels are stored separately in a CSV file ``annotations_file``.\n",
+        "\n",
+        "In the next sections, we'll break down what's happening in each of these functions.\n",
+        "\n"
       ]
     },
     {
@@ -90,14 +173,48 @@
       },
       "outputs": [],
       "source": [
-        "import os\nimport pandas as pd\nfrom torchvision.io import read_image\n\nclass CustomImageDataset(Dataset):\n    def __init__(self, annotations_file, img_dir, transform=None, target_transform=None):\n        self.img_labels = pd.read_csv(annotations_file)\n        self.img_dir = img_dir\n        self.transform = transform\n        self.target_transform = target_transform\n\n    def __len__(self):\n        return len(self.img_labels)\n\n    def __getitem__(self, idx):\n        img_path = os.path.join(self.img_dir, self.img_labels.iloc[idx, 0])\n        image = read_image(img_path)\n        label = self.img_labels.iloc[idx, 1]\n        if self.transform:\n            image = self.transform(image)\n        if self.target_transform:\n            label = self.target_transform(label)\n        return image, label"
+        "import os\n",
+        "import pandas as pd\n",
+        "from torchvision.io import read_image\n",
+        "\n",
+        "class CustomImageDataset(Dataset):\n",
+        "    def __init__(self, annotations_file, img_dir, transform=None, target_transform=None):\n",
+        "        self.img_labels = pd.read_csv(annotations_file)\n",
+        "        self.img_dir = img_dir\n",
+        "        self.transform = transform\n",
+        "        self.target_transform = target_transform\n",
+        "\n",
+        "    def __len__(self):\n",
+        "        return len(self.img_labels)\n",
+        "\n",
+        "    def __getitem__(self, idx):\n",
+        "        img_path = os.path.join(self.img_dir, self.img_labels.iloc[idx, 0])\n",
+        "        image = read_image(img_path)\n",
+        "        label = self.img_labels.iloc[idx, 1]\n",
+        "        if self.transform:\n",
+        "            image = self.transform(image)\n",
+        "        if self.target_transform:\n",
+        "            label = self.target_transform(label)\n",
+        "        return image, label"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "### __init__\n\nThe __init__ function is run once when instantiating the Dataset object. We initialize\nthe directory containing the images, the annotations file, and both transforms (covered\nin more detail in the next section).\n\nThe labels.csv file looks like: ::\n\n    tshirt1.jpg, 0\n    tshirt2.jpg, 0\n    ......\n    ankleboot999.jpg, 9\n\n"
+        "### __init__\n",
+        "\n",
+        "The __init__ function is run once when instantiating the Dataset object. We initialize\n",
+        "the directory containing the images, the annotations file, and both transforms (covered\n",
+        "in more detail in the next section).\n",
+        "\n",
+        "The labels.csv file looks like: ::\n",
+        "\n",
+        "    tshirt1.jpg, 0\n",
+        "    tshirt2.jpg, 0\n",
+        "    ......\n",
+        "    ankleboot999.jpg, 9\n",
+        "\n"
       ]
     },
     {
@@ -108,14 +225,23 @@
       },
       "outputs": [],
       "source": [
-        "def __init__(self, annotations_file, img_dir, transform=None, target_transform=None):\n    self.img_labels = pd.read_csv(annotations_file)\n    self.img_dir = img_dir\n    self.transform = transform\n    self.target_transform = target_transform"
+        "def __init__(self, annotations_file, img_dir, transform=None, target_transform=None):\n",
+        "    self.img_labels = pd.read_csv(annotations_file)\n",
+        "    self.img_dir = img_dir\n",
+        "    self.transform = transform\n",
+        "    self.target_transform = target_transform"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "### __len__\n\nThe __len__ function returns the number of samples in our dataset.\n\nExample:\n\n"
+        "### __len__\n",
+        "\n",
+        "The __len__ function returns the number of samples in our dataset.\n",
+        "\n",
+        "Example:\n",
+        "\n"
       ]
     },
     {
@@ -126,14 +252,21 @@
       },
       "outputs": [],
       "source": [
-        "def __len__(self):\n    return len(self.img_labels)"
+        "def __len__(self):\n",
+        "    return len(self.img_labels)"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "### __getitem__\n\nThe __getitem__ function loads and returns a sample from the dataset at the given index ``idx``.\nBased on the index, it identifies the image's location on disk, converts that to a tensor using ``read_image``, retrieves the\ncorresponding label from the csv data in ``self.img_labels``, calls the transform functions on them (if applicable), and returns the\ntensor image and corresponding label in a tuple.\n\n"
+        "### __getitem__\n",
+        "\n",
+        "The __getitem__ function loads and returns a sample from the dataset at the given index ``idx``.\n",
+        "Based on the index, it identifies the image's location on disk, converts that to a tensor using ``read_image``, retrieves the\n",
+        "corresponding label from the csv data in ``self.img_labels``, calls the transform functions on them (if applicable), and returns the\n",
+        "tensor image and corresponding label in a tuple.\n",
+        "\n"
       ]
     },
     {
@@ -144,21 +277,37 @@
       },
       "outputs": [],
       "source": [
-        "def __getitem__(self, idx):\n    img_path = os.path.join(self.img_dir, self.img_labels.iloc[idx, 0])\n    image = read_image(img_path)\n    label = self.img_labels.iloc[idx, 1]\n    if self.transform:\n        image = self.transform(image)\n    if self.target_transform:\n        label = self.target_transform(label)\n    return image, label"
+        "def __getitem__(self, idx):\n",
+        "    img_path = os.path.join(self.img_dir, self.img_labels.iloc[idx, 0])\n",
+        "    image = read_image(img_path)\n",
+        "    label = self.img_labels.iloc[idx, 1]\n",
+        "    if self.transform:\n",
+        "        image = self.transform(image)\n",
+        "    if self.target_transform:\n",
+        "        label = self.target_transform(label)\n",
+        "    return image, label"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "--------------\n\n\n"
+        "--------------\n",
+        "\n",
+        "\n"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "## Preparing your data for training with DataLoaders\nThe ``Dataset`` retrieves our dataset's features and labels one sample at a time. While training a model, we typically want to\npass samples in \"minibatches\", reshuffle the data at every epoch to reduce model overfitting, and use Python's ``multiprocessing`` to\nspeed up data retrieval.\n\n``DataLoader`` is an iterable that abstracts this complexity for us in an easy API.\n\n"
+        "## Preparing your data for training with DataLoaders\n",
+        "The ``Dataset`` retrieves our dataset's features and labels one sample at a time. While training a model, we typically want to\n",
+        "pass samples in \"minibatches\", reshuffle the data at every epoch to reduce model overfitting, and use Python's ``multiprocessing`` to\n",
+        "speed up data retrieval.\n",
+        "\n",
+        "``DataLoader`` is an iterable that abstracts this complexity for us in an easy API.\n",
+        "\n"
       ]
     },
     {
@@ -169,14 +318,23 @@
       },
       "outputs": [],
       "source": [
-        "from torch.utils.data import DataLoader\n\ntrain_dataloader = DataLoader(training_data, batch_size=64, shuffle=True)\ntest_dataloader = DataLoader(test_data, batch_size=64, shuffle=True)"
+        "from torch.utils.data import DataLoader\n",
+        "\n",
+        "train_dataloader = DataLoader(training_data, batch_size=64, shuffle=True)\n",
+        "test_dataloader = DataLoader(test_data, batch_size=64, shuffle=True)"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "## Iterate through the DataLoader\n\nWe have loaded that dataset into the ``DataLoader`` and can iterate through the dataset as needed.\nEach iteration below returns a batch of ``train_features`` and ``train_labels`` (containing ``batch_size=64`` features and labels respectively).\nBecause we specified ``shuffle=True``, after we iterate over all batches the data is shuffled (for finer-grained control over\nthe data loading order, take a look at [Samplers](https://pytorch.org/docs/stable/data.html#data-loading-order-and-sampler)).\n\n"
+        "## Iterate through the DataLoader\n",
+        "\n",
+        "We have loaded that dataset into the ``DataLoader`` and can iterate through the dataset as needed.\n",
+        "Each iteration below returns a batch of ``train_features`` and ``train_labels`` (containing ``batch_size=64`` features and labels respectively).\n",
+        "Because we specified ``shuffle=True``, after we iterate over all batches the data is shuffled (for finer-grained control over\n",
+        "the data loading order, take a look at [Samplers](https://pytorch.org/docs/stable/data.html#data-loading-order-and-sampler)).\n",
+        "\n"
       ]
     },
     {
@@ -187,21 +345,33 @@
       },
       "outputs": [],
       "source": [
-        "# Display image and label.\ntrain_features, train_labels = next(iter(train_dataloader))\nprint(f\"Feature batch shape: {train_features.size()}\")\nprint(f\"Labels batch shape: {train_labels.size()}\")\nimg = train_features[0].squeeze()\nlabel = train_labels[0]\nplt.imshow(img, cmap=\"gray\")\nplt.show()\nprint(f\"Label: {label}\")"
+        "# Display image and label.\n",
+        "train_features, train_labels = next(iter(train_dataloader))\n",
+        "print(f\"Feature batch shape: {train_features.size()}\")\n",
+        "print(f\"Labels batch shape: {train_labels.size()}\")\n",
+        "img = train_features[0].squeeze()\n",
+        "label = train_labels[0]\n",
+        "plt.imshow(img, cmap=\"gray\")\n",
+        "plt.show()\n",
+        "print(f\"Label: {label}\")"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "--------------\n\n\n"
+        "--------------\n",
+        "\n",
+        "\n"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "## Further Reading\n- [torch.utils.data API](https://pytorch.org/docs/stable/data.html)\n\n"
+        "## Further Reading\n",
+        "- [torch.utils.data API](https://pytorch.org/docs/stable/data.html)\n",
+        "\n"
       ]
     }
   ],
@@ -226,4 +396,4 @@
   },
   "nbformat": 4,
   "nbformat_minor": 0
-}
\ No newline at end of file
+}
diff --git a/tutorials/07-Autograd.ipynb b/tutorials/07-Autograd.ipynb
index 9f73676..f2248eb 100644
--- a/tutorials/07-Autograd.ipynb
+++ b/tutorials/07-Autograd.ipynb
@@ -1,21 +1,34 @@
 {
   "cells": [
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "collapsed": false
-      },
-      "outputs": [],
-      "source": [
-        "%matplotlib inline"
-      ]
-    },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "\n[Learn the Basics](intro.html) ||\n[Quickstart](quickstart_tutorial.html) ||\n[Tensors](tensorqs_tutorial.html) ||\n[Datasets & DataLoaders](data_tutorial.html) ||\n[Transforms](transforms_tutorial.html) ||\n[Build Model](buildmodel_tutorial.html) ||\n**Autograd** ||\n[Optimization](optimization_tutorial.html) ||\n[Save & Load Model](saveloadrun_tutorial.html)\n\n# Automatic Differentiation with ``torch.autograd``\n\nWhen training neural networks, the most frequently used algorithm is\n**back propagation**. In this algorithm, parameters (model weights) are\nadjusted according to the **gradient** of the loss function with respect\nto the given parameter.\n\nTo compute those gradients, PyTorch has a built-in differentiation engine\ncalled ``torch.autograd``. It supports automatic computation of gradient for any\ncomputational graph.\n\nConsider the simplest one-layer neural network, with input ``x``,\nparameters ``w`` and ``b``, and some loss function. It can be defined in\nPyTorch in the following manner:\n"
+        "\n",
+        "[Learn the Basics](intro.html) ||\n",
+        "[Quickstart](quickstart_tutorial.html) ||\n",
+        "[Tensors](tensorqs_tutorial.html) ||\n",
+        "[Datasets & DataLoaders](data_tutorial.html) ||\n",
+        "[Transforms](transforms_tutorial.html) ||\n",
+        "[Build Model](buildmodel_tutorial.html) ||\n",
+        "**Autograd** ||\n",
+        "[Optimization](optimization_tutorial.html) ||\n",
+        "[Save & Load Model](saveloadrun_tutorial.html)\n",
+        "\n",
+        "# Automatic Differentiation with ``torch.autograd``\n",
+        "\n",
+        "When training neural networks, the most frequently used algorithm is\n",
+        "**back propagation**. In this algorithm, parameters (model weights) are\n",
+        "adjusted according to the **gradient** of the loss function with respect\n",
+        "to the given parameter.\n",
+        "\n",
+        "To compute those gradients, PyTorch has a built-in differentiation engine\n",
+        "called ``torch.autograd``. It supports automatic computation of gradient for any\n",
+        "computational graph.\n",
+        "\n",
+        "Consider the simplest one-layer neural network, with input ``x``,\n",
+        "parameters ``w`` and ``b``, and some loss function. It can be defined in\n",
+        "PyTorch in the following manner:\n"
       ]
     },
     {
@@ -26,28 +39,58 @@
       },
       "outputs": [],
       "source": [
-        "import torch\n\nx = torch.ones(5)  # input tensor\ny = torch.zeros(3)  # expected output\nw = torch.randn(5, 3, requires_grad=True)\nb = torch.randn(3, requires_grad=True)\nz = torch.matmul(x, w)+b\nloss = torch.nn.functional.binary_cross_entropy_with_logits(z, y)"
+        "%matplotlib inline\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "x = torch.ones(5)  # input tensor\n",
+        "y = torch.zeros(3)  # expected output\n",
+        "w = torch.randn(5, 3, requires_grad=True)\n",
+        "b = torch.randn(3, requires_grad=True)\n",
+        "z = torch.matmul(x, w)+b\n",
+        "loss = torch.nn.functional.binary_cross_entropy_with_logits(z, y)"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "## Tensors, Functions and Computational graph\n\nThis code defines the following **computational graph**:\n\n.. figure:: /_static/img/basics/comp-graph.png\n   :alt:\n\nIn this network, ``w`` and ``b`` are **parameters**, which we need to\noptimize. Thus, we need to be able to compute the gradients of loss\nfunction with respect to those variables. In order to do that, we set\nthe ``requires_grad`` property of those tensors.\n\n"
+        "## Tensors, Functions and Computational graph\n",
+        "\n",
+        "This code defines the following **computational graph**:\n",
+        "\n",
+        ".. figure:: /_static/img/basics/comp-graph.png\n",
+        "   :alt:\n",
+        "\n",
+        "In this network, ``w`` and ``b`` are **parameters**, which we need to\n",
+        "optimize. Thus, we need to be able to compute the gradients of loss\n",
+        "function with respect to those variables. In order to do that, we set\n",
+        "the ``requires_grad`` property of those tensors.\n",
+        "\n"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "<div class=\"alert alert-info\"><h4>Note</h4><p>You can set the value of ``requires_grad`` when creating a\n          tensor, or later by using ``x.requires_grad_(True)`` method.</p></div>\n\n"
+        "<div class=\"alert alert-info\"><h4>Note</h4><p>You can set the value of ``requires_grad`` when creating a\n",
+        "          tensor, or later by using ``x.requires_grad_(True)`` method.</p></div>\n",
+        "\n"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "A function that we apply to tensors to construct computational graph is\nin fact an object of class ``Function``. This object knows how to\ncompute the function in the *forward* direction, and also how to compute\nits derivative during the *backward propagation* step. A reference to\nthe backward propagation function is stored in ``grad_fn`` property of a\ntensor. You can find more information of ``Function`` [in the\ndocumentation](https://pytorch.org/docs/stable/autograd.html#function)_.\n\n\n"
+        "A function that we apply to tensors to construct computational graph is\n",
+        "in fact an object of class ``Function``. This object knows how to\n",
+        "compute the function in the *forward* direction, and also how to compute\n",
+        "its derivative during the *backward propagation* step. A reference to\n",
+        "the backward propagation function is stored in ``grad_fn`` property of a\n",
+        "tensor. You can find more information of ``Function`` [in the\n",
+        "documentation](https://pytorch.org/docs/stable/autograd.html#function)_.\n",
+        "\n",
+        "\n"
       ]
     },
     {
@@ -58,14 +101,25 @@
       },
       "outputs": [],
       "source": [
-        "print(f\"Gradient function for z = {z.grad_fn}\")\nprint(f\"Gradient function for loss = {loss.grad_fn}\")"
+        "print(f\"Gradient function for z = {z.grad_fn}\")\n",
+        "print(f\"Gradient function for loss = {loss.grad_fn}\")"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "## Computing Gradients\n\nTo optimize weights of parameters in the neural network, we need to\ncompute the derivatives of our loss function with respect to parameters,\nnamely, we need $\\frac{\\partial loss}{\\partial w}$ and\n$\\frac{\\partial loss}{\\partial b}$ under some fixed values of\n``x`` and ``y``. To compute those derivatives, we call\n``loss.backward()``, and then retrieve the values from ``w.grad`` and\n``b.grad``:\n\n\n"
+        "## Computing Gradients\n",
+        "\n",
+        "To optimize weights of parameters in the neural network, we need to\n",
+        "compute the derivatives of our loss function with respect to parameters,\n",
+        "namely, we need $\\frac{\\partial loss}{\\partial w}$ and\n",
+        "$\\frac{\\partial loss}{\\partial b}$ under some fixed values of\n",
+        "``x`` and ``y``. To compute those derivatives, we call\n",
+        "``loss.backward()``, and then retrieve the values from ``w.grad`` and\n",
+        "``b.grad``:\n",
+        "\n",
+        "\n"
       ]
     },
     {
@@ -76,21 +130,42 @@
       },
       "outputs": [],
       "source": [
-        "loss.backward()\nprint(w.grad)\nprint(b.grad)"
+        "loss.backward()\n",
+        "print(w.grad)\n",
+        "print(b.grad)"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "<div class=\"alert alert-info\"><h4>Note</h4><p>- We can only obtain the ``grad`` properties for the leaf\n    nodes of the computational graph, which have ``requires_grad`` property\n    set to ``True``. For all other nodes in our graph, gradients will not be\n    available.\n  - We can only perform gradient calculations using\n    ``backward`` once on a given graph, for performance reasons. If we need\n    to do several ``backward`` calls on the same graph, we need to pass\n    ``retain_graph=True`` to the ``backward`` call.</p></div>\n\n\n"
+        "<div class=\"alert alert-info\"><h4>Note</h4><p>- We can only obtain the ``grad`` properties for the leaf\n",
+        "    nodes of the computational graph, which have ``requires_grad`` property\n",
+        "    set to ``True``. For all other nodes in our graph, gradients will not be\n",
+        "    available.\n",
+        "  - We can only perform gradient calculations using\n",
+        "    ``backward`` once on a given graph, for performance reasons. If we need\n",
+        "    to do several ``backward`` calls on the same graph, we need to pass\n",
+        "    ``retain_graph=True`` to the ``backward`` call.</p></div>\n",
+        "\n",
+        "\n"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "## Disabling Gradient Tracking\n\nBy default, all tensors with ``requires_grad=True`` are tracking their\ncomputational history and support gradient computation. However, there\nare some cases when we do not need to do that, for example, when we have\ntrained the model and just want to apply it to some input data, i.e. we\nonly want to do *forward* computations through the network. We can stop\ntracking computations by surrounding our computation code with\n``torch.no_grad()`` block:\n\n\n"
+        "## Disabling Gradient Tracking\n",
+        "\n",
+        "By default, all tensors with ``requires_grad=True`` are tracking their\n",
+        "computational history and support gradient computation. However, there\n",
+        "are some cases when we do not need to do that, for example, when we have\n",
+        "trained the model and just want to apply it to some input data, i.e. we\n",
+        "only want to do *forward* computations through the network. We can stop\n",
+        "tracking computations by surrounding our computation code with\n",
+        "``torch.no_grad()`` block:\n",
+        "\n",
+        "\n"
       ]
     },
     {
@@ -101,14 +176,22 @@
       },
       "outputs": [],
       "source": [
-        "z = torch.matmul(x, w)+b\nprint(z.requires_grad)\n\nwith torch.no_grad():\n    z = torch.matmul(x, w)+b\nprint(z.requires_grad)"
+        "z = torch.matmul(x, w)+b\n",
+        "print(z.requires_grad)\n",
+        "\n",
+        "with torch.no_grad():\n",
+        "    z = torch.matmul(x, w)+b\n",
+        "print(z.requires_grad)"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "Another way to achieve the same result is to use the ``detach()`` method\non the tensor:\n\n\n"
+        "Another way to achieve the same result is to use the ``detach()`` method\n",
+        "on the tensor:\n",
+        "\n",
+        "\n"
       ]
     },
     {
@@ -119,28 +202,88 @@
       },
       "outputs": [],
       "source": [
-        "z = torch.matmul(x, w)+b\nz_det = z.detach()\nprint(z_det.requires_grad)"
+        "z = torch.matmul(x, w)+b\n",
+        "z_det = z.detach()\n",
+        "print(z_det.requires_grad)"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "There are reasons you might want to disable gradient tracking:\n  - To mark some parameters in your neural network as **frozen parameters**.\n  - To **speed up computations** when you are only doing forward pass, because computations on tensors that do\n    not track gradients would be more efficient.\n\n"
+        "There are reasons you might want to disable gradient tracking:\n",
+        "  - To mark some parameters in your neural network as **frozen parameters**.\n",
+        "  - To **speed up computations** when you are only doing forward pass, because computations on tensors that do\n",
+        "    not track gradients would be more efficient.\n",
+        "\n"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "## More on Computational Graphs\nConceptually, autograd keeps a record of data (tensors) and all executed\noperations (along with the resulting new tensors) in a directed acyclic\ngraph (DAG) consisting of\n[Function](https://pytorch.org/docs/stable/autograd.html#torch.autograd.Function)_\nobjects. In this DAG, leaves are the input tensors, roots are the output\ntensors. By tracing this graph from roots to leaves, you can\nautomatically compute the gradients using the chain rule.\n\nIn a forward pass, autograd does two things simultaneously:\n\n- run the requested operation to compute a resulting tensor\n- maintain the operation\u2019s *gradient function* in the DAG.\n\nThe backward pass kicks off when ``.backward()`` is called on the DAG\nroot. ``autograd`` then:\n\n- computes the gradients from each ``.grad_fn``,\n- accumulates them in the respective tensor\u2019s ``.grad`` attribute\n- using the chain rule, propagates all the way to the leaf tensors.\n\n<div class=\"alert alert-info\"><h4>Note</h4><p>**DAGs are dynamic in PyTorch**\n  An important thing to note is that the graph is recreated from scratch; after each\n  ``.backward()`` call, autograd starts populating a new graph. This is\n  exactly what allows you to use control flow statements in your model;\n  you can change the shape, size and operations at every iteration if\n  needed.</p></div>\n\n"
+        "## More on Computational Graphs\n",
+        "Conceptually, autograd keeps a record of data (tensors) and all executed\n",
+        "operations (along with the resulting new tensors) in a directed acyclic\n",
+        "graph (DAG) consisting of\n",
+        "[Function](https://pytorch.org/docs/stable/autograd.html#torch.autograd.Function)_\n",
+        "objects. In this DAG, leaves are the input tensors, roots are the output\n",
+        "tensors. By tracing this graph from roots to leaves, you can\n",
+        "automatically compute the gradients using the chain rule.\n",
+        "\n",
+        "In a forward pass, autograd does two things simultaneously:\n",
+        "\n",
+        "- run the requested operation to compute a resulting tensor\n",
+        "- maintain the operation’s *gradient function* in the DAG.\n",
+        "\n",
+        "The backward pass kicks off when ``.backward()`` is called on the DAG\n",
+        "root. ``autograd`` then:\n",
+        "\n",
+        "- computes the gradients from each ``.grad_fn``,\n",
+        "- accumulates them in the respective tensor’s ``.grad`` attribute\n",
+        "- using the chain rule, propagates all the way to the leaf tensors.\n",
+        "\n",
+        "<div class=\"alert alert-info\"><h4>Note</h4><p>**DAGs are dynamic in PyTorch**\n",
+        "  An important thing to note is that the graph is recreated from scratch; after each\n",
+        "  ``.backward()`` call, autograd starts populating a new graph. This is\n",
+        "  exactly what allows you to use control flow statements in your model;\n",
+        "  you can change the shape, size and operations at every iteration if\n",
+        "  needed.</p></div>\n",
+        "\n"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "## Optional Reading: Tensor Gradients and Jacobian Products\n\nIn many cases, we have a scalar loss function, and we need to compute\nthe gradient with respect to some parameters. However, there are cases\nwhen the output function is an arbitrary tensor. In this case, PyTorch\nallows you to compute so-called **Jacobian product**, and not the actual\ngradient.\n\nFor a vector function $\\vec{y}=f(\\vec{x})$, where\n$\\vec{x}=\\langle x_1,\\dots,x_n\\rangle$ and\n$\\vec{y}=\\langle y_1,\\dots,y_m\\rangle$, a gradient of\n$\\vec{y}$ with respect to $\\vec{x}$ is given by **Jacobian\nmatrix**:\n\n\\begin{align}J=\\left(\\begin{array}{ccc}\n      \\frac{\\partial y_{1}}{\\partial x_{1}} & \\cdots & \\frac{\\partial y_{1}}{\\partial x_{n}}\\\\\n      \\vdots & \\ddots & \\vdots\\\\\n      \\frac{\\partial y_{m}}{\\partial x_{1}} & \\cdots & \\frac{\\partial y_{m}}{\\partial x_{n}}\n      \\end{array}\\right)\\end{align}\n\nInstead of computing the Jacobian matrix itself, PyTorch allows you to\ncompute **Jacobian Product** $v^T\\cdot J$ for a given input vector\n$v=(v_1 \\dots v_m)$. This is achieved by calling ``backward`` with\n$v$ as an argument. The size of $v$ should be the same as\nthe size of the original tensor, with respect to which we want to\ncompute the product:\n\n\n"
+        "## Optional Reading: Tensor Gradients and Jacobian Products\n",
+        "\n",
+        "In many cases, we have a scalar loss function, and we need to compute\n",
+        "the gradient with respect to some parameters. However, there are cases\n",
+        "when the output function is an arbitrary tensor. In this case, PyTorch\n",
+        "allows you to compute so-called **Jacobian product**, and not the actual\n",
+        "gradient.\n",
+        "\n",
+        "For a vector function $\\vec{y}=f(\\vec{x})$, where\n",
+        "$\\vec{x}=\\langle x_1,\\dots,x_n\\rangle$ and\n",
+        "$\\vec{y}=\\langle y_1,\\dots,y_m\\rangle$, a gradient of\n",
+        "$\\vec{y}$ with respect to $\\vec{x}$ is given by **Jacobian\n",
+        "matrix**:\n",
+        "\n",
+        "\\begin{align}J=\\left(\\begin{array}{ccc}\n",
+        "      \\frac{\\partial y_{1}}{\\partial x_{1}} & \\cdots & \\frac{\\partial y_{1}}{\\partial x_{n}}\\\\\n",
+        "      \\vdots & \\ddots & \\vdots\\\\\n",
+        "      \\frac{\\partial y_{m}}{\\partial x_{1}} & \\cdots & \\frac{\\partial y_{m}}{\\partial x_{n}}\n",
+        "      \\end{array}\\right)\\end{align}\n",
+        "\n",
+        "Instead of computing the Jacobian matrix itself, PyTorch allows you to\n",
+        "compute **Jacobian Product** $v^T\\cdot J$ for a given input vector\n",
+        "$v=(v_1 \\dots v_m)$. This is achieved by calling ``backward`` with\n",
+        "$v$ as an argument. The size of $v$ should be the same as\n",
+        "the size of the original tensor, with respect to which we want to\n",
+        "compute the product:\n",
+        "\n",
+        "\n"
       ]
     },
     {
@@ -151,35 +294,61 @@
       },
       "outputs": [],
       "source": [
-        "inp = torch.eye(4, 5, requires_grad=True)\nout = (inp+1).pow(2).t()\nout.backward(torch.ones_like(out), retain_graph=True)\nprint(f\"First call\\n{inp.grad}\")\nout.backward(torch.ones_like(out), retain_graph=True)\nprint(f\"\\nSecond call\\n{inp.grad}\")\ninp.grad.zero_()\nout.backward(torch.ones_like(out), retain_graph=True)\nprint(f\"\\nCall after zeroing gradients\\n{inp.grad}\")"
+        "inp = torch.eye(4, 5, requires_grad=True)\n",
+        "out = (inp+1).pow(2).t()\n",
+        "out.backward(torch.ones_like(out), retain_graph=True)\n",
+        "print(f\"First call\\n{inp.grad}\")\n",
+        "out.backward(torch.ones_like(out), retain_graph=True)\n",
+        "print(f\"\\nSecond call\\n{inp.grad}\")\n",
+        "inp.grad.zero_()\n",
+        "out.backward(torch.ones_like(out), retain_graph=True)\n",
+        "print(f\"\\nCall after zeroing gradients\\n{inp.grad}\")"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "Notice that when we call ``backward`` for the second time with the same\nargument, the value of the gradient is different. This happens because\nwhen doing ``backward`` propagation, PyTorch **accumulates the\ngradients**, i.e. the value of computed gradients is added to the\n``grad`` property of all leaf nodes of computational graph. If you want\nto compute the proper gradients, you need to zero out the ``grad``\nproperty before. In real-life training an *optimizer* helps us to do\nthis.\n\n"
+        "Notice that when we call ``backward`` for the second time with the same\n",
+        "argument, the value of the gradient is different. This happens because\n",
+        "when doing ``backward`` propagation, PyTorch **accumulates the\n",
+        "gradients**, i.e. the value of computed gradients is added to the\n",
+        "``grad`` property of all leaf nodes of computational graph. If you want\n",
+        "to compute the proper gradients, you need to zero out the ``grad``\n",
+        "property before. In real-life training an *optimizer* helps us to do\n",
+        "this.\n",
+        "\n"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "<div class=\"alert alert-info\"><h4>Note</h4><p>Previously we were calling ``backward()`` function without\n          parameters. This is essentially equivalent to calling\n          ``backward(torch.tensor(1.0))``, which is a useful way to compute the\n          gradients in case of a scalar-valued function, such as loss during\n          neural network training.</p></div>\n\n\n"
+        "<div class=\"alert alert-info\"><h4>Note</h4><p>Previously we were calling ``backward()`` function without\n",
+        "          parameters. This is essentially equivalent to calling\n",
+        "          ``backward(torch.tensor(1.0))``, which is a useful way to compute the\n",
+        "          gradients in case of a scalar-valued function, such as loss during\n",
+        "          neural network training.</p></div>\n",
+        "\n",
+        "\n"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "--------------\n\n\n"
+        "--------------\n",
+        "\n",
+        "\n"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "### Further Reading\n- [Autograd Mechanics](https://pytorch.org/docs/stable/notes/autograd.html)\n\n"
+        "### Further Reading\n",
+        "- [Autograd Mechanics](https://pytorch.org/docs/stable/notes/autograd.html)\n",
+        "\n"
       ]
     }
   ],
@@ -204,4 +373,4 @@
   },
   "nbformat": 4,
   "nbformat_minor": 0
-}
\ No newline at end of file
+}
diff --git a/tutorials/08-Optimization.ipynb b/tutorials/08-Optimization.ipynb
index b68272f..4a76e9e 100644
--- a/tutorials/08-Optimization.ipynb
+++ b/tutorials/08-Optimization.ipynb
@@ -1,21 +1,31 @@
 {
   "cells": [
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "collapsed": false
-      },
-      "outputs": [],
-      "source": [
-        "%matplotlib inline"
-      ]
-    },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "\n[Learn the Basics](intro.html) ||\n[Quickstart](quickstart_tutorial.html) ||\n[Tensors](tensorqs_tutorial.html) ||\n[Datasets & DataLoaders](data_tutorial.html) ||\n[Transforms](transforms_tutorial.html) ||\n[Build Model](buildmodel_tutorial.html) ||\n[Autograd](autogradqs_tutorial.html) ||\n**Optimization** ||\n[Save & Load Model](saveloadrun_tutorial.html)\n\n# Optimizing Model Parameters\n\nNow that we have a model and data it's time to train, validate and test our model by optimizing its parameters on\nour data. Training a model is an iterative process; in each iteration the model makes a guess about the output, calculates\nthe error in its guess (*loss*), collects the derivatives of the error with respect to its parameters (as we saw in\nthe [previous section](autograd_tutorial.html)), and **optimizes** these parameters using gradient descent. For a more\ndetailed walkthrough of this process, check out this video on [backpropagation from 3Blue1Brown](https://www.youtube.com/watch?v=tIeHLnjs5U8)_.\n\n## Prerequisite Code\nWe load the code from the previous sections on [Datasets & DataLoaders](data_tutorial.html)\nand [Build Model](buildmodel_tutorial.html).\n"
+        "\n",
+        "[Learn the Basics](intro.html) ||\n",
+        "[Quickstart](quickstart_tutorial.html) ||\n",
+        "[Tensors](tensorqs_tutorial.html) ||\n",
+        "[Datasets & DataLoaders](data_tutorial.html) ||\n",
+        "[Transforms](transforms_tutorial.html) ||\n",
+        "[Build Model](buildmodel_tutorial.html) ||\n",
+        "[Autograd](autogradqs_tutorial.html) ||\n",
+        "**Optimization** ||\n",
+        "[Save & Load Model](saveloadrun_tutorial.html)\n",
+        "\n",
+        "# Optimizing Model Parameters\n",
+        "\n",
+        "Now that we have a model and data it's time to train, validate and test our model by optimizing its parameters on\n",
+        "our data. Training a model is an iterative process; in each iteration the model makes a guess about the output, calculates\n",
+        "the error in its guess (*loss*), collects the derivatives of the error with respect to its parameters (as we saw in\n",
+        "the [previous section](autograd_tutorial.html)), and **optimizes** these parameters using gradient descent. For a more\n",
+        "detailed walkthrough of this process, check out this video on [backpropagation from 3Blue1Brown](https://www.youtube.com/watch?v=tIeHLnjs5U8)_.\n",
+        "\n",
+        "## Prerequisite Code\n",
+        "We load the code from the previous sections on [Datasets & DataLoaders](data_tutorial.html)\n",
+        "and [Build Model](buildmodel_tutorial.html).\n"
       ]
     },
     {
@@ -26,14 +36,67 @@
       },
       "outputs": [],
       "source": [
-        "import torch\nfrom torch import nn\nfrom torch.utils.data import DataLoader\nfrom torchvision import datasets\nfrom torchvision.transforms import ToTensor\n\ntraining_data = datasets.FashionMNIST(\n    root=\"data\",\n    train=True,\n    download=True,\n    transform=ToTensor()\n)\n\ntest_data = datasets.FashionMNIST(\n    root=\"data\",\n    train=False,\n    download=True,\n    transform=ToTensor()\n)\n\ntrain_dataloader = DataLoader(training_data, batch_size=64)\ntest_dataloader = DataLoader(test_data, batch_size=64)\n\nclass NeuralNetwork(nn.Module):\n    def __init__(self):\n        super(NeuralNetwork, self).__init__()\n        self.flatten = nn.Flatten()\n        self.linear_relu_stack = nn.Sequential(\n            nn.Linear(28*28, 512),\n            nn.ReLU(),\n            nn.Linear(512, 512),\n            nn.ReLU(),\n            nn.Linear(512, 10),\n        )\n\n    def forward(self, x):\n        x = self.flatten(x)\n        logits = self.linear_relu_stack(x)\n        return logits\n\nmodel = NeuralNetwork()"
+        "%matplotlib inline\n",
+        "\n",
+        "import torch\n",
+        "from torch import nn\n",
+        "from torch.utils.data import DataLoader\n",
+        "from torchvision import datasets\n",
+        "from torchvision.transforms import ToTensor\n",
+        "\n",
+        "training_data = datasets.FashionMNIST(\n",
+        "    root=\"data\",\n",
+        "    train=True,\n",
+        "    download=True,\n",
+        "    transform=ToTensor()\n",
+        ")\n",
+        "\n",
+        "test_data = datasets.FashionMNIST(\n",
+        "    root=\"data\",\n",
+        "    train=False,\n",
+        "    download=True,\n",
+        "    transform=ToTensor()\n",
+        ")\n",
+        "\n",
+        "train_dataloader = DataLoader(training_data, batch_size=64)\n",
+        "test_dataloader = DataLoader(test_data, batch_size=64)\n",
+        "\n",
+        "class NeuralNetwork(nn.Module):\n",
+        "    def __init__(self):\n",
+        "        super(NeuralNetwork, self).__init__()\n",
+        "        self.flatten = nn.Flatten()\n",
+        "        self.linear_relu_stack = nn.Sequential(\n",
+        "            nn.Linear(28*28, 512),\n",
+        "            nn.ReLU(),\n",
+        "            nn.Linear(512, 512),\n",
+        "            nn.ReLU(),\n",
+        "            nn.Linear(512, 10),\n",
+        "        )\n",
+        "\n",
+        "    def forward(self, x):\n",
+        "        x = self.flatten(x)\n",
+        "        logits = self.linear_relu_stack(x)\n",
+        "        return logits\n",
+        "\n",
+        "model = NeuralNetwork()"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "## Hyperparameters\n\nHyperparameters are adjustable parameters that let you control the model optimization process.\nDifferent hyperparameter values can impact model training and convergence rates\n([read more](https://pytorch.org/tutorials/beginner/hyperparameter_tuning_tutorial.html)_ about hyperparameter tuning)\n\nWe define the following hyperparameters for training:\n - **Number of Epochs** - the number times to iterate over the dataset\n - **Batch Size** - the number of data samples propagated through the network before the parameters are updated\n - **Learning Rate** - how much to update models parameters at each batch/epoch. Smaller values yield slow learning speed, while large values may result in unpredictable behavior during training.\n\n\n"
+        "## Hyperparameters\n",
+        "\n",
+        "Hyperparameters are adjustable parameters that let you control the model optimization process.\n",
+        "Different hyperparameter values can impact model training and convergence rates\n",
+        "([read more](https://pytorch.org/tutorials/beginner/hyperparameter_tuning_tutorial.html)_ about hyperparameter tuning)\n",
+        "\n",
+        "We define the following hyperparameters for training:\n",
+        " - **Number of Epochs** - the number times to iterate over the dataset\n",
+        " - **Batch Size** - the number of data samples propagated through the network before the parameters are updated\n",
+        " - **Learning Rate** - how much to update models parameters at each batch/epoch. Smaller values yield slow learning speed, while large values may result in unpredictable behavior during training.\n",
+        "\n",
+        "\n"
       ]
     },
     {
@@ -44,14 +107,40 @@
       },
       "outputs": [],
       "source": [
-        "learning_rate = 1e-3\nbatch_size = 64\nepochs = 5"
+        "learning_rate = 1e-3\n",
+        "batch_size = 64\n",
+        "epochs = 5"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "## Optimization Loop\n\nOnce we set our hyperparameters, we can then train and optimize our model with an optimization loop. Each\niteration of the optimization loop is called an **epoch**.\n\nEach epoch consists of two main parts:\n - **The Train Loop** - iterate over the training dataset and try to converge to optimal parameters.\n - **The Validation/Test Loop** - iterate over the test dataset to check if model performance is improving.\n\nLet's briefly familiarize ourselves with some of the concepts used in the training loop. Jump ahead to\nsee the `full-impl-label` of the optimization loop.\n\n### Loss Function\n\nWhen presented with some training data, our untrained network is likely not to give the correct\nanswer. **Loss function** measures the degree of dissimilarity of obtained result to the target value,\nand it is the loss function that we want to minimize during training. To calculate the loss we make a\nprediction using the inputs of our given data sample and compare it against the true data label value.\n\nCommon loss functions include [nn.MSELoss](https://pytorch.org/docs/stable/generated/torch.nn.MSELoss.html#torch.nn.MSELoss) (Mean Square Error) for regression tasks, and\n[nn.NLLLoss](https://pytorch.org/docs/stable/generated/torch.nn.NLLLoss.html#torch.nn.NLLLoss) (Negative Log Likelihood) for classification.\n[nn.CrossEntropyLoss](https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html#torch.nn.CrossEntropyLoss) combines ``nn.LogSoftmax`` and ``nn.NLLLoss``.\n\nWe pass our model's output logits to ``nn.CrossEntropyLoss``, which will normalize the logits and compute the prediction error.\n\n"
+        "## Optimization Loop\n",
+        "\n",
+        "Once we set our hyperparameters, we can then train and optimize our model with an optimization loop. Each\n",
+        "iteration of the optimization loop is called an **epoch**.\n",
+        "\n",
+        "Each epoch consists of two main parts:\n",
+        " - **The Train Loop** - iterate over the training dataset and try to converge to optimal parameters.\n",
+        " - **The Validation/Test Loop** - iterate over the test dataset to check if model performance is improving.\n",
+        "\n",
+        "Let's briefly familiarize ourselves with some of the concepts used in the training loop. Jump ahead to\n",
+        "see the `full-impl-label` of the optimization loop.\n",
+        "\n",
+        "### Loss Function\n",
+        "\n",
+        "When presented with some training data, our untrained network is likely not to give the correct\n",
+        "answer. **Loss function** measures the degree of dissimilarity of obtained result to the target value,\n",
+        "and it is the loss function that we want to minimize during training. To calculate the loss we make a\n",
+        "prediction using the inputs of our given data sample and compare it against the true data label value.\n",
+        "\n",
+        "Common loss functions include [nn.MSELoss](https://pytorch.org/docs/stable/generated/torch.nn.MSELoss.html#torch.nn.MSELoss) (Mean Square Error) for regression tasks, and\n",
+        "[nn.NLLLoss](https://pytorch.org/docs/stable/generated/torch.nn.NLLLoss.html#torch.nn.NLLLoss) (Negative Log Likelihood) for classification.\n",
+        "[nn.CrossEntropyLoss](https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html#torch.nn.CrossEntropyLoss) combines ``nn.LogSoftmax`` and ``nn.NLLLoss``.\n",
+        "\n",
+        "We pass our model's output logits to ``nn.CrossEntropyLoss``, which will normalize the logits and compute the prediction error.\n",
+        "\n"
       ]
     },
     {
@@ -62,14 +151,22 @@
       },
       "outputs": [],
       "source": [
-        "# Initialize the loss function\nloss_fn = nn.CrossEntropyLoss()"
+        "# Initialize the loss function\n",
+        "loss_fn = nn.CrossEntropyLoss()"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "### Optimizer\n\nOptimization is the process of adjusting model parameters to reduce model error in each training step. **Optimization algorithms** define how this process is performed (in this example we use Stochastic Gradient Descent).\nAll optimization logic is encapsulated in  the ``optimizer`` object. Here, we use the SGD optimizer; additionally, there are many [different optimizers](https://pytorch.org/docs/stable/optim.html)\navailable in PyTorch such as ADAM and RMSProp, that work better for different kinds of models and data.\n\nWe initialize the optimizer by registering the model's parameters that need to be trained, and passing in the learning rate hyperparameter.\n\n"
+        "### Optimizer\n",
+        "\n",
+        "Optimization is the process of adjusting model parameters to reduce model error in each training step. **Optimization algorithms** define how this process is performed (in this example we use Stochastic Gradient Descent).\n",
+        "All optimization logic is encapsulated in  the ``optimizer`` object. Here, we use the SGD optimizer; additionally, there are many [different optimizers](https://pytorch.org/docs/stable/optim.html)\n",
+        "available in PyTorch such as ADAM and RMSProp, that work better for different kinds of models and data.\n",
+        "\n",
+        "We initialize the optimizer by registering the model's parameters that need to be trained, and passing in the learning rate hyperparameter.\n",
+        "\n"
       ]
     },
     {
@@ -87,14 +184,22 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "Inside the training loop, optimization happens in three steps:\n * Call ``optimizer.zero_grad()`` to reset the gradients of model parameters. Gradients by default add up; to prevent double-counting, we explicitly zero them at each iteration.\n * Backpropagate the prediction loss with a call to ``loss.backward()``. PyTorch deposits the gradients of the loss w.r.t. each parameter.\n * Once we have our gradients, we call ``optimizer.step()`` to adjust the parameters by the gradients collected in the backward pass.\n\n"
+        "Inside the training loop, optimization happens in three steps:\n",
+        " * Call ``optimizer.zero_grad()`` to reset the gradients of model parameters. Gradients by default add up; to prevent double-counting, we explicitly zero them at each iteration.\n",
+        " * Backpropagate the prediction loss with a call to ``loss.backward()``. PyTorch deposits the gradients of the loss w.r.t. each parameter.\n",
+        " * Once we have our gradients, we call ``optimizer.step()`` to adjust the parameters by the gradients collected in the backward pass.\n",
+        "\n"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "\n## Full Implementation\nWe define ``train_loop`` that loops over our optimization code, and ``test_loop`` that\nevaluates the model's performance against our test data.\n\n"
+        "\n",
+        "## Full Implementation\n",
+        "We define ``train_loop`` that loops over our optimization code, and ``test_loop`` that\n",
+        "evaluates the model's performance against our test data.\n",
+        "\n"
       ]
     },
     {
@@ -105,14 +210,46 @@
       },
       "outputs": [],
       "source": [
-        "def train_loop(dataloader, model, loss_fn, optimizer):\n    size = len(dataloader.dataset)\n    for batch, (X, y) in enumerate(dataloader):\n        # Compute prediction and loss\n        pred = model(X)\n        loss = loss_fn(pred, y)\n\n        # Backpropagation\n        optimizer.zero_grad()\n        loss.backward()\n        optimizer.step()\n\n        if batch % 100 == 0:\n            loss, current = loss.item(), (batch + 1) * len(X)\n            print(f\"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]\")\n\n\ndef test_loop(dataloader, model, loss_fn):\n    size = len(dataloader.dataset)\n    num_batches = len(dataloader)\n    test_loss, correct = 0, 0\n\n    with torch.no_grad():\n        for X, y in dataloader:\n            pred = model(X)\n            test_loss += loss_fn(pred, y).item()\n            correct += (pred.argmax(1) == y).type(torch.float).sum().item()\n\n    test_loss /= num_batches\n    correct /= size\n    print(f\"Test Error: \\n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \\n\")"
+        "def train_loop(dataloader, model, loss_fn, optimizer):\n",
+        "    size = len(dataloader.dataset)\n",
+        "    for batch, (X, y) in enumerate(dataloader):\n",
+        "        # Compute prediction and loss\n",
+        "        pred = model(X)\n",
+        "        loss = loss_fn(pred, y)\n",
+        "\n",
+        "        # Backpropagation\n",
+        "        optimizer.zero_grad()\n",
+        "        loss.backward()\n",
+        "        optimizer.step()\n",
+        "\n",
+        "        if batch % 100 == 0:\n",
+        "            loss, current = loss.item(), (batch + 1) * len(X)\n",
+        "            print(f\"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]\")\n",
+        "\n",
+        "\n",
+        "def test_loop(dataloader, model, loss_fn):\n",
+        "    size = len(dataloader.dataset)\n",
+        "    num_batches = len(dataloader)\n",
+        "    test_loss, correct = 0, 0\n",
+        "\n",
+        "    with torch.no_grad():\n",
+        "        for X, y in dataloader:\n",
+        "            pred = model(X)\n",
+        "            test_loss += loss_fn(pred, y).item()\n",
+        "            correct += (pred.argmax(1) == y).type(torch.float).sum().item()\n",
+        "\n",
+        "    test_loss /= num_batches\n",
+        "    correct /= size\n",
+        "    print(f\"Test Error: \\n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \\n\")"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "We initialize the loss function and optimizer, and pass it to ``train_loop`` and ``test_loop``.\nFeel free to increase the number of epochs to track the model's improving performance.\n\n"
+        "We initialize the loss function and optimizer, and pass it to ``train_loop`` and ``test_loop``.\n",
+        "Feel free to increase the number of epochs to track the model's improving performance.\n",
+        "\n"
       ]
     },
     {
@@ -123,14 +260,27 @@
       },
       "outputs": [],
       "source": [
-        "loss_fn = nn.CrossEntropyLoss()\noptimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)\n\nepochs = 10\nfor t in range(epochs):\n    print(f\"Epoch {t+1}\\n-------------------------------\")\n    train_loop(train_dataloader, model, loss_fn, optimizer)\n    test_loop(test_dataloader, model, loss_fn)\nprint(\"Done!\")"
+        "loss_fn = nn.CrossEntropyLoss()\n",
+        "optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)\n",
+        "\n",
+        "epochs = 10\n",
+        "for t in range(epochs):\n",
+        "    print(f\"Epoch {t+1}\\n-------------------------------\")\n",
+        "    train_loop(train_dataloader, model, loss_fn, optimizer)\n",
+        "    test_loop(test_dataloader, model, loss_fn)\n",
+        "print(\"Done!\")"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "## Further Reading\n- [Loss Functions](https://pytorch.org/docs/stable/nn.html#loss-functions)\n- [torch.optim](https://pytorch.org/docs/stable/optim.html)\n- [Warmstart Training a Model](https://pytorch.org/tutorials/recipes/recipes/warmstarting_model_using_parameters_from_a_different_model.html)\n\n\n"
+        "## Further Reading\n",
+        "- [Loss Functions](https://pytorch.org/docs/stable/nn.html#loss-functions)\n",
+        "- [torch.optim](https://pytorch.org/docs/stable/optim.html)\n",
+        "- [Warmstart Training a Model](https://pytorch.org/tutorials/recipes/recipes/warmstarting_model_using_parameters_from_a_different_model.html)\n",
+        "\n",
+        "\n"
       ]
     }
   ],
@@ -155,4 +305,4 @@
   },
   "nbformat": 4,
   "nbformat_minor": 0
-}
\ No newline at end of file
+}
diff --git a/tutorials/09-SaveLoad.ipynb b/tutorials/09-SaveLoad.ipynb
index d13e7e7..c6d3d72 100644
--- a/tutorials/09-SaveLoad.ipynb
+++ b/tutorials/09-SaveLoad.ipynb
@@ -1,21 +1,23 @@
 {
   "cells": [
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "collapsed": false
-      },
-      "outputs": [],
-      "source": [
-        "%matplotlib inline"
-      ]
-    },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "\n[Learn the Basics](intro.html) ||\n[Quickstart](quickstart_tutorial.html) ||\n[Tensors](tensorqs_tutorial.html) ||\n[Datasets & DataLoaders](data_tutorial.html) ||\n[Transforms](transforms_tutorial.html) ||\n[Build Model](buildmodel_tutorial.html) ||\n[Autograd](autogradqs_tutorial.html) ||\n[Optimization](optimization_tutorial.html) ||\n**Save & Load Model**\n\n# Save and Load the Model\n\nIn this section we will look at how to persist model state with saving, loading and running model predictions.\n"
+        "\n",
+        "[Learn the Basics](intro.html) ||\n",
+        "[Quickstart](quickstart_tutorial.html) ||\n",
+        "[Tensors](tensorqs_tutorial.html) ||\n",
+        "[Datasets & DataLoaders](data_tutorial.html) ||\n",
+        "[Transforms](transforms_tutorial.html) ||\n",
+        "[Build Model](buildmodel_tutorial.html) ||\n",
+        "[Autograd](autogradqs_tutorial.html) ||\n",
+        "[Optimization](optimization_tutorial.html) ||\n",
+        "**Save & Load Model**\n",
+        "\n",
+        "# Save and Load the Model\n",
+        "\n",
+        "In this section we will look at how to persist model state with saving, loading and running model predictions.\n"
       ]
     },
     {
@@ -26,14 +28,21 @@
       },
       "outputs": [],
       "source": [
-        "import torch\nimport torchvision.models as models"
+        "%matplotlib inline\n",
+        "\n",
+        "import torch\n",
+        "import torchvision.models as models"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "## Saving and Loading Model Weights\nPyTorch models store the learned parameters in an internal\nstate dictionary, called ``state_dict``. These can be persisted via the ``torch.save``\nmethod:\n\n"
+        "## Saving and Loading Model Weights\n",
+        "PyTorch models store the learned parameters in an internal\n",
+        "state dictionary, called ``state_dict``. These can be persisted via the ``torch.save``\n",
+        "method:\n",
+        "\n"
       ]
     },
     {
@@ -44,14 +53,17 @@
       },
       "outputs": [],
       "source": [
-        "model = models.vgg16(pretrained=True)\ntorch.save(model.state_dict(), 'model_weights.pth')"
+        "model = models.vgg16(pretrained=True)\n",
+        "torch.save(model.state_dict(), 'model_weights.pth')"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "To load model weights, you need to create an instance of the same model first, and then load the parameters\nusing ``load_state_dict()`` method.\n\n"
+        "To load model weights, you need to create an instance of the same model first, and then load the parameters\n",
+        "using ``load_state_dict()`` method.\n",
+        "\n"
       ]
     },
     {
@@ -62,21 +74,28 @@
       },
       "outputs": [],
       "source": [
-        "model = models.vgg16() # we do not specify pretrained=True, i.e. do not load default weights\nmodel.load_state_dict(torch.load('model_weights.pth'))\nmodel.eval()"
+        "model = models.vgg16() # we do not specify pretrained=True, i.e. do not load default weights\n",
+        "model.load_state_dict(torch.load('model_weights.pth'))\n",
+        "model.eval()"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "<div class=\"alert alert-info\"><h4>Note</h4><p>be sure to call ``model.eval()`` method before inferencing to set the dropout and batch normalization layers to evaluation mode. Failing to do this will yield inconsistent inference results.</p></div>\n\n"
+        "<div class=\"alert alert-info\"><h4>Note</h4><p>be sure to call ``model.eval()`` method before inferencing to set the dropout and batch normalization layers to evaluation mode. Failing to do this will yield inconsistent inference results.</p></div>\n",
+        "\n"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "## Saving and Loading Models with Shapes\nWhen loading model weights, we needed to instantiate the model class first, because the class\ndefines the structure of a network. We might want to save the structure of this class together with\nthe model, in which case we can pass ``model`` (and not ``model.state_dict()``) to the saving function:\n\n"
+        "## Saving and Loading Models with Shapes\n",
+        "When loading model weights, we needed to instantiate the model class first, because the class\n",
+        "defines the structure of a network. We might want to save the structure of this class together with\n",
+        "the model, in which case we can pass ``model`` (and not ``model.state_dict()``) to the saving function:\n",
+        "\n"
       ]
     },
     {
@@ -94,7 +113,8 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "We can then load the model like this:\n\n"
+        "We can then load the model like this:\n",
+        "\n"
       ]
     },
     {
@@ -112,14 +132,17 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "<div class=\"alert alert-info\"><h4>Note</h4><p>This approach uses Python [pickle](https://docs.python.org/3/library/pickle.html) module when serializing the model, thus it relies on the actual class definition to be available when loading the model.</p></div>\n\n"
+        "<div class=\"alert alert-info\"><h4>Note</h4><p>This approach uses Python [pickle](https://docs.python.org/3/library/pickle.html) module when serializing the model, thus it relies on the actual class definition to be available when loading the model.</p></div>\n",
+        "\n"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "## Related Tutorials\n[Saving and Loading a General Checkpoint in PyTorch](https://pytorch.org/tutorials/recipes/recipes/saving_and_loading_a_general_checkpoint.html)\n\n"
+        "## Related Tutorials\n",
+        "[Saving and Loading a General Checkpoint in PyTorch](https://pytorch.org/tutorials/recipes/recipes/saving_and_loading_a_general_checkpoint.html)\n",
+        "\n"
       ]
     }
   ],
@@ -144,4 +167,4 @@
   },
   "nbformat": 4,
   "nbformat_minor": 0
-}
\ No newline at end of file
+}