Exercise 1
Becoming a backprop ninja - Exercise 1
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
%matplotlib inline
# read in all the words
words = open('names.txt', 'r').read().splitlines()
print(len(words))
print(max(len(w) for w in words))
print(words[:8])
32033 15 ['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']
# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
vocab_size = len(itos)
print(itos)
print(vocab_size)
{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'} 27
# build the dataset
block_size = 3 # context length: how many characters do we take to predict the next one?
def build_dataset(words):
X, Y = [], []
for w in words:
context = [0] * block_size
for ch in w + '.':
ix = stoi[ch]
X.append(context)
Y.append(ix)
context = context[1:] + [ix] # crop and append
X = torch.tensor(X)
Y = torch.tensor(Y)
print(X.shape, Y.shape)
return X, Y
import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))
Xtr, Ytr = build_dataset(words[:n1]) # 80%
Xdev, Ydev = build_dataset(words[n1:n2]) # 10%
Xte, Yte = build_dataset(words[n2:]) # 10%
torch.Size([182625, 3]) torch.Size([182625]) torch.Size([22655, 3]) torch.Size([22655]) torch.Size([22866, 3]) torch.Size([22866])
# ok biolerplate done, now we get to the action:
# utility function we will use later when comparing manual gradients to PyTorch gradients
def cmp(s, dt, t):
ex = torch.all(dt == t.grad).item()
app = torch.allclose(dt, t.grad)
maxdiff = (dt - t.grad).abs().max().item()
print(f'{s:15s} | exact: {str(ex):5s} | approximate: {str(app):5s} | maxdiff: {maxdiff}')
n_embd = 10 # the dimensionality of the character embedding vectors
n_hidden = 64 # the number of neurons in the hidden layer of the MLP
g = torch.Generator().manual_seed(2147483647) # for reproducibility
C = torch.randn((vocab_size, n_embd), generator=g)
# Layer 1
W1 = torch.randn((n_embd * block_size, n_hidden), generator=g) * (5/3)/((n_embd * block_size)**0.5)
b1 = torch.randn(n_hidden, generator=g) * 0.1 # using b1 just for fun, it's useless because of BN
# Layer 2
W2 = torch.randn((n_hidden, vocab_size), generator=g) * 0.1
b2 = torch.randn(vocab_size, generator=g) * 0.1
# BatchNorm parameters
bngain = torch.randn((1, n_hidden))*0.1 + 1.0
bnbias = torch.randn((1, n_hidden))*0.1
# Note: I am initializating many of these parameters in non-standard ways
# because sometimes initializating with e.g. all zeros could mask an incorrect
# implementation of the backward pass.
parameters = [C, W1, b1, W2, b2, bngain, bnbias]
print(sum(p.nelement() for p in parameters)) # number of parameters in total
for p in parameters:
p.requires_grad = True
4137
batch_size = 32
n = batch_size # a shorter variable also, for convenience
# construct a minibatch
ix = torch.randint(0, Xtr.shape[0], (batch_size,), generator=g)
Xb, Yb = Xtr[ix], Ytr[ix] # batch X,Y
# forward pass, "chunkated" into smaller steps that are possible to backward one at a time
emb = C[Xb] # embed the characters into vectors
embcat = emb.view(emb.shape[0], -1) # concatenate the vectors
# Linear layer 1
hprebn = embcat @ W1 + b1 # hidden layer pre-activation
# BatchNorm layer
bnmeani = 1/n*hprebn.sum(0, keepdim=True)
bndiff = hprebn - bnmeani
bndiff2 = bndiff**2
bnvar = 1/(n-1)*(bndiff2).sum(0, keepdim=True) # note: Bessel's correction (dividing by n-1, not n)
bnvar_inv = (bnvar + 1e-5)**-0.5
bnraw = bndiff * bnvar_inv
hpreact = bngain * bnraw + bnbias
# Non-linearity
h = torch.tanh(hpreact) # hidden layer
# Linear layer 2
logits = h @ W2 + b2 # output layer
# cross entropy loss (same as F.cross_entropy(logits, Yb))
logit_maxes = logits.max(1, keepdim=True).values
norm_logits = logits - logit_maxes # subtract max for numerical stability
counts = norm_logits.exp()
counts_sum = counts.sum(1, keepdims=True) #DONE
counts_sum_inv = counts_sum**-1 # if I use (1.0 / counts_sum) instead then I can't get backprop to be bit exact... #DONE
probs = counts * counts_sum_inv #DONE
logprobs = probs.log() #DONE
loss = -logprobs[range(n), Yb].mean() #DONE
# PyTorch backward pass
for p in parameters:
p.grad = None
for t in [logprobs, probs, counts, counts_sum, counts_sum_inv, # afaik there is no cleaner way
norm_logits, logit_maxes, logits, h, hpreact, bnraw,
bnvar_inv, bnvar, bndiff2, bndiff, hprebn, bnmeani,
embcat, emb]:
t.retain_grad()
loss.backward()
loss
tensor(3.3221, grad_fn=<NegBackward0>)
EXERCISE 1
print(logprobs.shape)
logprobs[range(n), Yb]
torch.Size([32, 27])
tensor([-4.0562, -3.0820, -3.6629, -3.2621, -4.1229, -3.4201, -3.2428, -3.9554, -3.1259, -4.2500, -3.1582, -1.6256, -2.8483, -2.9654, -2.9990, -3.1882, -3.9132, -3.0643, -3.5065, -3.5153, -2.8832, -3.0837, -4.2941, -4.0007, -3.4440, -2.9220, -3.1386, -3.8946, -2.6488, -3.5292, -3.3408, -3.1560], grad_fn=<IndexBackward0>)
print(Yb.shape)
Yb
torch.Size([32])
tensor([ 8, 14, 15, 22, 0, 19, 9, 14, 5, 1, 20, 3, 8, 14, 12, 0, 11, 0, 26, 9, 25, 0, 1, 1, 7, 18, 9, 3, 5, 9, 0, 18])
#simple breakdown
#now here we know there are 32 examples, for explaination lets assume we only have 3 in total i.e. a,b,c
#loss = - (a + b + c) / 3 ==> so we are basically doing the mean calculation here
#loss = - (1/3a + 1/3b + 1/3c) ==> same equation
#so now, when we take the derivative wrt a
#dloss/da = -1/3 ==>where 3 is the number of elements we consider, so we can also say that it is -1/n, therefore
#dloss/dn = -1/n
#So based on our calculation above
dlogprobs = torch.zeros_like(logprobs) #same as torch.zeros((32, 27)) as we need the same shape as logprobs. So instead of hardcoding the values we did this
dlogprobs[range(n), Yb] = -1.0/n #as we need to do it for each of the elements in the array
#Now, lets check
cmp('logprobs', dlogprobs, logprobs)
logprobs | exact: True | approximate: True | maxdiff: 0.0
dprobs = (1.0/probs) * dlogprobs #we had to take the derivative of logprobs, which was 1/x --> d/dx(log(x)) = 1/x
#then we multiplied it with dlogprobs (the one we calculated before this for the chainrule)
cmp('probs', dprobs, probs)
probs | exact: True | approximate: True | maxdiff: 0.0
# probs = counts * counts_sum_inv, now here before we do the multiplication, take a look at the matrix dimensions using `.shape`
# You would see that `counts` would have 3x3 and `counts_sum_inv` will have 3x1
# So before the backpropagation calculation, there is 'broadcasting' happening where the value of b is been replicated/broadcasted multiple time across the matrix
# Example
# c = a * b
# a[3x3] * b[3x1] ---->
# a[1,1]*b1 + a[1,2]*b1 + a[1,3]*b1
# a[2,1]*b2 + a[2,2]*b2 + a[2,3]*b2
# a[3,1]*b3 + a[3,2]*b3 + a[2,3]*b3
# ====> c[3x3]
# The point of this is just to show that there are two things happening internally: The broadcasting and then the backpropagation
# (first case) The derivative of c wrt b will be a
# So, here just `counts` will remain -> then `dprobs` is multiplied because chain rule.
# Finally, in order to make `dcounts_sum_inv` the same dimension as `counts_sum_inv` we sum all of them by 1 and also keepdims as true
dcounts_sum_inv = (counts * dprobs).sum(1, keepdims=True) # So this is our final manually calcualted equation
cmp('counts_sum_inv', dcounts_sum_inv, counts_sum_inv)
counts_sum_inv | exact: True | approximate: True | maxdiff: 0.0
# Here we have to calculate the second half of `dcounts` i.e. (Second case) The derivative of c wrt a will be b
dcounts = counts_sum_inv * dprobs
#but we cant compare it yet as `counts` is later depended on top again as well, which we will check
# counts_sum_inv = counts_sum**-1
# Okay so for this, the derivative of x^-1 is -(x^-2)
dcounts_sum = (-counts_sum**-2) * dcounts_sum_inv #Remember for this its the one before the `26:26 to 27:56 first contribution of counts` section
cmp('counts_sum', dcounts_sum, counts_sum)
counts_sum | exact: True | approximate: True | maxdiff: 0.0
# counts_sum = counts.sum(1, keepdims=True)
# Now here we know the shape of counts_sum is 32 by 1 and the shape of counts is 32 by 27. So we need to broadcast counts_sum 27 times
# We are dirctly using a PyTorch function where it keeps adding numbers from `counts`
dcounts += torch.ones_like(counts) * dcounts_sum #Also here we are adding `dcounts` as remember this is the second iteration of it, we had calculated one more value of it at the top
cmp('counts', dcounts, counts)
counts | exact: True | approximate: True | maxdiff: 0.0
# counts = norm_logits.exp()
# Now here, the derivative of `norm_logits.exp()`, now the derivate of e^x is (famously) e^x, so its just `norm_logits.exp()` itself
# so we can also just write it as `counts` directly as it holds that value
dnorm_logits = counts * dcounts
cmp('norm_logits', dnorm_logits, norm_logits)
norm_logits | exact: True | approximate: True | maxdiff: 0.0
# norm_logits = logits - logit_maxes
# Now here if you would look at the shape of all these variables, you would notice that there is internal broadcasting happening here (logit_maxes)
dlogits = dnorm_logits.clone()
dlogit_maxes = (-dnorm_logits).sum(1, keepdim=True) #WILL HAVE TO REWATCH THIS PART AGAIN, DIDN'T COMPLETELY GET IT
cmp('logit_maxes', dlogit_maxes, logit_maxes)
logit_maxes | exact: True | approximate: True | maxdiff: 0.0
# logit_maxes = logits.max(1, keepdim=True).values
# Here, this step is similar to that of the first one in `dlogprobs` where we used torch.zeros_like() function
# So we are doing another alternative way of doing that
dlogits += F.one_hot(logits.max(1).indices, num_classes=logits.shape[1]) * dlogit_maxes #Just remember the += here as we already have one dlogits above
cmp('logits', dlogits, logits)
logits | exact: True | approximate: True | maxdiff: 0.0
# # Linear layer 2
# logits = h @ W2 + b2 # output layer
# in `b2` broadcasting is happening
# Need these for understanding the matrix mulitplication why we are multiplying with what
dlogits.shape, h.shape, W2.shape, b2.shape
(torch.Size([32, 27]), torch.Size([32, 64]), torch.Size([64, 27]), torch.Size([27]))
# watch the last few minutes, probably from 51 to see how he broke down this based on the matrix sizes
dh = dlogits @ W2.T
dW2 = h.T @ dlogits
db2 = dlogits.sum(0)
cmp('h', dh, h)
cmp('W2', dW2, W2)
cmp('b2', db2, b2)
h | exact: True | approximate: True | maxdiff: 0.0 W2 | exact: True | approximate: True | maxdiff: 0.0 b2 | exact: True | approximate: True | maxdiff: 0.0
53:37 to 55:12 cmp('hpreact', dhpreact, hpreact)
# h = torch.tanh(hpreact) # hidden layer
dhpreact = (1.0 - h**2)*dh #we saw that the derivative of tanh is also (1-a^2) where a was the external variable `a`, not the input `z` to tanh i.e. a = tanh(z)
cmp('hpreact', dhpreact, hpreact)
hpreact | exact: True | approximate: True | maxdiff: 0.0
55:13 to 59:38 cmp('bngain', dbngain, bngain)
bnraw.shape, bngain.shape, bnbias.shape, dhpreact.shape
(torch.Size([32, 64]), torch.Size([1, 64]), torch.Size([1, 64]), torch.Size([32, 64]))
# hpreact = bngain * bnraw + bnbias
dbngain = (bnraw * dhpreact).sum(0, keepdim=True) #because dbraw and dhpreact are 32by64, but dbngain expects 1by64 (we also keep the dimension)
dbnraw = (bngain * dhpreact)
dbnbias = (dhpreact).sum(0, keepdim=True) #because dhpreact is 32by64 but the dbnbias expects 1by64 (we also keep the dimension)
cmp('bngain', dbngain, bngain)
cmp('bnbias', dbnbias, bnbias)
cmp('bnraw', dbnraw, bnraw)
bngain | exact: True | approximate: True | maxdiff: 0.0 bnbias | exact: True | approximate: True | maxdiff: 0.0 bnraw | exact: True | approximate: True | maxdiff: 0.0
59:40 to 1:04:1 cmp('bnvar_inv', dbnvar_inv, bnvar_inv)
# From here we are working on the batch norm layer
# the code has been spread out and broken down to different parts (based on the equations on the "bottom right corner box" in the paper for batch norm - See prev lecture) inorder to perform manual backprop more easily
bnraw.shape, bndiff.shape, bnvar_inv.shape
(torch.Size([32, 64]), torch.Size([32, 64]), torch.Size([1, 64]))
# bnraw = bndiff * bnvar_inv
dbnvar_inv = (bndiff * dbnraw).sum(0, keepdim=True)
dbndiff = bnvar_inv * dbnraw #We will come back to this in 1:12:43 - (1)
cmp('bnvar_inv', dbnvar_inv, bnvar_inv)
bnvar_inv | exact: True | approximate: True | maxdiff: 0.0
1:04:15 to 1:05:16 cmp('bnvar', dbnvar, bnvar)
# bnvar_inv = (bnvar + 1e-5)**-0.5
#This is a direct equation of derivative of x^n so the output should be n*x^n-1
dbnvar = (-0.5 * ((bnvar + 1e-5) ** (-1.5))) * dbnvar_inv
cmp('bnvar', dbnvar, bnvar)
bnvar | exact: True | approximate: True | maxdiff: 0.0
1:05:17 to 1:09:01 - Why he implemented the bessel's correction (as there seem to be some problem/issue in the paper. Using Bias during training time and Unbiased during testing). But we prefer to use Unbiased during both training and testing and that is what we went ahead with.
1:09:02 to 1:12:42 cmp('bndiff2', dbndiff2, bndiff2)
# bnvar = 1/(n-1)*(bndiff2).sum(0, keepdim=True)
dbndiff2 = 1/(n-1) * torch.ones_like(bndiff2) * dbnvar
cmp('bndiff2', dbndiff2, bndiff2)
bndiff2 | exact: True | approximate: True | maxdiff: 0.0
1:12:43 to 1:13:58 cmp('bndiff', dbndiff, bndiff)
# bndiff2 = bndiff**2
dbndiff += 2*bndiff * dbndiff2 #This is the (2)nd occurance of dbndiff - 59:40 so, we add it here
cmp('bndiff', dbndiff, bndiff)
bndiff | exact: True | approximate: True | maxdiff: 0.0
1:13:59 to 1:18:35 cmp('bnmeani', dbnmeani, bnmeani)
and cmp('hprebn', dhprebn, hprebn)
## Please go thorugh this one again, i didnt completely get it
# bnmeani = 1/n*hprebn.sum(0, keepdim=True)
# bndiff = hprebn - bnmeani
dhprebn = dbndiff.clone() #we are making a copy of it
dbnmeani = (-dbndiff).sum(0)
dhprebn += (1.0/n)*(torch.ones_like(hprebn) * dbnmeani)
cmp('bnmeani', dbnmeani, bnmeani)
cmp('hprebn', dhprebn, hprebn)
bnmeani | exact: True | approximate: True | maxdiff: 0.0 hprebn | exact: True | approximate: True | maxdiff: 0.0
1:18:36 to 1:20:34 cmp('embcat', dembcat, embcat)
, cmp('W1', dW1, W1)
and cmp('b1', db1, b1)
hprebn.shape, embcat.shape, W1.shape, b1.shape
(torch.Size([32, 64]), torch.Size([32, 30]), torch.Size([30, 64]), torch.Size([64]))
# Forward pass: hprebn = embcat @ W1 + b1
dembcat = dhprebn @ W1.T
dW1 = embcat.T @ dhprebn
db1 = dhprebn.sum(0)
cmp('embcat', dembcat, embcat)
cmp('W1', dW1, W1)
cmp('b1', db1, b1)
embcat | exact: True | approximate: True | maxdiff: 0.0 W1 | exact: True | approximate: True | maxdiff: 0.0 b1 | exact: True | approximate: True | maxdiff: 0.0
1:20:35 to 1:21:58 cmp('emb', demb, emb)
## Please rewatch this as well
# embcat = emb.view(emb.shape[0], -1)
demb = dembcat.view(emb.shape)
cmp('emb', demb, emb)
emb | exact: True | approximate: True | maxdiff: 0.0
1:21:59 to cmp('C', dC, C)
## Please rewatch this as well
# emb = C[Xb]
dC = torch.zeros_like(C)
for k in range(Xb.shape[0]):
for j in range(Xb.shape[1]):
ix = Xb[k,j]
dC[ix] += demb[k,j]
cmp('C', dC, C)
C | exact: True | approximate: True | maxdiff: 0.0
And we are done with the first exercise!!
# Exercise 1: backprop through the whole thing manually,
# backpropagating through exactly all of the variables
# as they are defined in the forward pass above, one by one
# -----------------
# YOUR CODE HERE :)
# -----------------
# cmp('logprobs', dlogprobs, logprobs)
# cmp('probs', dprobs, probs)
# cmp('counts_sum_inv', dcounts_sum_inv, counts_sum_inv)
# cmp('counts_sum', dcounts_sum, counts_sum)
# cmp('counts', dcounts, counts)
# cmp('norm_logits', dnorm_logits, norm_logits)
# cmp('logit_maxes', dlogit_maxes, logit_maxes)
# cmp('logits', dlogits, logits)
# cmp('h', dh, h)
# cmp('W2', dW2, W2)
# cmp('b2', db2, b2)
# cmp('hpreact', dhpreact, hpreact)
# cmp('bngain', dbngain, bngain)
# cmp('bnbias', dbnbias, bnbias)
# cmp('bnraw', dbnraw, bnraw)
# cmp('bnvar_inv', dbnvar_inv, bnvar_inv)
# cmp('bnvar', dbnvar, bnvar)
# cmp('bndiff2', dbndiff2, bndiff2)
# cmp('bndiff', dbndiff, bndiff)
# cmp('bnmeani', dbnmeani, bnmeani)
# cmp('hprebn', dhprebn, hprebn)
# cmp('embcat', dembcat, embcat)
# cmp('W1', dW1, W1)
# cmp('b1', db1, b1)
# cmp('emb', demb, emb)
# cmp('C', dC, C)