Exercise 1

Becoming a backprop ninja - Exercise 1

In [1]:

Copied!





import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
%matplotlib inline
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
%matplotlib inline

In [2]:

Copied!





# read in all the words
words = open('names.txt', 'r').read().splitlines()
print(len(words))
print(max(len(w) for w in words))
print(words[:8])
# read in all the words
words = open('names.txt', 'r').read().splitlines()
print(len(words))
print(max(len(w) for w in words))
print(words[:8])

32033
15
['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [3]:

Copied!





# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
vocab_size = len(itos)
print(itos)
print(vocab_size)
# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
vocab_size = len(itos)
print(itos)
print(vocab_size)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}
27

In [4]:

Copied!





# build the dataset
block_size = 3 # context length: how many characters do we take to predict the next one?

def build_dataset(words):
  X, Y = [], []

  for w in words:
    context = [0] * block_size
    for ch in w + '.':
      ix = stoi[ch]
      X.append(context)
      Y.append(ix)
      context = context[1:] + [ix] # crop and append

  X = torch.tensor(X)
  Y = torch.tensor(Y)
  print(X.shape, Y.shape)
  return X, Y

import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr,  Ytr  = build_dataset(words[:n1])     # 80%
Xdev, Ydev = build_dataset(words[n1:n2])   # 10%
Xte,  Yte  = build_dataset(words[n2:])     # 10%
# build the dataset
block_size = 3 # context length: how many characters do we take to predict the next one?

def build_dataset(words):
  X, Y = [], []

  for w in words:
    context = [0] * block_size
    for ch in w + '.':
      ix = stoi[ch]
      X.append(context)
      Y.append(ix)
      context = context[1:] + [ix] # crop and append

  X = torch.tensor(X)
  Y = torch.tensor(Y)
  print(X.shape, Y.shape)
  return X, Y

import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr,  Ytr  = build_dataset(words[:n1])     # 80%
Xdev, Ydev = build_dataset(words[n1:n2])   # 10%
Xte,  Yte  = build_dataset(words[n2:])     # 10%

torch.Size([182625, 3]) torch.Size([182625])
torch.Size([22655, 3]) torch.Size([22655])
torch.Size([22866, 3]) torch.Size([22866])

In [5]:

Copied!

# ok biolerplate done, now we get to the action:
# ok biolerplate done, now we get to the action:

In [5]:

Copied!





# utility function we will use later when comparing manual gradients to PyTorch gradients
def cmp(s, dt, t):
  ex = torch.all(dt == t.grad).item()
  app = torch.allclose(dt, t.grad)
  maxdiff = (dt - t.grad).abs().max().item()
  print(f'{s:15s} | exact: {str(ex):5s} | approximate: {str(app):5s} | maxdiff: {maxdiff}')
# utility function we will use later when comparing manual gradients to PyTorch gradients
def cmp(s, dt, t):
  ex = torch.all(dt == t.grad).item()
  app = torch.allclose(dt, t.grad)
  maxdiff = (dt - t.grad).abs().max().item()
  print(f'{s:15s} | exact: {str(ex):5s} | approximate: {str(app):5s} | maxdiff: {maxdiff}')

In [26]:

Copied!





n_embd = 10 # the dimensionality of the character embedding vectors
n_hidden = 64 # the number of neurons in the hidden layer of the MLP

g = torch.Generator().manual_seed(2147483647) # for reproducibility
C  = torch.randn((vocab_size, n_embd),            generator=g)
# Layer 1
W1 = torch.randn((n_embd * block_size, n_hidden), generator=g) * (5/3)/((n_embd * block_size)**0.5)
b1 = torch.randn(n_hidden,                        generator=g) * 0.1 # using b1 just for fun, it's useless because of BN
# Layer 2
W2 = torch.randn((n_hidden, vocab_size),          generator=g) * 0.1
b2 = torch.randn(vocab_size,                      generator=g) * 0.1
# BatchNorm parameters
bngain = torch.randn((1, n_hidden))*0.1 + 1.0
bnbias = torch.randn((1, n_hidden))*0.1

# Note: I am initializating many of these parameters in non-standard ways
# because sometimes initializating with e.g. all zeros could mask an incorrect
# implementation of the backward pass.

parameters = [C, W1, b1, W2, b2, bngain, bnbias]
print(sum(p.nelement() for p in parameters)) # number of parameters in total
for p in parameters:
  p.requires_grad = True
n_embd = 10 # the dimensionality of the character embedding vectors
n_hidden = 64 # the number of neurons in the hidden layer of the MLP

g = torch.Generator().manual_seed(2147483647) # for reproducibility
C  = torch.randn((vocab_size, n_embd),            generator=g)
# Layer 1
W1 = torch.randn((n_embd * block_size, n_hidden), generator=g) * (5/3)/((n_embd * block_size)**0.5)
b1 = torch.randn(n_hidden,                        generator=g) * 0.1 # using b1 just for fun, it's useless because of BN
# Layer 2
W2 = torch.randn((n_hidden, vocab_size),          generator=g) * 0.1
b2 = torch.randn(vocab_size,                      generator=g) * 0.1
# BatchNorm parameters
bngain = torch.randn((1, n_hidden))*0.1 + 1.0
bnbias = torch.randn((1, n_hidden))*0.1

# Note: I am initializating many of these parameters in non-standard ways
# because sometimes initializating with e.g. all zeros could mask an incorrect
# implementation of the backward pass.

parameters = [C, W1, b1, W2, b2, bngain, bnbias]
print(sum(p.nelement() for p in parameters)) # number of parameters in total
for p in parameters:
  p.requires_grad = True

In [27]:

Copied!





batch_size = 32
n = batch_size # a shorter variable also, for convenience
# construct a minibatch
ix = torch.randint(0, Xtr.shape[0], (batch_size,), generator=g)
Xb, Yb = Xtr[ix], Ytr[ix] # batch X,Y
batch_size = 32
n = batch_size # a shorter variable also, for convenience
# construct a minibatch
ix = torch.randint(0, Xtr.shape[0], (batch_size,), generator=g)
Xb, Yb = Xtr[ix], Ytr[ix] # batch X,Y

In [28]:

Copied!





# forward pass, "chunkated" into smaller steps that are possible to backward one at a time

emb = C[Xb] # embed the characters into vectors
embcat = emb.view(emb.shape[0], -1) # concatenate the vectors
# Linear layer 1
hprebn = embcat @ W1 + b1 # hidden layer pre-activation
# BatchNorm layer
bnmeani = 1/n*hprebn.sum(0, keepdim=True)
bndiff = hprebn - bnmeani
bndiff2 = bndiff**2
bnvar = 1/(n-1)*(bndiff2).sum(0, keepdim=True) # note: Bessel's correction (dividing by n-1, not n)
bnvar_inv = (bnvar + 1e-5)**-0.5
bnraw = bndiff * bnvar_inv
hpreact = bngain * bnraw + bnbias
# Non-linearity
h = torch.tanh(hpreact) # hidden layer
# Linear layer 2
logits = h @ W2 + b2 # output layer
# cross entropy loss (same as F.cross_entropy(logits, Yb))
logit_maxes = logits.max(1, keepdim=True).values
norm_logits = logits - logit_maxes # subtract max for numerical stability
counts = norm_logits.exp()
counts_sum = counts.sum(1, keepdims=True) #DONE
counts_sum_inv = counts_sum**-1 # if I use (1.0 / counts_sum) instead then I can't get backprop to be bit exact... #DONE
probs = counts * counts_sum_inv #DONE
logprobs = probs.log()  #DONE
loss = -logprobs[range(n), Yb].mean() #DONE

# PyTorch backward pass
for p in parameters:
  p.grad = None
for t in [logprobs, probs, counts, counts_sum, counts_sum_inv, # afaik there is no cleaner way
          norm_logits, logit_maxes, logits, h, hpreact, bnraw,
         bnvar_inv, bnvar, bndiff2, bndiff, hprebn, bnmeani,
         embcat, emb]:
  t.retain_grad()
loss.backward()
loss
# forward pass, "chunkated" into smaller steps that are possible to backward one at a time

emb = C[Xb] # embed the characters into vectors
embcat = emb.view(emb.shape[0], -1) # concatenate the vectors
# Linear layer 1
hprebn = embcat @ W1 + b1 # hidden layer pre-activation
# BatchNorm layer
bnmeani = 1/n*hprebn.sum(0, keepdim=True)
bndiff = hprebn - bnmeani
bndiff2 = bndiff**2
bnvar = 1/(n-1)*(bndiff2).sum(0, keepdim=True) # note: Bessel's correction (dividing by n-1, not n)
bnvar_inv = (bnvar + 1e-5)**-0.5
bnraw = bndiff * bnvar_inv
hpreact = bngain * bnraw + bnbias
# Non-linearity
h = torch.tanh(hpreact) # hidden layer
# Linear layer 2
logits = h @ W2 + b2 # output layer
# cross entropy loss (same as F.cross_entropy(logits, Yb))
logit_maxes = logits.max(1, keepdim=True).values
norm_logits = logits - logit_maxes # subtract max for numerical stability
counts = norm_logits.exp()
counts_sum = counts.sum(1, keepdims=True) #DONE
counts_sum_inv = counts_sum**-1 # if I use (1.0 / counts_sum) instead then I can't get backprop to be bit exact... #DONE
probs = counts * counts_sum_inv #DONE
logprobs = probs.log()  #DONE
loss = -logprobs[range(n), Yb].mean() #DONE

# PyTorch backward pass
for p in parameters:
  p.grad = None
for t in [logprobs, probs, counts, counts_sum, counts_sum_inv, # afaik there is no cleaner way
          norm_logits, logit_maxes, logits, h, hpreact, bnraw,
         bnvar_inv, bnvar, bndiff2, bndiff, hprebn, bnmeani,
         embcat, emb]:
  t.retain_grad()
loss.backward()
loss

Out[28]:

tensor(3.3221, grad_fn=<NegBackward0>)

EXERCISE 1

13:01 to 19:05 cmp('logprobs', dlogprobs, logprobs)

In [9]:

Copied!

print(logprobs.shape)
logprobs[range(n), Yb]
print(logprobs.shape)
logprobs[range(n), Yb]

torch.Size([32, 27])

Out[9]:

tensor([-4.0562, -3.0820, -3.6629, -3.2621, -4.1229, -3.4201, -3.2428, -3.9554,
        -3.1259, -4.2500, -3.1582, -1.6256, -2.8483, -2.9654, -2.9990, -3.1882,
        -3.9132, -3.0643, -3.5065, -3.5153, -2.8832, -3.0837, -4.2941, -4.0007,
        -3.4440, -2.9220, -3.1386, -3.8946, -2.6488, -3.5292, -3.3408, -3.1560],
       grad_fn=<IndexBackward0>)

In [10]:

Copied!

print(Yb.shape)
Yb
print(Yb.shape)
Yb

torch.Size([32])

Out[10]:

tensor([ 8, 14, 15, 22,  0, 19,  9, 14,  5,  1, 20,  3,  8, 14, 12,  0, 11,  0,
        26,  9, 25,  0,  1,  1,  7, 18,  9,  3,  5,  9,  0, 18])

In [ ]:

Copied!





#simple breakdown
#now here we know there are 32 examples, for explaination lets assume we only have 3 in total i.e. a,b,c

#loss = - (a + b + c) / 3 ==> so we are basically doing the mean calculation here
#loss = - (1/3a + 1/3b + 1/3c) ==> same equation
#so now, when we take the derivative wrt a
#dloss/da = -1/3 ==>where 3 is the number of elements we consider, so we can also say that it is -1/n, therefore
#dloss/dn = -1/n
#simple breakdown
#now here we know there are 32 examples, for explaination lets assume we only have 3 in total i.e. a,b,c

#loss = - (a + b + c) / 3 ==> so we are basically doing the mean calculation here
#loss = - (1/3a + 1/3b + 1/3c) ==> same equation
#so now, when we take the derivative wrt a
#dloss/da = -1/3 ==>where 3 is the number of elements we consider, so we can also say that it is -1/n, therefore
#dloss/dn = -1/n

In [29]:

Copied!





#So based on our calculation above
dlogprobs = torch.zeros_like(logprobs) #same as torch.zeros((32, 27)) as we need the same shape as logprobs. So instead of hardcoding the values we did this
dlogprobs[range(n), Yb] = -1.0/n #as we need to do it for each of the elements in the array

#Now, lets check
cmp('logprobs', dlogprobs, logprobs)
#So based on our calculation above
dlogprobs = torch.zeros_like(logprobs) #same as torch.zeros((32, 27)) as we need the same shape as logprobs. So instead of hardcoding the values we did this
dlogprobs[range(n), Yb] = -1.0/n #as we need to do it for each of the elements in the array

#Now, lets check
cmp('logprobs', dlogprobs, logprobs)

logprobs        | exact: True  | approximate: True  | maxdiff: 0.0

19:06 to 20:55 cmp('probs', dprobs, probs)

In [30]:

Copied!

dprobs = (1.0/probs) * dlogprobs #we had to take the derivative of logprobs, which was 1/x --> d/dx(log(x)) = 1/x 
#then we multiplied it with dlogprobs (the one we calculated before this for the chainrule)

cmp('probs', dprobs, probs)
dprobs = (1.0/probs) * dlogprobs #we had to take the derivative of logprobs, which was 1/x --> d/dx(log(x)) = 1/x 
#then we multiplied it with dlogprobs (the one we calculated before this for the chainrule)

cmp('probs', dprobs, probs)

probs           | exact: True  | approximate: True  | maxdiff: 0.0

20:56 to 26:21 cmp('counts_sum_inv', dcounts_sum_inv, counts_sum_inv)

In [31]:

Copied!





# probs = counts * counts_sum_inv, now here before we do the multiplication, take a look at the matrix dimensions using `.shape`
# You would see that `counts` would have 3x3 and `counts_sum_inv` will have 3x1
# So before the backpropagation calculation, there is 'broadcasting' happening where the value of b is been replicated/broadcasted multiple time across the matrix

# Example
# c = a * b
# a[3x3] * b[3x1] ---->
# a[1,1]*b1 + a[1,2]*b1 + a[1,3]*b1
# a[2,1]*b2 + a[2,2]*b2 + a[2,3]*b2
# a[3,1]*b3 + a[3,2]*b3 + a[2,3]*b3
# ====> c[3x3]

# The point of this is just to show that there are two things happening internally: The broadcasting and then the backpropagation

# (first case) The derivative of c wrt b will be a
# So, here just `counts` will remain -> then `dprobs` is multiplied because chain rule.
# Finally, in order to make `dcounts_sum_inv` the same dimension as `counts_sum_inv` we sum all of them by 1 and also keepdims as true

dcounts_sum_inv = (counts * dprobs).sum(1, keepdims=True) # So this is our final manually calcualted equation

cmp('counts_sum_inv', dcounts_sum_inv, counts_sum_inv)
# probs = counts * counts_sum_inv, now here before we do the multiplication, take a look at the matrix dimensions using `.shape`
# You would see that `counts` would have 3x3 and `counts_sum_inv` will have 3x1
# So before the backpropagation calculation, there is 'broadcasting' happening where the value of b is been replicated/broadcasted multiple time across the matrix

# Example
# c = a * b
# a[3x3] * b[3x1] ---->
# a[1,1]*b1 + a[1,2]*b1 + a[1,3]*b1
# a[2,1]*b2 + a[2,2]*b2 + a[2,3]*b2
# a[3,1]*b3 + a[3,2]*b3 + a[2,3]*b3
# ====> c[3x3]

# The point of this is just to show that there are two things happening internally: The broadcasting and then the backpropagation

# (first case) The derivative of c wrt b will be a
# So, here just `counts` will remain -> then `dprobs` is multiplied because chain rule.
# Finally, in order to make `dcounts_sum_inv` the same dimension as `counts_sum_inv` we sum all of them by 1 and also keepdims as true

dcounts_sum_inv = (counts * dprobs).sum(1, keepdims=True) # So this is our final manually calcualted equation

cmp('counts_sum_inv', dcounts_sum_inv, counts_sum_inv)

counts_sum_inv  | exact: True  | approximate: True  | maxdiff: 0.0

26:26 to 27:56 first contribution of counts

In [32]:

Copied!

# Here we have to calculate the second half of `dcounts` i.e. (Second case) The derivative of c wrt a will be b

dcounts = counts_sum_inv * dprobs

#but we cant compare it yet as `counts` is later depended on top again as well, which we will check
# Here we have to calculate the second half of `dcounts` i.e. (Second case) The derivative of c wrt a will be b

dcounts = counts_sum_inv * dprobs

#but we cant compare it yet as `counts` is later depended on top again as well, which we will check

27:57 to 28:59 cmp('counts_sum', dcounts_sum, counts_sum)

In [33]:

Copied!

# counts_sum_inv = counts_sum**-1

# Okay so for this, the derivative of x^-1 is -(x^-2)

dcounts_sum = (-counts_sum**-2) * dcounts_sum_inv #Remember for this its the one before the `26:26 to 27:56 first contribution of counts` section

cmp('counts_sum', dcounts_sum, counts_sum)
# counts_sum_inv = counts_sum**-1

# Okay so for this, the derivative of x^-1 is -(x^-2)

dcounts_sum = (-counts_sum**-2) * dcounts_sum_inv #Remember for this its the one before the `26:26 to 27:56 first contribution of counts` section

cmp('counts_sum', dcounts_sum, counts_sum)

counts_sum      | exact: True  | approximate: True  | maxdiff: 0.0

29:00 to 32:26 cmp('counts', dcounts, counts)

In [34]:

Copied!

# counts_sum = counts.sum(1, keepdims=True)

# Now here we know the shape of counts_sum is 32 by 1 and the shape of counts is 32 by 27. So we need to broadcast counts_sum 27 times
# We are dirctly using a PyTorch function where it keeps adding numbers from `counts`

dcounts += torch.ones_like(counts) * dcounts_sum #Also here we are adding `dcounts` as remember this is the second iteration of it, we had calculated one more value of it at the top

cmp('counts', dcounts, counts)
# counts_sum = counts.sum(1, keepdims=True)

# Now here we know the shape of counts_sum is 32 by 1 and the shape of counts is 32 by 27. So we need to broadcast counts_sum 27 times
# We are dirctly using a PyTorch function where it keeps adding numbers from `counts`

dcounts += torch.ones_like(counts) * dcounts_sum #Also here we are adding `dcounts` as remember this is the second iteration of it, we had calculated one more value of it at the top

cmp('counts', dcounts, counts)

counts          | exact: True  | approximate: True  | maxdiff: 0.0

32:27 to 33:13 cmp('norm_logits', dnorm_logits, norm_logits)

In [35]:

Copied!

# counts = norm_logits.exp()

# Now here, the derivative of `norm_logits.exp()`, now the derivate of e^x is (famously) e^x, so its just `norm_logits.exp()` itself
# so we can also just write it as `counts` directly  as it holds that value

dnorm_logits = counts * dcounts

cmp('norm_logits', dnorm_logits, norm_logits)
# counts = norm_logits.exp()

# Now here, the derivative of `norm_logits.exp()`, now the derivate of e^x is (famously) e^x, so its just `norm_logits.exp()` itself
# so we can also just write it as `counts` directly  as it holds that value

dnorm_logits = counts * dcounts

cmp('norm_logits', dnorm_logits, norm_logits)

norm_logits     | exact: True  | approximate: True  | maxdiff: 0.0

33:14 to 36:20 cmp('logit_maxes', dlogit_maxes, logit_maxes)

In [36]:

Copied!

# norm_logits = logits - logit_maxes

# Now here if you would look at the shape of all these variables, you would notice that there is internal broadcasting happening here (logit_maxes)

dlogits = dnorm_logits.clone()
dlogit_maxes = (-dnorm_logits).sum(1, keepdim=True) #WILL HAVE TO REWATCH THIS PART AGAIN, DIDN'T COMPLETELY GET IT

cmp('logit_maxes', dlogit_maxes, logit_maxes)
# norm_logits = logits - logit_maxes

# Now here if you would look at the shape of all these variables, you would notice that there is internal broadcasting happening here (logit_maxes)

dlogits = dnorm_logits.clone()
dlogit_maxes = (-dnorm_logits).sum(1, keepdim=True) #WILL HAVE TO REWATCH THIS PART AGAIN, DIDN'T COMPLETELY GET IT

cmp('logit_maxes', dlogit_maxes, logit_maxes)

logit_maxes     | exact: True  | approximate: True  | maxdiff: 0.0

38:27 to 41:44 cmp('logits', dlogits, logits)

In [37]:

Copied!

# logit_maxes = logits.max(1, keepdim=True).values

# Here, this step is similar to that of the first one in `dlogprobs` where we used torch.zeros_like() function
# So we are doing another alternative way of doing that

dlogits += F.one_hot(logits.max(1).indices, num_classes=logits.shape[1]) * dlogit_maxes #Just remember the += here as we already have one dlogits above

cmp('logits', dlogits, logits)
# logit_maxes = logits.max(1, keepdim=True).values

# Here, this step is similar to that of the first one in `dlogprobs` where we used torch.zeros_like() function
# So we are doing another alternative way of doing that

dlogits += F.one_hot(logits.max(1).indices, num_classes=logits.shape[1]) * dlogit_maxes #Just remember the += here as we already have one dlogits above

cmp('logits', dlogits, logits)

logits          | exact: True  | approximate: True  | maxdiff: 0.0

41:45 to 53:25 cmp('h', dh, h), cmp('W2', dW2, W2) and cmp('b2', db2, b2) - Bckpropagation through a linear layer

( Till 49:56 had theoritical proofs on the matrix multiplication )

In [ ]:

Copied!

# # Linear layer 2
# logits = h @ W2 + b2 # output layer

# in `b2` broadcasting is happening
# # Linear layer 2
# logits = h @ W2 + b2 # output layer

# in `b2` broadcasting is happening

In [19]:

Copied!

# Need these for understanding the matrix mulitplication why we are multiplying with what
dlogits.shape, h.shape, W2.shape, b2.shape
# Need these for understanding the matrix mulitplication why we are multiplying with what
dlogits.shape, h.shape, W2.shape, b2.shape

Out[19]:

(torch.Size([32, 27]),
 torch.Size([32, 64]),
 torch.Size([64, 27]),
 torch.Size([27]))

In [38]:

Copied!





# watch the last few minutes, probably from 51 to see how he broke down this based on the matrix sizes
dh = dlogits @ W2.T
dW2 = h.T @ dlogits
db2 = dlogits.sum(0)

cmp('h', dh, h)
cmp('W2', dW2, W2)
cmp('b2', db2, b2)
# watch the last few minutes, probably from 51 to see how he broke down this based on the matrix sizes
dh = dlogits @ W2.T
dW2 = h.T @ dlogits
db2 = dlogits.sum(0)

cmp('h', dh, h)
cmp('W2', dW2, W2)
cmp('b2', db2, b2)

h               | exact: True  | approximate: True  | maxdiff: 0.0
W2              | exact: True  | approximate: True  | maxdiff: 0.0
b2              | exact: True  | approximate: True  | maxdiff: 0.0

53:37 to 55:12 cmp('hpreact', dhpreact, hpreact)

In [39]:

Copied!

# h = torch.tanh(hpreact) # hidden layer

dhpreact = (1.0 - h**2)*dh #we saw that the derivative of tanh is also (1-a^2) where a was the external variable `a`, not the input `z` to tanh i.e. a = tanh(z)

cmp('hpreact', dhpreact, hpreact)
# h = torch.tanh(hpreact) # hidden layer

dhpreact = (1.0 - h**2)*dh #we saw that the derivative of tanh is also (1-a^2) where a was the external variable `a`, not the input `z` to tanh i.e. a = tanh(z)

cmp('hpreact', dhpreact, hpreact)

hpreact         | exact: True  | approximate: True  | maxdiff: 0.0

55:13 to 59:38 cmp('bngain', dbngain, bngain)

In [22]:

Copied!

bnraw.shape, bngain.shape, bnbias.shape, dhpreact.shape
bnraw.shape, bngain.shape, bnbias.shape, dhpreact.shape

Out[22]:

(torch.Size([32, 64]),
 torch.Size([1, 64]),
 torch.Size([1, 64]),
 torch.Size([32, 64]))

In [40]:

Copied!





# hpreact = bngain * bnraw + bnbias

dbngain = (bnraw * dhpreact).sum(0, keepdim=True) #because dbraw and dhpreact are 32by64, but dbngain expects 1by64 (we also keep the dimension)
dbnraw = (bngain * dhpreact)
dbnbias = (dhpreact).sum(0, keepdim=True) #because dhpreact is 32by64 but the dbnbias expects 1by64 (we also keep the dimension)

cmp('bngain', dbngain, bngain)
cmp('bnbias', dbnbias, bnbias)
cmp('bnraw', dbnraw, bnraw)
# hpreact = bngain * bnraw + bnbias

dbngain = (bnraw * dhpreact).sum(0, keepdim=True) #because dbraw and dhpreact are 32by64, but dbngain expects 1by64 (we also keep the dimension)
dbnraw = (bngain * dhpreact)
dbnbias = (dhpreact).sum(0, keepdim=True) #because dhpreact is 32by64 but the dbnbias expects 1by64 (we also keep the dimension)

cmp('bngain', dbngain, bngain)
cmp('bnbias', dbnbias, bnbias)
cmp('bnraw', dbnraw, bnraw)

bngain          | exact: True  | approximate: True  | maxdiff: 0.0
bnbias          | exact: True  | approximate: True  | maxdiff: 0.0
bnraw           | exact: True  | approximate: True  | maxdiff: 0.0

59:40 to 1:04:1 cmp('bnvar_inv', dbnvar_inv, bnvar_inv)

In [ ]:

Copied!

# From here we are working on the batch norm layer
# the code has been spread out and broken down to different parts (based on the equations on the "bottom right corner box" in the paper for batch norm - See prev lecture) inorder to perform manual backprop more easily
# From here we are working on the batch norm layer
# the code has been spread out and broken down to different parts (based on the equations on the "bottom right corner box" in the paper for batch norm - See prev lecture) inorder to perform manual backprop more easily

In [21]:

Copied!

bnraw.shape, bndiff.shape, bnvar_inv.shape
bnraw.shape, bndiff.shape, bnvar_inv.shape

Out[21]:

(torch.Size([32, 64]), torch.Size([32, 64]), torch.Size([1, 64]))

In [41]:

Copied!

# bnraw = bndiff * bnvar_inv

dbnvar_inv = (bndiff * dbnraw).sum(0, keepdim=True)
dbndiff = bnvar_inv * dbnraw #We will come back to this in 1:12:43 - (1)

cmp('bnvar_inv', dbnvar_inv, bnvar_inv)
# bnraw = bndiff * bnvar_inv

dbnvar_inv = (bndiff * dbnraw).sum(0, keepdim=True)
dbndiff = bnvar_inv * dbnraw #We will come back to this in 1:12:43 - (1)

cmp('bnvar_inv', dbnvar_inv, bnvar_inv)

bnvar_inv       | exact: True  | approximate: True  | maxdiff: 0.0

1:04:15 to 1:05:16 cmp('bnvar', dbnvar, bnvar)

In [42]:

Copied!

# bnvar_inv = (bnvar + 1e-5)**-0.5
#This is a direct equation of derivative of x^n so the output should be n*x^n-1

dbnvar = (-0.5 * ((bnvar + 1e-5) ** (-1.5))) * dbnvar_inv

cmp('bnvar', dbnvar, bnvar)
# bnvar_inv = (bnvar + 1e-5)**-0.5
#This is a direct equation of derivative of x^n so the output should be n*x^n-1

dbnvar = (-0.5 * ((bnvar + 1e-5) ** (-1.5))) * dbnvar_inv

cmp('bnvar', dbnvar, bnvar)

bnvar           | exact: True  | approximate: True  | maxdiff: 0.0

1:05:17 to 1:09:01 - Why he implemented the bessel's correction (as there seem to be some problem/issue in the paper. Using Bias during training time and Unbiased during testing). But we prefer to use Unbiased during both training and testing and that is what we went ahead with.

1:09:02 to 1:12:42 cmp('bndiff2', dbndiff2, bndiff2)

In [43]:

Copied!

# bnvar = 1/(n-1)*(bndiff2).sum(0, keepdim=True)

dbndiff2 = 1/(n-1) * torch.ones_like(bndiff2) * dbnvar

cmp('bndiff2', dbndiff2, bndiff2)
# bnvar = 1/(n-1)*(bndiff2).sum(0, keepdim=True)

dbndiff2 = 1/(n-1) * torch.ones_like(bndiff2) * dbnvar

cmp('bndiff2', dbndiff2, bndiff2)

bndiff2         | exact: True  | approximate: True  | maxdiff: 0.0

1:12:43 to 1:13:58 cmp('bndiff', dbndiff, bndiff)

In [44]:

Copied!

# bndiff2 = bndiff**2

dbndiff += 2*bndiff * dbndiff2 #This is the (2)nd occurance of dbndiff - 59:40 so, we add it here

cmp('bndiff', dbndiff, bndiff)
# bndiff2 = bndiff**2

dbndiff += 2*bndiff * dbndiff2 #This is the (2)nd occurance of dbndiff - 59:40 so, we add it here

cmp('bndiff', dbndiff, bndiff)

bndiff          | exact: True  | approximate: True  | maxdiff: 0.0

1:13:59 to 1:18:35 cmp('bnmeani', dbnmeani, bnmeani) and cmp('hprebn', dhprebn, hprebn)

In [45]:

Copied!

## Please go thorugh this one again, i didnt completely get it

# bnmeani = 1/n*hprebn.sum(0, keepdim=True)
# bndiff = hprebn - bnmeani

dhprebn = dbndiff.clone() #we are making a copy of it
dbnmeani = (-dbndiff).sum(0)

dhprebn += (1.0/n)*(torch.ones_like(hprebn) * dbnmeani)

cmp('bnmeani', dbnmeani, bnmeani)
cmp('hprebn', dhprebn, hprebn)
## Please go thorugh this one again, i didnt completely get it

# bnmeani = 1/n*hprebn.sum(0, keepdim=True)
# bndiff = hprebn - bnmeani

dhprebn = dbndiff.clone() #we are making a copy of it
dbnmeani = (-dbndiff).sum(0)

dhprebn += (1.0/n)*(torch.ones_like(hprebn) * dbnmeani)

cmp('bnmeani', dbnmeani, bnmeani)
cmp('hprebn', dhprebn, hprebn)

bnmeani         | exact: True  | approximate: True  | maxdiff: 0.0
hprebn          | exact: True  | approximate: True  | maxdiff: 0.0

1:18:36 to 1:20:34 cmp('embcat', dembcat, embcat), cmp('W1', dW1, W1) and cmp('b1', db1, b1)

In [47]:

Copied!

hprebn.shape, embcat.shape, W1.shape, b1.shape
hprebn.shape, embcat.shape, W1.shape, b1.shape

Out[47]:

(torch.Size([32, 64]),
 torch.Size([32, 30]),
 torch.Size([30, 64]),
 torch.Size([64]))

In [49]:

Copied!





# Forward pass: hprebn = embcat @ W1 + b1

dembcat = dhprebn @ W1.T
dW1 = embcat.T @ dhprebn
db1 = dhprebn.sum(0)

cmp('embcat', dembcat, embcat)
cmp('W1', dW1, W1)
cmp('b1', db1, b1)
# Forward pass: hprebn = embcat @ W1 + b1

dembcat = dhprebn @ W1.T
dW1 = embcat.T @ dhprebn
db1 = dhprebn.sum(0)

cmp('embcat', dembcat, embcat)
cmp('W1', dW1, W1)
cmp('b1', db1, b1)

embcat          | exact: True  | approximate: True  | maxdiff: 0.0
W1              | exact: True  | approximate: True  | maxdiff: 0.0
b1              | exact: True  | approximate: True  | maxdiff: 0.0

1:20:35 to 1:21:58 cmp('emb', demb, emb)

In [50]:

Copied!

## Please rewatch this as well

# embcat = emb.view(emb.shape[0], -1)

demb = dembcat.view(emb.shape)

cmp('emb', demb, emb)
## Please rewatch this as well

# embcat = emb.view(emb.shape[0], -1)

demb = dembcat.view(emb.shape)

cmp('emb', demb, emb)

emb             | exact: True  | approximate: True  | maxdiff: 0.0

1:21:59 to cmp('C', dC, C)

In [51]:

Copied!





## Please rewatch this as well
# emb = C[Xb]

dC = torch.zeros_like(C)
for k in range(Xb.shape[0]):
  for j in range(Xb.shape[1]):
    ix = Xb[k,j]
    dC[ix] += demb[k,j]

cmp('C', dC, C)
## Please rewatch this as well
# emb = C[Xb]

dC = torch.zeros_like(C)
for k in range(Xb.shape[0]):
  for j in range(Xb.shape[1]):
    ix = Xb[k,j]
    dC[ix] += demb[k,j]

cmp('C', dC, C)

C               | exact: True  | approximate: True  | maxdiff: 0.0

And we are done with the first exercise!!

In [ ]:

Copied!





# Exercise 1: backprop through the whole thing manually,
# backpropagating through exactly all of the variables
# as they are defined in the forward pass above, one by one

# -----------------
# YOUR CODE HERE :)
# -----------------

# cmp('logprobs', dlogprobs, logprobs)
# cmp('probs', dprobs, probs)
# cmp('counts_sum_inv', dcounts_sum_inv, counts_sum_inv)
# cmp('counts_sum', dcounts_sum, counts_sum)
# cmp('counts', dcounts, counts)
# cmp('norm_logits', dnorm_logits, norm_logits)
# cmp('logit_maxes', dlogit_maxes, logit_maxes)
# cmp('logits', dlogits, logits)
# cmp('h', dh, h)
# cmp('W2', dW2, W2)
# cmp('b2', db2, b2)
# cmp('hpreact', dhpreact, hpreact)
# cmp('bngain', dbngain, bngain)
# cmp('bnbias', dbnbias, bnbias)
# cmp('bnraw', dbnraw, bnraw)
# cmp('bnvar_inv', dbnvar_inv, bnvar_inv)
# cmp('bnvar', dbnvar, bnvar)
# cmp('bndiff2', dbndiff2, bndiff2)
# cmp('bndiff', dbndiff, bndiff)
# cmp('bnmeani', dbnmeani, bnmeani)
# cmp('hprebn', dhprebn, hprebn)
# cmp('embcat', dembcat, embcat)
# cmp('W1', dW1, W1)
# cmp('b1', db1, b1)
# cmp('emb', demb, emb)
# cmp('C', dC, C)
# Exercise 1: backprop through the whole thing manually,
# backpropagating through exactly all of the variables
# as they are defined in the forward pass above, one by one

# -----------------
# YOUR CODE HERE :)
# -----------------

# cmp('logprobs', dlogprobs, logprobs)
# cmp('probs', dprobs, probs)
# cmp('counts_sum_inv', dcounts_sum_inv, counts_sum_inv)
# cmp('counts_sum', dcounts_sum, counts_sum)
# cmp('counts', dcounts, counts)
# cmp('norm_logits', dnorm_logits, norm_logits)
# cmp('logit_maxes', dlogit_maxes, logit_maxes)
# cmp('logits', dlogits, logits)
# cmp('h', dh, h)
# cmp('W2', dW2, W2)
# cmp('b2', db2, b2)
# cmp('hpreact', dhpreact, hpreact)
# cmp('bngain', dbngain, bngain)
# cmp('bnbias', dbnbias, bnbias)
# cmp('bnraw', dbnraw, bnraw)
# cmp('bnvar_inv', dbnvar_inv, bnvar_inv)
# cmp('bnvar', dbnvar, bnvar)
# cmp('bndiff2', dbndiff2, bndiff2)
# cmp('bndiff', dbndiff, bndiff)
# cmp('bnmeani', dbnmeani, bnmeani)
# cmp('hprebn', dhprebn, hprebn)
# cmp('embcat', dembcat, embcat)
# cmp('W1', dW1, W1)
# cmp('b1', db1, b1)
# cmp('emb', demb, emb)
# cmp('C', dC, C)