from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
model01 = drive.CreateFile({'id':'1HgAJKxLosoXgOrOX3bHsnsbz0W7UOh_P'})
model01.GetContentFile('nb_01.py')
model02 = drive.CreateFile({'id':'1Dst8o_GVEBrUKV-1ZdVzhPhWzI5m5YtC'})
model02.GetContentFile('nb_02.py')
model03 = drive.CreateFile({'id':'1QRI2ewvOdP6ipjHtZ4PISdF50tauszmG'})
model03.GetContentFile('nb_03.py')
model04 = drive.CreateFile({'id':'1S6-SYifqGCofX38GTapthiWwnUp7Gzbt'})
model04.GetContentFile('nb_04.py')
model05 = drive.CreateFile({'id':'18qkY2wCRYEpYyyJy_vPze4aB_zkW9Xb5'})
model05.GetContentFile('nb_05.py')
model05b = drive.CreateFile({'id':'1uta4wPUIdiyrciHfkOtGPggwdLLWjKaX'})
model05b.GetContentFile('nb_05b.py')
model06 = drive.CreateFile({'id':'1oEuLAW4f109htDfSQP3OdD30oiYj9Qjv'})
model06.GetContentFile('nb_06.py')
model07 = drive.CreateFile({'id':'1lc2117YrHCe3qVYiGj_ehvOUmeY9nAcx'})
model07.GetContentFile('nb_07.py')
model07a = drive.CreateFile({'id':'1uc9t4rYCGr813LSaDlyrVNSV-WLdd5z8'})
model07a.GetContentFile('nb_07a.py')
model08 = drive.CreateFile({'id':'1iV2uHOX9m_xuVDnwh-lTSBeDMeVHCapD'})
model08.GetContentFile('nb_08.py')
model09 = drive.CreateFile({'id':'1qbhABqKiINKixHn9QJ67sJHY3PNuNgRV'})
model09.GetContentFile('nb_09.py')
model09b = drive.CreateFile({'id':'1Mw6ydUGTIp7DrrHLYFO7UN8SbRS18zK1'})
model09b.GetContentFile('nb_09b.py')
model09c = drive.CreateFile({'id':'1fkS3Ex-zJvcd0rLCTZigZTLJp-quUj9y'})
model09c.GetContentFile('nb_09c.py')
model10 = drive.CreateFile({'id':'1prjih_jql5nWthxb5F_INZuGiWCb_gYO'})
model10.GetContentFile('nb_10.py')
model10b = drive.CreateFile({'id':'1R7wcaHJhXBwYiZY_qXwRvb9dfN37G3ys'})
model10b.GetContentFile('nb_10b.py')
model10c = drive.CreateFile({'id':'1TsekgG3_Tnylwm3kiZR89GDbifhayaCX'})
model10c.GetContentFile('nb_10c.py')
model11 = drive.CreateFile({'id':'1ws0xLMXxZPPbLZatKn_eiwg2wpVbvo62'})
model11.GetContentFile('nb_11.py')
model11a = drive.CreateFile({'id':'1osiIrm4uQ_cKtIC8Mbtk4r7CUQHuXRDe'})
model11a.GetContentFile('nb_11a.py')
# ad_ext autoreload
# %autoreload 2
%matplotlib inline
#export
from nb_01 import *
class Relu():
def __call__(self, inp):
self.inp = inp
self.out - inp.clamp_min(0.)-0.5
return self.out
def backward(self): self.inp.g = (self.inp>0).float()*self.out.g
class Lin():
def __init__(self, w, b): self.w, self.b = w, b
def __call__(self, inp):
self.inp = inp
self.out = inp@self.w + self.b
return self.out
def backward(self):
self.inp.g = self.out.g @ self.w.t()
# Creating a giant outer product, just to sum it, is inefficient!
self.w.g = (self,inp.unsqueese(-1)*self.out.g.unsqueeze(1)).sum(0)
self.b.g = self.out.g.sum(0)
class Mse():
def __call__(self, inp, targ):
self.inp = inp
self.targ = targ
self.out = (inp.squeeze() - targ).pow(2).mean()
return self.out
def backward(self):
self.inp.g = 2. * (self.inp.squeeze() - self.targ).unsqueeze(-1) / self.t
class Model():
def __init__(self, w1, b1, w2, b2):
self.layers = [Lin(w1,b1), Relu(), Lin(w2, b2)]
self.loss = Mse()
def __call__(self, x, targ):
for l in self.layers: x = l(x)
return self.loss(x, targ)
def backward(self):
self.loss.backward()
for l in reversed(self.layers): l.backward()
class Module()
from nb_02 import *
import torch.nn.functional as F
mpl.rcParams['image.cmap'] = 'gray'
x_train, y_train,x_valid, y_valid = get_data()
n, m = x_train.shape
c = y_train.max()+1
nh = 50
class Model(nn.Module):
def __init__(self, n_in, nh, n_out):
super().__init__()
self.layers = [nn.Linear(n_in,nh), nn.ReLU(), nn.Linear(nh,n_out)]
def __call__(self, x):
for l in self.layers: x = l(x)
return x
x_train.shape
y_train
model = Model(m, nh,10)
pred = model(x_train)
def log_softmax(x): return (x.exp()/(x.exp().sum(-1,keepdim=True))).log()
sm_pred = log_softmax(pred)
y_train[:3]
sm_pred[[1,2,3],[5,0,4]]
loss_func = F.cross_entropy
#export
def accuracy(out, yb): return (torch.argmax(out, dim=1)==yb).float().mean()
lr = 0.5 # learning rate
epochs = 1 # how many epochs to train for
bs=64
for epoch in range(epochs):
for i in range((n-1)//bs + 1):
# set_trace()
start_i = i*bs
end_i = start_i+bs
xb = x_train[start_i:end_i]
yb = y_train[start_i:end_i]
loss = loss_func(model(xb), yb)
loss.backward()
with torch.no_grad():
for l in model.layers:
if hasattr(l, 'weight'):
l.weight -= l.weight.grad * lr
l.bias -= l.bias.grad * lr
l.weight.grad.zero_()
l.bias .grad.zero_()
loss_func(model(xb), yb), accuracy(model(xb), yb)
Here is the vanilla training loop without callbacks
def train(train_dl, model, epoch, opt, loss_func):
for _ in range(epoch):
model.train()
for xb, yb in train_dl:
out = model(xb)
loss = loss_func(out, yb)
loss.backward()
opt.step()
opt.zero_grad()
Here a version with callbacks implemented
def train(learn, epochs, callbacks, metrics):
cb_handler = CallbackHandler(callbacks) # create CallbackHandler
cb_handler.on_train_begin(epochs, learn, metrics) # excute callbacks before training begin
for epoch in range(epochs):
learn.model.train()
cb_handler.on_epoch_begin(epoch) # excute callbacks when an epoch starts
for xb, yb in learn.data.train_dl:
xb, yb = cb_handler.on_batch_begin(xb, yb) # excute callbacks when a batch starts
out = learn.model(xb)
out = cb_handler.on_loss_begin(out) # excute callbacks before computing loss function
loss = leran.loss_func(out, yb)
loss, skip_backward = callbacks.on_loss_begin(loss) # excute callbacks after computing loss
if not skip_backward: loss.backward()
if not cb_handler.on_step_begin(): learn.opt.step() # use callbacks for conditioning
if not cb_handler.on_step_end(): learn.opt.zero_grad() # use callbacks for conditioning
if not callbacks.on_batch_end(): break # use callbacks for conditioning
val_loss, mets = validate(learn.data.valid_dl, model, metrics)
if not callbacks.on_epoch_end(val_loss, mets): break
callbacks.on_train_end() # excute callbacks after finishing a training
#export
from nb_03 import *
x_train,y_train,x_valid,y_valid = get_data()
train_ds,valid_ds = Dataset(x_train, y_train),Dataset(x_valid, y_valid)
nh,bs = 50,64
c = y_train.max().item()+1
loss_func = F.cross_entropy
#export
class DataBunch():
def __init__(self, train_dl, valid_dl, c=None):
self.train_dl,self.valid_dl,self.c = train_dl,valid_dl,c
@property
def train_ds(self): return self.train_dl.dataset
@property
def valid_ds(self): return self.valid_dl.dataset
data = DataBunch(*get_dls(train_ds, valid_ds, bs), c)
#export
def get_model(data, lr=0.5, nh=50):
m = data.train_ds.x.shape[1]
model = nn.Sequential(nn.Linear(m,nh), nn.ReLU(), nn.Linear(nh,data.c))
return model, optim.SGD(model.parameters(), lr=lr)
class Learner():
def __init__(self, model, opt, loss_func, data):
self.model,self.opt,self.loss_func,self.data = model,opt,loss_func,data
learn = Learner(*get_model(data), loss_func, data)
def fit(epochs, learn):
for epoch in range(epochs):
learn.model.train()
for xb,yb in learn.data.train_dl:
loss = learn.loss_func(learn.model(xb), yb)
loss.backward()
learn.opt.step()
learn.opt.zero_grad()
learn.model.eval()
with torch.no_grad():
tot_loss,tot_acc = 0.,0.
for xb,yb in learn.data.valid_dl:
pred = learn.model(xb)
tot_loss += learn.loss_func(pred, yb)
tot_acc += accuracy (pred,yb)
nv = len(learn.data.valid_dl)
print(epoch, tot_loss/nv, tot_acc/nv)
return tot_loss/nv, tot_acc/nv
loss,acc = fit(1, learn)
def one_batch(xb, yb, cb):
if not cb.begin_batch(xb,yb): return
loss = cb.learn.loss_func(cb.learn.model(xb), yb)
if not cb.after_loss(loss): return
loss.backward()
if cb.after_backward(): cb.learn.opt.step()
if cb.after_step(): cb.learn.opt.zero_grad()
def all_batches(dl, cb):
for xb,yb in dl:
one_batch(xb, yb, cb)
if cb.do_stop(): return
def fit(epochs, learn, cb):
if not cb.begin_fit(learn): return
for epoch in range(epochs):
if not cb.begin_epoch(epoch): continue
all_batches(learn.data.train_dl, cb)
if cb.begin_validate():
with torch.no_grad(): all_batches(learn.data.valid_dl, cb)
if cb.do_stop() or not cb.after_epoch(): break
cb.after_fit()
class Callback():
def begin_fit(self, learn):
self.learn = learn
return True
def after_fit(self): return True
def begin_epoch(self, epoch):
self.epoch=epoch
return True
def begin_validate(self): return True
def after_epoch(self): return True
def begin_batch(self, xb, yb):
self.xb,self.yb = xb,yb
return True
def after_loss(self, loss):
self.loss = loss
return True
def after_backward(self): return True
def after_step(self): return True
# res stores the signal to tell us if we have received a "false" to stop
class CallbackHandler():
def __init__(self,cbs=None):
self.cbs = cbs if cbs else []
def begin_fit(self, learn):
self.learn,self.in_train = learn,True
learn.stop = False
res = True
for cb in self.cbs: res = res and cb.begin_fit(learn)
return res
def after_fit(self):
res = not self.in_train
for cb in self.cbs: res = res and cb.after_fit()
return res
def begin_epoch(self, epoch):
self.learn.model.train()
self.in_train=True
res = True
for cb in self.cbs: res = res and cb.begin_epoch(epoch)
return res
def begin_validate(self):
self.learn.model.eval()
self.in_train=False
res = True
for cb in self.cbs: res = res and cb.begin_validate()
return res
def after_epoch(self):
res = True
for cb in self.cbs: res = res and cb.after_epoch()
return res
def begin_batch(self, xb, yb):
res = True
for cb in self.cbs: res = res and cb.begin_batch(xb, yb)
return res
def after_loss(self, loss):
res = self.in_train
for cb in self.cbs: res = res and cb.after_loss(loss)
return res
def after_backward(self):
res = True
for cb in self.cbs: res = res and cb.after_backward()
return res
def after_step(self):
res = True
for cb in self.cbs: res = res and cb.after_step()
return res
def do_stop(self):
try: return self.learn.stop
finally: self.learn.stop = False
class TestCallback(Callback):
def begin_fit(self,learn):
super().begin_fit(learn)
self.n_iters = 0
return True
def after_step(self):
self.n_iters += 1
print(self.n_iters)
if self.n_iters>=10: self.learn.stop = True
return True
fit(1, learn, cb=CallbackHandler([TestCallback]))
import re
_camel_re1 = re.compile('(.)([A-Z][a-z]+)')
_camel_re2 = re.compile('([a-z0-9])([A-Z])')
def camel2snake(name):
s1 = re.sub(_camel_re1, r'\1_\2', name)
return re.sub(_camel_re2, r'\1_\2', s1).lower()
class Callback():
_order=0
def set_runner(self, run): self.run = run
def __getattr__(self, k): return getattr(self.run, k)
@property
def name(self):
name = re.sub(r'Callback$', '', self.__class__, __name__)
return camel2snake(name or 'callback')
This first callback(below) is responsible to switch the model back and forth in training or validation mode, as well as maintaining a count of the iteration, or the percentage of iteration ellapsed in the epoch.
class TrainEvalCallback(Callback):
def begin_fit(self):
self.run.n_epochs = 0.
self.run.n_iter = 0
def after_batch(self):
if not self.in_train: return
self.run.n_epochs += 1./self.iters
self.run.n_iter += 1
def begin_epoch(self):
self.run.n_epochs = self.epoch
self.model.train()
self.run.in_train=True
def begin_validate(self):
self.model.eval()
self.run.in_train=False
Now, re-create our TestCallback
class TestCallback(Callback):
def after_step(self):
if self.train_eval.n_iter>=10: return True
cbname = 'TrainEvelCallback'
camel2snake(cbname)
from typing import *
def listify(o):
if o is None: return []
if isinstance(o, list): return o
if isinstance(o,str): return [o]
if isinstance(o, Iterable): return list(o)
return [o]
class Runner():
def __init__(self, cbs=None, cb_funcs=None):
cbs = listify(cbs)
for cbf in listify(cb_funcs):
cb = cbf()
setattr(self, cb.name, cb)
cbs.append(cb)
self.stop, self.cbs = False, [TrainEvalCallback()]+cbs
@property
def opt(self): return self.learn.opt
@property
def model(self): return self.learn.model
@property
def loss_func(self): return self.learn.loss_func
@property
def data(self): return self.learn.data
def one_batch(self, xb, yb):
self.xb,self.yb = xb,yb
if self('begin_batch'): return
self.pred = self.model(self.xb)
if self('after_pred'): return
self.loss = self.loss_func(self.pred, self.yb)
if self('after_loss') or not self.in_train: return
self.loss.backward()
if self('after_backward'): return
self.opt.step()
if self('after_step'): return
self.opt.zero_grad()
def all_batches(self, dl):
self.iters = len(dl)
for xb,yb in dl:
if self.stop: break
self.one_batch(xb, yb)
self('after_batch')
self.stop=False
def fit(self, epochs, learn):
self.epochs,self.learn = epochs,learn
try:
for cb in self.cbs: cb.set_runner(self)
if self('begin_fit'): return
for epoch in range(epochs):
self.epoch = epoch
if not self('begin_epoch'): self.all_batches(self.data.train_dl)
with torch.no_grad():
if not self('begin_validate'): self.all_batches(self.data.valid_dl)
if self('after_epoch'): break
finally:
self('after_fit')
self.learn = None
def __call__(self, cb_name):
for cb in sorted(self.cbs, key=lambda x: x._order):
f = getattr(cb, cb_name, None)
if f and f(): return True
return False
Third callback: how to compute metrics.
class AvgStats():
def __init__(self, metrics, in_train): self.metrics,self.in_train = listify(metrics),in_train
def reset(self):
self.tot_loss,self.count = 0.,0
self.tot_mets = [0.] * len(self.metrics)
@property
def all_stats(self): return [self.tot_loss.item()] + self.tot_mets
@property
def avg_stats(self): return [o/self.count for o in self.all_stats]
def __repr__(self):
if not self.count: return ""
return f"{'train' if self.in_train else 'valid'}: {self.avg_stats}"
def accumulate(self, run):
bn = run.xb.shape[0]
self.tot_loss += run.loss * bn
self.count += bn
for i,m in enumerate(self.metrics):
self.tot_mets[i] += m(run.pred, run.yb) * bn
class AvgStatsCallback(Callback):
def __init__(self, metrics):
self.train_stats,self.valid_stats = AvgStats(metrics,True),AvgStats(metrics,False)
def begin_epoch(self):
self.train_stats.reset()
self.valid_stats.reset()
def after_loss(self):
stats = self.train_stats if self.in_train else self.valid_stats
with torch.no_grad(): stats.accumulate(self.run)
def after_epoch(self):
print(self.train_stats)
print(self.valid_stats)
lr
¶from nb_04 import *
x_train, y_train, x_valid, y_valid = get_data()
train_ds, valid_ds = Dataset(x_train, y_train), Dataset(x_valid, y_valid)
nh, bs = 50, 512
c = y_train.max().item()+1
loss_func = F.cross_entropy
data = DataBunch(*get_dls(train_ds, valid_ds, bs), c)
def create_learner(model_func, loss_func, data):
return Learner(*model_func(data), loss_func, data)
learn = create_learner(get_model, loss_func, data)
run = Runner([AvgStatsCallback([accuracy])])
run.fit(3,learn)
The defaul lr
above in get_model
is 0.5, we can use partial
to change the lr
to 0.3. Really handy
learn = create_learner(partial(get_model, lr=0.3), loss_func, data)
run = Runner([AvgStatsCallback([accuracy])])
run.fit(3,learn)
Refactoring
def get_model_func(lr=0.5):
return partial(get_model, lr=lr)
we define two new callbacks: the Recorder to save track of the loss and our scheduled learning rate, and a ParamScheduler that can schedule any hyperparameter as long as it's registered in the state_dict
of the optimizer.
#export
class Recorder(Callback):
def begin_fit(self): self.lrs,self.losses = [],[]
def after_batch(self):
if not self.in_train: return
self.lrs.append(self.opt.param_groups[-1]['lr'])
self.losses.append(self.loss.detach().cpu())
def plot_lr (self): plt.plot(self.lrs)
def plot_loss(self): plt.plot(self.losses)
class ParamScheduler(Callback):
_order=1
def __init__(self, pname, sched_func): self.pname,self.sched_func = pname,sched_func
def set_param(self):
for pg in self.opt.param_groups:
pg[self.pname] = self.sched_func(self.n_epochs/self.epochs)
def begin_batch(self):
if self.in_train: self.set_param()
Let's start with a simple linear schedule going from start to end. It returns a function that takes a pos argument (going from 0 to 1) such that this function goes from start(at pos=0) to end (at pos=1) in a linear fasion
def sched_lin(start, end):
def _inner(start, end, pos): return start + pos*(end-start)
return partial(_inner, start, end)
This can be refactored with a decorator
#export
def annealer(f):
def _inner(start, end): return partial(f, start, end)
return _inner
@annealer
def sched_lin(start, end, pos): return start + pos*(end-start)
f = sched_lin(1,2)
f(0.3)
More scheduler functions: cosine; no scheduling; exponential scheduling
#export
@annealer
def sched_cos(start, end, pos): return start + (1 + math.cos(math.pi*(1-pos))) * (end-start) / 2
@annealer
def sched_no(start, end, pos): return start
@annealer
def sched_exp(start, end, pos): return start * (end/start) ** pos
Because pythorch tensor has no attribute called ndim
. We create this attribute for tensors so that they can be used directly as input to matplotlib
torch.Tensor.ndim = property(lambda x: len(x.shape))
annealings = "NO LINEAR COS EXP".split()
a = torch.arange(0, 100)
p = torch.linspace(0.01,1,100)
fns = [sched_no, sched_lin, sched_cos, sched_exp]
for fn, t in zip(fns, annealings):
f = fn(2, 1e-2)
plt.plot(a, [f(o) for o in p], label=t)
plt.legend();
annealings
annealer
In practice, we'll often want to combine different scedulers, the following function does that: it uses scheds[i]
for pcts[i]
of the training
def combine_scheds(pcts, scheds):
assert sum(pcts) == 1.
pcts = tensor([0] + listify(pcts))
assert torch.all(pcts >= 0)
pcts = torch.cumsum(pcts, 0)
def _inner(pos):
idx = (pos >= pcts).nonzero().max()
actual_pos = (pos-pcts[idx]) / (pcts[idx+1]-pcts[idx])
return scheds[idx](actual_pos)
return _inner
Here is an example: use 30% of the budget to go from 0.3 to 0.6 following a cosine, then the last 70% of the budget to go from 0.6 to 0.2, still following a cosine.
sched = combine_scheds([0.3, 0.7], [sched_cos(0.3, 0.6), sched_cos(0.6, 0.2)])
pcts = tensor([0] + listify([0.3,0.7]))
pcts
plt.plot(a, [sched(o) for o in p])
cbfs = [Recorder,
partial(AvgStatsCallback,accuracy),
partial(ParamScheduler, 'lr', sched)]
learn = create_learner(get_model_func(0.3), loss_func, data)
run = Runner(cb_funcs=cbfs)
run.fit(3,learn)
run.recorder.plot_lr()
run.recorder.plot_loss()
import torch
import matplotlib.pyplot as plt
import ipywidgets as widgets
def f(o): print('hi')
From the ipywidget docs:
on_click
method of the Button can be used to register function to be called when the button is clickedw = widgets.Button(description='Click me')
w
w.on_click(f)
from time import sleep
def slow_calculation():
res = 0
for i in range(5):
res += i*i
sleep(1)
return res
slow_calculation()
def slow_calculation(cb=None):
res = 0
for i in range(5):
res += i*i
if cb: cb(i); print('running callback')
return res
def show_progress(epoch):
print(f"Awesome! We've finished epoch {epoch}!")
slow_calculation(cb=show_progress)
slow_calculation(lambda o:print(f"Awesome! We've finished epoch {o}"))
def show_progress(exclamation, epoch):
print(f"{exclamation}! We've finished epoch {epoch}!")
slow_calculation(lambda o: show_progress("OK I guess",o))
def make_show_progress(exclamation):
_inner = lambda epoch: print(f"{exclamation}! We've finished epoch {epoch}!")
return _inner
slow_calculation(make_show_progress("Nicely done!"))
def make_show_progress(exclamation):
# Leading "_" is generally understood to be "private"
def _inner(epoch): print(f"{exclamation}! We've finished epoch {epoch}!")
return _inner
slow_calculation(make_show_progress("Amazingly done!"))
Here is the same work done by using partial
from functools import partial
def show_progress(exclamation, epoch):
print(f"{exclamation}! We've finished epoch {epoch}!")
slow_calculation(partial(show_progress, "OK I think"))
In this way, show_progress
originally takes two inputs (exclamation, epoch). With partial
, we provide the first input slot, so that when show_progress
is passed to slow_calculation
as a callback function, it no longer require the first input argument and will take i
as the second input argument to epoch
A good way to understand this is to play around with the positioning of the arguments: try swtich exclamation
and epoch
def show_progress(epoch , exclamation):
print(f"{exclamation}! We've finished epoch {epoch}!")
slow_calculation(partial(show_progress, "after switching"))
Now we can see, partial will always supply the first input argument so that our print out shows the reversed input argument positioning
class ProgressShowingCallback():
def __init__(self, exclamation="Awesome"):
self.exclamation = exclamation
def __call__(self, epoch):
print(f"{self.exclamation}! We've finished epoch {epoch}!")
cb = ProgressShowingCallback("WOW!")
cb(epoch="505")
def f(*args, **kwargs):
print(f"args:{args}; kwargs: {kwargs}")
f(3, 'a', thing1 = "hello")
def slow_calculation(cb=None):
res = 0
for i in range(5):
if cb: cb.before_calc(i)
res += i*i
sleep(1)
if cb: cb.after_calc(i, val=res)
return
class PrintStepCallback():
def __init__(self): pass
def before_calc(self, *args, **kwargs): print(f"About to start")
def after_calc(self, *args, **kwargs): print(f"Done step")
slow_calculation(PrintStepCallback())
The example above don't actually use the args and kwargs passed in.
To use them, we can't directly use the variable as passed in
class PrintStepCallback():
def __init__(self): pass
def before_calc(self, *args, **kwargs): print(f"About to start")
def after_calc(self, *args, **kwargs): print(f"Done step{i}, with res={val}")
slow_calculation(PrintStepCallback())
This results in an errer.
So instead, we have to reserve the variables when defining our function
class PrintStepCallback():
def __init__(self): pass
def before_calc(self, epoch, *args, **kwargs): print(f"About to start {epoch}")
def after_calc(self, epoch, val, *args, **kwargs): print(f"Done step{epoch}, with res={val}")
slow_calculation(PrintStepCallback())
Another powerful application of callbacks is to modify the values of variables in the middle of a training process or to perform functions such as "early stopping"
def slow_calculation(cb=None):
res = 0
for i in range(5):
if cb and hasattr(cb, 'before_calc'): cv.before_calc(i)
res += i*i
sleep(1)
if cb and hasattr(cb, 'after_calc'): # if callback doesn't exit, don't run
if cb.after_calc(i, res):
print("early stopping")
break
return res
NOTE that here we haven't define 'before_calc'. Since in slow_calculation
we have a line to check if certain callback functions are being created, this wouldn't give an error
class PrintAfterCallback():
def after_calc(self, epoch, val):
print(f"after {epoch}:{val}")
if val>10: return True
slow_calculation(PrintAfterCallback())
In order for our callbacks to modify values of certain variables in the training loop, it's good to construct the training function as a Class
and the modifiable variable as one of its attribute
class SlowCalculator():
# here this class has two attributes: 'cb' and 'res'
def __init__(self, cb=None): self.cb, self.res = cb, 0
def callback(self, cb_name, *args):
if not self.cb: return
cb = getattr(self.cb, cb_name, None)
if cb: return cb(self, *args)
def calc(self):
for i in range(5):
self.callback('before_calc', i)
self.res += i*i
sleep(1)
if self.callback('after_calc', i):
print("stopping early")
break
class ModifyingCallback():
def after_calc(self, calc, epoch):
print(f"After {epoch}, the res is {calc.res}")
if calc.res > 10: return True
if calc.res < 3: calc.res = calc.res*2
calculator = SlowCalculator(ModifyingCallback())
calculator.calc()
calculator.res
from nb_05b import *
torch.set_num_threads(2)
x_train, y_train, x_valid, y_valid = get_data()
def normalize_to(train, valid):
m, s = train.mean(), train.std()
return normalize(train, m, s), normalize(valid, m, s)
x_train, x_valid = normalize_to(x_train, x_valid)
train_ds, valid_ds = Dataset(x_train, y_train), Dataset(x_valid, y_valid)
Let's check it behaved properly
(f"the mean and std after normalization: {x_train.mean():.2f}, {x_train.std()}")
nh, bs = 50, 512
c = y_train.max().item()+1
loss_func = F.cross_entropy
data = DataBunch(*get_dls(train_ds,valid_ds,bs),c)
data.c
To refactor layers, it's useful to have a Lambda layer that can take a basic function and voncert it to a layer you can put in nn.Sequential.
class Lambda(nn.Module):
def __init__(self, func):
super().__init__()
self.func = func
def forward(self, x): return self.func(x)
def flatten(x): return x.view(x.shape[0], -1)
# This function takes the flat vector of size bsx784 and puts it back as a batch of images of 28 by 28 pixels
def mnist_resize(x):
return x.view(-1,1,28,28)
defin a simple CNN with input of bsx784
def get_cnn_model(data):
return nn.Sequential(
Lambda(mnist_resize),
nn.Conv2d( 1, 8, 5, padding=2,stride=2), nn.ReLU(), #14
nn.Conv2d( 8,16, 3, padding=1,stride=2), nn.ReLU(), # 7
nn.Conv2d(16,32, 3, padding=1,stride=2), nn.ReLU(), # 4
nn.Conv2d(32,32, 3, padding=1,stride=2), nn.ReLU(), # 2
nn.AdaptiveAvgPool2d(1),
Lambda(flatten),
nn.Linear(32,data.c)
)
model = get_cnn_model(data)
Get some callbacks
cbfs = [Recorder, partial(AvgStatsCallback,accuracy)]
opt = optim.SGD(model.parameters(), lr=0.4)
learn = Learner(model, opt, loss_func, data)
run = Runner(cb_funcs=cbfs)
%time run.fit(1, learn)
def conv2d(ni, nf, ks=3, stride=2):
return nn.Sequential(
nn.Conv2d(ni, nf, ks, padding=ks//2, stride=stride), nn.ReLU())
Another thing we acn do is to refactor the Callback for mnist resize
class BatchTransformXCallback(Callback):
_order=2
def __init__(self, tfm): self.tfm = tfm
def begin_batch(self): self.run.xb = self.tfm(self.xb)
def view_tfm(*size):
def _inner(x): return x.view(*((-1,)+size))
return _inner
mnist_view = view_tfm(1,28,28)
cbfs.append(partial(BatchTransformXCallback, mnist_view))
# with the AdaptiveAvfPool, this model can now work on any size input
nfs = [8,16,32,32]
def get_cnn_layers(data, nfs):
nfs = [1] + nfs
return [
conv2d(nfs[i], nfs[i+1], 5 if i==0 else 3)
for i in range(len(nfs)-1)
] + [nn.AdaptiveAvgPool2d(1), Lambda(flatten), nn.Linear(nfs[-1], data.c)]
def get_cnn_model(data, nfs): return nn.Sequential(*get_cnn_layers(data, nfs))
#export
def get_runner(model, data, lr=0.6, cbs=None, opt_func=None, loss_func = F.cross_entropy):
if opt_func is None: opt_func = optim.SGD
opt = opt_func(model.parameters(), lr=lr)
learn = Learner(model, opt, loss_func, data)
return learn, Runner(cb_funcs=listify(cbs))
model = get_cnn_model(data, nfs)
learn, run = get_runner(model, data, lr=0.4, cbs=cbfs)
model
run.fit(3, learn)
Here we want to do some telemetry, and find the mean and standard deviation of each activations in the model.
Here is a manual attempt
class SequentialModel(nn.Module):
def __init__(self, *layers):
super().__init__()
self.layers = nn.ModuleList(layers)
self.act_means = [[] for _ in layers]
self.act_stds = [[] for _ in layers]
def __call__(self, x):
for i,l in enumerate(self.layers):
x = l(x)
self.act_means[i].append(x.data.mean())
self.act_stds [i].append(x.data.std ())
return x
def __iter__(self): return iter(self.layers)
get_cnn_layers(data,nfs)
get_cnn_layers(data,nfs)[0]
model = SequentialModel(*get_cnn_layers(data, nfs))
learn,run = get_runner(model, data, lr=0.9, cbs=cbfs)
run.fit(3, learn)
for l in model.act_means: plt.plot(l)
plt.legend(range(6));
for l in model.act_stds: plt.plot(l)
plt.legend(range(6));
Hooks don't require us to rewrite the model
model = get_cnn_model(data, nfs)
learn, run = get_runner(model, data, lr=0.5, cbs=cbfs)
act_means = [[] for _ in model]
act_stds = [[] for _ in model]
A hook is attached to a leyer, and needs to have a function that takes three argument (module, input, output)
.
Here we store the mean and std of the output in the correct position of our list
def append_stats(i, mod, inp, outp):
act_means[i].append(outp.data.mean())
act_stds[i].append(outp.data.std())
for i , m in enumerate(model): m.register_forward_hook(partial(append_stats,i))
run.fit(1, learn)
for o in act_means: plt.plot(o)
plt.legend(range(5));
def children(m): return list(m.children())
class Hook():
def __init__(self, m, f): self.hook = m.register_forward_hook(partial(f, self))
def remove(self): self.hook.remove()
def __del__(self): self.remove()
def append_stats(hook, mod, inp, outp):
if not hasattr(hook, 'stats'): hook.stats = ([],[])
means, stds = hook.stats
means.append(outp.data.mean())
stds.append(outp.data.std())
model = get_cnn_model(data, nfs)
learn, run = get_runner(model, data, lr=0.5, cbs=cbfs)
hooks = [Hook(l,append_stats) for l in children(model[:4])]
run.fit(1, learn)
for h in hooks:
plt.plot(h.stats[0])
h.remove()
plt.legend(range(4));
from nb_06 import *
x_train,y_train,x_valid,y_valid = get_data()
x_train,x_valid = normalize_to(x_train,x_valid)
train_ds,valid_ds = Dataset(x_train, y_train),Dataset(x_valid, y_valid)
nh,bs = 50,512
c = y_train.max().item()+1
loss_func = F.cross_entropy
data = DataBunch(*get_dls(train_ds, valid_ds, bs), c)
mnist_view = view_tfm(1,28,28)
cbfs = [Recorder,
partial(AvgStatsCallback,accuracy),
CudaCallback,
partial(BatchTransformXCallback, mnist_view)]
nfs = [8,16,32,64,64]
learn,run = get_learn_run(nfs, data, 0.4, conv_layer, cbs=cbfs)
%time run.fit(2, learn)
This is the result of CNN without batch-norm
class BatchNorm(nn.Module):
def __init__(self, nf, mom=0.1, eps=1e-5):
super().__init__()
# NB: pytorch bn mom is opposite of what you'd expect
self.mom,self.eps = mom,eps
self.mults = nn.Parameter(torch.ones (nf,1,1))
self.adds = nn.Parameter(torch.zeros(nf,1,1))
self.register_buffer('vars', torch.ones(1,nf,1,1))
self.register_buffer('means', torch.zeros(1,nf,1,1))
def update_stats(self, x):
m = x.mean((0,2,3), keepdim=True)
v = x.var ((0,2,3), keepdim=True)
self.means.lerp_(m, self.mom)
self.vars.lerp_ (v, self.mom)
return m,v
def forward(self, x):
if self.training:
with torch.no_grad(): m,v = self.update_stats(x)
else: m,v = self.means,self.vars
x = (x-m) / (v+self.eps).sqrt()
return x*self.mults + self.adds
def conv_layer(ni, nf, ks=3, stride=2, bn=True, **kwargs):
# No bias needed if using bn
layers = [nn.Conv2d(ni, nf, ks, padding=ks//2, stride=stride, bias=not bn),
GeneralRelu(**kwargs)]
if bn: layers.append(BatchNorm(nf))
return nn.Sequential(*layers)
#export
def init_cnn_(m, f):
if isinstance(m, nn.Conv2d):
f(m.weight, a=0.1)
if getattr(m, 'bias', None) is not None: m.bias.data.zero_()
for l in m.children(): init_cnn_(l, f)
def init_cnn(m, uniform=False):
f = init.kaiming_uniform_ if uniform else init.kaiming_normal_
init_cnn_(m, f)
def get_learn_run(nfs, data, lr, layer, cbs=None, opt_func=None, uniform=False, **kwargs):
model = get_cnn_model(data, nfs, layer, **kwargs)
init_cnn(model, uniform=uniform)
return get_runner(model, data, lr=lr, cbs=cbs, opt_func=opt_func)
learn, run = get_learn_run(nfs, data, 0.9, conv_layer, cbs=cbfs)
with Hooks(learn.model, append_stats) as hooks:
run.fit(1, learn)
fig,(ax0,ax1) = plt.subplots(1,2, figsize=(10,4))
for h in hooks[:-1]:
ms,ss = h.stats
ax0.plot(ms[:10])
ax1.plot(ss[:10])
h.remove()
plt.legend(range(6));
fig,(ax0,ax1) = plt.subplots(1,2, figsize=(10,4))
for h in hooks[:-1]:
ms,ss = h.stats
ax0.plot(ms)
ax1.plot(ss)
from nb_07a import *
datasets.URLs.IMAGENETTE_160
path = datasets.untar_data(datasets.URLs.IMAGENETTE_160)
path
To be able to look at what's inside a directory from a notebook, we ad the .ls
method to Path
with a monkey-patch.
import PIL, os, mimetypes
Path.ls = lambda x: list(x.iterdir())
path.ls()
# There is one directory for each category
(path/'val').ls()
path_tench = path/'val'/'n01440764'
img_fn = path_tench.ls()[0]
img_fn
img = PIL.Image.open(img_fn)
img
import numpy
imga = numpy.array(img)
imga.shape
imga[:10,:10,0]
Just in case there are other files in the directory (models, texts,...) we want to keep only the images.
Let's not weite it out by hand, but instead use what's already on our computer (the MIME teypes database).
#export
image_extensions = set(k for k,v in mimetypes.types_map.items() if v.startswith('image/'))
' '.join(image_extensions)
def setify(o): return o if isinstance(o, set) else set(listify(o))
test_eq(setify('aa'), {'aa'})
test_eq(setify(['aa',1]), {'aa',1})
test_eq(setify(None), set())
test_eq(setify(1), {1})
test_eq(setify({1}), {1})
def _get_files(p, fs, extensions=None):
p = Path(p)
res = [p/f for f in fs if not f.startswith('.')
and ((not extensions) or f'.{f.split(".")[-1].lower()}' in extensions)]
return res
t = [o.name for o in os.scandir(path_tench)]
t
t = _get_files(path, t, extensions=image_extensions)
t[:3]
Now, t
contains all the path
objects in one directory
# putting everthing together
def get_files(path, extensions=None, recurse=False, include=None):
path = Path(path)
extensions = setify(extensions)
extensions = {e.lower() for e in extensions}
if recurse:
res = []
for p,d,f in os.walk(path): # returns (dirpath, dirnames, filenames)
if include is not None: d[:] = [o for o in d if o in include]
else: d[:] = [o for o in d if not o.startswith('.')]
res += _get_files(p, f, extensions)
return res
else:
f = [o.name for o in os.scandir(path) if o.is_file()]
return _get_files(path, f, extensions)
get_files(path_tench, image_extensions)[:3]
We need the recurse
argument when we start from path
, since the pictures are two levels below in the directories.
get_files(path, image_extensions, recurse=True)[:3]
all_fns = get_files(path, image_extensions, recurse=True)
len(all_fns)
%timeit -n 10 get_files(path, image_extensions, recurse=True)
What we need to do:
We use the ListContainer
class from notebook 06 to store our objects in an ItemList. The get method will need to be subclassed to explain how to access an element(open an image for instance), then the private_get
method can allow us to apply any additional transformation to it.
#export
def compose(x, funcs, *args, order_key='_order', **kwargs):
key = lambda o: getattr(o, order_key, 0)
for f in sorted(listify(funcs), key=key): x = f(x, **kwargs)
return x
class ItemList(ListContainer):
def __init__(self, items, path='.', tfms=None):
super().__init__(items)
self.path,self.tfms = Path(path),tfms
def __repr__(self): return f'{super().__repr__()}\nPath: {self.path}'
def new(self, items, cls=None):
if cls is None: cls=self.__class__
return cls(items, self.path, tfms=self.tfms)
def get(self, i): return i
def _get(self, i): return compose(self.get(i), self.tfms)
def __getitem__(self, idx):
res = super().__getitem__(idx)
if isinstance(res,list): return [self._get(o) for o in res]
return self._get(res)
class ImageList(ItemList):
@classmethod
def from_files(cls, path, extensions=None, recurse=True, include=None, **kwargs):
if extensions is None: extensions = image_extensions
return cls(get_files(path, extensions, recurse=recurse, include=include), path, **kwargs)
def get(self, fn): return PIL.Image.open(fn)
#export
class Transform(): _order=0
class MakeRGB(Transform):
def __call__(self, item): return item.convert('RGB')
def make_rgb(item): return item.convert('RGB')
il = ImageList.from_files(path, tfms=make_rgb)
il
img = il[0]
img
il[:1]
Here, we need to split the files between those in the folder train and those in the folder val.
fn = il.items[0]
fn
Since our filenames are path
object, we can find the directory of the file with .parent
.
We need to go back two folders before since the last folders are the class names.
fn.parent.parent
fn.parent.parent.name
def grandparent_splitter(fn, valid_name='valid', train_name='train'):
gp = fn.parent.parent.name
return True if gp==valid_name else False if gp == train_name else None
def split_by_func(ds, f):
items = ds.items
mask = [f(o) for o in items]
# 'None' values will be filtered out
train = [o for o,m in zip(items, mask) if m==False]
valid = [o for o,m in zip(items, mask) if m==True]
return train, valid
splitter = partial(grandparent_splitter, valid_name='val')
train, valid = split_by_func(il,splitter)
len(train), len(valid)
Let's create a class that contrains it. It just needs two ItemList
to be initialized, and we create a shortcut to all the unknown attributes by trying to grab them in the train
ItemList
Labeling has to be done after splitting, because it uses training set information to apply to the validation set, using a Processor.
A Processor is a transformation that is applied to all the inputs once at initialization, with some state computed on the training set that is then apoplied without modification on the validation set (and maybe the test set or at inference time on a single item).
FOr instance, it could be processing texts to tokenize, then numericalize them. In that case we want the validation set to be numericalized with exactly the same vocabulary as the trianing set.
Another example is in tabular data, where we fill missing values with (for instance) the median computed on the training set. That statistic is stored in the inner state of the Processor and applied on the validation set.
In our case, we want to conver label strings to numbers in a consistent and reproducible way. So we create a list of possible labels in the training set, and then convert our labels to numbers based on this vocab.
from collections import OrderedDict
def uniqueify(x, sort=False):
res = list(OrderedDict.fromkeys(x).keys())
if sort: res.sort()
return res
First, let's define the processor. We also define a ProcessedItemList
with an obj
method that can get the unprocessed items: for instance a processed label will be an index between 0 and the number of classes-1, the corresponding obj
will be the name of the calss.The first one is needed by the model for the trianing, but the second one is better for displaying the objects.
class Processor():
def process(slef, items): return items
class CategoryProcessor(Processor):
def __init__(self): self.vocab = None
def __call__(self, items):
# The vocab is defined on the first use.
if self.vocab is None:
self.vocab = uniqueify(items)
self.otoi = {v:k for k,v in enumerate(self.vocab)}
return [self.procl(o) for o in items]
def procl(self, item): return self.otoi[item]
def deprocess(self, idxs):
assert self.vocab is not None
return [self.deprocl(idx) for idx in idxs]
def deprocl(self, idx): return self.vocab[idx]
Here we label according to the folders of the images, so simply fn.parent.name
. We label the training set first with a newly created CategoryProcessor
so that it computes its inner vocab
on that set.
Then we label the validation set using the same processor, which means it uses the same vocab. The end result is another SplitData
object.
#export
def parent_labeler(fn): return fn.parent.name
def _label_by_func(ds, f, cls=ItemList): return cls([f(o) for o in ds.items], path=ds.path)
#This is a slightly different from what was seen during the lesson,
# we'll discuss the changes in lesson 11
class LabeledData():
def process(self, il, proc): return il.new(compose(il.items, proc))
def __init__(self, x, y, proc_x=None, proc_y=None):
self.x,self.y = self.process(x, proc_x),self.process(y, proc_y)
self.proc_x,self.proc_y = proc_x,proc_y
def __repr__(self): return f'{self.__class__.__name__}\nx: {self.x}\ny: {self.y}\n'
def __getitem__(self,idx): return self.x[idx],self.y[idx]
def __len__(self): return len(self.x)
def x_obj(self, idx): return self.obj(self.x, idx, self.proc_x)
def y_obj(self, idx): return self.obj(self.y, idx, self.proc_y)
def obj(self, items, idx, procs):
isint = isinstance(idx, int) or (isinstance(idx,torch.LongTensor) and not idx.ndim)
item = items[idx]
for proc in reversed(listify(procs)):
item = proc.deproc1(item) if isint else proc.deprocess(item)
return item
@classmethod
def label_by_func(cls, il, f, proc_x=None, proc_y=None):
return cls(il, _label_by_func(il, f), proc_x=proc_x, proc_y=proc_y)
def label_by_func(sd, f, proc_x=None, proc_y=None):
train = LabeledData.label_by_func(sd.train, f, proc_x=proc_x, proc_y=proc_y)
valid = LabeledData.label_by_func(sd.valid, f, proc_x=proc_x, proc_y=proc_y)
return SplitData(train,valid)
ll = label_by_func(sd, parent_labeler, proc_y=CategoryProcessor())
class ResizeFixed(Transform):
_order=10
def __init__(slef, size):
if isinstance(size, int): size = (size, size)
self.size = size
def __call__(self,item): return item.resize(self.size, PIL.Image.BILINEAR)
def to_byte_tensor(item):
res = torch.ByteTensor(torch.ByteStorage.from_buffer(item.tobytes()))
w, h = item.size
return res.view(h,w,-1).permute(2,0,1)
to_byte_tensor._order=20
def to_float_tensor(item): return item.float().div_(255.)
to_float_tensor._order=30
bs = 64
train_dl, valid_dl = get_dls(ll.train, ll.valid, bs)
from nb_08 import *
path = datasets.untar_data(datasets.URLs.IMAGENETTE_160)
tfms = [make_rgb, ResizeFixed(128), to_byte_tensor, to_float_tensor]
bs=128
il = ImageList.from_files(path, tfms = tfms)
sd = SplitData.split_by_func(il, partial(grandparent_splitter, valid_name='val'))
ll = label_by_func(sd, parent_labeler, proc_y=CategoryProcessor())
data = ll.to_databunch(bs, c_in=3, c_out=10)
Then a basic model:
nfs = [32,64,128,256]
cbfs = [partial(AvgStatsCallback, accuracy), CudaCallback,\
partial(BatchTransformXCallback, norm_imagenette)]
# Create a baseline training model with vanilla SGD
learn, run = get_learn_run(nfs, data, 0.4, conv_layer, cbs=cbfs)
run.fit(1,learn)
In PyTorch, the base optimizer in torch.optim
is just a dictionary that stores the hyper-parameters and rederences to the parameters of the model we want to train in parameter groups (different groups can have different leraning rates/momentum/weight decay... which is what lets us do discriminative learning rates).
It contains a method step
that wil update our parameters with the gradients and a method zero_grad
to detach and zero the gradients of all our parameters.
We build the equivalent from scratch, only ours will be more flexible. In our implementation, the step
funtion loop over all the parameters to execute the step using stepper functions that we have to provide whten initializing the optimizer.
class Optimizer():
def __init__(self, params, steppers, **defaults):
# might be a generator
self.param_groups = list(params)
# ensure params is a list of lists
if not isinstance(self.param_groups[0], list): self.param_groups = [self.param_groups]
self.hypers = [{**defaults} for p in self.param_groups]
self.steppers = listify(steppers)
def grad_params(self):
return [(p,hyper) for pg,hyper in zip(self.param_groups,self.hypers)
for p in pg if p.grad is not None]
def zero_grad(self):
for p,hyper in self.grad_params():
p.grad.detach_()
p.grad.zero_()
def step(self):
for p,hyper in self.grad_params(): compose(p, self.steppers, **hyper)
The **defaults
here is a set of default hyuper parameters such as learning_rate, momumtun, beta, epsi in Adam.
The difference that FastAI has as compared to PyTorch is that the step
function actually doesn't excute anything and we need to create a stepper in order to complete the update process.
The flexibility in this is that we now can create our stepper function in various way in order to implement all those fancy optimizers in the paper.
#export
def sgd_step(p, lr, **kwargs):
p.data.add_(-lr, p.grad.data)
return p
opt_func = partial(Optimizer, steppers=[sgd_step])
Now that we have changed the optimizer, we will need to adjust the calbacks that were using properties from the PyTorch optimizer:
in particular the hyper-parameters are in the list of dictionaries opt.hypers
(PyTorch has everything in a list of list of param groups).
#export
class Recorder(Callback):
def begin_fit(self): self.lrs,self.losses = [],[]
def after_batch(self):
if not self.in_train: return
self.lrs.append(self.opt.hypers[-1]['lr'])
self.losses.append(self.loss.detach().cpu())
def plot_lr (self): plt.plot(self.lrs)
def plot_loss(self): plt.plot(self.losses)
def plot(self, skip_last=0):
losses = [o.item() for o in self.losses]
n = len(losses)-skip_last
plt.xscale('log')
plt.plot(self.lrs[:n], losses[:n])
class ParamScheduler(Callback):
_order=1
def __init__(self, pname, sched_funcs):
self.pname,self.sched_funcs = pname,listify(sched_funcs)
def begin_batch(self):
if not self.in_train: return
fs = self.sched_funcs
if len(fs)==1: fs = fs*len(self.opt.param_groups)
pos = self.n_epochs/self.epochs
for f,h in zip(fs,self.opt.hypers): h[self.pname] = f(pos)
class LR_Find(Callback):
_order=1
def __init__(self, max_iter=100, min_lr=1e-6, max_lr=10):
self.max_iter,self.min_lr,self.max_lr = max_iter,min_lr,max_lr
self.best_loss = 1e9
def begin_batch(self):
if not self.in_train: return
pos = self.n_iter/self.max_iter
lr = self.min_lr * (self.max_lr/self.min_lr) ** pos
for pg in self.opt.hypers: pg['lr'] = lr
def after_step(self):
if self.n_iter>=self.max_iter or self.loss>self.best_loss*10:
raise CancelTrainException()
if self.loss < self.best_loss: self.best_loss = self.loss
A sanity check
sched = combine_scheds([0.3, 0.7], [sched_cos(0.3, 0.6), sched_cos(0.6, 0.2)])
cbfs = [partial(AvgStatsCallback,accuracy),
CudaCallback,
Recorder,
partial(ParamScheduler, 'lr', sched)]
learn,run = get_learn_run(nfs, data, 0.4, conv_layer, cbs=cbfs, opt_func=opt_func)
%time run.fit(1, learn)
run.recorder.plot_loss()
run.recorder.plot_lr()
To prevent overfitting, we apply some regularization methods in the process.
Weight devay comes from the idea of L2 regularization, which consists in adding to our loss function the sum of all the weights squared. The reason is becaus when we compute the gradients, it will add a contributino to them that will encourage the weights to be as small as possible.
Limiting our weights from growing too much is going to hinder the training of the model, but it will yield to a state where it generalizes better. Going back to the theory a little bit, weight decay (wd) is a parameter that controls that sum of squares we add to our loss:
$loss\_with\_wd = loss+(wd/2)*(weights**2).sum()$
In practice, it would be very inefficient (and numerically unstable) to compute that big sum and add it to the loss.
The derivative of p*2 with respect to p is 2p. So adding that big sum to our loss is eactly the same as doing:
$weigh.grad += wd*weight$
For every weight in our model, which in the case of vanilla SGD is equivalent to updating the parameters with:
$weight = weight - lr*(weight.grad+wd*weight)$
This technique is called "weight decay", as each weight is decayed by a factor $lr*wd$, as it's shown in this last formula.
This only works for stadard SGD, as we have seen that with momentum, RMSProp and Adam, the update has some additional formulas around the gradient. In those cases, the formula that comes from L2 regularization:
$weight.grad += wd*weight$
is different than weight decay
$new\_weight = weight -lr*weight.grad - lr*wd*weight$
Weight decay is subtracting $lr*wd*weight$ from the weights. We need this function to have an attribute_defults so that we are sure there is an hyper_parameter of the same name in our Optimizer.
def weight_decay(p, lr, wd, **kwargs):
p.data.mul_(1-lr*wd)
return p
weight_decay._defaults = dict(wd=0.)
L2 regularization is adding $wd*weight$ to the gradients.
def l2_reg(p,lr,wd,**kwargs):
p.grad.data.add_(wd, p.data)
return p
l2_reg._defaults = dict(wd=0.)
Let's allow steppers to add to our defaults (which are the default values of all the hyper-parameters). This helper function adds in dest the key/values it fins while going through os and applying f when there was no key of the same name.
def maybe_update(os, dest, f):
for o in os:
for k,v in f(o).items():
if k not in dest: dest[k] = v
def get_defaults(d): return getattr(d, '_defaults',{})
#export
class Optimizer():
def __init__(self, params, steppers, **defaults):
self.steppers = listify(steppers)
maybe_update(self.steppers, defaults, get_defaults)
# might be a generator
self.param_groups = list(params)
# ensure params is a list of lists
if not isinstance(self.param_groups[0], list): self.param_groups = [self.param_groups]
self.hypers = [{**defaults} for p in self.param_groups]
def grad_params(self):
return [(p,hyper) for pg,hyper in zip(self.param_groups,self.hypers)
for p in pg if p.grad is not None]
def zero_grad(self):
for p,hyper in self.grad_params():
p.grad.detach_()
p.grad.zero_()
def step(self):
for p,hyper in self.grad_params(): compose(p, self.steppers, **hyper)
sgd_opt = partial(Optimizer,steppers=[weight_decay, sgd_step])
learn, run = get_learn_run(nfs,data, 0.4, conv_layer, cbfs, sgd_opt)
Before trying to train, let's check the behavior works as intened: when we don't provide a value for wd
, we pull the corresponding default from weight_decay
model = learn.model
opt = sgd_opt(model.parameters(), lr=0.1)
test_eq(opt.hypers[0]['wd'], 0.)
test_eq(opt.hypers[0]['lr'], 0.1)
But if we provide a value, it will override the default
opt = sgd_opt(model.parameters(), lr=0.2, wd=1e-4)
test_eq(opt.hypers[0]['wd'], 0.0001)
test_eq(opt.hypers[0]['lr'], 0.2)
Now, time to fit
cbfs = [partial(AvgStatsCallback, accuracy),
CudaCallback]
learn, run = get_learn_run(nfs, data, 0.3, conv_layer, cbfs, partial(sgd_opt, wd=0.01))
run.fit(1, learn)
Momentum requires to add some state. We need to save the moving average of the gradients to be able to do the step and store this inside the optimizer state.
To do this, we introduce statistics. Statistics are an object with two methods:
init_state
, that returns the initial state(a tensor of -. for the moving average of gradients)update
, that updates the states with the new gradient value
We also read the _defaults
values of those objects, to allow them to provide default values to hyper-parameters.
class StatefulOptimizer(Optimizer):
def __init__(self, params, steppers, stats=None, **defaults):
self.stats = listify(stats)
maybe_update(self.stats, defaults, get_defaults)
super().__init__(params, steppers, **defaults)
self.state={}
def step(self):
for p, hyper in self.grad_params():
if o not in self.state:
#Create a state for p and call all the statistics to initialize it.
self.state[p] = {}
maybe_update(self.stats, self.state[p], lambda o: o.init_state(p))
state = self.state[p]
for stat in self.stats: state = stat.update(p, state, **hyper)
compose(p, self.steppers, **state, **hyper)
self.state[p] = state
class Stat():
_defaults = {}
def init_state(self, p): raise NotImplementedError
def update(self, p, state, **kwargs): raise NotImplementedError
Here is an example of Stat:
class AverageGrad(Stat):
_defaults = dict(mom=0.9)
def init_state(self, p): return {'grad_avg': torch.zeros_like(p.grad.data)}
def update(self, p, state, mom, **kwargs):
state['grad_avg'].mul_(mom).add_(p.grad.data)
return state
from nb_09c import *
We start with PIL transforms to rsize all our images to the same size. Then, when they are in a batch, we can apply data augmentation to all of them at the same time on the GPU.
make_rgb._order=0
path = datasets.untar_data(datasets.URLs.IMAGENETTE)
tfms = [make_rgb, ResizeFixed(128), to_byte_tensor, to_float_tensor]
def get_il(tfms): return ImageList.from_files(path, tfms=tfms)
il = get_il(tfms)
show_image(il[0])
il[0]
il.items[0]
Some side note here: if we index into a ImageList directly, it will return the tensor
of the image. However, if we call ImageList.Items[ ] and index into it, the path to the image will be returned.
PIL.image.open
accepts a path to open the image. Hence, the example below
img = PIL.Image.open(il.items[0])
img
img.getbbox()
# Returns the pixel value at a given position.
# :param xy: The coordinate, given as (x, y).
img.getpixel((1,1))
# returns: RGB
import numpy as np
%timeit -n 10 a = np.array(PIL.Image.open(il.items[0]))
Be careful with resampling methods as we might quickly lose some textures
# ANTIALIAS is a good method to use when down-sampling(going from big to small)
img.resize((128,128), resample=PIL.Image.ANTIALIAS)
# THe most common
img.resize((128,128), resample=PIL.Image.BILINEAR)
# Nearest neighbours
img.resize((128,128), resample=PIL.Image.NEAREST)
img.resize((256,256), resample=PIL.Image.BICUBIC).resize((128,128), resample=PIL.Image.NEAREST)
data
¶from nb_10b import *
path = datasets.untar_data(datasets.URLs.IMDB)
path.ls()
We define a subclass ofItemList that will read the texts in the corresponding filenames.
def read_file(fn):
with open(fn, 'r', encoding= 'utf8') as f : return f.read()
class TextList(ItemList):
@classmethod
def from_files(cls, path, extensions='.txt', recurse=True, include=None, **kwargs):
return cls(get_files(path, extensions, recurse=recurse, include=include), path, **kwargs)
def get(self, i):
if isinstance(i, Path): return read_file(i)
return i
We will use data from train, test and unsuperivsed folders
il = TextList.from_files(path, include=['train','test','unsup'])
len(il.items)
# take a peek at an example
txt = il[0]
txt
p = path/'train'
p.ls()
# randomly split our data with 10% being hold-off for Validation set
def random_splitter(fn, p_valid): return random.random() < p_valid
sd = SplitData.split_by_func(il, partial(random_splitter, p_valid=0.1))
# remember, we will get different result evertime we run the line above because of the random split
sd
We need to tokenize the dataset first, which is splitting a sentence in to individual tokens.
The library FastAI prefers is the same with Dr.Dipanjan.
We will be using spacy
here
import spacy, html
Before even tokenizing, we will apply a bit of preprocessing on the texts to clean them up.
These rules are applied before we split the sentences in tokens
# These rules are applied before we split the sentences in tokens
# Special tokens
UNK, PAD, BOS, EOS, TK_REP, TK_WREP, TK_UP, TK_MAJ = "xxunk xxpad xxbos xxeos xxrep xxwrep xxup xxmaj".split()
def sub_br(t):
"Replaces the <br /> by \n"
re_br = re.compile(r'<\s*br\s*/?>', re.IGNORECASE)
return re_br.sub("\n", t)
def spec_add_spaces(t):
"Add spaces around / and #"
return re.sub(r'([/#])', r' \1 ', t)
def rm_useless_spaces(t):
"Remove multiple spaces"
return re.sub(' {2,}', ' ', t)
def replace_rep(t):
"Replace repetitions at the character level: cccc -> TK_REP 4 c"
def _replace_rep(m:Collection[str]) -> str:
c,cc = m.groups()
return f' {TK_REP} {len(cc)+1} {c} '
re_rep = re.compile(r'(\S)(\1{3,})')
return re_rep.sub(_replace_rep, t)
def replace_wrep(t):
"Replace word repetitions: word word word -> TK_WREP 3 word"
def _replace_wrep(m:Collection[str]) -> str:
c,cc = m.groups()
return f' {TK_WREP} {len(cc.split())+1} {c} '
re_wrep = re.compile(r'(\b\w+\W+)(\1{3,})')
return re_wrep.sub(_replace_wrep, t)
def fixup_text(x):
"Various messy things we've seen in documents"
re1 = re.compile(r' +')
x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
'<br />', "\n").replace('\\"', '"').replace('<unk>',UNK).replace(' @.@ ','.').replace(
' @-@ ','-').replace('\\', ' \\ ')
return re1.sub(' ', html.unescape(x))
default_pre_rules = [fixup_text, replace_rep, replace_wrep, spec_add_spaces, rm_useless_spaces, sub_br]
default_spec_tok = [UNK, PAD, BOS, EOS, TK_REP, TK_WREP, TK_UP, TK_MAJ]
replace_rep('dddddd')
# works on minimum of 4 repeated charactors
replace_rep('ddd')
replace_wrep('sup sup sup sup ')
# works on minimum of 3 repeated words
replace_wrep('sup sup sup ')
These rules are applied after the tokenization on the list of tokens.
# These rules are applied after the tokenization on the list of tokens.
def replace_all_caps(x):
"Replace token in ALL CAPS by their lower version and add 'TK_UP' before"
res = []
for t in x:
if t.isupper() and len(t) > 1: res.append(TK_UP); res.append(t.lower())
else: res.append(t)
return res
def deal_caps(x):
"Replace all capitalized tokens in by their lower version and add 'TK_MAJ' before"
res = []
for t in x:
if t == '': continue
if t[0].isupper() and len(t) > 1 and t[1:].islower(): res.append(TK_MAJ)
res.append(t.lower())
return res
def add_eos_bos(x): return [BOS] + x + [EOS]
default_post_rules = [deal_caps, replace_all_caps, add_eos_bos]
t = 'TAKE'
t.isupper()
t = 'TakE'
t.isupper()
t[0].isupper()
# one-letter-word (I) will be skipped
replace_all_caps(['I','AM','SHOUTING'])
deal_caps(['My','name','is','Sylar'])
Since tokenizing and applying those rules take a bit of time, we will parallelize it using processPollExecuotr
to go faster
#export
from spacy.symbols import ORTH
from concurrent.futures import ProcessPoolExecutor
def parallel(func, arr, max_workers=4):
if max_workers<2: results = list(progress_bar(map(func, enumerate(arr)), total=len(arr)))
else:
with ProcessPoolExecutor(max_workers=max_workers) as ex:
return list(progress_bar(ex.map(func, enumerate(arr)), total=len(arr)))
if any([o is not None for o in results]): return results
class TokenizeProcessor(Processor):
def __init__(self, lang='en', chunksize=2000, pre_rules=None, post_rules=None, max_workers=4):
self.chunksize, self.max_workers = chunksize, max_workers
self.tokenizer = spacy.blank(lang).tokenizer
for w in default_spec_tok:
self.tokenizer.add_special_case(w, [{ORTH: w}])
self.pre_rules = default_pre_rules if pre_rules is None else pre_rules
self.post_rules = default_post_rules if post_rules is None else post_rules
def proc_chunk(self, args):
i, chunk = args
chunk = [compose(t, self.pre_rules) for t in chunk]
docs = [[d.text for d in doc] for doc in self.tokenizer.pipe(chunk)]
docs = [compose(t, self.post_rules) for t in docs]
return docs
def __call__(self, items):
toks = []
if isinstance(items[0], Path): items = [read_file(i) for i in items]
chunks = [items[i:i+self.chunksize] for i in range(0, len(items), self.chunksize)]
toks = parallel(self.proc_chunk, chunks, max_workers = self.max_workers)
return sum(toks, [])
def proc1(self, item): return self.proc_chunk([item])[0]
def deprocess(self, toks): return [self.deproc1(tok) for tok in toks]
def deproc1(self, tok): return " ".join(tok)
tp = TokenizeProcessor()
txt[:250]
'*'.join(tp(il[:10])[0])[:400]
Once we have tokenized our texts, we replace each token by an individual number, this is called numericalizing. Again, we do this with a processor(not so different from the CategoryProcessor
)
#export
import collections
class NumericalizeProcessor(Processor):
def __init__(self, vocab=None, max_vocab=60000, min_freq=2):
self.vocab,self.max_vocab,self.min_freq = vocab,max_vocab,min_freq
def __call__(self, items):
#The vocab is defined on the first use.
if self.vocab is None:
freq = Counter(p for o in items for p in o)
self.vocab = [o for o,c in freq.most_common(self.max_vocab) if c >= self.min_freq]
for o in reversed(default_spec_tok):
if o in self.vocab: self.vocab.remove(o)
self.vocab.insert(0, o)
if getattr(self, 'otoi', None) is None:
self.otoi = collections.defaultdict(int,{v:k for k,v in enumerate(self.vocab)})
return [self.proc1(o) for o in items]
def proc1(self, item): return [self.otoi[o] for o in item]
def deprocess(self, idxs):
assert self.vocab is not None
return [self.deproc1(idx) for idx in idxs]
def deproc1(self, idx): return [self.vocab[i] for i in idx]
c = Counter('abcbcbcbcaacccccc')
c.most_common
c['c']
default_spec_tok
print( [i for i in reversed(default_spec_tok)])
When we do language modeling, we will infer the labels from the text during training, so there's no need to label.
The training loop expects labels however, so we need to add dummy ones.
proc_tok, proc_num = TokenizeProcessor(max_workers=8), NumericalizeProcessor()
%time ll = label_by_func(sd, lambda x: 0, proc_x=[proc_tok, proc_num])
Once the items have been processed they will become list of numbers, we can still access the underlying raw data in x_obj
(or y-obj
for the the targets, but we don't have any here)
ll.train.x_obj(0)
ll.train
To conver our LabelList
to a DataBunch
, we need to solve the issue that batching IMDb reviews with different length. We want to stream through all the texts concatenated. We also have to prepare the targets that are the newt words in the text.
All of this is done with the next object called LM_PreLoader
.
At the beginning of each epoch, it will shuffle the articales(if shuffle=True
) and create a big stream by concatenating all of them. We divide this big stream in bs
smaller streams. That we will read in chunks of bptt
length.
from IPython.display import display, HTML
import pandas as pd
Let's say our stream is:
stream = """
In this notebook, we will go back over the example of classifying movie reviews we studied in part 1 and dig deeper under the surface.
First we will look at the processing steps necessary to convert text into numbers and how to customize it. By doing this, we'll have another example of the Processor used in the data block API.
Then we will study how we build a language model and train it.\n
"""
tokens = np.array(tp([stream])[0])
Then if we split it in 6 batches it would give something like this:
bs, seq_len = 6, 15
d_tokens = np.array([tokens[i*seq_len: (i+1)*seq_len] for i in range(bs)])
df = pd.DataFrame(d_tokens)
display(HTML(df.to_html(index=False,header=None)))
Then if we have a bptt
of 5, we would go over those in three batches. (df.shape[1]/bptt
)
bs, bptt = 6,5
for k in range(int(df.shape[1]/bptt)):
d_tokens = np.array([tokens[i*seq_len + k*bptt:i*seq_len + (k+1)*bptt] for i in range(bs)])
df = pd.DataFrame(d_tokens)
display(HTML(df.to_html(index=False, header=None)))
df.shape
class LM_PreLoader():
def __init__(self, data, bs=64, bptt=70, shuffle=False):
self.data,self.bs,self.bptt,self.shuffle = data,bs,bptt,shuffle
total_len = sum([len(t) for t in data.x])
self.n_batch = total_len // bs
self.batchify()
def __len__(self): return ((self.n_batch-1) // self.bptt) * self.bs
def __getitem__(self, idx):
source = self.batched_data[idx % self.bs]
seq_idx = (idx // self.bs) * self.bptt
return source[seq_idx:seq_idx+self.bptt],source[seq_idx+1:seq_idx+self.bptt+1]
def batchify(self):
texts = self.data.x
if self.shuffle: texts = texts[torch.randperm(len(texts))]
stream = torch.cat([tensor(t) for t in texts])
self.batched_data = stream[:self.n_batch * self.bs].view(self.bs, self.n_batch)
15//4
15 % 2
dl = DataLoader(LM_PreLoader(ll.valid, shuffle=True), batch_size=64)
Let's check it all workds ok:
bs
by bptt
. iter_dl = iter(dl)
x1, y1 = next(iter_dl)
x2, y2 = next(iter_dl)
x1.size(), y1.size()
vocab = proc_num.vocab
" ".join(vocab[o] for o in x1[0])
# check if it's shifted by 1 place from x1
" ".join(vocab[o] for o in y1[0])
" ".join(vocab[o] for o in x2[0])
# some vonvenient functions to quickly do the above
def get_lm_dls(train_ds, valid_ds, bs, bptt, **kwargs):
return (DataLoader(LM_PreLoader(train_ds, bs, bptt, shuffle=True), batch_size=bs, **kwargs),
DataLoader(LM_PreLoader(valid_ds, bs, bptt, shuffle=False), batch_size=2*bs, **kwargs))
def lm_databunchify(sd, bs, bptt, **kwargs):
return DataBunch(*get_lm_dls(sd.train, sd.valid, bs, bptt, **kwargs))
bs,bptt = 64,70
data = lm_databunchify(ll, bs, bptt)
When we will want to tackle classification, gathering the data will be a bit different:
Sampler
to sort our samples by length