Changed the solver interface to take the learning rate and the layer details

object as an input.  This allows the solvers to exhibit a more complex behavior
that depends on the specific layer.  It also removes the learning rate from the
solver's parameter set and pushes it entirely into the core training code.
This also removes the need for the separate "step size" which previously was
multiplied with the output of the solvers.

Most of the code is still the same, and in the core and trainer the step_size
variables have just been renamed to learning_rate.  The dnn_trainer's relevant
member functions have also been renamed.

The examples have been updated to reflect these API changes.  I also cleaned up
the resnet definition and added better downsampling.
This commit is contained in:
Davis King 2016-05-14 20:30:45 -04:00
parent 8421f213ad
commit 66166c674d
8 changed files with 256 additions and 226 deletions

View File

@ -825,16 +825,16 @@ namespace dlib
}
template <typename solver_type>
void update_parameters(sstack<solver_type> solvers, double step_size)
void update_parameters(sstack<solver_type> solvers, double learning_rate)
{
DLIB_CASSERT(solvers.size()>=num_computational_layers,"");
// Don't try to adjust the parameters if this layer doesn't have any.
if (params_grad.size() != 0)
{
const tensor& step = solvers.top()(details.get_layer_params(), static_cast<const tensor&>(params_grad));
tt::add(1,details.get_layer_params(), step_size, step);
const tensor& step = solvers.top()(learning_rate, details, static_cast<const tensor&>(params_grad));
tt::add(details.get_layer_params(), details.get_layer_params(), step);
}
subnetwork->update_parameters(solvers.pop(), step_size);
subnetwork->update_parameters(solvers.pop(), learning_rate);
}
const tensor& get_parameter_gradient(
@ -1175,13 +1175,14 @@ namespace dlib
}
template <typename solver_type>
void update_parameters(sstack<solver_type> solvers, double step_size)
void update_parameters(sstack<solver_type> solvers, double learning_rate)
{
DLIB_CASSERT(solvers.size()>=num_computational_layers,"");
// Don't try to adjust the parameters if this layer doesn't have any.
if (params_grad.size() != 0) {
const tensor& step = solvers.top()(details.get_layer_params(), static_cast<const tensor&>(params_grad));
tt::add(1,details.get_layer_params(), step_size, step);
if (params_grad.size() != 0)
{
const tensor& step = solvers.top()(learning_rate, details, static_cast<const tensor&>(params_grad));
tt::add(details.get_layer_params(), details.get_layer_params(), step);
}
}
@ -1401,9 +1402,9 @@ namespace dlib
}
template <typename solver_type>
void update_parameters(sstack<solver_type> solvers, double step_size)
void update_parameters(sstack<solver_type> solvers, double learning_rate)
{
subnetwork.update_parameters(solvers, step_size);
subnetwork.update_parameters(solvers, learning_rate);
}
const tensor& get_parameter_gradient(
@ -1687,11 +1688,11 @@ namespace dlib
}
template <typename solver_type>
void update_parameters(sstack<solver_type> solvers, double step_size)
void update_parameters(sstack<solver_type> solvers, double learning_rate)
{
for (size_t i = 0; i < details.size(); ++i)
details[i].update_parameters(solvers.pop(comp_layers_in_each_group*i),step_size);
subnetwork.update_parameters(solvers.pop(comp_layers_in_each_group*details.size()),step_size);
details[i].update_parameters(solvers.pop(comp_layers_in_each_group*i),learning_rate);
subnetwork.update_parameters(solvers.pop(comp_layers_in_each_group*details.size()),learning_rate);
}
const subnet_type& subnet() const { return subnetwork; }
@ -1905,7 +1906,7 @@ namespace dlib
}
template <typename solver_type>
void update_parameters(sstack<solver_type> /*solvers*/, double /*step_size*/)
void update_parameters(sstack<solver_type> /*solvers*/, double /*learning_rate*/)
{
// nothing to do
}
@ -2248,10 +2249,10 @@ namespace dlib
template <typename solver_type>
void update_parameters (
sstack<solver_type> solvers,
double step_size
double learning_rate
)
{
subnetwork.update_parameters(solvers, step_size);
subnetwork.update_parameters(solvers, learning_rate);
}
const subnet_type& subnet() const { return subnetwork; }
@ -2542,9 +2543,9 @@ namespace dlib
}
template <typename solver_type>
void update_parameters(sstack<solver_type> solvers, double step_size)
void update_parameters(sstack<solver_type> solvers, double learning_rate)
{
subnetwork.update_parameters(solvers, step_size);
subnetwork.update_parameters(solvers, learning_rate);
}
const tensor& get_parameter_gradient(

View File

@ -506,7 +506,7 @@ namespace dlib
template <typename solver_type>
void update_parameters(
sstack<solver_type> solvers,
double step_size
double learning_rate
);
/*!
requires
@ -517,13 +517,14 @@ namespace dlib
if you want to call update_parameters() on some other neural network
object then you must NOT reuse the same solvers object.
- solvers.size() >= num_computational_layers
- 0 < step_size <= 1
- 0 < learning_rate <= 1
ensures
- Updates all the parameters in the network. In particular, we pass each
layer's parameter gradient (i.e. the tensor returned by the layer's
get_parameter_gradient() member) through that layer's corresponding
solver object. This produces a parameter delta vector and we add
step_size times that vector to the layer's parameters.
solver object. This produces a parameter delta vector which we add to
the layer's parameters.
- The solvers use the given learning rate.
!*/
void clean(
@ -944,7 +945,7 @@ namespace dlib
template <typename solver_type>
void update_parameters (
sstack<solver_type> solvers,
double step_size
double learning_rate
);
/*!
requires
@ -955,13 +956,14 @@ namespace dlib
is, if you want to call update_parameters() on some other neural network
object then you must NOT reuse the same solvers object.
- solvers.size() >= num_computational_layers
- 0 < step_size <= 1
- 0 < learning_rate <= 1
ensures
- Updates all the parameters in the network. In particular, we pass each
layer's parameter gradient (i.e. the tensor returned by the layer's
get_parameter_gradient() member) through that layer's corresponding
solver object. This produces a parameter delta vector and we add
step_size times that vector to the layer's parameters.
solver object. This produces a parameter delta vector which we add to
the layer's parameters.
- The solvers use the given learning rate.
!*/
// -------------

View File

@ -14,30 +14,34 @@ namespace dlib
public:
sgd(
float learning_rate_ = 0.01,
float weight_decay_ = 0.0005,
float momentum_ = 0.9
float weight_decay_,
float momentum_
)
{
weight_decay = weight_decay_;
learning_rate = learning_rate_;
momentum = momentum_;
}
sgd(
) : sgd(0.0005, 0.9)
{
}
float get_momentum (
) const { return momentum; }
float get_weight_decay (
) const { return weight_decay; }
float get_learning_rate (
) const { return learning_rate; }
template <typename layer_type>
const tensor& operator() (
const tensor& params,
const float learning_rate,
const layer_type& l,
const tensor& params_grad
)
{
const tensor& params = l.get_layer_params();
DLIB_CASSERT(params.size() != 0,"");
if (v.size() == 0)
{
@ -54,10 +58,9 @@ namespace dlib
friend void serialize(const sgd& item, std::ostream& out)
{
serialize("sgd", out);
serialize("sgd2", out);
serialize(item.v, out);
serialize(item.weight_decay, out);
serialize(item.learning_rate, out);
serialize(item.momentum, out);
}
@ -65,18 +68,16 @@ namespace dlib
{
std::string version;
deserialize(version, in);
if (version != "sgd")
if (version != "sgd2")
throw serialization_error("Unexpected version found while deserializing dlib::sgd.");
deserialize(item.v, in);
deserialize(item.weight_decay, in);
deserialize(item.learning_rate, in);
deserialize(item.momentum, in);
}
private:
resizable_tensor v;
float weight_decay;
float learning_rate;
float momentum;
};
@ -87,19 +88,21 @@ namespace dlib
public:
adam(
float learning_rate_ = 0.001,
float weight_decay_ = 0.0005,
float momentum1_ = 0.9,
float momentum2_ = 0.999
float weight_decay_,
float momentum1_,
float momentum2_
)
{
weight_decay = weight_decay_;
learning_rate = learning_rate_;
momentum1 = momentum1_;
momentum2 = momentum2_;
t = 0;
}
adam(
) : adam(0.0005, 0.9, 0.999)
{}
float get_momentum1 (
) const { return momentum1; }
@ -109,14 +112,14 @@ namespace dlib
float get_weight_decay (
) const { return weight_decay; }
float get_learning_rate (
) const { return learning_rate; }
template <typename layer_type>
const tensor& operator() (
const tensor& params,
const float learning_rate,
const layer_type& l,
const tensor& params_grad
)
{
const tensor& params = l.get_layer_params();
DLIB_CASSERT(params.size() != 0,"");
if (v.size() == 0)
{
@ -136,12 +139,11 @@ namespace dlib
friend void serialize(const adam& item, std::ostream& out)
{
serialize("adam", out);
serialize("adam2", out);
serialize(item.m, out);
serialize(item.v, out);
serialize(item.s, out);
serialize(item.weight_decay, out);
serialize(item.learning_rate, out);
serialize(item.momentum1, out);
serialize(item.momentum2, out);
serialize(item.t, out);
@ -151,13 +153,12 @@ namespace dlib
{
std::string version;
deserialize(version, in);
if (version != "adam")
if (version != "adam2")
throw serialization_error("Unexpected version found while deserializing dlib::adam.");
deserialize(item.m, in);
deserialize(item.v, in);
deserialize(item.s, in);
deserialize(item.weight_decay, in);
deserialize(item.learning_rate, in);
deserialize(item.momentum1, in);
deserialize(item.momentum2, in);
deserialize(item.t, in);
@ -168,7 +169,6 @@ namespace dlib
resizable_tensor v;
resizable_tensor s;
float weight_decay;
float learning_rate;
float momentum1;
float momentum2;
float t;

View File

@ -33,22 +33,28 @@ namespace dlib
EXAMPLE_SOLVER(
);
template <typename layer_type>
const tensor& operator() (
const tensor& params,
const float learning_rate,
const layer_type& l,
const tensor& params_grad
)
/*!
requires
- params.size() != 0
- have_same_dimensions(params, params_grad) == true.
- l.get_layer_params().size() != 0
- have_same_dimensions(l.get_layer_params(), params_grad) == true.
- When this function is invoked on a particular solver instance, it is
always supplied with parameters from the same layer instance. That is,
the solver is allowed to remember things from one invocation to another
and to assume that it is being serially applied to optimize the same
always supplied with the same layer instance, l. That is, the solver is
allowed to remember things from one invocation to another and to assume
that it is being serially applied to optimize the same layer's
parameters.
ensures
- Returns a step vector V that is intended to be used to update the
parameters by adding V to params.
parameters by adding V to l.get_layer_params().
- This function will use the given "learning rate" to compute V. How the
learning rate is used is solver dependent. But in general the learning
rate should be used to select the step size, i.e. to somehow determine
the magnitude of V.
!*/
};
@ -68,32 +74,34 @@ namespace dlib
WHAT THIS OBJECT REPRESENTS
This object implements the EXAMPLE_SOLVER interface defined above. It is a
basic stochastic gradient descent solver which uses momentum and weight
decay. In particular, it performs the following update each time the
solver is invoked:
v = momentum*v - weight_decay*learning_rate*l.get_layer_params() - learning_rate*params_grad;
l.get_layer_params() += v;
Here v is a momentum term that is remembered by the solver from one
invocation of operator() to the next.
decay. In particular, it computes the update vector V according to:
V = momentum*V - weight_decay*learning_rate*l.get_layer_params() - learning_rate*params_grad;
Here V is a momentum term that is remembered by the solver from one
invocation of operator() to the next.
!*/
public:
sgd(
float learning_rate = 0.01,
float weight_decay = 0.0005,
float momentum = 0.9
);
/*!
ensures
- #get_weight_decay() == 0.0005
- #get_momentum() == 0.9
!*/
sgd(
float weight_decay,
float momentum
);
/*!
requires
- learning_rate > 0
- weight_decay >= 0
- momentum >= 0
ensures
- #get_learning_rate() == learning_rate
- #get_weight_decay() == weight_decay
- #get_momentum() == momentum
!*/
float get_learning_rate () const;
float get_weight_decay () const;
float get_momentum () const;
};
@ -120,25 +128,30 @@ namespace dlib
public:
adam(
float learning_rate = 0.001,
float weight_decay = 0.0005,
float momentum1 = 0.9,
float momentum2 = 0.999
);
/*!
ensures
- #get_weight_decay() == 0.0005
- #get_momentum1() == 0.9
- #get_momentum2() == 0.999
!*/
adam(
float weight_decay,
float momentum1,
float momentum2
);
/*!
requires
- learning_rate > 0
- weight_decay >= 0
- 0 <= momentum1 < 1
- 0 <= momentum2 < 1
ensures
- #get_learning_rate() == learning_rate
- #get_weight_decay() == weight_decay
- #get_momentum1() == momentum1
- #get_momentum2() == momentum2
!*/
float get_learning_rate () const;
float get_weight_decay () const;
float get_momentum1 () const;
float get_momentum2 () const;

View File

@ -193,7 +193,7 @@ namespace dlib
{
last_time = now_time;
std::cout << "step#: " << rpad(cast_to_string(train_one_step_calls),epoch_string_pad) << " "
<< "step size: " << rpad(cast_to_string(step_size),ss_string_pad) << " "
<< "learning rate: " << rpad(cast_to_string(learning_rate),lr_string_pad) << " "
<< "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << " "
<< "steps without apparent progress: " << steps_without_progress
<< std::endl;
@ -219,7 +219,7 @@ namespace dlib
{
last_time = now_time;
std::cout << "step#: " << rpad(cast_to_string(train_one_step_calls),epoch_string_pad) << " "
<< "step size: " << rpad(cast_to_string(step_size),ss_string_pad) << " "
<< "learning rate: " << rpad(cast_to_string(learning_rate),lr_string_pad) << " "
<< "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << " "
<< "steps without apparent progress: " << steps_without_progress
<< std::endl;
@ -243,13 +243,13 @@ namespace dlib
// instead use class members is so we can include the state of the loops in the
// stuff written by sync_to_disk()
for (;
epoch_iteration < max_num_epochs && step_size >= min_step_size;
epoch_iteration < max_num_epochs && learning_rate >= min_learning_rate;
++epoch_iteration)
{
using namespace std::chrono;
last_time = system_clock::now();
clear_average_loss();
for (; epoch_pos < data.size() && step_size >= min_step_size; epoch_pos += mini_batch_size)
for (; epoch_pos < data.size() && learning_rate >= min_learning_rate; epoch_pos += mini_batch_size)
{
if (verbose)
{
@ -259,7 +259,7 @@ namespace dlib
last_time = now_time;
auto iter = epoch_iteration + epoch_pos/(double)data.size();
std::cout << "epoch: " << rpad(cast_to_string(iter),epoch_string_pad) << " "
<< "step size: " << rpad(cast_to_string(step_size),ss_string_pad) << " "
<< "learning rate: " << rpad(cast_to_string(learning_rate),lr_string_pad) << " "
<< "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << " "
<< "steps without apparent progress: " << steps_without_progress
<< std::endl;
@ -279,7 +279,7 @@ namespace dlib
// Capitalize the E in Epoch so it's easy to grep out the lines that
// are for full epoch status statements.
std::cout << "Epoch: " << rpad(cast_to_string(epoch_iteration+1),epoch_string_pad) << " "
<< "step size: " << rpad(cast_to_string(step_size),ss_string_pad) << " "
<< "learning rate: " << rpad(cast_to_string(learning_rate),lr_string_pad) << " "
<< "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << " "
<< "steps without apparent progress: " << steps_without_progress
<< std::endl;
@ -305,13 +305,13 @@ namespace dlib
// instead use class members is so we can include the state of the loops in the
// stuff written by sync_to_disk()
for (;
epoch_iteration < max_num_epochs && step_size >= min_step_size;
epoch_iteration < max_num_epochs && learning_rate >= min_learning_rate;
++epoch_iteration)
{
using namespace std::chrono;
last_time = system_clock::now();
clear_average_loss();
for (; epoch_pos < data.size() && step_size >= min_step_size; epoch_pos += mini_batch_size)
for (; epoch_pos < data.size() && learning_rate >= min_learning_rate; epoch_pos += mini_batch_size)
{
if (verbose)
{
@ -321,7 +321,7 @@ namespace dlib
last_time = now_time;
auto iter = epoch_iteration + epoch_pos/(double)data.size();
std::cout << "epoch: " << rpad(cast_to_string(iter),epoch_string_pad) << " "
<< "step size: " << rpad(cast_to_string(step_size),ss_string_pad) << " "
<< "learning rate: " << rpad(cast_to_string(learning_rate),lr_string_pad) << " "
<< "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << " "
<< "steps without apparent progress: " << steps_without_progress
<< std::endl;
@ -340,7 +340,7 @@ namespace dlib
// Capitalize the E in Epoch so it's easy to grep out the lines that
// are for full epoch status statements.
std::cout << "Epoch: " << rpad(cast_to_string(epoch_iteration+1),epoch_string_pad) << " "
<< "step size: " << rpad(cast_to_string(step_size),ss_string_pad) << " "
<< "learning rate: " << rpad(cast_to_string(learning_rate),lr_string_pad) << " "
<< "average loss: " << rpad(cast_to_string(get_average_loss()),string_pad) << " "
<< "steps without apparent progress: " << steps_without_progress
<< std::endl;
@ -380,35 +380,35 @@ namespace dlib
rs.clear();
}
void set_step_size (
double ss
void set_learning_rate (
double lr
)
{
DLIB_CASSERT(ss > 0,"");
DLIB_CASSERT(lr > 0,"");
wait_for_thread_to_pause();
if (step_size != ss)
if (learning_rate != lr)
previous_loss_values.clear();
step_size = ss;
learning_rate = lr;
}
double get_step_size(
double get_learning_rate(
) const
{
return step_size;
return learning_rate;
}
void set_min_step_size (
double ss
void set_min_learning_rate (
double lr
)
{
DLIB_CASSERT(ss > 0,"");
min_step_size = ss;
DLIB_CASSERT(lr > 0,"");
min_learning_rate = lr;
}
double get_min_step_size (
double get_min_learning_rate (
) const
{
return min_step_size;
return min_learning_rate;
}
void set_iterations_without_progress_threshold (
@ -424,18 +424,18 @@ namespace dlib
return iter_without_progress_thresh;
}
void set_step_size_shrink_amount (
void set_learning_rate_shrink_amount (
double shrink
)
{
DLIB_CASSERT(0 < shrink && shrink <= 1,"");
step_size_shrink = shrink;
learning_rate_shrink = shrink;
}
double get_step_size_shrink (
double get_learning_rate_shrink (
) const
{
return step_size_shrink;
return learning_rate_shrink;
}
private:
@ -490,7 +490,7 @@ namespace dlib
{
auto&& dev = *devices[device];
dlib::cuda::set_device(dev.device_id);
dev.net.update_parameters(make_sstack(dev.solvers), step_size);
dev.net.update_parameters(make_sstack(dev.solvers), learning_rate);
}
void thread() try
@ -592,18 +592,18 @@ namespace dlib
}
// If we have been running for a while then check if the loss is still
// dropping. If it isn't then we will reduce the step size. Note that we
// dropping. If it isn't then we will reduce the learning rate. Note that we
// have a "budget" that prevents us from calling
// count_steps_without_decrease() every iteration. We do this because
// it can be expensive to compute when previous_loss_values is large.
if (gradient_check_budget > iter_without_progress_thresh && step_size_shrink != 1)
if (gradient_check_budget > iter_without_progress_thresh && learning_rate_shrink != 1)
{
gradient_check_budget = 0;
steps_without_progress = count_steps_without_decrease(previous_loss_values);
if (steps_without_progress >= iter_without_progress_thresh)
{
// optimization has flattened out, so drop the learning rate.
step_size = step_size_shrink*step_size;
learning_rate = learning_rate_shrink*learning_rate;
steps_without_progress = 0;
previous_loss_values.clear();
}
@ -623,18 +623,18 @@ namespace dlib
const static long string_pad = 10;
const static long epoch_string_pad = 4;
const static long ss_string_pad = 4;
const static long lr_string_pad = 4;
void init()
{
max_num_epochs = 10000;
mini_batch_size = 128;
verbose = false;
step_size = 1;
min_step_size = 1e-3;
learning_rate = 1e-2;
min_learning_rate = 1e-5;
iter_without_progress_thresh = 2000;
steps_without_progress = 0;
step_size_shrink = 0.1;
learning_rate_shrink = 0.1;
epoch_iteration = 0;
epoch_pos = 0;
train_one_step_calls = 0;
@ -661,11 +661,11 @@ namespace dlib
serialize(item.verbose, out);
serialize(item.net, out);
serialize(item.devices[0]->solvers, out);
serialize(item.step_size.load(), out);
serialize(item.min_step_size, out);
serialize(item.learning_rate.load(), out);
serialize(item.min_learning_rate, out);
serialize(item.iter_without_progress_thresh.load(), out);
serialize(item.steps_without_progress.load(), out);
serialize(item.step_size_shrink.load(), out);
serialize(item.learning_rate_shrink.load(), out);
serialize(item.epoch_iteration, out);
serialize(item.epoch_pos, out);
serialize(item.train_one_step_calls, out);
@ -697,11 +697,11 @@ namespace dlib
deserialize(item.verbose, in);
deserialize(item.net, in);
deserialize(item.devices[0]->solvers, in);
deserialize(dtemp, in); item.step_size = dtemp;
deserialize(item.min_step_size, in);
deserialize(dtemp, in); item.learning_rate = dtemp;
deserialize(item.min_learning_rate, in);
deserialize(ltemp, in); item.iter_without_progress_thresh = ltemp;
deserialize(ltemp, in); item.steps_without_progress = ltemp;
deserialize(dtemp, in); item.step_size_shrink = dtemp;
deserialize(dtemp, in); item.learning_rate_shrink = dtemp;
deserialize(item.epoch_iteration, in);
deserialize(item.epoch_pos, in);
deserialize(item.train_one_step_calls, in);
@ -845,11 +845,11 @@ namespace dlib
size_t mini_batch_size;
bool verbose;
net_type& net;
std::atomic<double> step_size;
double min_step_size;
std::atomic<double> learning_rate;
double min_learning_rate;
std::atomic<unsigned long> iter_without_progress_thresh;
std::atomic<unsigned long> steps_without_progress;
std::atomic<double> step_size_shrink;
std::atomic<double> learning_rate_shrink;
std::chrono::time_point<std::chrono::system_clock> last_sync_time;
std::string sync_filename;
std::chrono::seconds time_between_syncs;

View File

@ -68,10 +68,10 @@ namespace dlib
provided solver instance.
- #get_max_num_epochs() == 10000
- #get_mini_batch_size() == 128
- #get_step_size() == 1
- #get_min_step_size() == 1e-3
- #get_learning_rate() == 1e-2
- #get_min_learning_rate() == 1e-5
- #get_iterations_without_progress_threshold() == 2000
- #get_step_size_shrink() == 0.1
- #get_learning_rate_shrink() == 0.1
- if (cuda_extra_devices.size() > 0) then
- This object will use multiple graphics cards to run the learning
algorithms. In particular, it will always use whatever device is
@ -102,6 +102,8 @@ namespace dlib
get_net(). In particular, the first layer's solver is
get_solvers()[0], the second layer's solver is
get_solvers()[1], and so on.
- This function blocks until all threads inside the dnn_trainer have
stopped touching the net.
!*/
unsigned long get_mini_batch_size (
@ -142,54 +144,51 @@ namespace dlib
- #get_max_num_epochs() == num
!*/
void set_step_size (
double ss
void set_learning_rate (
double lr
);
/*!
requires
- ss > 0
- lr > 0
ensures
- #get_step_size() == ss
- #get_learning_rate() == lr
- This function blocks until all threads inside the dnn_trainer have
stopped touching the net.
!*/
double get_step_size(
double get_learning_rate(
) const;
/*!
ensures
- During each training step, a solver tells us how to modify the parameters
of each layer in the network. It does this by outputting a step vector,
that when added to the parameters, will hopefully result in improved
network performance. In our case, at each step, we multiply the step
vector from the solver by get_step_size() before adding it to the
parameters. Therefore, get_step_size() controls the "learning rate" used
during training.
It should be emphasized that this learning rate applied by dnn_trainer is
independent from any learning rate scheduling a solver might itself apply
to the step vector it outputs. That is, the dnn_trainer doesn't know
what the solver is doing. It just takes the output from a solver and
multiplies it by get_step_size() before applying the step vector.
of each layer in the network. It does this by outputting a step vector
that, when added to the parameters, will hopefully result in improved
network performance. The learning rate is one of the inputs to the
solver and influences the size of this step vector.
!*/
void set_min_step_size (
double ss
void set_min_learning_rate (
double lr
);
/*!
requires
- ss > 0
- lr > 0
ensures
- #get_min_step_size() == ss
- #get_min_learning_rate() == lr
!*/
double get_min_step_size (
double get_min_learning_rate (
) const;
/*!
ensures
- During training, this object will test if progress is still being made
and if it isn't then it will reduce get_step_size() by setting it to
get_step_size()*get_step_size_shrink(). However, it will not reduce it
below get_min_step_size(). Once this minimum step size is crossed the
training will terminate.
- During training via this->train(), this object will test if progress is
still being made and if it isn't then it will reduce get_learning_rate()
by setting it to get_learning_rate()*get_learning_rate_shrink().
However, it will not reduce it below get_min_learning_rate(). Once this
minimum learning rate is crossed the training will terminate.
- get_min_learning_rate() doesn't apply if you are using train_one_step().
You can keep calling train_one_step() as many times as you want and the
learning rate will drop infinitely close to 0 if you run long enough.
!*/
void set_iterations_without_progress_threshold (
@ -209,33 +208,33 @@ namespace dlib
get_iterations_without_progress_threshold() mini-batch results and
applying the statistical test defined by the running_gradient object to
see if the training error is getting smaller. If it isn't being reduced
then get_step_size() is made smaller by a factor of get_step_size_shrink().
then get_learning_rate() is made smaller by a factor of get_learning_rate_shrink().
Therefore, get_iterations_without_progress_threshold() should always be
set to something sensibly large so that this test can be done with
reasonably high confidence. Think of this test as saying "if the loss
hasn't decreased for the previous get_iterations_without_progress_threshold()
then shrink the step size".
then shrink the learning rate".
!*/
void set_step_size_shrink_amount (
void set_learning_rate_shrink_amount (
double shrink
);
/*!
requires
- 0 < shrink && shrink <= 1
ensures
- #get_step_size_shrink() == shrink
- #get_learning_rate_shrink() == shrink
!*/
double get_step_size_shrink (
double get_learning_rate_shrink (
) const;
/*!
ensures
- Whenever the training routine thinks it isn't making progress anymore it
will reduce get_step_size() by multiplying it by get_step_size_shrink().
- You can disable the automatic step size reduction by setting
get_step_size_shrink() to 1.
will reduce get_learning_rate() by multiplying it by get_learning_rate_shrink().
- You can disable the automatic learning rate reduction by setting
get_learning_rate_shrink() to 1.
!*/
void be_verbose (
@ -283,8 +282,8 @@ namespace dlib
- Trains a supervised neural network based on the given training data.
The goal of training is to find the network parameters that minimize
get_net().compute_loss(data.begin(), data.end(), labels.begin()).
- The optimizer will run until get_step_size() < get_min_step_size() or
get_max_num_epochs() training epochs have been executed.
- The optimizer will run until get_learning_rate() < get_min_learning_rate()
or get_max_num_epochs() training epochs have been executed.
- Each layer in the network will be optimized by its corresponding solver
in get_solvers().
- Each call to train DOES NOT reinitialize the state of get_net() or
@ -309,8 +308,8 @@ namespace dlib
- Trains an unsupervised neural network based on the given training data.
The goal of training is to find the network parameters that minimize
get_net().compute_loss(data.begin(), data.end()).
- The optimizer will run until get_step_size() < get_min_step_size() or
get_max_num_epochs() training epochs have been executed.
- The optimizer will run until get_learning_rate() < get_min_learning_rate()
or get_max_num_epochs() training epochs have been executed.
- Each layer in the network will be optimized by its corresponding solver
in get_solvers().
- Each call to train DOES NOT reinitialize the state of get_net() or
@ -381,6 +380,8 @@ namespace dlib
- Note that, if be_verbose() has been called, then this object will
automatically call clear_average_loss() periodically when it logs the
loss to the console.
- This function blocks until all threads inside the dnn_trainer have
stopped touching the net.
!*/
void clear_average_loss (
@ -393,6 +394,8 @@ namespace dlib
applied during training. Calling clear_average_loss() resets the
running_stats object so it forgets about all previous loss values
observed.
- This function blocks until all threads inside the dnn_trainer have
stopped touching the net.
!*/
};

View File

@ -33,23 +33,27 @@ using namespace dlib;
// It exists solely so other layers can refer to it. In this case, the
// add_prev1 layer looks for the tag1 layer and will take the tag1 output and
// add it to the input of the add_prev1 layer. This combination allows us to
// implement skip and residual style networks.
template <int stride, typename SUBNET>
using base_res = relu<add_prev1<bn_con<con<8,3,3,1,1,relu<bn_con<con<8,3,3,stride,stride,tag1<SUBNET>>>>>>>>;
// implement skip and residual style networks. We have also made base_res
// parameterized by BN, which will let us insert different batch normalization
// layers.
template <template <typename> class BN, typename SUBNET>
using base_res = relu<add_prev1<BN<con<8,3,3,1,1,relu<BN<con<8,3,3,1,1,tag1<SUBNET>>>>>>>>;
// Let's also define the same block but with all the batch normalization layers
// replaced with affine transform layers. We will use this type of construction
// when testing our networks.
template <int stride, typename SUBNET>
using base_ares = relu<add_prev1<affine<con<8,3,3,1,1,relu<affine<con<8,3,3,stride,stride,tag1<SUBNET>>>>>>>>;
// We also want a residual block that begins by doing downsampling. We can
// reuse base_res to define it like this:
template <template <typename> class BN, typename SUBNET>
using base_res_down = base_res<BN,avg_pool<1,1,2,2,SUBNET>>;
// And of course we can define more alias templates based on previously defined
// alias templates. The _down versions downsample the inputs by a factor of 2
// while the res and ares layer types don't.
template <typename SUBNET> using res = base_res<1,SUBNET>;
template <typename SUBNET> using res_down = base_res<2,SUBNET>;
template <typename SUBNET> using ares = base_ares<1,SUBNET>;
template <typename SUBNET> using ares_down = base_ares<2,SUBNET>;
// Now we can define 4 different residual blocks we will use in this example.
// The first two are non-downsampling residual blocks while the last two
// downsample. Also, res and res_down use batch normalization while ares and
// ares_down have had the batch normalization replaced with simple affine
// layers. We will use the affine version of the layers when testing our
// networks.
template <typename SUBNET> using res = base_res<bn_con,SUBNET>;
template <typename SUBNET> using ares = base_res<affine,SUBNET>;
template <typename SUBNET> using res_down = base_res_down<bn_con,SUBNET>;
template <typename SUBNET> using ares_down = base_res_down<affine,SUBNET>;
@ -141,37 +145,39 @@ int main(int argc, char** argv) try
// These print statements will output this (I've truncated it since it's
// long, but you get the idea):
/*
The pnet has 125 layers in it.
layer<0> loss_multiclass_log
layer<1> fc (num_outputs=10)
layer<2> avg_pool (nr=0, nc=0, stride_y=1, stride_x=1, padding_y=0, padding_x=0)
layer<3> prelu (initial_param_value=0.2)
layer<4> add_prev
layer<5> bn_con
layer<6> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1)
layer<7> prelu (initial_param_value=0.25)
layer<8> bn_con
layer<9> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1)
layer<10> tag1
The pnet has 127 layers in it.
layer<0> loss_multiclass_log
layer<1> fc (num_outputs=10)
layer<2> avg_pool (nr=0, nc=0, stride_y=1, stride_x=1, padding_y=0, padding_x=0)
layer<3> prelu (initial_param_value=0.2)
layer<4> add_prev
layer<5> bn_con
layer<6> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1)
layer<7> prelu (initial_param_value=0.25)
layer<8> bn_con
layer<9> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1)
layer<10> tag1
...
layer<33> con (num_filters=8, nr=3, nc=3, stride_y=2, stride_x=2)
layer<34> tag1
layer<35> tag4
layer<36> prelu (initial_param_value=0.3)
layer<37> add_prev
layer<38> bn_con
layer<33> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1)
layer<34> tag1
layer<35> avg_pool (nr=1, nc=1, stride_y=2, stride_x=2, padding_y=0, padding_x=0)
layer<36> tag4
layer<37> prelu (initial_param_value=0.3)
layer<38> add_prev
layer<39> bn_con
...
layer<114> con (num_filters=8, nr=3, nc=3, stride_y=2, stride_x=2)
layer<115> tag1
layer<116> relu
layer<117> add_prev
layer<118> bn_con
layer<119> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1)
layer<120> relu
layer<121> bn_con
layer<122> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1)
layer<123> tag1
layer<124> input<matrix>
layer<115> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1)
layer<116> tag1
layer<117> avg_pool (nr=1, nc=1, stride_y=2, stride_x=2, padding_y=0, padding_x=0)
layer<118> relu
layer<119> add_prev
layer<120> bn_con
layer<121> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1)
layer<122> relu
layer<123> bn_con
layer<124> con (num_filters=8, nr=3, nc=3, stride_y=1, stride_x=1, padding_y=1, padding_x=1)
layer<125> tag1
layer<126> input<matrix>
*/
// Now that we know the index numbers for each layer, we can access them
@ -189,7 +195,7 @@ int main(int argc, char** argv) try
// parts of your network and access them by layer<tag>(). You can also
// index relative to a tag. So for example, to access the layer immediately
// after tag4 you can say:
layer<tag4,1>(pnet); // Equivalent to layer<35+1>(pnet).
layer<tag4,1>(pnet); // Equivalent to layer<36+1>(pnet).
// Or to access the layer 2 layers after tag4:
layer<tag4,2>(pnet);
@ -203,23 +209,26 @@ int main(int argc, char** argv) try
// talk about training networks!
// The dnn_trainer will use SGD by default, but you can tell it to use
// different solvers like adam.
dnn_trainer<net_type,adam> trainer(net,adam(0.001));
// different solvers like adam with a weight decay of 0.0005 and the given
// momentum parameters.
dnn_trainer<net_type,adam> trainer(net,adam(0.0005, 0.9, 0.999));
// Also, if you have multiple graphics cards you can tell the trainer to use
// them together to make the training faster. For example, replacing the
// above constructor call with this one would cause it to use GPU cards 0
// and 1.
//dnn_trainer<net_type,adam> trainer(net,adam(0.001), {0,1});
//dnn_trainer<net_type,adam> trainer(net,adam(0.0005, 0.9, 0.999), {0,1});
trainer.be_verbose();
trainer.set_synchronization_file("mnist_resnet_sync", std::chrono::seconds(100));
// While the trainer is running it keeps an eye on the training error. If
// it looks like the error hasn't decreased for the last 2000 iterations it
// will automatically reduce the step size by 0.1. You can change these
// will automatically reduce the learning rate by 0.1. You can change these
// default parameters to some other values by calling these functions. Or
// disable them entirely by setting the shrink amount to 1.
// disable the automatic shrinking entirely by setting the shrink amount to 1.
trainer.set_iterations_without_progress_threshold(2000);
trainer.set_step_size_shrink_amount(0.1);
trainer.set_learning_rate_shrink_amount(0.1);
// The learning rate will start at 1e-3.
trainer.set_learning_rate(1e-3);
// Now, what if your training dataset is so big it doesn't fit in RAM? You
@ -230,10 +239,10 @@ int main(int argc, char** argv) try
std::vector<matrix<unsigned char>> mini_batch_samples;
std::vector<unsigned long> mini_batch_labels;
dlib::rand rnd(time(0));
// Loop until the trainer's automatic shrinking has shrunk the step size by
// 1e-3. For the default shrinks amount of 0.1 this means stop after it
// shrinks it 3 times.
while(trainer.get_step_size() >= 1e-3)
// Loop until the trainer's automatic shrinking has shrunk the learning rate to 1e-6.
// Given our settings, this means it will stop training after it has shrunk the
// learning rate 3 times.
while(trainer.get_learning_rate() >= 1e-6)
{
mini_batch_samples.clear();
mini_batch_labels.clear();

View File

@ -89,7 +89,9 @@ int main(int argc, char** argv) try
net_type net;
// And then train it using the MNIST data. The code below uses mini-batch stochastic
// gradient descent with an initial learning rate of 0.01 to accomplish this.
dnn_trainer<net_type> trainer(net,sgd(0.01));
dnn_trainer<net_type> trainer(net);
trainer.set_learning_rate(0.01);
trainer.set_min_learning_rate(0.00001);
trainer.set_mini_batch_size(128);
trainer.be_verbose();
// Since DNN training can take a long time, we can ask the trainer to save its state to
@ -97,11 +99,11 @@ int main(int argc, char** argv) try
// start it again it will begin where it left off rather than restarting the training
// from scratch.
trainer.set_synchronization_file("mnist_sync", std::chrono::seconds(20));
// Finally, this line begins training. By default, it runs SGD with our specified step
// size until the loss stops decreasing. Then it reduces the step size by a factor of
// 10 and continues running until the loss stops decreasing again. It will reduce the
// step size 3 times and then terminate. For a longer discussion, see the documentation
// of the dnn_trainer object.
// Finally, this line begins training. By default, it runs SGD with our specified
// learning rate until the loss stops decreasing. Then it reduces the learning rate by
// a factor of 10 and continues running until the loss stops decreasing again. It will
// keep doing this until the learning rate has dropped below the min learning rate
// defined above or the maximum number of epochs as been executed (defaulted to 10000).
trainer.train(training_images, training_labels);
// At this point our net object should have learned how to classify MNIST images. But