Added visit_layer_parameter_gradients() and also fixed a silly synchronization

error in the multi-gpu training code.
This commit is contained in:
Davis King 2016-04-19 06:45:35 -04:00
parent d31723ff45
commit 168574bd6c
3 changed files with 138 additions and 5 deletions

View File

@ -3049,6 +3049,73 @@ namespace dlib
impl::vlp_loop<0, net_type::num_layers>::visit(comp_i, net, v);
}
// ----------------------------------------------------------------------------------------
namespace impl
{
template <size_t i, size_t num>
struct vlpg_loop
{
template <typename T, typename U>
static typename std::enable_if<!is_add_layer<U>::value>::type invoke_functor(T&& , size_t& , U&& )
{
// intentionally left empty
}
template <typename T, typename U>
static typename std::enable_if<is_add_layer<U>::value>::type invoke_functor(T&& v , size_t& comp_i, U&& l )
{
v(comp_i, l.get_parameter_gradient());
++comp_i;
}
template <
typename net_type,
typename visitor
>
static void visit(
size_t comp_i,
net_type& net,
visitor&& v
)
{
invoke_functor(v, comp_i, layer<i>(net));
vlpg_loop<i+1, num>::visit(comp_i, net,v);
}
};
template <size_t num>
struct vlpg_loop<num,num>
{
template <
typename net_type,
typename visitor
>
static void visit(
size_t,
net_type&,
visitor&&
)
{
// Base case of recursion. Don't do anything.
}
};
}
template <
typename net_type,
typename visitor
>
void visit_layer_parameter_gradients(
net_type& net,
visitor v
)
{
size_t comp_i = 0;
impl::vlpg_loop<0, net_type::num_layers>::visit(comp_i, net, v);
}
// ----------------------------------------------------------------------------------------
}

View File

@ -1348,6 +1348,40 @@ namespace dlib
- When v() is called, the first argument is always < net_type::num_computational_layers.
!*/
// ----------------------------------------------------------------------------------------
template <
typename net_type,
typename visitor
>
void visit_layer_parameter_gradients(
net_type& net,
visitor v
);
/*!
requires
- net_type is an object of type add_layer, add_loss_layer, add_skip_layer, or
add_tag_layer.
- v is a function object with a signature equivalent to:
v(size_t idx, tensor& t)
ensures
- Loops over all the computational layers (i.e. layers with parameters, as
opposed to loss, tag, or input layers) in net and passes their parameter
gradients to v(). To be specific, this function essentially performs the
following:
size_t computational_layer_idx = 0;
for (size_t i = 0; i < net_type::num_layers; ++i)
{
if (layer<i>(net) is a computational layer)
{
v(computational_layer_idx, layer<i>(net).get_parameter_gradient());
++computational_layer_idx;
}
}
- When v() is called, the first argument is always < net_type::num_computational_layers.
!*/
// ----------------------------------------------------------------------------------------
struct layer_test_results

View File

@ -501,9 +501,12 @@ namespace dlib
std::vector<std::future<double>> losses(devices.size());
std::vector<std::future<void>> update_futs(devices.size());
std::vector<matrix<float>> param_buffer(net_type::num_computational_layers);
std::vector<matrix<float>> param_grad_buffer(net_type::num_computational_layers);
size_t iteration = 0;
while(job_pipe.dequeue(next_job))
{
++iteration;
// Call compute_parameter_gradients() and update_parameters() but pick the
// right version for unsupervised or supervised training based on the type
// of label_type.
@ -517,28 +520,57 @@ namespace dlib
// gradient updates between devices. So we do that now.
if (devices.size() > 1)
{
for (auto&& p : param_buffer)
for (auto&& p : param_grad_buffer)
p = 0;
// now average all the parameter gradients
for (size_t i = 0; i < devices.size(); ++i)
{
visit_layer_parameters(devices[i]->net, [&param_buffer](size_t j, tensor& t) {
visit_layer_parameter_gradients(devices[i]->net, [&param_grad_buffer](size_t j, tensor& t) {
if (t.size() != 0)
param_buffer[j] += mat(t);
param_grad_buffer[j] += mat(t);
});
}
// and then assign the parameter gradients back to all the networks
const float scale = 1.0f/devices.size();
for (size_t i = 0; i < devices.size(); ++i)
{
visit_layer_parameters(devices[i]->net, [scale,&param_buffer](size_t j, tensor& t) {
visit_layer_parameter_gradients(devices[i]->net, [scale,&param_grad_buffer](size_t j, tensor& t) {
if (t.size() != 0)
{
t = param_buffer[j]*scale;
t = param_grad_buffer[j]*scale;
t.async_copy_to_device();
}
});
}
// Evey now and then force all the parameters to be the same just to
// make sure they aren't drifting apart due to any non-deterministic
// behavior on the GPU.
if (iteration%5000 == 1)
{
for (auto&& p : param_buffer)
p = 0;
// now average all the parameters
for (size_t i = 0; i < devices.size(); ++i)
{
visit_layer_parameters(devices[i]->net, [&param_buffer](size_t j, tensor& t) {
if (t.size() != 0)
param_buffer[j] += mat(t);
});
}
// and then assign the parameters back to all the networks.
const float scale = 1.0f/devices.size();
for (size_t i = 0; i < devices.size(); ++i)
{
visit_layer_parameters(devices[i]->net, [scale,&param_buffer](size_t j, tensor& t) {
if (t.size() != 0)
{
t = param_buffer[j]*scale;
t.async_copy_to_device();
}
});
}
}
}