mirror of
https://github.com/davisking/dlib.git
synced 2024-11-01 10:14:53 +08:00
Added visit_layer_parameter_gradients() and also fixed a silly synchronization
error in the multi-gpu training code.
This commit is contained in:
parent
d31723ff45
commit
168574bd6c
@ -3049,6 +3049,73 @@ namespace dlib
|
||||
impl::vlp_loop<0, net_type::num_layers>::visit(comp_i, net, v);
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
namespace impl
|
||||
{
|
||||
template <size_t i, size_t num>
|
||||
struct vlpg_loop
|
||||
{
|
||||
template <typename T, typename U>
|
||||
static typename std::enable_if<!is_add_layer<U>::value>::type invoke_functor(T&& , size_t& , U&& )
|
||||
{
|
||||
// intentionally left empty
|
||||
}
|
||||
|
||||
template <typename T, typename U>
|
||||
static typename std::enable_if<is_add_layer<U>::value>::type invoke_functor(T&& v , size_t& comp_i, U&& l )
|
||||
{
|
||||
v(comp_i, l.get_parameter_gradient());
|
||||
++comp_i;
|
||||
}
|
||||
|
||||
template <
|
||||
typename net_type,
|
||||
typename visitor
|
||||
>
|
||||
static void visit(
|
||||
size_t comp_i,
|
||||
net_type& net,
|
||||
visitor&& v
|
||||
)
|
||||
{
|
||||
invoke_functor(v, comp_i, layer<i>(net));
|
||||
vlpg_loop<i+1, num>::visit(comp_i, net,v);
|
||||
}
|
||||
};
|
||||
|
||||
template <size_t num>
|
||||
struct vlpg_loop<num,num>
|
||||
{
|
||||
template <
|
||||
typename net_type,
|
||||
typename visitor
|
||||
>
|
||||
static void visit(
|
||||
size_t,
|
||||
net_type&,
|
||||
visitor&&
|
||||
)
|
||||
{
|
||||
// Base case of recursion. Don't do anything.
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
template <
|
||||
typename net_type,
|
||||
typename visitor
|
||||
>
|
||||
void visit_layer_parameter_gradients(
|
||||
net_type& net,
|
||||
visitor v
|
||||
)
|
||||
{
|
||||
size_t comp_i = 0;
|
||||
impl::vlpg_loop<0, net_type::num_layers>::visit(comp_i, net, v);
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
}
|
||||
|
@ -1348,6 +1348,40 @@ namespace dlib
|
||||
- When v() is called, the first argument is always < net_type::num_computational_layers.
|
||||
!*/
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
template <
|
||||
typename net_type,
|
||||
typename visitor
|
||||
>
|
||||
void visit_layer_parameter_gradients(
|
||||
net_type& net,
|
||||
visitor v
|
||||
);
|
||||
/*!
|
||||
requires
|
||||
- net_type is an object of type add_layer, add_loss_layer, add_skip_layer, or
|
||||
add_tag_layer.
|
||||
- v is a function object with a signature equivalent to:
|
||||
v(size_t idx, tensor& t)
|
||||
ensures
|
||||
- Loops over all the computational layers (i.e. layers with parameters, as
|
||||
opposed to loss, tag, or input layers) in net and passes their parameter
|
||||
gradients to v(). To be specific, this function essentially performs the
|
||||
following:
|
||||
|
||||
size_t computational_layer_idx = 0;
|
||||
for (size_t i = 0; i < net_type::num_layers; ++i)
|
||||
{
|
||||
if (layer<i>(net) is a computational layer)
|
||||
{
|
||||
v(computational_layer_idx, layer<i>(net).get_parameter_gradient());
|
||||
++computational_layer_idx;
|
||||
}
|
||||
}
|
||||
- When v() is called, the first argument is always < net_type::num_computational_layers.
|
||||
!*/
|
||||
|
||||
// ----------------------------------------------------------------------------------------
|
||||
|
||||
struct layer_test_results
|
||||
|
@ -501,9 +501,12 @@ namespace dlib
|
||||
std::vector<std::future<double>> losses(devices.size());
|
||||
std::vector<std::future<void>> update_futs(devices.size());
|
||||
std::vector<matrix<float>> param_buffer(net_type::num_computational_layers);
|
||||
std::vector<matrix<float>> param_grad_buffer(net_type::num_computational_layers);
|
||||
|
||||
size_t iteration = 0;
|
||||
while(job_pipe.dequeue(next_job))
|
||||
{
|
||||
++iteration;
|
||||
// Call compute_parameter_gradients() and update_parameters() but pick the
|
||||
// right version for unsupervised or supervised training based on the type
|
||||
// of label_type.
|
||||
@ -517,28 +520,57 @@ namespace dlib
|
||||
// gradient updates between devices. So we do that now.
|
||||
if (devices.size() > 1)
|
||||
{
|
||||
for (auto&& p : param_buffer)
|
||||
for (auto&& p : param_grad_buffer)
|
||||
p = 0;
|
||||
// now average all the parameter gradients
|
||||
for (size_t i = 0; i < devices.size(); ++i)
|
||||
{
|
||||
visit_layer_parameters(devices[i]->net, [¶m_buffer](size_t j, tensor& t) {
|
||||
visit_layer_parameter_gradients(devices[i]->net, [¶m_grad_buffer](size_t j, tensor& t) {
|
||||
if (t.size() != 0)
|
||||
param_buffer[j] += mat(t);
|
||||
param_grad_buffer[j] += mat(t);
|
||||
});
|
||||
}
|
||||
// and then assign the parameter gradients back to all the networks
|
||||
const float scale = 1.0f/devices.size();
|
||||
for (size_t i = 0; i < devices.size(); ++i)
|
||||
{
|
||||
visit_layer_parameters(devices[i]->net, [scale,¶m_buffer](size_t j, tensor& t) {
|
||||
visit_layer_parameter_gradients(devices[i]->net, [scale,¶m_grad_buffer](size_t j, tensor& t) {
|
||||
if (t.size() != 0)
|
||||
{
|
||||
t = param_buffer[j]*scale;
|
||||
t = param_grad_buffer[j]*scale;
|
||||
t.async_copy_to_device();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Evey now and then force all the parameters to be the same just to
|
||||
// make sure they aren't drifting apart due to any non-deterministic
|
||||
// behavior on the GPU.
|
||||
if (iteration%5000 == 1)
|
||||
{
|
||||
for (auto&& p : param_buffer)
|
||||
p = 0;
|
||||
// now average all the parameters
|
||||
for (size_t i = 0; i < devices.size(); ++i)
|
||||
{
|
||||
visit_layer_parameters(devices[i]->net, [¶m_buffer](size_t j, tensor& t) {
|
||||
if (t.size() != 0)
|
||||
param_buffer[j] += mat(t);
|
||||
});
|
||||
}
|
||||
// and then assign the parameters back to all the networks.
|
||||
const float scale = 1.0f/devices.size();
|
||||
for (size_t i = 0; i < devices.size(); ++i)
|
||||
{
|
||||
visit_layer_parameters(devices[i]->net, [scale,¶m_buffer](size_t j, tensor& t) {
|
||||
if (t.size() != 0)
|
||||
{
|
||||
t = param_buffer[j]*scale;
|
||||
t.async_copy_to_device();
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user