To avoid a GPU memory leak, allow passing thread pools to dnn_trainer from outside (#2027)

* Problem: The CUDA runtime allocates resources for each thread, and apparently those resources are not freed when the corresponding threads terminate. Therefore, each instantiation of dnn_trainer leaks a bit of GPU memory.

Solution: Add possibility to pass thread pools from outside. This way, subsequent dnn_trainer instances can use the same threads, and there's no memory leak.

* Add helpful comments
This commit is contained in:
Juha Reunanen 2020-03-19 13:38:43 +02:00 committed by GitHub
parent 6fc503d242
commit 74123841bb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 31 additions and 5 deletions

View File

@ -75,6 +75,7 @@ namespace dlib
typedef typename net_type::input_type input_type;
const static size_t num_computational_layers = net_type::num_computational_layers;
const static size_t num_layers = net_type::num_layers;
using threads = std::vector<std::shared_ptr<thread_pool>>;
private:
typedef impl::dnn_job_t<training_label_type> job_t;
public:
@ -104,8 +105,9 @@ namespace dlib
dnn_trainer(
net_type& net_,
const solver_type& solver_,
const std::vector<int>& cuda_extra_devices
) : job_pipe(0), net(net_)
const std::vector<int>& cuda_extra_devices,
std::shared_ptr<threads> thread_pools_ = std::shared_ptr<threads>()
) : job_pipe(0), net(net_), thread_pools(thread_pools_)
{
devices.push_back(std::make_shared<device_data>(dlib::cuda::get_device(), net, solver_));
@ -667,6 +669,14 @@ namespace dlib
std::vector<tensor*> reference_params;
visit_layer_parameters(devices[0]->net, [&](size_t, tensor& t) { reference_params.push_back(&t); });
// If no external thread pools vector was passed, then create one that will
// be automatically destructed as soon as the dnn_trainer object goes out of
// scope.
if (!thread_pools)
thread_pools = std::make_shared<threads>();
auto& tp = *thread_pools;
// We make separate thread pools with just one thread in them because we want
// to make sure each device is always executed on the same thread. We care
// about this because there are thread_local context variables for some cuda
@ -674,8 +684,7 @@ namespace dlib
// So if we make sure the same device always uses the same thread this will
// reduce the number of contexts we allocate from num_devices*num_devices to
// just num_devices.
std::vector<std::shared_ptr<thread_pool>> tp;
for (size_t i = 0; i < devices.size(); ++i)
while (tp.size() < devices.size())
tp.push_back(std::make_shared<thread_pool>(1));
@ -1274,6 +1283,7 @@ namespace dlib
std::vector<std::shared_ptr<device_data>> devices;
dlib::pipe<job_t> job_pipe;
std::shared_ptr<threads> thread_pools;
job_t job;

View File

@ -58,6 +58,8 @@ namespace dlib
typedef typename net_type::input_type input_type;
const static size_t num_computational_layers = net_type::num_computational_layers;
using threads = std::vector<std::shared_ptr<thread_pool>>;
dnn_trainer() = delete;
dnn_trainer(const dnn_trainer&) = delete;
dnn_trainer& operator=(const dnn_trainer&) = delete;
@ -65,7 +67,8 @@ namespace dlib
dnn_trainer(
net_type& net,
const solver_type& solver = solver_type(),
const std::vector<int>& cuda_extra_devices = {}
const std::vector<int>& cuda_extra_devices = {},
std::shared_ptr<threads> thread_pools = std::shared_ptr<threads>()
);
/*!
requires
@ -96,6 +99,19 @@ namespace dlib
cudaGetDevice()). In addition, you can ask to use additional
devices, which you do by putting their device numbers into
cuda_extra_devices.
- if (thread_pools.get() != nullptr) then
- Any new threads spun within the trainer will execute within the
passed thread pools vector. This means that the same threads can
be re-used across different dnn_trainer instances. Otherwise, the
CUDA runtime may leak memory. This, however, is relevant only if
your program is going to instantiate a large number of trainers,
and generally stay up and running for a very long time. If not,
then you need not worry about this.
NB: Any particular thread pools vector should be passed to max
one trainer instance at a time.
NB: The mentioned leak isn't happening because dlib is or isn't
doing something. Instead, it is a limitation of the CUDA
runtime that dlib has no control over.
!*/
net_type& get_net (