mirror of
https://github.com/davisking/dlib.git
synced 2024-11-01 10:14:53 +08:00
To avoid a GPU memory leak, allow passing thread pools to dnn_trainer from outside (#2027)
* Problem: The CUDA runtime allocates resources for each thread, and apparently those resources are not freed when the corresponding threads terminate. Therefore, each instantiation of dnn_trainer leaks a bit of GPU memory. Solution: Add possibility to pass thread pools from outside. This way, subsequent dnn_trainer instances can use the same threads, and there's no memory leak. * Add helpful comments
This commit is contained in:
parent
6fc503d242
commit
74123841bb
@ -75,6 +75,7 @@ namespace dlib
|
||||
typedef typename net_type::input_type input_type;
|
||||
const static size_t num_computational_layers = net_type::num_computational_layers;
|
||||
const static size_t num_layers = net_type::num_layers;
|
||||
using threads = std::vector<std::shared_ptr<thread_pool>>;
|
||||
private:
|
||||
typedef impl::dnn_job_t<training_label_type> job_t;
|
||||
public:
|
||||
@ -104,8 +105,9 @@ namespace dlib
|
||||
dnn_trainer(
|
||||
net_type& net_,
|
||||
const solver_type& solver_,
|
||||
const std::vector<int>& cuda_extra_devices
|
||||
) : job_pipe(0), net(net_)
|
||||
const std::vector<int>& cuda_extra_devices,
|
||||
std::shared_ptr<threads> thread_pools_ = std::shared_ptr<threads>()
|
||||
) : job_pipe(0), net(net_), thread_pools(thread_pools_)
|
||||
{
|
||||
devices.push_back(std::make_shared<device_data>(dlib::cuda::get_device(), net, solver_));
|
||||
|
||||
@ -667,6 +669,14 @@ namespace dlib
|
||||
std::vector<tensor*> reference_params;
|
||||
visit_layer_parameters(devices[0]->net, [&](size_t, tensor& t) { reference_params.push_back(&t); });
|
||||
|
||||
// If no external thread pools vector was passed, then create one that will
|
||||
// be automatically destructed as soon as the dnn_trainer object goes out of
|
||||
// scope.
|
||||
if (!thread_pools)
|
||||
thread_pools = std::make_shared<threads>();
|
||||
|
||||
auto& tp = *thread_pools;
|
||||
|
||||
// We make separate thread pools with just one thread in them because we want
|
||||
// to make sure each device is always executed on the same thread. We care
|
||||
// about this because there are thread_local context variables for some cuda
|
||||
@ -674,8 +684,7 @@ namespace dlib
|
||||
// So if we make sure the same device always uses the same thread this will
|
||||
// reduce the number of contexts we allocate from num_devices*num_devices to
|
||||
// just num_devices.
|
||||
std::vector<std::shared_ptr<thread_pool>> tp;
|
||||
for (size_t i = 0; i < devices.size(); ++i)
|
||||
while (tp.size() < devices.size())
|
||||
tp.push_back(std::make_shared<thread_pool>(1));
|
||||
|
||||
|
||||
@ -1274,6 +1283,7 @@ namespace dlib
|
||||
|
||||
std::vector<std::shared_ptr<device_data>> devices;
|
||||
dlib::pipe<job_t> job_pipe;
|
||||
std::shared_ptr<threads> thread_pools;
|
||||
job_t job;
|
||||
|
||||
|
||||
|
@ -58,6 +58,8 @@ namespace dlib
|
||||
typedef typename net_type::input_type input_type;
|
||||
const static size_t num_computational_layers = net_type::num_computational_layers;
|
||||
|
||||
using threads = std::vector<std::shared_ptr<thread_pool>>;
|
||||
|
||||
dnn_trainer() = delete;
|
||||
dnn_trainer(const dnn_trainer&) = delete;
|
||||
dnn_trainer& operator=(const dnn_trainer&) = delete;
|
||||
@ -65,7 +67,8 @@ namespace dlib
|
||||
dnn_trainer(
|
||||
net_type& net,
|
||||
const solver_type& solver = solver_type(),
|
||||
const std::vector<int>& cuda_extra_devices = {}
|
||||
const std::vector<int>& cuda_extra_devices = {},
|
||||
std::shared_ptr<threads> thread_pools = std::shared_ptr<threads>()
|
||||
);
|
||||
/*!
|
||||
requires
|
||||
@ -96,6 +99,19 @@ namespace dlib
|
||||
cudaGetDevice()). In addition, you can ask to use additional
|
||||
devices, which you do by putting their device numbers into
|
||||
cuda_extra_devices.
|
||||
- if (thread_pools.get() != nullptr) then
|
||||
- Any new threads spun within the trainer will execute within the
|
||||
passed thread pools vector. This means that the same threads can
|
||||
be re-used across different dnn_trainer instances. Otherwise, the
|
||||
CUDA runtime may leak memory. This, however, is relevant only if
|
||||
your program is going to instantiate a large number of trainers,
|
||||
and generally stay up and running for a very long time. If not,
|
||||
then you need not worry about this.
|
||||
NB: Any particular thread pools vector should be passed to max
|
||||
one trainer instance at a time.
|
||||
NB: The mentioned leak isn't happening because dlib is or isn't
|
||||
doing something. Instead, it is a limitation of the CUDA
|
||||
runtime that dlib has no control over.
|
||||
!*/
|
||||
|
||||
net_type& get_net (
|
||||
|
Loading…
Reference in New Issue
Block a user