diff --git a/dlib/dnn/trainer.h b/dlib/dnn/trainer.h index eb96af8dd..401f36b7c 100644 --- a/dlib/dnn/trainer.h +++ b/dlib/dnn/trainer.h @@ -75,6 +75,7 @@ namespace dlib typedef typename net_type::input_type input_type; const static size_t num_computational_layers = net_type::num_computational_layers; const static size_t num_layers = net_type::num_layers; + using threads = std::vector>; private: typedef impl::dnn_job_t job_t; public: @@ -104,8 +105,9 @@ namespace dlib dnn_trainer( net_type& net_, const solver_type& solver_, - const std::vector& cuda_extra_devices - ) : job_pipe(0), net(net_) + const std::vector& cuda_extra_devices, + std::shared_ptr thread_pools_ = std::shared_ptr() + ) : job_pipe(0), net(net_), thread_pools(thread_pools_) { devices.push_back(std::make_shared(dlib::cuda::get_device(), net, solver_)); @@ -667,6 +669,14 @@ namespace dlib std::vector reference_params; visit_layer_parameters(devices[0]->net, [&](size_t, tensor& t) { reference_params.push_back(&t); }); + // If no external thread pools vector was passed, then create one that will + // be automatically destructed as soon as the dnn_trainer object goes out of + // scope. + if (!thread_pools) + thread_pools = std::make_shared(); + + auto& tp = *thread_pools; + // We make separate thread pools with just one thread in them because we want // to make sure each device is always executed on the same thread. We care // about this because there are thread_local context variables for some cuda @@ -674,8 +684,7 @@ namespace dlib // So if we make sure the same device always uses the same thread this will // reduce the number of contexts we allocate from num_devices*num_devices to // just num_devices. - std::vector> tp; - for (size_t i = 0; i < devices.size(); ++i) + while (tp.size() < devices.size()) tp.push_back(std::make_shared(1)); @@ -1274,6 +1283,7 @@ namespace dlib std::vector> devices; dlib::pipe job_pipe; + std::shared_ptr thread_pools; job_t job; diff --git a/dlib/dnn/trainer_abstract.h b/dlib/dnn/trainer_abstract.h index 3bfb6dc99..7d3c1e5f1 100644 --- a/dlib/dnn/trainer_abstract.h +++ b/dlib/dnn/trainer_abstract.h @@ -58,6 +58,8 @@ namespace dlib typedef typename net_type::input_type input_type; const static size_t num_computational_layers = net_type::num_computational_layers; + using threads = std::vector>; + dnn_trainer() = delete; dnn_trainer(const dnn_trainer&) = delete; dnn_trainer& operator=(const dnn_trainer&) = delete; @@ -65,7 +67,8 @@ namespace dlib dnn_trainer( net_type& net, const solver_type& solver = solver_type(), - const std::vector& cuda_extra_devices = {} + const std::vector& cuda_extra_devices = {}, + std::shared_ptr thread_pools = std::shared_ptr() ); /*! requires @@ -96,6 +99,19 @@ namespace dlib cudaGetDevice()). In addition, you can ask to use additional devices, which you do by putting their device numbers into cuda_extra_devices. + - if (thread_pools.get() != nullptr) then + - Any new threads spun within the trainer will execute within the + passed thread pools vector. This means that the same threads can + be re-used across different dnn_trainer instances. Otherwise, the + CUDA runtime may leak memory. This, however, is relevant only if + your program is going to instantiate a large number of trainers, + and generally stay up and running for a very long time. If not, + then you need not worry about this. + NB: Any particular thread pools vector should be passed to max + one trainer instance at a time. + NB: The mentioned leak isn't happening because dlib is or isn't + doing something. Instead, it is a limitation of the CUDA + runtime that dlib has no control over. !*/ net_type& get_net (