To avoid a GPU memory leak, allow passing thread pools to dnn_trainer from outside (#2027)

* Problem: The CUDA runtime allocates resources for each thread, and apparently those resources are not freed when the corresponding threads terminate. Therefore, each instantiation of dnn_trainer leaks a bit of GPU memory. Solution: Add possibility to pass thread pools from outside. This way, subsequent dnn_trainer instances can use the same threads, and there's no memory leak. * Add helpful comments
2024-11-01 10:14:53 +08:00 · 2020-03-19 13:38:43 +02:00 · 2020-03-19 13:38:43 +02:00 · 74123841bb
commit 74123841bb
parent 6fc503d242
2 changed files with 31 additions and 5 deletions
--- a/dlib/dnn/trainer.h
+++ b/dlib/dnn/trainer.h
@ -75,6 +75,7 @@ namespace dlib
        typedef typename net_type::input_type input_type;
        const static size_t num_computational_layers = net_type::num_computational_layers;
        const static size_t num_layers = net_type::num_layers;
+        using threads = std::vector<std::shared_ptr<thread_pool>>;
    private:
        typedef impl::dnn_job_t<training_label_type> job_t;
    public:
@ -104,8 +105,9 @@ namespace dlib
        dnn_trainer(
            net_type& net_, 
            const solver_type& solver_,
-            const std::vector<int>& cuda_extra_devices
-        ) : job_pipe(0), net(net_) 
+            const std::vector<int>& cuda_extra_devices,
+            std::shared_ptr<threads> thread_pools_ = std::shared_ptr<threads>()
+        ) : job_pipe(0), net(net_), thread_pools(thread_pools_)
        {
            devices.push_back(std::make_shared<device_data>(dlib::cuda::get_device(), net, solver_));

@ -667,6 +669,14 @@ namespace dlib
            std::vector<tensor*> reference_params;
            visit_layer_parameters(devices[0]->net, [&](size_t, tensor& t) { reference_params.push_back(&t); });

+            // If no external thread pools vector was passed, then create one that will
+            // be automatically destructed as soon as the dnn_trainer object goes out of
+            // scope.
+            if (!thread_pools)
+                thread_pools = std::make_shared<threads>();
+
+            auto& tp = *thread_pools;
+
            // We make separate thread pools with just one thread in them because we want
            // to make sure each device is always executed on the same thread.  We care
            // about this because there are thread_local context variables for some cuda
@ -674,8 +684,7 @@ namespace dlib
            // So if we make sure the same device always uses the same thread this will
            // reduce the number of contexts we allocate from num_devices*num_devices to
            // just num_devices. 
-            std::vector<std::shared_ptr<thread_pool>> tp;
-            for (size_t i = 0; i < devices.size(); ++i)
+            while (tp.size() < devices.size())
                tp.push_back(std::make_shared<thread_pool>(1));


@ -1274,6 +1283,7 @@ namespace dlib

        std::vector<std::shared_ptr<device_data>> devices;
        dlib::pipe<job_t> job_pipe;
+        std::shared_ptr<threads> thread_pools;
        job_t job;


--- a/dlib/dnn/trainer_abstract.h
+++ b/dlib/dnn/trainer_abstract.h
@ -58,6 +58,8 @@ namespace dlib
        typedef typename net_type::input_type input_type;
        const static size_t num_computational_layers = net_type::num_computational_layers;

+        using threads = std::vector<std::shared_ptr<thread_pool>>;
+
        dnn_trainer() = delete;
        dnn_trainer(const dnn_trainer&) = delete;
        dnn_trainer& operator=(const dnn_trainer&) = delete;
@ -65,7 +67,8 @@ namespace dlib
        dnn_trainer(
            net_type& net, 
            const solver_type& solver = solver_type(),
-            const std::vector<int>& cuda_extra_devices = {}
+            const std::vector<int>& cuda_extra_devices = {},
+            std::shared_ptr<threads> thread_pools = std::shared_ptr<threads>()
        ); 
        /*!
            requires
@ -96,6 +99,19 @@ namespace dlib
                      cudaGetDevice()).  In addition, you can ask to use additional
                      devices, which you do by putting their device numbers into
                      cuda_extra_devices.
+                - if (thread_pools.get() != nullptr) then
+                    - Any new threads spun within the trainer will execute within the
+                      passed thread pools vector. This means that the same threads can
+                      be re-used across different dnn_trainer instances. Otherwise, the
+                      CUDA runtime may leak memory. This, however, is relevant only if
+                      your program is going to instantiate a large number of trainers,
+                      and generally stay up and running for a very long time. If not,
+                      then you need not worry about this.
+                      NB: Any particular thread pools vector should be passed to max
+                          one trainer instance at a time.
+                      NB: The mentioned leak isn't happening because dlib is or isn't
+                          doing something. Instead, it is a limitation of the CUDA
+                          runtime that dlib has no control over.
        !*/

        net_type& get_net (