From 45dd580bf4d2f267cd1d5d16ca6f556288eda3e1 Mon Sep 17 00:00:00 2001
From: Davis King <davis@dlib.net>
Date: Wed, 2 Nov 2016 10:06:36 -0400
Subject: [PATCH] Wrote replacements for set_tensor() and scale_tensor() since
 the previous versions were calling into cuDNN, however, the cuDNN functions
 for doing this are horrifically slow, well over 100x slower than they should
 be, which is surprising since these functions are so trivial.

---
 dlib/dnn/cuda_dlib.cu      | 32 ++++++++++++++++++++++++++++++++
 dlib/dnn/cuda_dlib.h       | 12 ++++++++++++
 dlib/dnn/cudnn_dlibapi.cpp | 26 --------------------------
 dlib/dnn/cudnn_dlibapi.h   | 20 --------------------
 dlib/dnn/tensor.h          | 16 ++++++++++++++++
 dlib/test/dnn.cpp          |  8 ++++++++
 6 files changed, 68 insertions(+), 46 deletions(-)

diff --git a/dlib/dnn/cuda_dlib.cu b/dlib/dnn/cuda_dlib.cu
index e9f546bb6..36d7edc71 100644
--- a/dlib/dnn/cuda_dlib.cu
+++ b/dlib/dnn/cuda_dlib.cu
@@ -864,6 +864,38 @@ namespace dlib
             launch_kernel(_add_bias_gradient,max_jobs(grad.size()),grad.device(), gradient_input.device(), grad.size(), gradient_input.size());
         }
 
+    // ----------------------------------------------------------------------------------------
+
+        __global__ void _set_tensor(float* out, size_t n, const float val)
+        {
+            for (auto i : grid_stride_range(0, n))
+                out[i] = val;
+        }
+
+        void set_tensor (
+            tensor& t,
+            float value
+        )
+        {
+            launch_kernel(_set_tensor, max_jobs(t.size()), t.device(), t.size(), value);
+        }
+
+    // ----------------------------------------------------------------------------------------
+
+        __global__ void _scale_tensor(float* out, size_t n, const float val)
+        {
+            for (auto i : grid_stride_range(0, n))
+                out[i] *= val;
+        }
+
+        void scale_tensor (
+            tensor& t,
+            float value
+        )
+        {
+            launch_kernel(_scale_tensor, max_jobs(t.size()), t.device(), t.size(), value);
+        }
+
     // -----------------------------------------------------------------------------------
     // -----------------------------------------------------------------------------------
 
diff --git a/dlib/dnn/cuda_dlib.h b/dlib/dnn/cuda_dlib.h
index dd0690417..75c418d6d 100644
--- a/dlib/dnn/cuda_dlib.h
+++ b/dlib/dnn/cuda_dlib.h
@@ -141,6 +141,18 @@ namespace dlib
             const tensor& v2
         );
 
+    // ------------------------------------------------------------------------------------
+
+        void set_tensor (
+            tensor& t,
+            float value
+        );
+
+        void scale_tensor (
+            tensor& t,
+            float value
+        );
+
     // ------------------------------------------------------------------------------------
 
         void multiply (
diff --git a/dlib/dnn/cudnn_dlibapi.cpp b/dlib/dnn/cudnn_dlibapi.cpp
index 151d9c2ce..93d542500 100644
--- a/dlib/dnn/cudnn_dlibapi.cpp
+++ b/dlib/dnn/cudnn_dlibapi.cpp
@@ -289,32 +289,6 @@ namespace dlib
                                     dest.device()));
         }
 
-        void set_tensor (
-            tensor& t,
-            float value
-        )
-        {
-            if (t.size() == 0)
-                return;
-            CHECK_CUDNN(cudnnSetTensor(context(),
-                                 descriptor(t),
-                                 t.device_write_only(),
-                                 &value));
-        }
-
-        void scale_tensor (
-            tensor& t,
-            float value
-        )
-        {
-            if (t.size() == 0)
-                return;
-            CHECK_CUDNN(cudnnScaleTensor(context(),
-                                   descriptor(t),
-                                   t.device(),
-                                   &value));
-        }
-
         void assign_conv_bias_gradient (
             tensor& grad,
             const tensor& gradient_input
diff --git a/dlib/dnn/cudnn_dlibapi.h b/dlib/dnn/cudnn_dlibapi.h
index 3e3aa3921..a4769c344 100644
--- a/dlib/dnn/cudnn_dlibapi.h
+++ b/dlib/dnn/cudnn_dlibapi.h
@@ -89,26 +89,6 @@ namespace dlib
                   add into the dest tensor.
         !*/
 
-        void set_tensor (
-            tensor& t,
-            float value
-        );
-        /*!
-            ensures
-                - sets all elements in t equal to value.
-        !*/
-
-        void scale_tensor (
-            tensor& t,
-            float value
-        );
-        /*!
-            ensures
-                - scales all elements of t by the given value.  I.e. for all elements E in
-                  t, this function performs:
-                    - E = E*value
-        !*/
-
     // ------------------------------------------------------------------------------------
 
         void assign_conv_bias_gradient (
diff --git a/dlib/dnn/tensor.h b/dlib/dnn/tensor.h
index 46d35c601..5b842ba98 100644
--- a/dlib/dnn/tensor.h
+++ b/dlib/dnn/tensor.h
@@ -14,6 +14,22 @@
 
 namespace dlib
 {
+
+// ----------------------------------------------------------------------------------------
+
+    namespace cuda
+    {
+        void set_tensor (
+            tensor& t,
+            float value
+        );
+
+        void scale_tensor (
+            tensor& t,
+            float value
+        );
+    }
+
 // ----------------------------------------------------------------------------------------
 
     class tensor
diff --git a/dlib/test/dnn.cpp b/dlib/test/dnn.cpp
index 9161b6631..e4dda2599 100644
--- a/dlib/test/dnn.cpp
+++ b/dlib/test/dnn.cpp
@@ -339,6 +339,14 @@ namespace
         dlog << LINFO << mat(dest);
         matrix<float> truth1(3,4), truth2(3,4);
 
+        truth1 = 2;
+        DLIB_TEST(max(abs(truth1-mat(src))) < 1e-5);
+        src *= 2;
+        truth1 = 4;
+        DLIB_TEST(max(abs(truth1-mat(src))) < 1e-5);
+        src = 2;
+
+
         truth1 = 7;
         truth2 = 7, 10,  7,  7,
         7, 10,  7,  7,