Added affine_transform_range() and another overload of affine_transform()

2024-11-01 10:14:53 +08:00 · 2016-05-21 23:22:26 -04:00 · 2016-05-21 23:22:26 -04:00 · c3a74c7c1c
commit c3a74c7c1c
parent 15b2d7b5d8
6 changed files with 164 additions and 1 deletions
--- a/dlib/dnn/cpu_dlib.cpp
+++ b/dlib/dnn/cpu_dlib.cpp
@ -385,6 +385,30 @@ namespace dlib
                d[i] = A*s1[i] + B*s2[i] + C*s3[i] + D;
        }

+        void affine_transform_range(
+            size_t begin,
+            size_t end,
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2,
+            const tensor& src3,
+            const float A,
+            const float B,
+            const float C
+        )
+        {
+            DLIB_CASSERT(dest.size()==src1.size(),"");
+            DLIB_CASSERT(dest.size()==src2.size(),"");
+            DLIB_CASSERT(dest.size()==src3.size(),"");
+            DLIB_CASSERT(begin <= end && end <= dest.size(),"");
+            const auto d = dest.host();
+            const auto s1 = src1.host();
+            const auto s2 = src2.host();
+            const auto s3 = src3.host();
+            for (size_t i = begin; i < end; ++i)
+                d[i] = A*s1[i] + B*s2[i] + C*s3[i];
+        }
+
    // -----------------------------------------------------------------------------------

        void affine_transform(
--- a/dlib/dnn/cpu_dlib.h
+++ b/dlib/dnn/cpu_dlib.h
@ -81,6 +81,18 @@ namespace dlib
            const float D
        );

+        void affine_transform_range(
+            size_t begin,
+            size_t end,
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2,
+            const tensor& src3,
+            const float A,
+            const float B,
+            const float C
+        );
+
    // -----------------------------------------------------------------------------------

        void affine_transform(
--- a/dlib/dnn/cuda_dlib.cu
+++ b/dlib/dnn/cuda_dlib.cu
@ -504,6 +504,40 @@ namespace dlib
                src2.device(), src3.device(), dest.size(), A, B, C, D);
        }

+    // ----------------------------------------------------------------------------------------
+
+        __global__ void _cuda_affine_transform_range(
+            float* d, const float* s1, const float* s2, const float* s3, size_t begin, size_t end, float A, float B, float C
+        )
+        {
+            for (auto i : grid_stride_range(begin, end))
+            {
+                d[i] = A*s1[i] + B*s2[i] + C*s3[i];
+            }
+        }
+
+
+        void affine_transform_range(
+            size_t begin,
+            size_t end,
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2,
+            const tensor& src3,
+            const float A,
+            const float B,
+            const float C
+        )
+        {
+            DLIB_CASSERT(dest.size()==src1.size(),"");
+            DLIB_CASSERT(dest.size()==src2.size(),"");
+            DLIB_CASSERT(dest.size()==src3.size(),"");
+            DLIB_CASSERT(begin <= end && end <= dest.size(),"");
+            launch_kernel(_cuda_affine_transform_range,max_jobs(end-begin),
+                dest.device(), src1.device(),
+                src2.device(), src3.device(), begin, end, A, B, C);
+        }
+
    // -----------------------------------------------------------------------------------

        __global__ void _cuda_affine_transform2(float* d, const float* s, size_t n, const float* A, const float* B)
--- a/dlib/dnn/cuda_dlib.h
+++ b/dlib/dnn/cuda_dlib.h
@ -164,6 +164,18 @@ namespace dlib
            const float D
        );

+        void affine_transform_range(
+            size_t begin,
+            size_t end,
+            tensor& dest,
+            const tensor& src1,
+            const tensor& src2,
+            const tensor& src3,
+            const float A,
+            const float B,
+            const float C
+        );
+
        // Note that this function isn't in the tt:: namespace because add_scaled() is
        // called by cuda::add() so we don't need a tt:: version of add_scaled().  
        void add_scaled(
--- a/dlib/dnn/tensor_tools.cpp
+++ b/dlib/dnn/tensor_tools.cpp
@ -240,6 +240,42 @@ namespace dlib { namespace tt
 #endif
    }

+    void affine_transform_range(
+        size_t begin,
+        size_t end,
+        tensor& dest,
+        const tensor& src1,
+        const tensor& src2,
+        const tensor& src3,
+        const float A,
+        const float B,
+        const float C
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        cuda::affine_transform_range(begin, end, dest,src1,src2,src3,A,B,C);
+#else
+        cpu::affine_transform_range(begin, end, dest,src1,src2,src3,A,B,C);
+#endif
+    }
+
+    void affine_transform(
+        tensor& dest,
+        const tensor& src1,
+        const tensor& src2,
+        const tensor& src3,
+        const float A,
+        const float B,
+        const float C
+    )
+    {
+#ifdef DLIB_USE_CUDA
+        cuda::affine_transform_range(0,dest.size(),dest,src1,src2,src3,A,B,C);
+#else
+        cpu::affine_transform_range(0,dest.size(),dest,src1,src2,src3,A,B,C);
+#endif
+    }
+
 // ----------------------------------------------------------------------------------------

    void affine_transform(
--- a/dlib/dnn/tensor_tools.h
+++ b/dlib/dnn/tensor_tools.h
@ -229,13 +229,58 @@ namespace dlib { namespace tt
        const float D
    );
    /*!
-        requires - dest.size()==src1.size()
+        requires 
+            - dest.size()==src1.size()
            - dest.size()==src2.size()
            - dest.size()==src3.size()
        ensures
            - #dest == A*src1 + B*src2 + C*src3 + D
    !*/

+    void affine_transform(
+        tensor& dest,
+        const tensor& src1,
+        const tensor& src2,
+        const tensor& src3,
+        const float A,
+        const float B,
+        const float C
+    );
+    /*!
+        requires 
+            - dest.size()==src1.size()
+            - dest.size()==src2.size()
+            - dest.size()==src3.size()
+        ensures
+            - #dest == A*src1 + B*src2 + C*src3
+    !*/
+
+    void affine_transform_range(
+        size_t begin,
+        size_t end,
+        tensor& dest,
+        const tensor& src1,
+        const tensor& src2,
+        const tensor& src3,
+        const float A,
+        const float B,
+        const float C
+    );
+    /*!
+        requires 
+            - dest.size()==src1.size()
+            - dest.size()==src2.size()
+            - dest.size()==src3.size()
+            - begin <= end <= dest.size()
+        ensures
+            - This function operates much like
+              affine_transform(dest,src1,src2,src3,A,B,C,0), except that it runs over only
+              the half open range [begin,end) rather than processing the entire tensor.
+              Specifically, it does this:
+                - for i in the range [begin, end):
+                    - #dest.host()[i] == A*src1.host()[i] + B*src2.host()[i] + C*src3.host()[i]
+    !*/
+
 // ----------------------------------------------------------------------------------------

    void affine_transform(