diff --git a/dlib/dnn/cpu_dlib.cpp b/dlib/dnn/cpu_dlib.cpp index cdeb6d9a4..6ba2ac4da 100644 --- a/dlib/dnn/cpu_dlib.cpp +++ b/dlib/dnn/cpu_dlib.cpp @@ -265,6 +265,79 @@ namespace dlib } } + // ---------------------------------------------------------------------------------------- + + void multiply_zero_padded ( + bool add_to, + tensor& dest, + const tensor& src1, + const tensor& src2 + ) + { + auto d = dest.host(); + auto s1 = src1.host(); + auto s2 = src2.host(); + + // Do the simple and fast version if everything has the same dimensions + if (have_same_dimensions(dest, src1) && + have_same_dimensions(dest, src2)) + { + if (add_to) + { + for (size_t i = 0; i < dest.size(); ++i) + d[i] += s1[i] * s2[i]; + } + else + { + for (size_t i = 0; i < dest.size(); ++i) + d[i] = s1[i] * s2[i]; + } + return; + } + + // Otherwise, do the more complex version with bounds checking. + for (long n = 0; n < dest.num_samples(); ++n) + { + for (long k = 0; k < dest.k(); ++k) + { + for (long r = 0; r < dest.nr(); ++r) + { + for (long c = 0; c < dest.nc(); ++c) + { + float v1 = 0; + float v2 = 0; + + // if this index is inside src1 + if (n < src1.num_samples() && + k < src1.k() && + r < src1.nr() && + c < src1.nc() ) + { + const auto s_idx = ((n*src1.k() + k)*src1.nr() + r)*src1.nc() + c; + v1 = s1[s_idx]; + } + + // if this index is inside src2 + if (n < src2.num_samples() && + k < src2.k() && + r < src2.nr() && + c < src2.nc() ) + { + const auto s_idx = ((n*src2.k() + k)*src2.nr() + r)*src2.nc() + c; + v2 = s2[s_idx]; + } + + if (add_to) + *d += v1 * v2; + else + *d = v1 * v2; + ++d; + } + } + } + } + } + // ---------------------------------------------------------------------------------------- void assign_bias_gradient ( diff --git a/dlib/dnn/cpu_dlib.h b/dlib/dnn/cpu_dlib.h index 97a9aaca3..df8d20002 100644 --- a/dlib/dnn/cpu_dlib.h +++ b/dlib/dnn/cpu_dlib.h @@ -30,6 +30,13 @@ namespace dlib const tensor& src2 ); + void multiply_zero_padded ( + bool add_to, + tensor& dest, + const tensor& src1, + const tensor& src2 + ); + void add( float beta, tensor& dest, diff --git a/dlib/dnn/cuda_dlib.cu b/dlib/dnn/cuda_dlib.cu index b6bfc7834..7512f9c72 100644 --- a/dlib/dnn/cuda_dlib.cu +++ b/dlib/dnn/cuda_dlib.cu @@ -513,6 +513,134 @@ namespace dlib } + // ------------------------------------------------------------------------------------ + + __global__ void _cuda_mult1(float* d, const float* s1, const float* s2, size_t n) + { + for (auto i : grid_stride_range(0, n)) + { + d[i] = s1[i]*s2[i]; + } + } + + __global__ void _cuda_mult1_add_to(float* d, const float* s1, const float* s2, size_t n) + { + for (auto i : grid_stride_range(0, n)) + { + d[i] += s1[i]*s2[i]; + } + } + + __global__ void _cuda_mult2(float* d, const float* s1, const float* s2, + size_t dn, size_t dk, size_t dr, size_t dc, + size_t s1n, size_t s1k, size_t s1r, size_t s1c, + size_t s2n, size_t s2k, size_t s2r, size_t s2c) + { + for (auto i : grid_stride_range(0, dn*dk*dr*dc)) + { + size_t n,k,r,c; + unpack_idx(i, dk,dr,dc, n,k,r,c); + + float v1 = 0; + float v2 = 0; + + if (n < s1n && + k < s1k && + r < s1r && + c < s1c ) + { + v1 = s1[pack_idx(s1k,s1r,s1c, n,k,r,c)]; + } + + if (n < s2n && + k < s2k && + r < s2r && + c < s2c ) + { + v2 = s2[pack_idx(s2k,s2r,s2c, n,k,r,c)]; + } + + d[i] = v1*v2; + } + } + + __global__ void _cuda_mult2_add_to(float* d, const float* s1, const float* s2, + size_t dn, size_t dk, size_t dr, size_t dc, + size_t s1n, size_t s1k, size_t s1r, size_t s1c, + size_t s2n, size_t s2k, size_t s2r, size_t s2c) + { + for (auto i : grid_stride_range(0, dn*dk*dr*dc)) + { + size_t n,k,r,c; + unpack_idx(i, dk,dr,dc, n,k,r,c); + + float v1 = 0; + float v2 = 0; + + if (n < s1n && + k < s1k && + r < s1r && + c < s1c ) + { + v1 = s1[pack_idx(s1k,s1r,s1c, n,k,r,c)]; + } + + if (n < s2n && + k < s2k && + r < s2r && + c < s2c ) + { + v2 = s2[pack_idx(s2k,s2r,s2c, n,k,r,c)]; + } + + d[i] += v1*v2; + } + } + + void multiply_zero_padded ( + bool add_to, + tensor& dest, + const tensor& src1, + const tensor& src2 + ) + { + if (dest.size() == 0) + return; + + // Do the simple and fast version if everything has the same dimensions + if (have_same_dimensions(dest, src1) && + have_same_dimensions(dest, src2)) + { + if (add_to) + launch_kernel(_cuda_mult1_add_to,max_jobs(dest.size()), dest.device(), src1.device(), src2.device(), dest.size()); + else + launch_kernel(_cuda_mult1,max_jobs(dest.size()), dest.device(), src1.device(), src2.device(), dest.size()); + } + else + { + if (add_to) + { + // Otherwise, do the more complex version with bounds checking. + launch_kernel(_cuda_mult2_add_to,max_jobs(dest.size()), + dest.device(), src1.device(), src2.device(), + dest.num_samples(), dest.k(), dest.nr(), dest.nc(), + src1.num_samples(), src1.k(), src1.nr(), src1.nc(), + src2.num_samples(), src2.k(), src2.nr(), src2.nc() + ); + } + else + { + // Otherwise, do the more complex version with bounds checking. + launch_kernel(_cuda_mult2,max_jobs(dest.size()), + dest.device(), src1.device(), src2.device(), + dest.num_samples(), dest.k(), dest.nr(), dest.nc(), + src1.num_samples(), src1.k(), src1.nr(), src1.nc(), + src2.num_samples(), src2.k(), src2.nr(), src2.nc() + ); + } + } + } + // ------------------------------------------------------------------------------------ __global__ void _cuda_add1(float* d, const float* s1, const float* s2, size_t n) diff --git a/dlib/dnn/cuda_dlib.h b/dlib/dnn/cuda_dlib.h index 545615e37..dbf1bbd69 100644 --- a/dlib/dnn/cuda_dlib.h +++ b/dlib/dnn/cuda_dlib.h @@ -185,6 +185,13 @@ namespace dlib const tensor& src2 ); + void multiply_zero_padded ( + bool add_to, + tensor& dest, + const tensor& src1, + const tensor& src2 + ); + void add ( tensor& dest, const tensor& src1, diff --git a/dlib/dnn/tensor_tools.cpp b/dlib/dnn/tensor_tools.cpp index fc8a91340..7fc016fd1 100644 --- a/dlib/dnn/tensor_tools.cpp +++ b/dlib/dnn/tensor_tools.cpp @@ -304,6 +304,20 @@ namespace dlib { namespace tt #endif } + void multiply_zero_padded ( + bool add_to, + tensor& dest, + const tensor& src1, + const tensor& src2 + ) + { +#ifdef DLIB_USE_CUDA + cuda::multiply_zero_padded(add_to, dest, src1, src2); +#else + cpu::multiply_zero_padded(add_to, dest, src1, src2); +#endif + } + // ---------------------------------------------------------------------------------------- void affine_transform( diff --git a/dlib/dnn/tensor_tools.h b/dlib/dnn/tensor_tools.h index 40e27433e..3b2133954 100644 --- a/dlib/dnn/tensor_tools.h +++ b/dlib/dnn/tensor_tools.h @@ -306,6 +306,23 @@ namespace dlib { namespace tt - Instead of assigning the result to dest, this function adds the result to dest. !*/ + void multiply_zero_padded ( + bool add_to, + tensor& dest, + const tensor& src1, + const tensor& src2 + ); + /*! + ensures + - if (add_to) then + - performs: dest += src1 * src2 + - else + - performs: dest = src1 * src2 + - In either case, the multiplication happens pointwise according to 4D tensor + arithmetic. If the dimensions don't match then missing elements are presumed + to be equal to 0. + !*/ + // ---------------------------------------------------------------------------------------- void affine_transform( diff --git a/dlib/test/dnn.cpp b/dlib/test/dnn.cpp index b8f16c538..a4a938367 100644 --- a/dlib/test/dnn.cpp +++ b/dlib/test/dnn.cpp @@ -904,6 +904,64 @@ namespace DLIB_TEST_MSG(max(abs(mat(v)-mat(vv))) < 1e-6, max(abs(mat(v)-mat(vv)))); } + void test_multiply_zero_padded() + { + print_spinner(); + dlib::rand rnd; + tt::tensor_rand trnd; + for (int iter = 0; iter < 300; ++iter) + { + resizable_tensor dest1(rnd.get_random_32bit_number()%4+1, + rnd.get_random_32bit_number()%4+1, + rnd.get_random_32bit_number()%4+1, + rnd.get_random_32bit_number()%4+1); + resizable_tensor dest2; + dest2.copy_size(dest1); + resizable_tensor src1(rnd.get_random_32bit_number()%4+1, + rnd.get_random_32bit_number()%4+1, + rnd.get_random_32bit_number()%4+1, + rnd.get_random_32bit_number()%4+1); + resizable_tensor src2(rnd.get_random_32bit_number()%4+1, + rnd.get_random_32bit_number()%4+1, + rnd.get_random_32bit_number()%4+1, + rnd.get_random_32bit_number()%4+1); + + trnd.fill_uniform(dest1); + trnd.fill_uniform(dest2); + trnd.fill_uniform(src1); + trnd.fill_uniform(src2); + cpu::multiply_zero_padded(false, dest1, src1, src2); + cuda::multiply_zero_padded(false, dest2, src1, src2); + DLIB_TEST(max(abs(mat(dest1) - mat(dest2))) < 1e-5); + + cpu::multiply_zero_padded(true, dest1, src1, src2); + cuda::multiply_zero_padded(true, dest2, src1, src2); + DLIB_TEST(max(abs(mat(dest1) - mat(dest2))) < 1e-5); + } + + // make sure we have a test for the case where all tensors have the same + // dimensions. + resizable_tensor dest1(3,4,5,6); + resizable_tensor dest2; + resizable_tensor src1; + resizable_tensor src2; + dest2.copy_size(dest1); + src1.copy_size(dest1); + src2.copy_size(dest1); + + trnd.fill_uniform(dest1); + trnd.fill_uniform(dest2); + trnd.fill_uniform(src1); + trnd.fill_uniform(src2); + cpu::multiply_zero_padded(false, dest1, src1, src2); + cuda::multiply_zero_padded(false, dest2, src1, src2); + DLIB_TEST(max(abs(mat(dest1) - mat(dest2))) < 1e-5); + + cpu::multiply_zero_padded(true, dest1, src1, src2); + cuda::multiply_zero_padded(true, dest2, src1, src2); + DLIB_TEST(max(abs(mat(dest1) - mat(dest2))) < 1e-5); + } + void test_add() { print_spinner(); @@ -2606,6 +2664,7 @@ namespace compare_bn_gpu_and_cpu(); compare_bn_conv_gpu_and_cpu(); test_add(); + test_multiply_zero_padded(); compare_adam(); test_copy_tensor_gpu(); #endif