Added multiply_zero_padded()

This commit is contained in:
Davis King 2017-08-11 16:39:00 -04:00
parent 46a02d9447
commit f7310f4bbc
7 changed files with 305 additions and 0 deletions

View File

@ -265,6 +265,79 @@ namespace dlib
}
}
// ----------------------------------------------------------------------------------------
void multiply_zero_padded (
bool add_to,
tensor& dest,
const tensor& src1,
const tensor& src2
)
{
auto d = dest.host();
auto s1 = src1.host();
auto s2 = src2.host();
// Do the simple and fast version if everything has the same dimensions
if (have_same_dimensions(dest, src1) &&
have_same_dimensions(dest, src2))
{
if (add_to)
{
for (size_t i = 0; i < dest.size(); ++i)
d[i] += s1[i] * s2[i];
}
else
{
for (size_t i = 0; i < dest.size(); ++i)
d[i] = s1[i] * s2[i];
}
return;
}
// Otherwise, do the more complex version with bounds checking.
for (long n = 0; n < dest.num_samples(); ++n)
{
for (long k = 0; k < dest.k(); ++k)
{
for (long r = 0; r < dest.nr(); ++r)
{
for (long c = 0; c < dest.nc(); ++c)
{
float v1 = 0;
float v2 = 0;
// if this index is inside src1
if (n < src1.num_samples() &&
k < src1.k() &&
r < src1.nr() &&
c < src1.nc() )
{
const auto s_idx = ((n*src1.k() + k)*src1.nr() + r)*src1.nc() + c;
v1 = s1[s_idx];
}
// if this index is inside src2
if (n < src2.num_samples() &&
k < src2.k() &&
r < src2.nr() &&
c < src2.nc() )
{
const auto s_idx = ((n*src2.k() + k)*src2.nr() + r)*src2.nc() + c;
v2 = s2[s_idx];
}
if (add_to)
*d += v1 * v2;
else
*d = v1 * v2;
++d;
}
}
}
}
}
// ----------------------------------------------------------------------------------------
void assign_bias_gradient (

View File

@ -30,6 +30,13 @@ namespace dlib
const tensor& src2
);
void multiply_zero_padded (
bool add_to,
tensor& dest,
const tensor& src1,
const tensor& src2
);
void add(
float beta,
tensor& dest,

View File

@ -513,6 +513,134 @@ namespace dlib
}
// ------------------------------------------------------------------------------------
__global__ void _cuda_mult1(float* d, const float* s1, const float* s2, size_t n)
{
for (auto i : grid_stride_range(0, n))
{
d[i] = s1[i]*s2[i];
}
}
__global__ void _cuda_mult1_add_to(float* d, const float* s1, const float* s2, size_t n)
{
for (auto i : grid_stride_range(0, n))
{
d[i] += s1[i]*s2[i];
}
}
__global__ void _cuda_mult2(float* d, const float* s1, const float* s2,
size_t dn, size_t dk, size_t dr, size_t dc,
size_t s1n, size_t s1k, size_t s1r, size_t s1c,
size_t s2n, size_t s2k, size_t s2r, size_t s2c)
{
for (auto i : grid_stride_range(0, dn*dk*dr*dc))
{
size_t n,k,r,c;
unpack_idx(i, dk,dr,dc, n,k,r,c);
float v1 = 0;
float v2 = 0;
if (n < s1n &&
k < s1k &&
r < s1r &&
c < s1c )
{
v1 = s1[pack_idx(s1k,s1r,s1c, n,k,r,c)];
}
if (n < s2n &&
k < s2k &&
r < s2r &&
c < s2c )
{
v2 = s2[pack_idx(s2k,s2r,s2c, n,k,r,c)];
}
d[i] = v1*v2;
}
}
__global__ void _cuda_mult2_add_to(float* d, const float* s1, const float* s2,
size_t dn, size_t dk, size_t dr, size_t dc,
size_t s1n, size_t s1k, size_t s1r, size_t s1c,
size_t s2n, size_t s2k, size_t s2r, size_t s2c)
{
for (auto i : grid_stride_range(0, dn*dk*dr*dc))
{
size_t n,k,r,c;
unpack_idx(i, dk,dr,dc, n,k,r,c);
float v1 = 0;
float v2 = 0;
if (n < s1n &&
k < s1k &&
r < s1r &&
c < s1c )
{
v1 = s1[pack_idx(s1k,s1r,s1c, n,k,r,c)];
}
if (n < s2n &&
k < s2k &&
r < s2r &&
c < s2c )
{
v2 = s2[pack_idx(s2k,s2r,s2c, n,k,r,c)];
}
d[i] += v1*v2;
}
}
void multiply_zero_padded (
bool add_to,
tensor& dest,
const tensor& src1,
const tensor& src2
)
{
if (dest.size() == 0)
return;
// Do the simple and fast version if everything has the same dimensions
if (have_same_dimensions(dest, src1) &&
have_same_dimensions(dest, src2))
{
if (add_to)
launch_kernel(_cuda_mult1_add_to,max_jobs(dest.size()), dest.device(), src1.device(), src2.device(), dest.size());
else
launch_kernel(_cuda_mult1,max_jobs(dest.size()), dest.device(), src1.device(), src2.device(), dest.size());
}
else
{
if (add_to)
{
// Otherwise, do the more complex version with bounds checking.
launch_kernel(_cuda_mult2_add_to,max_jobs(dest.size()),
dest.device(), src1.device(), src2.device(),
dest.num_samples(), dest.k(), dest.nr(), dest.nc(),
src1.num_samples(), src1.k(), src1.nr(), src1.nc(),
src2.num_samples(), src2.k(), src2.nr(), src2.nc()
);
}
else
{
// Otherwise, do the more complex version with bounds checking.
launch_kernel(_cuda_mult2,max_jobs(dest.size()),
dest.device(), src1.device(), src2.device(),
dest.num_samples(), dest.k(), dest.nr(), dest.nc(),
src1.num_samples(), src1.k(), src1.nr(), src1.nc(),
src2.num_samples(), src2.k(), src2.nr(), src2.nc()
);
}
}
}
// ------------------------------------------------------------------------------------
__global__ void _cuda_add1(float* d, const float* s1, const float* s2, size_t n)

View File

@ -185,6 +185,13 @@ namespace dlib
const tensor& src2
);
void multiply_zero_padded (
bool add_to,
tensor& dest,
const tensor& src1,
const tensor& src2
);
void add (
tensor& dest,
const tensor& src1,

View File

@ -304,6 +304,20 @@ namespace dlib { namespace tt
#endif
}
void multiply_zero_padded (
bool add_to,
tensor& dest,
const tensor& src1,
const tensor& src2
)
{
#ifdef DLIB_USE_CUDA
cuda::multiply_zero_padded(add_to, dest, src1, src2);
#else
cpu::multiply_zero_padded(add_to, dest, src1, src2);
#endif
}
// ----------------------------------------------------------------------------------------
void affine_transform(

View File

@ -306,6 +306,23 @@ namespace dlib { namespace tt
- Instead of assigning the result to dest, this function adds the result to dest.
!*/
void multiply_zero_padded (
bool add_to,
tensor& dest,
const tensor& src1,
const tensor& src2
);
/*!
ensures
- if (add_to) then
- performs: dest += src1 * src2
- else
- performs: dest = src1 * src2
- In either case, the multiplication happens pointwise according to 4D tensor
arithmetic. If the dimensions don't match then missing elements are presumed
to be equal to 0.
!*/
// ----------------------------------------------------------------------------------------
void affine_transform(

View File

@ -904,6 +904,64 @@ namespace
DLIB_TEST_MSG(max(abs(mat(v)-mat(vv))) < 1e-6, max(abs(mat(v)-mat(vv))));
}
void test_multiply_zero_padded()
{
print_spinner();
dlib::rand rnd;
tt::tensor_rand trnd;
for (int iter = 0; iter < 300; ++iter)
{
resizable_tensor dest1(rnd.get_random_32bit_number()%4+1,
rnd.get_random_32bit_number()%4+1,
rnd.get_random_32bit_number()%4+1,
rnd.get_random_32bit_number()%4+1);
resizable_tensor dest2;
dest2.copy_size(dest1);
resizable_tensor src1(rnd.get_random_32bit_number()%4+1,
rnd.get_random_32bit_number()%4+1,
rnd.get_random_32bit_number()%4+1,
rnd.get_random_32bit_number()%4+1);
resizable_tensor src2(rnd.get_random_32bit_number()%4+1,
rnd.get_random_32bit_number()%4+1,
rnd.get_random_32bit_number()%4+1,
rnd.get_random_32bit_number()%4+1);
trnd.fill_uniform(dest1);
trnd.fill_uniform(dest2);
trnd.fill_uniform(src1);
trnd.fill_uniform(src2);
cpu::multiply_zero_padded(false, dest1, src1, src2);
cuda::multiply_zero_padded(false, dest2, src1, src2);
DLIB_TEST(max(abs(mat(dest1) - mat(dest2))) < 1e-5);
cpu::multiply_zero_padded(true, dest1, src1, src2);
cuda::multiply_zero_padded(true, dest2, src1, src2);
DLIB_TEST(max(abs(mat(dest1) - mat(dest2))) < 1e-5);
}
// make sure we have a test for the case where all tensors have the same
// dimensions.
resizable_tensor dest1(3,4,5,6);
resizable_tensor dest2;
resizable_tensor src1;
resizable_tensor src2;
dest2.copy_size(dest1);
src1.copy_size(dest1);
src2.copy_size(dest1);
trnd.fill_uniform(dest1);
trnd.fill_uniform(dest2);
trnd.fill_uniform(src1);
trnd.fill_uniform(src2);
cpu::multiply_zero_padded(false, dest1, src1, src2);
cuda::multiply_zero_padded(false, dest2, src1, src2);
DLIB_TEST(max(abs(mat(dest1) - mat(dest2))) < 1e-5);
cpu::multiply_zero_padded(true, dest1, src1, src2);
cuda::multiply_zero_padded(true, dest2, src1, src2);
DLIB_TEST(max(abs(mat(dest1) - mat(dest2))) < 1e-5);
}
void test_add()
{
print_spinner();
@ -2606,6 +2664,7 @@ namespace
compare_bn_gpu_and_cpu();
compare_bn_conv_gpu_and_cpu();
test_add();
test_multiply_zero_padded();
compare_adam();
test_copy_tensor_gpu();
#endif