From b6418e349e3f8073310a334ad833317638f8ff99 Mon Sep 17 00:00:00 2001 From: Juha Reunanen Date: Tue, 16 May 2023 15:03:58 +0300 Subject: [PATCH] Problem: CUDA error having `code: 716, reason: misaligned address` (#2796) Why does this happen: `float` data should be aligned to 4 bytes [1], but this is not guaranteed when the truth data is `uint16_t` and is in the buffer before the weights data Solution: re-arrange the buffer so that the weights data (type `float`) comes before the truth data (type `uint16_t`), essentially making it so that the `float` data is correctly aligned to 4 bytes [1] https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses --- dlib/cuda/cuda_dlib.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/dlib/cuda/cuda_dlib.h b/dlib/cuda/cuda_dlib.h index 17785eefb..a9a351772 100644 --- a/dlib/cuda/cuda_dlib.h +++ b/dlib/cuda/cuda_dlib.h @@ -707,7 +707,7 @@ namespace dlib cuda_data_ptr loss_buf = static_pointer_cast(buf, 1); buf = buf+sizeof(float); - const auto weights_offset = subnetwork_output.num_samples() * bytes_per_plane; + const auto truth_offset = subnetwork_output.num_samples() * weight_bytes_per_plane; // copy the truth data into a cuda buffer. for (long i = 0; i < subnetwork_output.num_samples(); ++i, ++truth) { @@ -722,13 +722,13 @@ namespace dlib weights(r, c) = t(r, c).weight; } } - memcpy(buf + i*bytes_per_plane, &labels(0,0), bytes_per_plane); - memcpy(buf + weights_offset + i*weight_bytes_per_plane, &weights(0, 0), weight_bytes_per_plane); + memcpy(buf + truth_offset + i*bytes_per_plane, &labels(0,0), bytes_per_plane); + memcpy(buf + i*weight_bytes_per_plane, &weights(0, 0), weight_bytes_per_plane); } - auto truth_buf = static_pointer_cast(buf, subnetwork_output.num_samples()*image_size); - buf = buf+weights_offset; auto weights_buf = static_pointer_cast(buf, subnetwork_output.num_samples()*image_size); + buf = buf+truth_offset; + auto truth_buf = static_pointer_cast(buf, subnetwork_output.num_samples()*image_size); do_work(loss_buf, truth_buf, weights_buf, subnetwork_output, gradient, loss); }