Problem: CUDA error having code: 716, reason: misaligned address (#2796)

Why does this happen: `float` data should be aligned to 4 bytes [1], but this is not guaranteed when the truth data is `uint16_t` and is in the buffer before the weights data

Solution: re-arrange the buffer so that the weights data (type `float`) comes before the truth data (type `uint16_t`), essentially making it so that the `float` data is correctly aligned to 4 bytes

[1] https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses
This commit is contained in:
Juha Reunanen 2023-05-16 15:03:58 +03:00 committed by GitHub
parent decdef12f5
commit b6418e349e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -707,7 +707,7 @@ namespace dlib
cuda_data_ptr<float> loss_buf = static_pointer_cast<float>(buf, 1);
buf = buf+sizeof(float);
const auto weights_offset = subnetwork_output.num_samples() * bytes_per_plane;
const auto truth_offset = subnetwork_output.num_samples() * weight_bytes_per_plane;
// copy the truth data into a cuda buffer.
for (long i = 0; i < subnetwork_output.num_samples(); ++i, ++truth)
{
@ -722,13 +722,13 @@ namespace dlib
weights(r, c) = t(r, c).weight;
}
}
memcpy(buf + i*bytes_per_plane, &labels(0,0), bytes_per_plane);
memcpy(buf + weights_offset + i*weight_bytes_per_plane, &weights(0, 0), weight_bytes_per_plane);
memcpy(buf + truth_offset + i*bytes_per_plane, &labels(0,0), bytes_per_plane);
memcpy(buf + i*weight_bytes_per_plane, &weights(0, 0), weight_bytes_per_plane);
}
auto truth_buf = static_pointer_cast<const uint16_t>(buf, subnetwork_output.num_samples()*image_size);
buf = buf+weights_offset;
auto weights_buf = static_pointer_cast<const float>(buf, subnetwork_output.num_samples()*image_size);
buf = buf+truth_offset;
auto truth_buf = static_pointer_cast<const uint16_t>(buf, subnetwork_output.num_samples()*image_size);
do_work(loss_buf, truth_buf, weights_buf, subnetwork_output, gradient, loss);
}