mirror of
https://github.com/davisking/dlib.git
synced 2024-11-01 10:14:53 +08:00
Problem: CUDA error having code: 716, reason: misaligned address
(#2796)
Why does this happen: `float` data should be aligned to 4 bytes [1], but this is not guaranteed when the truth data is `uint16_t` and is in the buffer before the weights data Solution: re-arrange the buffer so that the weights data (type `float`) comes before the truth data (type `uint16_t`), essentially making it so that the `float` data is correctly aligned to 4 bytes [1] https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses
This commit is contained in:
parent
decdef12f5
commit
b6418e349e
@ -707,7 +707,7 @@ namespace dlib
|
||||
|
||||
cuda_data_ptr<float> loss_buf = static_pointer_cast<float>(buf, 1);
|
||||
buf = buf+sizeof(float);
|
||||
const auto weights_offset = subnetwork_output.num_samples() * bytes_per_plane;
|
||||
const auto truth_offset = subnetwork_output.num_samples() * weight_bytes_per_plane;
|
||||
// copy the truth data into a cuda buffer.
|
||||
for (long i = 0; i < subnetwork_output.num_samples(); ++i, ++truth)
|
||||
{
|
||||
@ -722,13 +722,13 @@ namespace dlib
|
||||
weights(r, c) = t(r, c).weight;
|
||||
}
|
||||
}
|
||||
memcpy(buf + i*bytes_per_plane, &labels(0,0), bytes_per_plane);
|
||||
memcpy(buf + weights_offset + i*weight_bytes_per_plane, &weights(0, 0), weight_bytes_per_plane);
|
||||
memcpy(buf + truth_offset + i*bytes_per_plane, &labels(0,0), bytes_per_plane);
|
||||
memcpy(buf + i*weight_bytes_per_plane, &weights(0, 0), weight_bytes_per_plane);
|
||||
}
|
||||
|
||||
auto truth_buf = static_pointer_cast<const uint16_t>(buf, subnetwork_output.num_samples()*image_size);
|
||||
buf = buf+weights_offset;
|
||||
auto weights_buf = static_pointer_cast<const float>(buf, subnetwork_output.num_samples()*image_size);
|
||||
buf = buf+truth_offset;
|
||||
auto truth_buf = static_pointer_cast<const uint16_t>(buf, subnetwork_output.num_samples()*image_size);
|
||||
|
||||
do_work(loss_buf, truth_buf, weights_buf, subnetwork_output, gradient, loss);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user