Add new loss layer for binary loss per pixel (#1976)

* Add new loss layer for binary loss per pixel
pull/1985/head
Juha Reunanen 5 years ago committed by Davis E. King
parent 6bdd289f73
commit bd6994cc66

@ -1681,6 +1681,48 @@ namespace dlib
}
}
// ----------------------------------------------------------------------------------------
__device__ float cuda_log1pexp(float x)
{
if (x <= -18)
return std::exp(x);
else if (-18 < x && x <= 9)
return std::log1p(std::exp(x));
else if (9 < x && x <= 16)
return x + std::exp(-x);
else
return x;
}
__global__ void _cuda_compute_loss_binary_log_per_pixel(float* loss_out, float* g, const float* truth, const float* out_data, size_t n, const float scale)
{
float loss = 0;
for(auto i : grid_stride_range(0, n))
{
const float y = truth[i];
if (y > 0.f)
{
const float temp = cuda_log1pexp(-out_data[i]);
loss += y*temp;
g[i] = y*scale*(g[i]-1);
}
else if (y < 0.f)
{
const float temp = -(-out_data[i]-cuda_log1pexp(-out_data[i]));
loss += -y*temp;
g[i] = -y*scale*g[i];
}
else
{
g[i] = 0.f;
}
}
warp_reduce_atomic_add(*loss_out, loss);
}
// ----------------------------------------------------------------------------------------
__device__ float cuda_safe_log(float x, float epsilon = 1e-10)
@ -1720,29 +1762,52 @@ namespace dlib
warp_reduce_atomic_add(*loss_out, loss);
}
// ----------------------------------------------------------------------------------------
void compute_loss_binary_log_per_pixel::
do_work(
cuda_data_ptr<float> loss_work_buffer,
cuda_data_ptr<const float> truth_buffer,
const tensor& subnetwork_output,
tensor& gradient,
double& loss
)
{
CHECK_CUDA(cudaMemset(loss_work_buffer, 0, sizeof(float)));
sigmoid(gradient, subnetwork_output);
// The loss we output is the average loss over the mini-batch, and also over each element of the matrix output.
const double scale = 1.0 / (subnetwork_output.num_samples() * subnetwork_output.nr() * subnetwork_output.nc());
launch_kernel(_cuda_compute_loss_binary_log_per_pixel, max_jobs(gradient.size()),
loss_work_buffer.data(), gradient.device(), truth_buffer.data(), subnetwork_output.device(), gradient.size(), scale);
float floss;
dlib::cuda::memcpy(&floss, loss_work_buffer);
loss = scale*floss;
}
void compute_loss_multiclass_log_per_pixel::
do_work(
float* loss_cuda_work_buffer,
const uint16_t* truth_buffer,
cuda_data_ptr<float> loss_work_buffer,
cuda_data_ptr<const uint16_t> truth_buffer,
const tensor& subnetwork_output,
tensor& gradient,
double& loss
)
{
CHECK_CUDA(cudaMemset(loss_cuda_work_buffer, 0, sizeof(float)));
CHECK_CUDA(cudaMemset(loss_work_buffer, 0, sizeof(float)));
softmax(gradient, subnetwork_output);
static const uint16_t label_to_ignore = std::numeric_limits<uint16_t>::max();
// The loss we output is the average loss over the mini-batch, and also over each element of the matrix output.
const double scale = 1.0 / (subnetwork_output.num_samples() * subnetwork_output.nr() * subnetwork_output.nc());
launch_kernel(_cuda_compute_loss_multiclass_log_per_pixel, max_jobs(gradient.size()),
loss_cuda_work_buffer, gradient.device(), truth_buffer, gradient.size(), gradient.nr()*gradient.nc(), gradient.nr()*gradient.nc()*gradient.k(), gradient.k(), label_to_ignore, scale);
loss_work_buffer.data(), gradient.device(), truth_buffer.data(), gradient.size(), gradient.nr()*gradient.nc(), gradient.nr()*gradient.nc()*gradient.k(), gradient.k(), label_to_ignore, scale);
float floss;
CHECK_CUDA(cudaMemcpy(&floss, loss_cuda_work_buffer, sizeof(float), cudaMemcpyDefault));
dlib::cuda::memcpy(&floss, loss_work_buffer);
loss = scale*floss;
}

@ -424,11 +424,70 @@ namespace dlib
// ----------------------------------------------------------------------------------------
class compute_loss_binary_log_per_pixel
{
/*!
The point of this class is to compute the loss computed by
loss_binary_log_per_pixel_, but to do so with CUDA.
!*/
public:
compute_loss_binary_log_per_pixel(
)
{
}
template <
typename const_label_iterator
>
void operator() (
const_label_iterator truth,
const tensor& subnetwork_output,
tensor& gradient,
double& loss
) const
{
const auto image_size = subnetwork_output.nr()*subnetwork_output.nc();
const size_t bytes_per_plane = image_size*sizeof(float);
// Allocate a cuda buffer to store all the truth images and also one float
// for the scalar loss output.
buf = device_global_buffer(subnetwork_output.num_samples()*bytes_per_plane + sizeof(float));
cuda_data_ptr<float> loss_buf = static_pointer_cast<float>(buf, 1);
buf = buf+sizeof(float);
// copy the truth data into a cuda buffer.
for (long i = 0; i < subnetwork_output.num_samples(); ++i, ++truth)
{
const matrix<float>& t = *truth;
DLIB_ASSERT(t.nr() == subnetwork_output.nr());
DLIB_ASSERT(t.nc() == subnetwork_output.nc());
memcpy(buf + i*bytes_per_plane, &t(0,0), bytes_per_plane);
}
auto truth_buf = static_pointer_cast<const float>(buf, subnetwork_output.num_samples()*image_size);
do_work(loss_buf, truth_buf, subnetwork_output, gradient, loss);
}
private:
static void do_work(
cuda_data_ptr<float> loss_work_buffer,
cuda_data_ptr<const float> truth_buffer,
const tensor& subnetwork_output,
tensor& gradient,
double& loss
);
mutable cuda_data_void_ptr buf;
};
class compute_loss_multiclass_log_per_pixel
{
/*!
The point of this class is to compute the loss computed by
loss_multiclass_log_per_pixel, but to do so with CUDA.
loss_multiclass_log_per_pixel_, but to do so with CUDA.
!*/
public:
@ -447,12 +506,13 @@ namespace dlib
double& loss
) const
{
const size_t bytes_per_plane = subnetwork_output.nr()*subnetwork_output.nc()*sizeof(uint16_t);
const auto image_size = subnetwork_output.nr()*subnetwork_output.nc();
const size_t bytes_per_plane = image_size*sizeof(uint16_t);
// Allocate a cuda buffer to store all the truth images and also one float
// for the scalar loss output.
buf = device_global_buffer(subnetwork_output.num_samples()*bytes_per_plane + sizeof(float));
cuda_data_void_ptr loss_buf = buf;
cuda_data_ptr<float> loss_buf = static_pointer_cast<float>(buf, 1);
buf = buf+sizeof(float);
// copy the truth data into a cuda buffer.
@ -464,14 +524,16 @@ namespace dlib
memcpy(buf + i*bytes_per_plane, &t(0,0), bytes_per_plane);
}
do_work(static_cast<float*>(loss_buf.data()), static_cast<uint16_t*>(buf.data()), subnetwork_output, gradient, loss);
auto truth_buf = static_pointer_cast<const uint16_t>(buf, subnetwork_output.num_samples()*image_size);
do_work(loss_buf, truth_buf, subnetwork_output, gradient, loss);
}
private:
static void do_work(
float* loss_cuda_work_buffer,
const uint16_t* truth_buffer,
cuda_data_ptr<float> loss_work_buffer,
cuda_data_ptr<const uint16_t> truth_buffer,
const tensor& subnetwork_output,
tensor& gradient,
double& loss

@ -2481,6 +2481,164 @@ namespace dlib
template <typename SUBNET>
using loss_mean_squared_multioutput = add_loss_layer<loss_mean_squared_multioutput_, SUBNET>;
// ----------------------------------------------------------------------------------------
class loss_binary_log_per_pixel_
{
public:
typedef matrix<float> training_label_type;
typedef matrix<float> output_label_type;
template <
typename SUB_TYPE,
typename label_iterator
>
static void to_label (
const tensor& input_tensor,
const SUB_TYPE& sub,
label_iterator iter
)
{
DLIB_CASSERT(sub.sample_expansion_factor() == 1);
const tensor& output_tensor = sub.get_output();
DLIB_CASSERT(output_tensor.k() == 1);
DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
const float* const out_data = output_tensor.host();
for (long i = 0; i < output_tensor.num_samples(); ++i, ++iter)
{
iter->set_size(output_tensor.nr(), output_tensor.nc());
for (long r = 0; r < output_tensor.nr(); ++r)
{
for (long c = 0; c < output_tensor.nc(); ++c)
{
iter->operator()(r, c) = out_data[tensor_index(output_tensor, i, r, c)];
}
}
}
}
template <
typename const_label_iterator,
typename SUBNET
>
double compute_loss_value_and_gradient (
const tensor& input_tensor,
const_label_iterator truth,
SUBNET& sub
) const
{
const tensor& output_tensor = sub.get_output();
tensor& grad = sub.get_gradient_input();
DLIB_CASSERT(sub.sample_expansion_factor() == 1);
DLIB_CASSERT(input_tensor.num_samples() != 0);
DLIB_CASSERT(input_tensor.num_samples()%sub.sample_expansion_factor() == 0);
DLIB_CASSERT(input_tensor.num_samples() == grad.num_samples());
DLIB_CASSERT(input_tensor.num_samples() == output_tensor.num_samples());
DLIB_CASSERT(output_tensor.k() == 1);
DLIB_CASSERT(output_tensor.nr() == grad.nr() &&
output_tensor.nc() == grad.nc() &&
output_tensor.k() == grad.k());
for (long idx = 0; idx < output_tensor.num_samples(); ++idx)
{
const_label_iterator truth_matrix_ptr = (truth + idx);
DLIB_CASSERT(truth_matrix_ptr->nr() == output_tensor.nr() &&
truth_matrix_ptr->nc() == output_tensor.nc(),
"truth size = " << truth_matrix_ptr->nr() << " x " << truth_matrix_ptr->nc() << ", "
"output size = " << output_tensor.nr() << " x " << output_tensor.nc());
}
#ifdef DLIB_USE_CUDA
double loss;
cuda_compute(truth, output_tensor, grad, loss);
return loss;
#else
tt::sigmoid(grad, output_tensor);
// The loss we output is the average loss over the mini-batch, and also over each element of the matrix output.
const double scale = 1.0/(output_tensor.num_samples()*output_tensor.nr()*output_tensor.nc());
double loss = 0;
float* const g = grad.host();
const float* const out_data = output_tensor.host();
for (long i = 0; i < output_tensor.num_samples(); ++i, ++truth)
{
for (long r = 0; r < output_tensor.nr(); ++r)
{
for (long c = 0; c < output_tensor.nc(); ++c)
{
const float y = truth->operator()(r, c);
const size_t idx = tensor_index(output_tensor, i, r, c);
if (y > 0.f)
{
const float temp = log1pexp(-out_data[idx]);
loss += y*scale*temp;
g[idx] = y*scale*(g[idx]-1);
}
else if (y < 0.f)
{
const float temp = -(-out_data[idx]-log1pexp(-out_data[idx]));
loss += -y*scale*temp;
g[idx] = -y*scale*g[idx];
}
else
{
g[idx] = 0.f;
}
}
}
}
return loss;
#endif
}
friend void serialize(const loss_binary_log_per_pixel_& , std::ostream& out)
{
serialize("loss_binary_log_per_pixel_", out);
}
friend void deserialize(loss_binary_log_per_pixel_& , std::istream& in)
{
std::string version;
deserialize(version, in);
if (version != "loss_binary_log_per_pixel_")
throw serialization_error("Unexpected version found while deserializing dlib::loss_binary_log_per_pixel_.");
}
friend std::ostream& operator<<(std::ostream& out, const loss_binary_log_per_pixel_& )
{
out << "loss_binary_log_per_pixel";
return out;
}
friend void to_xml(const loss_binary_log_per_pixel_& /*item*/, std::ostream& out)
{
out << "<loss_binary_log_per_pixel/>";
}
private:
static size_t tensor_index(const tensor& t, long sample, long row, long column)
{
DLIB_ASSERT(t.k() == 1);
// See: https://github.com/davisking/dlib/blob/4dfeb7e186dd1bf6ac91273509f687293bd4230a/dlib/dnn/tensor_abstract.h#L38
return (sample * t.nr() + row) * t.nc() + column;
}
#ifdef DLIB_USE_CUDA
cuda::compute_loss_binary_log_per_pixel cuda_compute;
#endif
};
template <typename SUBNET>
using loss_binary_log_per_pixel = add_loss_layer<loss_binary_log_per_pixel_, SUBNET>;
// ----------------------------------------------------------------------------------------
class loss_multiclass_log_per_pixel_

@ -1283,6 +1283,68 @@ namespace dlib
template <typename SUBNET>
using loss_mean_squared_multioutput = add_loss_layer<loss_mean_squared_multioutput_, SUBNET>;
// ----------------------------------------------------------------------------------------
class loss_binary_log_per_pixel_
{
/*!
WHAT THIS OBJECT REPRESENTS
This object implements the loss layer interface defined above by
EXAMPLE_LOSS_LAYER_. In particular, it implements the log loss, which is
appropriate for binary classification problems. It is basically just like
loss_binary_log_ except that it lets you define matrix outputs instead
of scalar outputs. It should be useful, for example, in segmentation
where we want to classify each pixel of an image, and also get at least
some sort of confidence estimate for each pixel.
!*/
public:
typedef matrix<float> training_label_type;
typedef matrix<float> output_label_type;
template <
typename SUB_TYPE,
typename label_iterator
>
void to_label (
const tensor& input_tensor,
const SUB_TYPE& sub,
label_iterator iter
) const;
/*!
This function has the same interface as EXAMPLE_LOSS_LAYER_::to_label() except
it has the additional calling requirements that:
- sub.get_output().num_samples() == input_tensor.num_samples()
- sub.sample_expansion_factor() == 1
and the output label is the raw score for each classified object. If the score
is > 0 then the classifier is predicting the +1 class, otherwise it is
predicting the -1 class.
!*/
template <
typename const_label_iterator,
typename SUBNET
>
double compute_loss_value_and_gradient (
const tensor& input_tensor,
const_label_iterator truth,
SUBNET& sub
) const;
/*!
This function has the same interface as EXAMPLE_LOSS_LAYER_::compute_loss_value_and_gradient()
except it has the additional calling requirements that:
- sub.get_output().num_samples() == input_tensor.num_samples()
- sub.sample_expansion_factor() == 1
- all pixel values pointed to by truth correspond to the desired target values.
Nominally they should be +1 or -1, each indicating the desired class label,
or 0 to indicate that the corresponding pixel is to be ignored.
!*/
};
template <typename SUBNET>
using loss_binary_log_per_pixel = add_loss_layer<loss_binary_log_per_pixel_, SUBNET>;
// ----------------------------------------------------------------------------------------
class loss_multiclass_log_per_pixel_

@ -2587,6 +2587,202 @@ namespace
DLIB_TEST_MSG(error_after < error_before, "multi channel error increased after training");
}
// ----------------------------------------------------------------------------------------
void test_loss_binary_log_per_pixel_learned_params_on_trivial_two_pixel_task()
{
print_spinner();
::std::vector<matrix<float>> x({ matrix<float,2,1>({ -1, 1 }) });
::std::vector<matrix<float>> y({ matrix<float,2,1>({ -1, 1 }) });
using net_type = loss_binary_log_per_pixel<con<1,1,1,1,1,input<matrix<float>>>>;
net_type net;
dnn_trainer<net_type> trainer(net, sgd(0,0));
trainer.set_learning_rate(1e7);
trainer.set_max_num_epochs(1);
trainer.train(x, y);
const tensor& learned_params = layer<1>(net).layer_details().get_layer_params();
const float* learned_params_data = learned_params.host();
DLIB_TEST(learned_params_data[0] > 1e5);
DLIB_TEST(abs(learned_params_data[1]) < 1);
}
// ----------------------------------------------------------------------------------------
void test_loss_binary_log_per_pixel_outputs_on_trivial_task()
{
print_spinner();
constexpr int input_height = 7;
constexpr int input_width = 5;
constexpr int output_height = input_height;
constexpr int output_width = input_width;
constexpr int num_samples = 7;
constexpr int filter_height = 3;
constexpr int filter_width = 3;
::std::vector<matrix<double>> x(num_samples);
::std::vector<matrix<float>> y(num_samples);
matrix<double> xtmp(input_height, input_width);
matrix<float> ytmp(output_height, output_width);
::std::default_random_engine generator(16);
::std::normal_distribution<double> n01(0);
const auto z = 0.674490; // This should give us a 50/50 split between the classes
// Generate training data: random inputs x, and the corresponding target outputs y
for (int ii = 0; ii < num_samples; ++ii) {
for (int jj = 0; jj < input_height; ++jj) {
for (int kk = 0; kk < input_width; ++kk) {
xtmp(jj, kk) = n01(generator);
ytmp(jj, kk) = std::abs(xtmp(jj, kk)) > z ? 1.f : -1.f;
}
}
x[ii] = xtmp;
y[ii] = ytmp;
}
using net_type = loss_binary_log_per_pixel<con<1,1,1,1,1,relu<con<10,1,1,1,1,input<matrix<double>>>>>>;
net_type net;
dnn_trainer<net_type> trainer(net, sgd(0, 0.9));
trainer.set_learning_rate(1);
trainer.set_max_num_epochs(800);
trainer.train(x, y);
// The learning task is easy, so the net should have no problem
// getting all the outputs right.
const auto response = net(x);
for (int ii = 0; ii < num_samples; ++ii)
for (int jj = 0; jj < output_height; ++jj)
for (int kk = 0; kk < output_width; ++kk)
DLIB_TEST((response[ii](jj,kk) > 0) == (y[ii](jj,kk) > 0));
}
// ----------------------------------------------------------------------------------------
void test_loss_binary_log_per_pixel_with_noise_and_pixels_to_ignore()
{
// Test learning when some pixels are to be ignored, etc.
print_spinner();
constexpr int input_height = 5;
constexpr int input_width = 7;
constexpr int output_height = input_height;
constexpr int output_width = input_width;
const int num_samples = 1000;
const double ignore_probability = 0.5;
const double noise_probability = 0.05;
::std::default_random_engine generator(16);
::std::bernoulli_distribution ignore(ignore_probability);
::std::bernoulli_distribution noise_occurrence(noise_probability);
::std::bernoulli_distribution noisy_label(0.5);
::std::vector<matrix<double>> x(num_samples);
::std::vector<matrix<float>> y(num_samples);
::std::vector<int> truth_histogram(2);
matrix<double> xtmp(input_height, input_width);
matrix<float> ytmp(output_height, output_width);
// The function to be learned.
const auto ground_truth = [](const matrix<double>& x, int row, int column) {
double sum = 0.0;
const int first_column = std::max(0, column - 1);
const int last_column = std::min(static_cast<int>(x.nc() - 1), column + 1);
for (int c = first_column; c <= last_column; ++c) {
sum += x(row, c);
}
DLIB_TEST(sum < 2.0 * (last_column - first_column + 1));
return sum > (last_column - first_column + 1);
};
for ( int ii = 0; ii < num_samples; ++ii ) {
for ( int jj = 0; jj < input_height; ++jj ) {
for ( int kk = 0; kk < input_width; ++kk ) {
// Generate numbers between 0 and 2.
double value = static_cast<double>(ii + jj + kk) / 10.0;
value -= (static_cast<int>(value) / 2) * 2;
DLIB_TEST(value >= 0.0 && value < 2.0);
xtmp(jj, kk) = value;
}
}
x[ii] = xtmp;
for ( int jj = 0; jj < output_height; ++jj ) {
for ( int kk = 0; kk < output_width; ++kk ) {
const bool truth = ground_truth(x[ii], jj, kk);
++truth_histogram[truth];
if (ignore(generator)) {
ytmp(jj, kk) = 0.f;
}
else if (noise_occurrence(generator)) {
ytmp(jj, kk) = noisy_label(generator) ? 1.f : -1.f;
}
else {
ytmp(jj, kk) = truth ? 1.f : -1.f;
}
}
}
y[ii] = ytmp;
}
const int num_total_elements = num_samples * output_height * output_width;
{ // Require a reasonably balanced truth histogram in order to make sure that a trivial classifier is not enough
const int required_min_histogram_value = static_cast<int>(::std::ceil(num_total_elements / 2.0 * 0.375));
for (auto histogram_value : truth_histogram) {
DLIB_TEST_MSG(histogram_value >= required_min_histogram_value,
"Histogram value = " << histogram_value << ", required = " << required_min_histogram_value);
}
}
using net_type = loss_binary_log_per_pixel<con<1,1,input_width,1,1,input<matrix<double>>>>;
net_type net;
sgd defsolver(0,0.9);
dnn_trainer<net_type> trainer(net, defsolver);
trainer.set_learning_rate(0.1);
trainer.set_min_learning_rate(0.01);
trainer.set_mini_batch_size(50);
trainer.set_max_num_epochs(170);
trainer.train(x, y);
const ::std::vector<matrix<float>> predictions = net(x);
int num_correct = 0;
for ( int ii = 0; ii < num_samples; ++ii ) {
const matrix<float>& prediction = predictions[ii];
DLIB_TEST(prediction.nr() == output_height);
DLIB_TEST(prediction.nc() == output_width);
for ( int jj = 0; jj < output_height; ++jj )
for ( int kk = 0; kk < output_width; ++kk )
if ( (prediction(jj, kk) > 0.f) == ground_truth(x[ii], jj, kk) )
++num_correct;
}
// First some sanity checks.
const int num_correct_max = num_total_elements;
DLIB_TEST(num_correct_max == ::std::accumulate(truth_histogram.begin(), truth_histogram.end(), 0));
DLIB_TEST_MSG(num_correct <= num_correct_max,
"Number of correctly classified elements = " << num_correct << ", max = " << num_correct_max);
// This is the real test, verifying that we have actually learned something.
const int num_correct_required = static_cast<int>(::std::ceil(0.9 * num_correct_max));
DLIB_TEST_MSG(num_correct >= num_correct_required,
"Number of correctly classified elements = " << num_correct << ", required = " << num_correct_required);
}
// ----------------------------------------------------------------------------------------
void test_loss_multiclass_per_pixel_learned_params_on_trivial_single_pixel_task()
@ -3429,6 +3625,9 @@ namespace
test_multioutput_linear_regression();
test_simple_autoencoder();
test_loss_mean_squared_per_channel_and_pixel();
test_loss_binary_log_per_pixel_learned_params_on_trivial_two_pixel_task();
test_loss_binary_log_per_pixel_outputs_on_trivial_task();
test_loss_binary_log_per_pixel_with_noise_and_pixels_to_ignore();
test_loss_multiclass_per_pixel_learned_params_on_trivial_single_pixel_task();
test_loss_multiclass_per_pixel_activations_on_trivial_single_pixel_task();
test_loss_multiclass_per_pixel_outputs_on_trivial_task();

@ -16,7 +16,7 @@
./dnn_instance_segmentation_ex /path/to/VOC2012-or-other-images
An alternative to steps 2-4 above is to download a pre-trained network
from here: http://dlib.net/files/instance_segmentation_voc2012net.dnn
from here: http://dlib.net/files/instance_segmentation_voc2012net_v2.dnn
It would be a good idea to become familiar with dlib's DNN tooling before reading this
example. So you should read dnn_introduction_ex.cpp and dnn_introduction2_ex.cpp
@ -72,25 +72,20 @@ int main(int argc, char** argv) try
// Load the input image.
load_image(input_image, file.full_name());
// Draw largest objects last
const auto sort_instances = [](const std::vector<mmod_rect>& input) {
auto output = input;
const auto compare_area = [](const mmod_rect& lhs, const mmod_rect& rhs) {
return lhs.rect.area() < rhs.rect.area();
};
std::sort(output.begin(), output.end(), compare_area);
return output;
};
// Find instances in the input image
const auto instances = sort_instances(det_net(input_image));
const auto instances = det_net(input_image);
matrix<rgb_pixel> rgb_label_image;
matrix<float> label_image_confidence;
matrix<rgb_pixel> input_chip;
rgb_label_image.set_size(input_image.nr(), input_image.nc());
rgb_label_image = rgb_pixel(0, 0, 0);
label_image_confidence.set_size(input_image.nr(), input_image.nc());
label_image_confidence = 0.0;
bool found_something = false;
for (const auto& instance : instances)
@ -131,7 +126,7 @@ int main(int argc, char** argv) try
rnd.get_random_8bit_number()
);
dlib::matrix<uint16_t> resized_mask(
dlib::matrix<float> resized_mask(
static_cast<int>(chip_details.rect.height()),
static_cast<int>(chip_details.rect.width())
);
@ -142,12 +137,29 @@ int main(int argc, char** argv) try
{
for (int c = 0; c < resized_mask.nc(); ++c)
{
if (resized_mask(r, c))
const auto new_confidence = resized_mask(r, c);
if (new_confidence > 0)
{
const auto y = chip_details.rect.top() + r;
const auto x = chip_details.rect.left() + c;
if (y >= 0 && y < rgb_label_image.nr() && x >= 0 && x < rgb_label_image.nc())
rgb_label_image(y, x) = random_color;
{
auto& current_confidence = label_image_confidence(y, x);
if (new_confidence > current_confidence)
{
auto rgb_label = random_color;
const auto baseline_confidence = 5;
if (new_confidence < baseline_confidence)
{
// Scale label intensity if confidence isn't high
rgb_label.red *= new_confidence / baseline_confidence;
rgb_label.green *= new_confidence / baseline_confidence;
rgb_label.blue *= new_confidence / baseline_confidence;
}
rgb_label_image(y, x) = rgb_label;
current_confidence = new_confidence;
}
}
}
}
}

@ -23,7 +23,7 @@
./dnn_instance_segmentation_ex /path/to/VOC2012-or-other-images
An alternative to steps 2-4 above is to download a pre-trained network
from here: http://dlib.net/files/instance_segmentation_voc2012net.dnn
from here: http://dlib.net/files/instance_segmentation_voc2012net_v2.dnn
It would be a good idea to become familiar with dlib's DNN tooling before reading this
example. So you should read dnn_introduction_ex.cpp and dnn_introduction2_ex.cpp
@ -159,13 +159,13 @@ template <typename SUBNET> using concat_utag4 = resize_and_concat<utag4,utag4_,S
// ----------------------------------------------------------------------------------------
static const char* instance_segmentation_net_filename = "instance_segmentation_voc2012net.dnn";
static const char* instance_segmentation_net_filename = "instance_segmentation_voc2012net_v2.dnn";
// ----------------------------------------------------------------------------------------
// training network type
using seg_bnet_type = dlib::loss_multiclass_log_per_pixel<
dlib::cont<2,1,1,1,1,
using seg_bnet_type = dlib::loss_binary_log_per_pixel<
dlib::cont<1,1,1,1,1,
dlib::relu<dlib::bn_con<dlib::cont<16,7,7,2,2,
concat_utag1<level1t<
concat_utag2<level2t<
@ -180,8 +180,8 @@ using seg_bnet_type = dlib::loss_multiclass_log_per_pixel<
>>>>>>>>>>>>>>>>>>>>>>>>>;
// testing network type (replaced batch normalization with fixed affine transforms)
using seg_anet_type = dlib::loss_multiclass_log_per_pixel<
dlib::cont<2,1,1,1,1,
using seg_anet_type = dlib::loss_binary_log_per_pixel<
dlib::cont<1,1,1,1,1,
dlib::relu<dlib::affine<dlib::cont<16,7,7,2,2,
concat_utag1<alevel1t<
concat_utag2<alevel2t<

@ -49,7 +49,7 @@ struct det_training_sample
struct seg_training_sample
{
matrix<rgb_pixel> input_image;
matrix<uint16_t> label_image; // The ground-truth label of each pixel.
matrix<float> label_image; // The ground-truth label of each pixel. (+1 or -1)
};
// ----------------------------------------------------------------------------------------
@ -321,12 +321,12 @@ det_bnet_type train_detection_network(
// ----------------------------------------------------------------------------------------
matrix<uint16_t> keep_only_current_instance(const matrix<rgb_pixel>& rgb_label_image, const rgb_pixel rgb_label)
matrix<float> keep_only_current_instance(const matrix<rgb_pixel>& rgb_label_image, const rgb_pixel rgb_label)
{
const auto nr = rgb_label_image.nr();
const auto nc = rgb_label_image.nc();
matrix<uint16_t> result(nr, nc);
matrix<float> result(nr, nc);
for (long r = 0; r < nr; ++r)
{
@ -334,11 +334,11 @@ matrix<uint16_t> keep_only_current_instance(const matrix<rgb_pixel>& rgb_label_i
{
const auto& index = rgb_label_image(r, c);
if (index == rgb_label)
result(r, c) = 1;
result(r, c) = +1;
else if (index == dlib::rgb_pixel(224, 224, 192))
result(r, c) = dlib::loss_multiclass_log_per_pixel_::label_to_ignore;
else
result(r, c) = 0;
else
result(r, c) = -1;
}
}
@ -373,7 +373,7 @@ seg_bnet_type train_segmentation_network(
cout << seg_trainer << endl;
std::vector<matrix<rgb_pixel>> samples;
std::vector<matrix<uint16_t>> labels;
std::vector<matrix<float>> labels;
// Start a bunch of threads that read images from disk and pull out random crops. It's
// important to be sure to feed the GPU fast enough to keep it busy. Using multiple

Loading…
Cancel
Save