From f685cb4249b19bac5437558828448a8f6b727b31 Mon Sep 17 00:00:00 2001 From: Juha Reunanen Date: Sun, 6 Jan 2019 16:11:39 +0200 Subject: [PATCH] Add U-net style skip connections to the semantic-segmentation example (#1600) * Add concat_prev layer, and U-net example for semantic segmentation * Allow to supply mini-batch size as command-line parameter * Decrease default mini-batch size from 30 to 24 * Resize t1, if needed * Use DenseNet-style blocks instead of residual learning * Increase default mini-batch size to 50 * Increase default mini-batch size from 50 to 60 * Resize even during the backward step, if needed * Use resize_bilinear_gradient for the backward step * Fix function call ambiguity problem * Clear destination before adding gradient * Works OK-ish * Add more U-tags * Tweak default mini-batch size * Define a simpler network when using Microsoft Visual C++ compiler; clean up the DenseNet stuff (leaving it for a later PR) * Decrease default mini-batch size from 24 to 23 * Define separate dnn filename for MSVC++ and not * Add documentation for the resize_to_prev layer; move the implementation so that it comes after mult_prev * Fix previous typo * Minor formatting changes * Reverse the ordering of levels * Increase the learning-rate stopping criterion back to 1e-4 (was 1e-8) * Use more U-tags even on Windows * Minor formatting * Latest MSVC 2017 builds fast, so there's no need to limit the depth any longer * Tweak default mini-batch size again * Even though latest MSVC can now build the extra layers, it does not mean we should add them! * Fix naming --- dlib/dnn/layers.h | 100 +++++++++++++ dlib/dnn/layers_abstract.h | 50 +++++++ dlib/test/dnn.cpp | 4 +- examples/dnn_semantic_segmentation_ex.cpp | 8 +- examples/dnn_semantic_segmentation_ex.h | 137 ++++++++++++------ .../dnn_semantic_segmentation_train_ex.cpp | 29 ++-- 6 files changed, 262 insertions(+), 66 deletions(-) diff --git a/dlib/dnn/layers.h b/dlib/dnn/layers.h index b41237487..8819dbb9b 100644 --- a/dlib/dnn/layers.h +++ b/dlib/dnn/layers.h @@ -2386,6 +2386,106 @@ namespace dlib using mult_prev9_ = mult_prev_; using mult_prev10_ = mult_prev_; +// ---------------------------------------------------------------------------------------- + + template < + template class tag + > + class resize_prev_to_tagged_ + { + public: + const static unsigned long id = tag_id::id; + + resize_prev_to_tagged_() + { + } + + template + void setup (const SUBNET& /*sub*/) + { + } + + template + void forward(const SUBNET& sub, resizable_tensor& output) + { + auto& prev = sub.get_output(); + auto& tagged = layer(sub).get_output(); + + DLIB_CASSERT(prev.num_samples() == tagged.num_samples()); + + output.set_size(prev.num_samples(), + prev.k(), + tagged.nr(), + tagged.nc()); + + if (prev.nr() == tagged.nr() && prev.nc() == tagged.nc()) + { + tt::copy_tensor(false, output, 0, prev, 0, prev.k()); + } + else + { + tt::resize_bilinear(output, prev); + } + } + + template + void backward(const tensor& gradient_input, SUBNET& sub, tensor& /*params_grad*/) + { + auto& prev = sub.get_gradient_input(); + + DLIB_CASSERT(prev.k() == gradient_input.k()); + DLIB_CASSERT(prev.num_samples() == gradient_input.num_samples()); + + if (prev.nr() == gradient_input.nr() && prev.nc() == gradient_input.nc()) + { + tt::copy_tensor(true, prev, 0, gradient_input, 0, prev.k()); + } + else + { + tt::resize_bilinear_gradient(prev, gradient_input); + } + } + + const tensor& get_layer_params() const { return params; } + tensor& get_layer_params() { return params; } + + inline dpoint map_input_to_output (const dpoint& p) const { return p; } + inline dpoint map_output_to_input (const dpoint& p) const { return p; } + + friend void serialize(const resize_prev_to_tagged_& , std::ostream& out) + { + serialize("resize_prev_to_tagged_", out); + } + + friend void deserialize(resize_prev_to_tagged_& , std::istream& in) + { + std::string version; + deserialize(version, in); + if (version != "resize_prev_to_tagged_") + throw serialization_error("Unexpected version '"+version+"' found while deserializing dlib::resize_prev_to_tagged_."); + } + + friend std::ostream& operator<<(std::ostream& out, const resize_prev_to_tagged_& item) + { + out << "resize_prev_to_tagged"<\n"; + } + + private: + resizable_tensor params; + }; + + template < + template class tag, + typename SUBNET + > + using resize_prev_to_tagged = add_layer, SUBNET>; + // ---------------------------------------------------------------------------------------- template < diff --git a/dlib/dnn/layers_abstract.h b/dlib/dnn/layers_abstract.h index 9a11724a0..6aad71cec 100644 --- a/dlib/dnn/layers_abstract.h +++ b/dlib/dnn/layers_abstract.h @@ -2382,6 +2382,56 @@ namespace dlib using mult_prev9_ = mult_prev_; using mult_prev10_ = mult_prev_; +// ---------------------------------------------------------------------------------------- + + template < + template class tag + > + class resize_prev_to_tagged_ + { + /*! + WHAT THIS OBJECT REPRESENTS + This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface + defined above. This layer resizes the output channels of the previous layer + to have the same number of rows and columns as the output of the tagged layer. + + This layer uses bilinear interpolation. If the sizes match already, then it + simply copies the data. + + Therefore, you supply a tag via resize_prev_to_tagged's template argument that + tells it what layer to use for the target size. + + If tensor PREV is resized to size of tensor TAGGED, then a tensor OUT is + produced such that: + - OUT.num_samples() == PREV.num_samples() + - OUT.k() == PREV.k() + - OUT.nr() == TAGGED.nr() + - OUT.nc() == TAGGED.nc() + !*/ + + public: + resize_prev_to_tagged_( + ); + + template void setup(const SUBNET& sub); + template void forward(const SUBNET& sub, resizable_tensor& output); + template void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad); + dpoint map_input_to_output(dpoint p) const; + dpoint map_output_to_input(dpoint p) const; + const tensor& get_layer_params() const; + tensor& get_layer_params(); + /*! + These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface. + !*/ + }; + + + template < + template class tag, + typename SUBNET + > + using resize_prev_to_tagged = add_layer, SUBNET>; + // ---------------------------------------------------------------------------------------- template < diff --git a/dlib/test/dnn.cpp b/dlib/test/dnn.cpp index a13f44254..4d4d98347 100644 --- a/dlib/test/dnn.cpp +++ b/dlib/test/dnn.cpp @@ -1910,7 +1910,7 @@ namespace template using pres = prelu>>>>>>>; - void test_visit_funcions() + void test_visit_functions() { using net_type2 = loss_multiclass_log> net; + deserialize(semantic_segmentation_net_filename) >> net; // Show inference results in a window. image_window win; diff --git a/examples/dnn_semantic_segmentation_ex.h b/examples/dnn_semantic_segmentation_ex.h index 47fc102c9..3665c5920 100644 --- a/examples/dnn_semantic_segmentation_ex.h +++ b/examples/dnn_semantic_segmentation_ex.h @@ -23,7 +23,7 @@ ./dnn_semantic_segmentation_ex /path/to/VOC2012-or-other-images An alternative to steps 2-4 above is to download a pre-trained network - from here: http://dlib.net/files/semantic_segmentation_voc2012net.dnn + from here: http://dlib.net/files/semantic_segmentation_voc2012net_v2.dnn It would be a good idea to become familiar with dlib's DNN tooling before reading this example. So you should read dnn_introduction_ex.cpp and dnn_introduction2_ex.cpp @@ -115,16 +115,16 @@ const Voc2012class& find_voc2012_class(Predicate predicate) // ---------------------------------------------------------------------------------------- // Introduce the building blocks used to define the segmentation network. -// The network first does residual downsampling (similar to the dnn_imagenet_(train_)ex -// example program), and then residual upsampling. The network could be improved e.g. -// by introducing skip connections from the input image, and/or the first layers, to the -// last layer(s). (See Long et al., Fully Convolutional Networks for Semantic Segmentation, -// https://people.eecs.berkeley.edu/~jonlong/long_shelhamer_fcn.pdf) +// The network first does residual downsampling (similar to the dnn_imagenet_(train_)ex +// example program), and then residual upsampling. In addition, U-Net style skip +// connections are used, so that not every simple detail needs to reprented on the low +// levels. (See Ronneberger et al. (2015), U-Net: Convolutional Networks for Biomedical +// Image Segmentation, https://arxiv.org/pdf/1505.04597.pdf) -template class BN, int stride, typename SUBNET> -using block = BN>>>>; +template class BN, int stride, typename SUBNET> +using block = BN>>>>; -template class BN, int stride, typename SUBNET> +template class BN, int stride, typename SUBNET> using blockt = BN>>>>; template