diff --git a/dlib/dnn/input.h b/dlib/dnn/input.h index 28167ac2e..213092ad9 100644 --- a/dlib/dnn/input.h +++ b/dlib/dnn/input.h @@ -31,6 +31,8 @@ namespace dlib template class input_rgb_image_sized; + class input_rgb_image_pair; + class input_rgb_image { public: @@ -54,7 +56,11 @@ namespace dlib template inline input_rgb_image ( const input_rgb_image_sized& item - ); + ); + + inline input_rgb_image ( + const input_rgb_image_pair& item + ); float get_avg_red() const { return avg_red; } float get_avg_green() const { return avg_green; } @@ -87,7 +93,7 @@ namespace dlib ); } - + // initialize data to the right size to contain the stuff in the iterator range. data.set_size(std::distance(ibegin,iend), 3, nr, nc); @@ -127,7 +133,7 @@ namespace dlib { std::string version; deserialize(version, in); - if (version != "input_rgb_image" && version != "input_rgb_image_sized") + if (version != "input_rgb_image" && version != "input_rgb_image_sized" && version != "input_rgb_image_pair") throw serialization_error("Unexpected version found while deserializing dlib::input_rgb_image."); deserialize(item.avg_red, in); deserialize(item.avg_green, in); @@ -216,7 +222,7 @@ namespace dlib ); } - + // initialize data to the right size to contain the stuff in the iterator range. data.set_size(std::distance(ibegin,iend), 3, NR, NC); @@ -303,6 +309,164 @@ namespace dlib avg_blue(item.get_avg_blue()) {} +// ---------------------------------------------------------------------------------------- + + class input_rgb_image_pair + { + public: + typedef std::pair, matrix> input_type; + + input_rgb_image_pair ( + ) : + avg_red(122.782), + avg_green(117.001), + avg_blue(104.298) + { + } + + input_rgb_image_pair ( + float avg_red, + float avg_green, + float avg_blue + ) : avg_red(avg_red), avg_green(avg_green), avg_blue(avg_blue) + {} + + inline input_rgb_image_pair ( + const input_rgb_image& item + ) : + avg_red(item.get_avg_red()), + avg_green(item.get_avg_green()), + avg_blue(item.get_avg_blue()) + {} + + template + inline input_rgb_image_pair ( + const input_rgb_image_sized& item + ) : + avg_red(item.get_avg_red()), + avg_green(item.get_avg_green()), + avg_blue(item.get_avg_blue()) + {} + + float get_avg_red() const { return avg_red; } + float get_avg_green() const { return avg_green; } + float get_avg_blue() const { return avg_blue; } + + bool image_contained_point ( const tensor& data, const point& p) const { return get_rect(data).contains(p); } + drectangle tensor_space_to_image_space ( const tensor& /*data*/, drectangle r) const { return r; } + drectangle image_space_to_tensor_space ( const tensor& /*data*/, double /*scale*/, drectangle r ) const { return r; } + + template + void to_tensor ( + forward_iterator ibegin, + forward_iterator iend, + resizable_tensor& data + ) const + { + DLIB_CASSERT(std::distance(ibegin, iend) > 0); + const auto nr = ibegin->first.nr(); + const auto nc = ibegin->first.nc(); + + // make sure all the input matrices have the same dimensions + for (auto i = ibegin; i != iend; ++i) + { + DLIB_CASSERT(i->first.nr() == nr && i->first.nc()==nc && + i->second.nr() == nr && i->second.nc() == nc, + "\t input_rgb_image_pair::to_tensor()" + << "\n\t All matrices given to to_tensor() must have the same dimensions." + << "\n\t nr: " << nr + << "\n\t nc: " << nc + << "\n\t i->first.nr(): " << i->first.nr() + << "\n\t i->first.nc(): " << i->first.nc() + << "\n\t i->second.nr(): " << i->second.nr() + << "\n\t i->second.nc(): " << i->second.nc() + ); + } + + // initialize data to the right size to contain the stuff in the iterator range. + data.set_size(2 * std::distance(ibegin, iend), 3, nr, nc); + + const size_t offset = nr * nc; + const size_t offset2 = data.size() / 2; + auto ptr = data.host(); + for (auto i = ibegin; i != iend; ++i) + { + for (long r = 0; r < nr; ++r) + { + for (long c = 0; c < nc; ++c) + { + rgb_pixel temp_first = i->first(r, c); + rgb_pixel temp_second = i->second(r, c); + auto p = ptr++; + *p = (temp_first.red - avg_red) / 256.0; + *(p + offset2) = (temp_second.red - avg_red) / 256.0; + p += offset; + *p = (temp_first.green - avg_green) / 256.0; + *(p + offset2) = (temp_second.green - avg_green) / 256.0; + p += offset; + *p = (temp_first.blue - avg_blue) / 256.0; + *(p + offset2) = (temp_second.blue - avg_blue) / 256.0; + p += offset; + } + } + ptr += offset * (data.k() - 1); + } + } + + friend void serialize(const input_rgb_image_pair& item, std::ostream& out) + { + serialize("input_rgb_image_pair", out); + serialize(item.avg_red, out); + serialize(item.avg_green, out); + serialize(item.avg_blue, out); + } + + friend void deserialize(input_rgb_image_pair& item, std::istream& in) + { + std::string version; + deserialize(version, in); + if (version != "input_rgb_image_pair" && version != "input_rgb_image" && version != "input_rgb_image_sized") + throw serialization_error("Unexpected version found while deserializing dlib::input_rgb_image_pair."); + + deserialize(item.avg_red, in); + deserialize(item.avg_green, in); + deserialize(item.avg_blue, in); + // read and discard the sizes if this was really a sized input layer. + if (version == "input_rgb_image_sized") + { + size_t nr, nc; + deserialize(nr, in); + deserialize(nc, in); + } + } + + friend std::ostream& operator<<(std::ostream& out, const input_rgb_image_pair& item) + { + out << "input_rgb_image_pair("<< item.avg_red<<","<"; + } + + private: + float avg_red; + float avg_green; + float avg_blue; + }; + +// ---------------------------------------------------------------------------------------- + + input_rgb_image:: + input_rgb_image ( + const input_rgb_image_pair& item + ) : avg_red(item.get_avg_red()), + avg_green(item.get_avg_green()), + avg_blue(item.get_avg_blue()) + {} + // ---------------------------------------------------------------------------------------- template diff --git a/dlib/dnn/input_abstract.h b/dlib/dnn/input_abstract.h index 24fe6de6e..eb41d548c 100644 --- a/dlib/dnn/input_abstract.h +++ b/dlib/dnn/input_abstract.h @@ -271,6 +271,115 @@ namespace dlib }; +// ---------------------------------------------------------------------------------------- + + class input_rgb_image_pair + { + /*! + WHAT THIS OBJECT REPRESENTS + This input layer works with std::pair of RGB images of type matrix. + It is useful when you want to input image pairs that are related to each other, + for instance, they are different distorted views of the same original image. + It is mainly supposed to be used with unsupervised loss functions such as + loss_barlow_twins_. You can also convert between input_rgb_image and + input_rgb_image_pair by copy construction or assignment. + !*/ + public: + typedef std::pair, matrix> input_type; + + input_rgb_image_pair ( + ); + /*! + ensures + - #get_avg_red() == 122.782 + - #get_avg_green() == 117.001 + - #get_avg_blue() == 104.298 + !*/ + + input_rgb_image_pair ( + float avg_red, + float avg_green, + float avg_blue + ); + /*! + ensures + - #get_avg_red() == avg_red + - #get_avg_green() == avg_green + - #get_avg_blue() == avg_blue + !*/ + + inline input_rgb_image_pair ( + const input_rgb_image& item + ); + /*! + ensures + - #get_avg_red() == item.get_avg_red() + - #get_avg_green() == item.get_avg_green() + - #get_avg_blue() == item.get_avg_blue() + !*/ + + template + inline input_rgb_image_pair ( + const input_rgb_image_sized& item + ); + /*! + ensures + - #get_avg_red() == item.get_avg_red() + - #get_avg_green() == item.get_avg_green() + - #get_avg_blue() == item.get_avg_blue() + !*/ + + float get_avg_red( + ) const; + /*! + ensures + - returns the value subtracted from the red color channel. + !*/ + + float get_avg_green( + ) const; + /*! + ensures + - returns the value subtracted from the green color channel. + !*/ + + float get_avg_blue( + ) const; + /*! + ensures + - returns the value subtracted from the blue color channel. + !*/ + + void to_tensor ( + forward_iterator ibegin, + forward_iterator iend, + resizable_tensor& data + ) const; + /*! + requires + - [ibegin, iend) is an iterator range over input_type objects. + - std::distance(ibegin,iend) > 0 + - The input range should contain images that all have the same + dimensions. + ensures + - Converts the iterator range into a tensor and stores it into #data. In + particular, if the input images have R rows, C columns then we will have: + - #data.num_samples() == 2 * std::distance(ibegin,iend) + - #data.nr() == R + - #data.nc() == C + - #data.k() == 3 + Moreover, each color channel is normalized by having its average value + subtracted (according to get_avg_red(), get_avg_green(), or + get_avg_blue()) and then is divided by 256.0. + Additionally, the first elements in each pair are placed in the first half + of the batch, and the second elements in the second half. + !*/ + + // Provided for compatibility with input_rgb_image_pyramid's interface + bool image_contained_point ( const tensor& data, const point& p) const { return get_rect(data).contains(p); } + drectangle tensor_space_to_image_space ( const tensor& /*data*/, drectangle r) const { return r; } + drectangle image_space_to_tensor_space ( const tensor& /*data*/, double /*scale*/, drectangle r ) const { return r; } + // ---------------------------------------------------------------------------------------- template < diff --git a/dlib/dnn/loss.h b/dlib/dnn/loss.h index af7a17ff1..a553506fa 100644 --- a/dlib/dnn/loss.h +++ b/dlib/dnn/loss.h @@ -3970,6 +3970,155 @@ namespace dlib template