diff --git a/dlib/dnn/core.h b/dlib/dnn/core.h
index 6d806f49b..493ecb41b 100644
--- a/dlib/dnn/core.h
+++ b/dlib/dnn/core.h
@@ -2120,12 +2120,12 @@ namespace dlib
     {
     private:
         // We don't want anyone making these no_label_type objects.  They are here only to
-        // allow add_loss_layer::label_type and dnn_trainer::label_type to exist which avoids
-        // needing to overload add_loss_layer and dnn_trainer for supervised an unsupervised
-        // losses.  It also can be a type to use in template metaprogramming to indicate
-        // "no label".  So here we make the constructor private with the exception that
-        // add_loss_layer objects can make it (again, just to simplify add_loss_layer's
-        // implementation).
+        // allow add_loss_layer::training_label_type and dnn_trainer::training_label_type
+        // to exist which avoids needing to overload add_loss_layer and dnn_trainer for
+        // supervised an unsupervised losses.  It also can be a type to use in template
+        // metaprogramming to indicate "no label".  So here we make the constructor private
+        // with the exception that add_loss_layer objects can make it (again, just to
+        // simplify add_loss_layer's implementation).
         no_label_type(){};
         template <typename LOSS_DETAILS, typename SUBNET> friend class add_loss_layer;
         template < typename net_type, typename solver_type > friend class dnn_trainer; 
@@ -2137,14 +2137,25 @@ namespace dlib
     class add_loss_layer
     {
         template <typename T, typename enabled=void>
-        struct get_loss_layer_label_type
+        struct get_loss_layer_training_label_type
         {
             typedef no_label_type type;
         };
         template <typename T>
-        struct get_loss_layer_label_type<T,typename std::enable_if<sizeof(typename T::label_type)!=0>::type>
+        struct get_loss_layer_training_label_type<T,typename std::enable_if<sizeof(typename T::training_label_type)!=0>::type>
         {
-            typedef typename T::label_type type;
+            typedef typename T::training_label_type type;
+        };
+
+        template <typename T, typename enabled=void>
+        struct get_loss_layer_output_label_type
+        {
+            typedef no_label_type type;
+        };
+        template <typename T>
+        struct get_loss_layer_output_label_type<T,typename std::enable_if<sizeof(typename T::output_label_type)!=0>::type>
+        {
+            typedef typename T::output_label_type type;
         };
 
     public:
@@ -2154,7 +2165,8 @@ namespace dlib
         const static size_t num_layers = subnet_type::num_layers + 1;
         // Note that the loss layer doesn't count as an additional computational layer.
         const static size_t num_computational_layers = subnet_type::num_computational_layers;
-        typedef typename get_loss_layer_label_type<LOSS_DETAILS>::type label_type;
+        typedef typename get_loss_layer_training_label_type<LOSS_DETAILS>::type training_label_type;
+        typedef typename get_loss_layer_output_label_type<LOSS_DETAILS>::type output_label_type;
 
         static_assert(is_nonloss_layer_type<SUBNET>::value, 
             "SUBNET must be of type add_layer, add_skip_layer, or add_tag_layer."); 
@@ -2250,19 +2262,19 @@ namespace dlib
             (*this)(temp_tensor, obegin);
         }
 
-        const label_type& operator() (const input_type& x)
+        const output_label_type& operator() (const input_type& x)
         {
             (*this)(&x, &x+1, &temp_label);
             return temp_label;
         }
 
         template <typename iterable_type>
-        std::vector<label_type> operator() (
+        std::vector<output_label_type> operator() (
             const iterable_type& data,
             size_t batch_size = 128
         )
         {
-            std::vector<label_type> results(std::distance(data.begin(), data.end()));
+            std::vector<output_label_type> results(std::distance(data.begin(), data.end()));
             auto o = results.begin();
             auto i = data.begin();
             auto num_remaining = results.size();
@@ -2426,7 +2438,7 @@ namespace dlib
 
         // These two objects don't logically contribute to the state of this object.  They
         // are here to prevent them from being reallocated over and over.
-        label_type temp_label;
+        output_label_type temp_label;
         resizable_tensor temp_tensor;
     };
 
diff --git a/dlib/dnn/core_abstract.h b/dlib/dnn/core_abstract.h
index 9ad37bbe0..7d256634e 100644
--- a/dlib/dnn/core_abstract.h
+++ b/dlib/dnn/core_abstract.h
@@ -619,9 +619,12 @@ namespace dlib
         typedef typename subnet_type::input_type input_type;
         const static size_t num_computational_layers = subnet_type::num_computational_layers;
         const static size_t num_layers = subnet_type::num_layers + 1;
-        // If LOSS_DETAILS is an unsupervised loss then label_type==no_label_type.
+        // If LOSS_DETAILS is an unsupervised loss then training_label_type==no_label_type.
         // Otherwise it is defined as follows:
-        typedef typename LOSS_DETAILS::label_type label_type;
+        typedef typename LOSS_DETAILS::training_label_type training_label_type;
+        // Similarly, if LOSS_DETAILS doesn't provide any output conversion then
+        // output_label_type==no_label_type.
+        typedef typename LOSS_DETAILS::output_label_type output_label_type;
 
 
 
@@ -768,7 +771,7 @@ namespace dlib
                 - x.num_samples()%sample_expansion_factor() == 0
                 - x.num_samples() > 0
                 - obegin == iterator pointing to the start of a range of
-                  x.num_samples()/sample_expansion_factor() label_type elements.
+                  x.num_samples()/sample_expansion_factor() output_label_type elements.
             ensures
                 - runs x through the network and writes the output to the range at obegin.
                 - loss_details().to_label() is used to write the network output into
@@ -786,7 +789,7 @@ namespace dlib
                 - [ibegin, iend) is an iterator range over input_type objects.
                 - std::distance(ibegin,iend) > 0
                 - obegin == iterator pointing to the start of a range of
-                  std::distance(ibegin,iend) label_type elements.
+                  std::distance(ibegin,iend) output_label_type elements.
             ensures
                 - runs [ibegin,iend) through the network and writes the output to the range
                   at obegin.
@@ -796,18 +799,18 @@ namespace dlib
 
     // -------------
 
-        const label_type& operator() (
+        const output_label_type& operator() (
             const input_type& x
         );
         /*!
             ensures
                 - runs a single object, x, through the network and returns the output.
                 - loss_details().to_label() is used to convert the network output into a
-                  label_type.
+                  output_label_type.
         !*/
 
         template <typename iterable_type>
-        std::vector<label_type> operator() (
+        std::vector<output_label_type> operator() (
             const iterable_type& data,
             size_t batch_size = 128
         );
@@ -826,7 +829,7 @@ namespace dlib
                   items.  Using a batch_size > 1 can be faster because it better exploits
                   the available hardware parallelism.
                 - loss_details().to_label() is used to convert the network output into a
-                  label_type.
+                  output_label_type.
         !*/
 
     // -------------
@@ -844,7 +847,7 @@ namespace dlib
                 - x.num_samples()%sample_expansion_factor() == 0
                 - x.num_samples() > 0
                 - lbegin == iterator pointing to the start of a range of
-                  x.num_samples()/sample_expansion_factor() label_type elements.
+                  x.num_samples()/sample_expansion_factor() training_label_type elements.
             ensures
                 - runs x through the network, compares the output to the expected output
                   pointed to by lbegin, and returns the resulting loss. 
@@ -864,7 +867,7 @@ namespace dlib
                 - [ibegin, iend) is an iterator range over input_type objects.
                 - std::distance(ibegin,iend) > 0
                 - lbegin == iterator pointing to the start of a range of
-                  std::distance(ibegin,iend) label_type elements.
+                  std::distance(ibegin,iend) training_label_type elements.
             ensures
                 - runs [ibegin,iend) through the network, compares the output to the
                   expected output pointed to by lbegin, and returns the resulting loss. 
@@ -880,7 +883,7 @@ namespace dlib
         );
         /*!
             requires
-                - LOSS_DETAILS is an unsupervised loss.  i.e. label_type==no_label_type.
+                - LOSS_DETAILS is an unsupervised loss.  i.e. training_label_type==no_label_type.
                 - sample_expansion_factor() != 0
                   (i.e. to_tensor() must have been called to set sample_expansion_factor()
                   to something non-zero.)
@@ -898,7 +901,7 @@ namespace dlib
         );
         /*!
             requires
-                - LOSS_DETAILS is an unsupervised loss.  i.e. label_type==no_label_type.
+                - LOSS_DETAILS is an unsupervised loss.  i.e. training_label_type==no_label_type.
                 - [ibegin, iend) is an iterator range over input_type objects.
                 - std::distance(ibegin,iend) > 0
             ensures
@@ -921,7 +924,7 @@ namespace dlib
                 - x.num_samples()%sample_expansion_factor() == 0
                 - x.num_samples() > 0
                 - lbegin == iterator pointing to the start of a range of
-                  x.num_samples()/sample_expansion_factor() label_type elements.
+                  x.num_samples()/sample_expansion_factor() training_label_type elements.
             ensures
                 - runs x through the network, compares the output to the expected output
                   pointed to by lbegin, and computes parameter and data gradients with
@@ -944,7 +947,7 @@ namespace dlib
                 - [ibegin, iend) is an iterator range over input_type objects.
                 - std::distance(ibegin,iend) > 0
                 - lbegin == iterator pointing to the start of a range of
-                  std::distance(ibegin,iend) label_type elements.
+                  std::distance(ibegin,iend) training_label_type elements.
             ensures
                 - runs [ibegin,iend) through the network, compares the output to the
                   expected output pointed to by lbegin, and computes parameter and data
@@ -961,7 +964,7 @@ namespace dlib
         );
         /*!
             requires
-                - LOSS_DETAILS is an unsupervised loss.  i.e. label_type==no_label_type.
+                - LOSS_DETAILS is an unsupervised loss.  i.e. training_label_type==no_label_type.
                 - sample_expansion_factor() != 0
                   (i.e. to_tensor() must have been called to set sample_expansion_factor()
                   to something non-zero.)
@@ -982,7 +985,7 @@ namespace dlib
         );
         /*!
             requires
-                - LOSS_DETAILS is an unsupervised loss.  i.e. label_type==no_label_type.
+                - LOSS_DETAILS is an unsupervised loss.  i.e. training_label_type==no_label_type.
                 - [ibegin, iend) is an iterator range over input_type objects.
                 - std::distance(ibegin,iend) > 0
             ensures
diff --git a/dlib/dnn/loss.h b/dlib/dnn/loss.h
index cf75adba7..93bf04425 100644
--- a/dlib/dnn/loss.h
+++ b/dlib/dnn/loss.h
@@ -21,7 +21,8 @@ namespace dlib
     {
     public:
 
-        typedef float label_type;
+        typedef float training_label_type;
+        typedef float output_label_type;
 
         template <
             typename SUB_TYPE,
@@ -128,7 +129,8 @@ namespace dlib
     {
     public:
 
-        typedef float label_type;
+        typedef float training_label_type;
+        typedef float output_label_type;
 
         template <
             typename SUB_TYPE,
@@ -244,7 +246,8 @@ namespace dlib
     {
     public:
 
-        typedef unsigned long label_type;
+        typedef unsigned long training_label_type;
+        typedef unsigned long output_label_type;
 
         template <
             typename SUB_TYPE,
@@ -468,7 +471,8 @@ namespace dlib
 
     public:
 
-        typedef std::vector<mmod_rect> label_type;
+        typedef std::vector<mmod_rect> training_label_type;
+        typedef std::vector<mmod_rect> output_label_type;
 
         loss_mmod_() {}
 
@@ -494,7 +498,7 @@ namespace dlib
             DLIB_CASSERT(sub.sample_expansion_factor() == 1,  sub.sample_expansion_factor());
 
             std::vector<intermediate_detection> dets_accum;
-            label_type final_dets;
+            output_label_type final_dets;
             for (long i = 0; i < output_tensor.num_samples(); ++i)
             {
                 tensor_to_dets(input_tensor, output_tensor, i, dets_accum, adjust_threshold, sub);
@@ -865,7 +869,7 @@ namespace dlib
     {
     public:
 
-        typedef unsigned long label_type;
+        typedef unsigned long training_label_type;
 
 
         template <
diff --git a/dlib/dnn/loss_abstract.h b/dlib/dnn/loss_abstract.h
index 6193e90e0..e8227c5f3 100644
--- a/dlib/dnn/loss_abstract.h
+++ b/dlib/dnn/loss_abstract.h
@@ -33,14 +33,16 @@ namespace dlib
 
                 Finally, note that there are two broad flavors of loss layer, supervised
                 and unsupervised.  The EXAMPLE_LOSS_LAYER_ as shown here is a supervised
-                layer.  To make an unsupervised loss you simply leave out the label_type
-                typedef, to_label(), and the truth iterator argument to
+                layer.  To make an unsupervised loss you simply leave out the
+                training_label_type typedef and the truth iterator argument to
                 compute_loss_value_and_gradient().
         !*/
 
     public:
 
-        typedef whatever_type_you_use_for_labels label_type;
+        // In most cases training_label_type and output_label_type will be the same type.
+        typedef whatever_type_you_use_for_training_labels training_label_type;
+        typedef whatever_type_you_use_for_outout_labels   output_label_type;
 
         EXAMPLE_LOSS_LAYER_ (
         );
@@ -77,9 +79,9 @@ namespace dlib
                 - input_tensor.num_samples()%sub.sample_expansion_factor() == 0.
                 - iter == an iterator pointing to the beginning of a range of
                   input_tensor.num_samples()/sub.sample_expansion_factor() elements.  Moreover,
-                  they must be label_type elements.
+                  they must be output_label_type elements.
             ensures
-                - Converts the output of the provided network to label_type objects and
+                - Converts the output of the provided network to output_label_type objects and
                   stores the results into the range indicated by iter.  In particular, for
                   all valid i, it will be the case that:
                     *(iter+i/sub.sample_expansion_factor()) is populated based on the output of
@@ -108,7 +110,7 @@ namespace dlib
                       layer<i>(sub).get_output().
                 - truth == an iterator pointing to the beginning of a range of
                   input_tensor.num_samples()/sub.sample_expansion_factor() elements.  Moreover,
-                  they must be label_type elements.
+                  they must be training_label_type elements.
                 - for all valid i:
                     - *(truth+i/sub.sample_expansion_factor()) is the label of the ith sample in
                       input_tensor.
@@ -167,7 +169,8 @@ namespace dlib
         !*/
     public:
 
-        typedef float label_type;
+        typedef float training_label_type;
+        typedef float output_label_type;
 
         template <
             typename SUB_TYPE,
@@ -234,7 +237,8 @@ namespace dlib
         !*/
     public:
 
-        typedef float label_type;
+        typedef float training_label_type;
+        typedef float output_label_type;
 
         template <
             typename SUB_TYPE,
@@ -306,7 +310,8 @@ namespace dlib
 
     public:
 
-        typedef unsigned long label_type;
+        typedef unsigned long training_label_type;
+        typedef unsigned long output_label_type;
 
         template <
             typename SUB_TYPE,
@@ -443,7 +448,8 @@ namespace dlib
 
     public:
 
-        typedef std::vector<mmod_rect> label_type;
+        typedef std::vector<mmod_rect> training_label_type;
+        typedef std::vector<mmod_rect> output_label_type;
 
         loss_mmod_(
         );
diff --git a/dlib/dnn/trainer.h b/dlib/dnn/trainer.h
index 47617f65d..95410b2ea 100644
--- a/dlib/dnn/trainer.h
+++ b/dlib/dnn/trainer.h
@@ -30,20 +30,20 @@ namespace dlib
 
     namespace impl
     {
-        template <typename label_type>
+        template <typename training_label_type>
         struct dnn_job_t
         {
             dnn_job_t() = default;
             dnn_job_t(const dnn_job_t&) = delete;
             dnn_job_t& operator=(const dnn_job_t&) = delete;
 
-            std::vector<std::vector<label_type>> labels;
+            std::vector<std::vector<training_label_type>> labels;
             std::vector<resizable_tensor> t;
             std::vector<int> have_data;  // have_data[i] is true if there is data in labels[i] and t[i].
         };
 
-        template <typename label_type>
-        void swap(dnn_job_t<label_type>& a, dnn_job_t<label_type>& b)
+        template <typename training_label_type>
+        void swap(dnn_job_t<training_label_type>& a, dnn_job_t<training_label_type>& b)
         {
             a.labels.swap(b.labels);
             a.t.swap(b.t);
@@ -63,12 +63,12 @@ namespace dlib
         static_assert(is_loss_layer_type<net_type>::value, 
             "The last layer in a network must be a loss layer.");
 
-        typedef typename net_type::label_type label_type;
+        typedef typename net_type::training_label_type training_label_type;
         typedef typename net_type::input_type input_type;
         const static size_t num_computational_layers = net_type::num_computational_layers;
         const static size_t num_layers = net_type::num_layers;
     private:
-        typedef impl::dnn_job_t<label_type> job_t;
+        typedef impl::dnn_job_t<training_label_type> job_t;
     public:
 
         dnn_trainer() = delete;
@@ -184,7 +184,7 @@ namespace dlib
 
         void train_one_step (
             const std::vector<input_type>& data,
-            const std::vector<label_type>& labels 
+            const std::vector<training_label_type>& labels 
         )
         {
             DLIB_CASSERT(data.size() == labels.size());
@@ -261,7 +261,7 @@ namespace dlib
 
         void train (
             const std::vector<input_type>& data,
-            const std::vector<label_type>& labels 
+            const std::vector<training_label_type>& labels 
         ) 
         {
             DLIB_CASSERT(data.size() == labels.size() && data.size() > 0);
@@ -322,7 +322,7 @@ namespace dlib
         {
             DLIB_CASSERT(data.size() > 0);
 
-            const bool has_unsupervised_loss = std::is_same<no_label_type, label_type>::value; 
+            const bool has_unsupervised_loss = std::is_same<no_label_type, training_label_type>::value; 
             static_assert(has_unsupervised_loss, 
                 "You can only call this version of train() when using an unsupervised loss.");
 
@@ -562,7 +562,7 @@ namespace dlib
 
         void thread() try
         {
-            label_type pick_which_run_update;
+            training_label_type pick_which_run_update;
             job_t next_job;
 
             std::vector<dlib::future<double>> losses(devices.size());
@@ -591,7 +591,7 @@ namespace dlib
                 ++main_iteration_counter;
                 // Call compute_parameter_gradients() and update_parameters() but pick the
                 // right version for unsupervised or supervised training based on the type
-                // of label_type.
+                // of training_label_type.
                 for (size_t i = 0; i < devices.size(); ++i)
                     tp[i]->add_task_by_value([&,i](double& loss){ loss = compute_parameter_gradients(i, next_job, pick_which_run_update); }, losses[i]);
                 // aggregate loss values from all the network computations.
@@ -988,7 +988,7 @@ namespace dlib
             data_iterator dend
         )
         {
-            typename std::vector<label_type>::iterator nothing;
+            typename std::vector<training_label_type>::iterator nothing;
             send_job(dbegin, dend, nothing);
         }
 
diff --git a/dlib/dnn/trainer_abstract.h b/dlib/dnn/trainer_abstract.h
index 5258a2339..a8920f6f6 100644
--- a/dlib/dnn/trainer_abstract.h
+++ b/dlib/dnn/trainer_abstract.h
@@ -47,7 +47,7 @@ namespace dlib
 
     public:
 
-        typedef typename net_type::label_type label_type;
+        typedef typename net_type::training_label_type training_label_type;
         typedef typename net_type::input_type input_type;
         const static size_t num_computational_layers = net_type::num_computational_layers;
 
@@ -341,14 +341,14 @@ namespace dlib
 
         void train (
             const std::vector<input_type>& data,
-            const std::vector<label_type>& labels 
+            const std::vector<training_label_type>& labels 
         ); 
         /*!
             requires
                 - data.size() == labels.size()
                 - data.size() > 0
                 - net_type uses a supervised loss.  
-                  i.e. net_type::label_type != no_label_type.
+                  i.e. net_type::training_label_type != no_label_type.
             ensures
                 - Trains a supervised neural network based on the given training data.
                   The goal of training is to find the network parameters that minimize
@@ -374,7 +374,7 @@ namespace dlib
             requires 
                 - data.size() > 0
                 - net_type uses an unsupervised loss.  
-                  i.e. net_type::label_type == no_label_type.
+                  i.e. net_type::training_label_type == no_label_type.
             ensures
                 - Trains an unsupervised neural network based on the given training data.
                   The goal of training is to find the network parameters that minimize
@@ -395,14 +395,14 @@ namespace dlib
 
         void train_one_step (
             const std::vector<input_type>& data,
-            const std::vector<label_type>& labels 
+            const std::vector<training_label_type>& labels 
         );
         /*!
             requires
                 - data.size() == labels.size()
                 - data.size() > 0
                 - net_type uses a supervised loss.  
-                  i.e. net_type::label_type != no_label_type.
+                  i.e. net_type::training_label_type != no_label_type.
             ensures
                 - Performs one stochastic gradient update step based on the mini-batch of
                   data and labels supplied to this function.  In particular, calling
@@ -433,7 +433,7 @@ namespace dlib
                 - std::advance(lbegin, std::distance(dbegin, dend) - 1) is dereferencable
                 - std::distance(dbegin, dend) > 0
                 - net_type uses a supervised loss.  
-                  i.e. net_type::label_type != no_label_type.
+                  i.e. net_type::training_label_type != no_label_type.
             ensures
                 - Performs one stochastic gradient update step based on the mini-batch of
                   data and labels supplied to this function.  In particular, calling
@@ -457,7 +457,7 @@ namespace dlib
             requires
                 - data.size() > 0
                 - net_type uses an unsupervised loss.  
-                  i.e. net_type::label_type == no_label_type.
+                  i.e. net_type::training_label_type == no_label_type.
             ensures
                 - Performs one stochastic gradient update step based on the mini-batch of
                   data supplied to this function.  In particular, calling train_one_step()
@@ -485,7 +485,7 @@ namespace dlib
             requires
                 - std::distance(dbegin, dend) > 0
                 - net_type uses an unsupervised loss.  
-                  i.e. net_type::label_type == no_label_type.
+                  i.e. net_type::training_label_type == no_label_type.
             ensures
                 - Performs one stochastic gradient update step based on the mini-batch of
                   data supplied to this function.  In particular, calling train_one_step()