Make dnn_trainer use robust statistic to determine if the loss is exploding and if it should backtrack.

Previously we used only the non-robust version, and so would mistakenly not catch sequenes of loss increase that begin with an extremely large value and then settled down to still large but less extreme values.
2024-11-01 10:14:53 +08:00 · 2020-09-02 21:48:30 -04:00 · 2020-09-02 21:48:30 -04:00 · ed22f0400a
commit ed22f0400a
parent 0bb6ce36d8
1 changed files with 4 additions and 12 deletions
--- a/dlib/dnn/trainer.h
+++ b/dlib/dnn/trainer.h
@ -1108,29 +1108,21 @@ namespace dlib
            while (previous_loss_values_to_keep_until_disk_sync.size() > 2 * gradient_updates_since_last_sync)
                previous_loss_values_to_keep_until_disk_sync.pop_front();

-            running_gradient g;
-
+            // Always retry if there are any nan values
            for (auto x : previous_loss_values_to_keep_until_disk_sync)
            {
-                // If we get a NaN value of loss assume things have gone horribly wrong and
-                // we should reload the state of the trainer.
                if (std::isnan(x))
                    return true;
-
-                g.add(x);
            }

            // if we haven't seen much data yet then just say false.
            if (gradient_updates_since_last_sync < 30)
                return false;

-            // if learning rate was changed from outside during training, for example
-            if (g.current_n() <= 2)
-                return false;
-
            // if the loss is very likely to be increasing then return true
-            const double prob = g.probability_gradient_greater_than(0);
-            if (prob > prob_loss_increasing_thresh)
+            const double prob1 = probability_values_are_increasing(previous_loss_values_to_keep_until_disk_sync);
+            const double prob2 = probability_values_are_increasing_robust(previous_loss_values_to_keep_until_disk_sync);
+            if (std::max(prob1, prob2) > prob_loss_increasing_thresh)
            {
                // Exponentially decay the threshold towards 1 so that if we keep finding
                // the loss to be increasing over and over we will make the test