diff --git a/dlib/dnn/loss.h b/dlib/dnn/loss.h
index bb017d1dc..4e97f9609 100644
--- a/dlib/dnn/loss.h
+++ b/dlib/dnn/loss.h
@@ -3518,12 +3518,14 @@ namespace dlib
         double lambda_obj = 1.0;
         double lambda_box = 1.0;
         double lambda_cls = 1.0;
+        double gamma_obj = 0.0;
+        double gamma_cls = 0.0;
 
     };
 
     inline void serialize(const yolo_options& item, std::ostream& out)
     {
-        int version = 1;
+        int version = 2;
         serialize(version, out);
         serialize(item.anchors, out);
         serialize(item.labels, out);
@@ -3534,13 +3536,15 @@ namespace dlib
         serialize(item.lambda_obj, out);
         serialize(item.lambda_box, out);
         serialize(item.lambda_cls, out);
+        serialize(item.gamma_obj, out);
+        serialize(item.gamma_cls, out);
     }
 
     inline void deserialize(yolo_options& item, std::istream& in)
     {
         int version = 0;
         deserialize(version, in);
-        if (version != 1)
+        if (!(1 <= version && version <= 2))
             throw serialization_error("Unexpected version found while deserializing dlib::yolo_options.");
         deserialize(item.anchors, in);
         deserialize(item.labels, in);
@@ -3551,6 +3555,11 @@ namespace dlib
         deserialize(item.lambda_obj, in);
         deserialize(item.lambda_box, in);
         deserialize(item.lambda_cls, in);
+        if (version >= 2)
+        {
+            deserialize(item.gamma_obj, in);
+            deserialize(item.gamma_cls, in);
+        }
     }
 
     inline std::ostream& operator<<(std::ostream& out, const std::map<int, std::vector<yolo_options::anchor_box_details>>& anchors)
@@ -3655,19 +3664,21 @@ namespace dlib
                     {
                         for (long c = 0; c < output_tensor.nc(); ++c)
                         {
-                            const float obj = out_data[tensor_index(output_tensor, n, k + 4, r, c)];
+                            const double obj = out_data[tensor_index(output_tensor, n, k + 4, r, c)];
                             if (obj > adjust_threshold)
                             {
-                                const auto x = out_data[tensor_index(output_tensor, n, k + 0, r, c)] * 2.0 - 0.5;
-                                const auto y = out_data[tensor_index(output_tensor, n, k + 1, r, c)] * 2.0 - 0.5;
-                                const auto w = out_data[tensor_index(output_tensor, n, k + 2, r, c)];
-                                const auto h = out_data[tensor_index(output_tensor, n, k + 3, r, c)];
+                                // The scaling and shifting in the x and y coordinates avoids the grid sensitivity
+                                // effect by allowing the network to output centers along the grid boundaries.
+                                const double x = out_data[tensor_index(output_tensor, n, k + 0, r, c)] * 2.0 - 0.5;
+                                const double y = out_data[tensor_index(output_tensor, n, k + 1, r, c)] * 2.0 - 0.5;
+                                const double w = out_data[tensor_index(output_tensor, n, k + 2, r, c)];
+                                const double h = out_data[tensor_index(output_tensor, n, k + 3, r, c)];
                                 yolo_rect det(centered_drect(dpoint((x + c) * stride_x, (y + r) * stride_y),
                                                              w / (1 - w) * anchors[a].width,
                                                              h / (1 - h) * anchors[a].height));
                                 for (long i = 0; i < num_classes; ++i)
                                 {
-                                    const float conf = obj * out_data[tensor_index(output_tensor, n, k + 5 + i, r, c)];
+                                    const double conf = obj * out_data[tensor_index(output_tensor, n, k + 5 + i, r, c)];
                                     if (conf > adjust_threshold)
                                         det.labels.emplace_back(conf, options.labels[i]);
                                 }
@@ -3719,10 +3730,10 @@ namespace dlib
                         for (size_t a = 0; a < anchors.size(); ++a)
                         {
                             const long k = a * num_feats;
-                            const auto x = out_data[tensor_index(output_tensor, n, k + 0, r, c)] * 2.0 - 0.5;
-                            const auto y = out_data[tensor_index(output_tensor, n, k + 1, r, c)] * 2.0 - 0.5;
-                            const auto w = out_data[tensor_index(output_tensor, n, k + 2, r, c)];
-                            const auto h = out_data[tensor_index(output_tensor, n, k + 3, r, c)];
+                            const double x = out_data[tensor_index(output_tensor, n, k + 0, r, c)] * 2.0 - 0.5;
+                            const double y = out_data[tensor_index(output_tensor, n, k + 1, r, c)] * 2.0 - 0.5;
+                            const double w = out_data[tensor_index(output_tensor, n, k + 2, r, c)];
+                            const double h = out_data[tensor_index(output_tensor, n, k + 3, r, c)];
 
                             // The prediction at r, c for anchor a
                             const yolo_rect pred(centered_drect(dpoint((x + c) * stride_x, (y + r) * stride_y),
@@ -3739,9 +3750,14 @@ namespace dlib
                             }
 
                             // Incur loss for the boxes that are below a certain IoU threshold with any truth box
-                            const auto o_idx = tensor_index(output_tensor, n, k + 4, r, c);
                             if (best_iou < options.iou_ignore_threshold)
-                                g[o_idx] = options.lambda_obj * out_data[o_idx];
+                            {
+                                const auto o_idx = tensor_index(output_tensor, n, k + 4, r, c);
+                                const double p = out_data[o_idx];
+                                const double focus = std::pow(p, options.gamma_obj);
+                                const double g_obj = focus * (options.gamma_obj * (1 - p) * safe_log(1 - p) + p);
+                                g[o_idx] = options.lambda_obj * g_obj;
+                            }
                         }
                     }
                 }
@@ -3755,6 +3771,7 @@ namespace dlib
                     double best_iou = 0;
                     size_t best_a = 0;
                     size_t best_tag_id = 0;
+                    running_stats<double> ious;
                     for (const auto& item : options.anchors)
                     {
                         const auto tag_id = item.first;
@@ -3768,22 +3785,27 @@ namespace dlib
                                 best_iou = iou;
                                 best_a = a;
                                 best_tag_id = tag_id;
+                                ious.add(iou);
                             }
                         }
                     }
 
+                    // ATSS: Adaptive Training Sample Selection
+                    double iou_anchor_threshold = options.iou_anchor_threshold;
+                    if (iou_anchor_threshold == 0)
+                        iou_anchor_threshold = ious.mean() + ious.stddev();
+
                     for (size_t a = 0; a < anchors.size(); ++a)
                     {
                         // Update best anchor if it's from the current stride, and optionally other anchors
-                        if ((best_tag_id == tag_id<TAG_TYPE>::id && best_a == a) || options.iou_anchor_threshold < 1)
+                        if ((best_tag_id == tag_id<TAG_TYPE>::id && best_a == a) || iou_anchor_threshold < 1)
                         {
 
                             // do not update other anchors if they have low IoU
                             if (!(best_tag_id == tag_id<TAG_TYPE>::id && best_a == a))
                             {
                                 const yolo_rect anchor(centered_drect(t_center, anchors[a].width, anchors[a].height));
-                                const double iou = box_intersection_over_union(truth_box.rect, anchor.rect);
-                                if (iou < options.iou_anchor_threshold)
+                                if (box_intersection_over_union(truth_box.rect, anchor.rect) < iou_anchor_threshold)
                                     continue;
                             }
 
@@ -3800,34 +3822,48 @@ namespace dlib
                             // Scale regression error according to the truth size
                             const double scale_box = options.lambda_box * (2 - truth_box.rect.area() / input_rect.area());
 
-                            // Compute the gradient for the box coordinates
+                            // Compute the smoothed L1 gradient for the box coordinates
                             const auto x_idx = tensor_index(output_tensor, n, k + 0, r, c);
                             const auto y_idx = tensor_index(output_tensor, n, k + 1, r, c);
                             const auto w_idx = tensor_index(output_tensor, n, k + 2, r, c);
                             const auto h_idx = tensor_index(output_tensor, n, k + 3, r, c);
-                            g[x_idx] = scale_box * (out_data[x_idx] * 2.0 - 0.5 - tx);
-                            g[y_idx] = scale_box * (out_data[y_idx] * 2.0 - 0.5 - ty);
-                            g[w_idx] = scale_box * (out_data[w_idx] - tw);
-                            g[h_idx] = scale_box * (out_data[h_idx] - th);
+                            g[x_idx] = scale_box * put_in_range(-1, 1, (out_data[x_idx] * 2.0 - 0.5 - tx));
+                            g[y_idx] = scale_box * put_in_range(-1, 1, (out_data[y_idx] * 2.0 - 0.5 - ty));
+                            g[w_idx] = scale_box * put_in_range(-1, 1, (out_data[w_idx] - tw));
+                            g[h_idx] = scale_box * put_in_range(-1, 1, (out_data[h_idx] - th));
 
                             // This grid cell should detect an object
                             const auto o_idx = tensor_index(output_tensor, n, k + 4, r, c);
-                            g[o_idx] = options.lambda_obj * (out_data[o_idx] - 1);
+                            {
+                                const auto p = out_data[o_idx];
+                                const double focus = std::pow(1 - p, options.gamma_obj);
+                                const double g_obj = focus * (options.gamma_obj * p * safe_log(p) + p - 1);
+                                g[o_idx] = options.lambda_obj * g_obj;
+                            }
 
-                            // Compute the classification error
+                            // Compute the classification error using the truth weights and the focal loss
                             for (long i = 0; i < num_classes; ++i)
                             {
                                 const auto c_idx = tensor_index(output_tensor, n, k + 5 + i, r, c);
+                                const auto p = out_data[c_idx];
                                 if (truth_box.label == options.labels[i])
-                                    g[c_idx] = options.lambda_cls * (out_data[c_idx] - 1);
+                                {
+                                    const double focus = std::pow(1 - p, options.gamma_cls);
+                                    const double g_cls = focus * (options.gamma_cls * p * safe_log(p) + p - 1);
+                                    g[c_idx] = truth_box.detection_confidence * options.lambda_cls * g_cls;
+                                }
                                 else
-                                    g[c_idx] = options.lambda_cls * out_data[c_idx];
+                                {
+                                    const double focus = std::pow(p, options.gamma_cls);
+                                    const double g_cls = focus * (options.gamma_cls * (1 - p) * safe_log(1 - p) + p);
+                                    g[c_idx] = options.lambda_cls * g_cls;
+                                }
                             }
                         }
                     }
                 }
 
-                // Compute the L2 loss
+                // The loss is the squared norm of the gradient
                 loss += length_squared(rowm(mat(grad), n));
             }
         };
@@ -3956,6 +3992,8 @@ namespace dlib
             out << ", lambda_obj:" << opts.lambda_obj;
             out << ", lambda_box:" << opts.lambda_box;
             out << ", lambda_cls:" << opts.lambda_cls;
+            out << ", gamma_obj:" << opts.gamma_obj;
+            out << ", gamma_cls:" << opts.gamma_cls;
             out << ", overlaps_nms:(" << opts.overlaps_nms.get_iou_thresh() << "," << opts.overlaps_nms.get_percent_covered_thresh() << ")";
             out << ", classwise_nms:" << std::boolalpha << opts.classwise_nms;
             out << ")";
diff --git a/dlib/dnn/loss_abstract.h b/dlib/dnn/loss_abstract.h
index 9717f5d8b..cb3a2f145 100644
--- a/dlib/dnn/loss_abstract.h
+++ b/dlib/dnn/loss_abstract.h
@@ -1928,21 +1928,35 @@ namespace dlib
         // When computing the YOLO loss (objectness + bounding box regression + classification),
         // the best match between a truth and an anchor is always used, regardless of the IoU.
         // However, if other anchors have an IoU with a truth box above iou_anchor_threshold, they
-        // will also experience loss against that truth box as well.  Setting iou_anchor_threshold to 1 will
-        // make the model use only the best anchor for each ground truth, so other anchors can be
-        // used for other ground truth boxes in the same cell (useful for detecting objects in crowds).
-        // This setting is meant to be used with "high capacity" models, not small ones.
+        // will also experience loss against that truth box as well.  Setting iou_anchor_threshold
+        // to 1 will make the model use only the best anchor for each ground truth, so other
+        // anchors can be used for other ground truth boxes in the same cell (useful for detecting
+        // objects in crowds).  This setting is meant to be used with "high capacity" models, not
+        // small ones.  Additionaly, when this value is set to 0, it will adaptively compute the
+        // IOU threshold based on the statistics of the IOUS from all anchors with the current
+        // target truth box. In particular, it follows the adaptive training sample selection
+        // (ATSS) from the paper: "Bridging the Gap Between Anchor-based and Anchor-free Detection
+        // via Adaptive Training Sample Selection" by Shifeng Zhang, et al.
+        // (https://arxiv.org/abs/1912.02424)
         double iou_anchor_threshold = 1.0;
         // When doing non-max suppression, we use overlaps_nms to decide if a box overlaps
         // an already output detection and should therefore be thrown out.
         test_box_overlap overlaps_nms = test_box_overlap(0.45, 1.0);
         // When set to true, NMS will only be applied between objects with the same class label.
         bool classwise_nms = true;
-        // These parameters control how we penalize different kinds of mistakes: notably the objectness loss,
-        // the box (bounding box regression) loss, and the classification loss.
+        // These parameters control how we penalize different kinds of mistakes: notably the
+        // objectness loss, the box (bounding box regression) loss, and the classification loss.
         double lambda_obj = 1.0;
         double lambda_box = 1.0;
         double lambda_cls = 1.0;
+        // This parameter makes YOLO behave like the Focal loss, presented in the paper:
+        // "Focal Loss for Dense Object Detection", by Tsung-Yi Lin, et al.
+        // (https://arxiv.org/abs/1708.02002)
+        // The gamma_obj and gamma_cls act as a modulating factor to the cross-entropy layers of
+        // objectness and classification by reducing the relative loss for well-classified
+        // examples, and focusing on the difficult ones.
+        double gamma_obj = 0.0;
+        double gamma_cls = 0.0;
 
     };