diff --git a/examples/optimization_ex.cpp b/examples/optimization_ex.cpp
index e55c359bd..fdcc163a8 100644
--- a/examples/optimization_ex.cpp
+++ b/examples/optimization_ex.cpp
@@ -113,213 +113,208 @@ public:
 
 // ----------------------------------------------------------------------------------------
 
-int main()
+int main() try
 {
-    try
+    // Set the starting point to (4,8).  This is the point the optimization algorithm
+    // will start out from and it will move it closer and closer to the function's 
+    // minimum point.   So generally you want to try and compute a good guess that is
+    // somewhat near the actual optimum value.
+    column_vector starting_point = {4, 8};
+
+    // The first example below finds the minimum of the rosen() function and uses the
+    // analytical derivative computed by rosen_derivative().  Since it is very easy to
+    // make a mistake while coding a function like rosen_derivative() it is a good idea
+    // to compare your derivative function against a numerical approximation and see if
+    // the results are similar.  If they are very different then you probably made a 
+    // mistake.  So the first thing we do is compare the results at a test point: 
+    cout << "Difference between analytic derivative and numerical approximation of derivative: " 
+         << length(derivative(rosen)(starting_point) - rosen_derivative(starting_point)) << endl;
+
+
+    cout << "Find the minimum of the rosen function()" << endl;
+    // Now we use the find_min() function to find the minimum point.  The first argument
+    // to this routine is the search strategy we want to use.  The second argument is the 
+    // stopping strategy.  Below I'm using the objective_delta_stop_strategy which just 
+    // says that the search should stop when the change in the function being optimized 
+    // is small enough.
+
+    // The other arguments to find_min() are the function to be minimized, its derivative, 
+    // then the starting point, and the last is an acceptable minimum value of the rosen() 
+    // function.  That is, if the algorithm finds any inputs to rosen() that gives an output 
+    // value <= -1 then it will stop immediately.  Usually you supply a number smaller than 
+    // the actual global minimum.  So since the smallest output of the rosen function is 0 
+    // we just put -1 here which effectively causes this last argument to be disregarded.
+
+    find_min(bfgs_search_strategy(),  // Use BFGS search algorithm
+             objective_delta_stop_strategy(1e-7), // Stop when the change in rosen() is less than 1e-7
+             rosen, rosen_derivative, starting_point, -1);
+    // Once the function ends the starting_point vector will contain the optimum point 
+    // of (1,1).
+    cout << "rosen solution:\n" << starting_point << endl;
+
+
+    // Now let's try doing it again with a different starting point and the version
+    // of find_min() that doesn't require you to supply a derivative function.  
+    // This version will compute a numerical approximation of the derivative since 
+    // we didn't supply one to it.
+    starting_point = {-94, 5.2};
+    find_min_using_approximate_derivatives(bfgs_search_strategy(),
+                                           objective_delta_stop_strategy(1e-7),
+                                           rosen, starting_point, -1);
+    // Again the correct minimum point is found and stored in starting_point
+    cout << "rosen solution:\n" << starting_point << endl;
+
+
+    // Here we repeat the same thing as above but this time using the L-BFGS 
+    // algorithm.  L-BFGS is very similar to the BFGS algorithm, however, BFGS 
+    // uses O(N^2) memory where N is the size of the starting_point vector.  
+    // The L-BFGS algorithm however uses only O(N) memory.  So if you have a 
+    // function of a huge number of variables the L-BFGS algorithm is probably 
+    // a better choice.
+    starting_point = {0.8, 1.3};
+    find_min(lbfgs_search_strategy(10),  // The 10 here is basically a measure of how much memory L-BFGS will use.
+             objective_delta_stop_strategy(1e-7).be_verbose(),  // Adding be_verbose() causes a message to be 
+                                                                // printed for each iteration of optimization.
+             rosen, rosen_derivative, starting_point, -1);
+
+    cout << endl << "rosen solution: \n" << starting_point << endl;
+
+    starting_point = {-94, 5.2};
+    find_min_using_approximate_derivatives(lbfgs_search_strategy(10),
+                                           objective_delta_stop_strategy(1e-7),
+                                           rosen, starting_point, -1);
+    cout << "rosen solution: \n"<< starting_point << endl;
+
+
+
+
+    // dlib also supports solving functions subject to bounds constraints on
+    // the variables.  So for example, if you wanted to find the minimizer
+    // of the rosen function where both input variables were in the range
+    // 0.1 to 0.8 you would do it like this:
+    starting_point = {0.1, 0.1}; // Start with a valid point inside the constraint box.
+    find_min_box_constrained(lbfgs_search_strategy(10),  
+                             objective_delta_stop_strategy(1e-9),  
+                             rosen, rosen_derivative, starting_point, 0.1, 0.8);
+    // Here we put the same [0.1 0.8] range constraint on each variable, however, you
+    // can put different bounds on each variable by passing in column vectors of
+    // constraints for the last two arguments rather than scalars.  
+
+    cout << endl << "constrained rosen solution: \n" << starting_point << endl;
+
+    // You can also use an approximate derivative like so:
+    starting_point = {0.1, 0.1}; 
+    find_min_box_constrained(bfgs_search_strategy(),  
+                             objective_delta_stop_strategy(1e-9),  
+                             rosen, derivative(rosen), starting_point, 0.1, 0.8);
+    cout << endl << "constrained rosen solution: \n" << starting_point << endl;
+
+
+
+
+    // In many cases, it is useful if we also provide second derivative information
+    // to the optimizers.  Two examples of how we can do that are shown below.  
+    starting_point = {0.8, 1.3};
+    find_min(newton_search_strategy(rosen_hessian),
+             objective_delta_stop_strategy(1e-7),
+             rosen,
+             rosen_derivative,
+             starting_point,
+             -1);
+    cout << "rosen solution: \n"<< starting_point << endl;
+
+    // We can also use find_min_trust_region(), which is also a method which uses
+    // second derivatives.  For some kinds of non-convex function it may be more
+    // reliable than using a newton_search_strategy with find_min().
+    starting_point = {0.8, 1.3};
+    find_min_trust_region(objective_delta_stop_strategy(1e-7),
+                          rosen_model(), 
+                          starting_point, 
+                          10 // initial trust region radius
+    );
+    cout << "rosen solution: \n"<< starting_point << endl;
+
+
+
+
+
+    // Next, let's try the BOBYQA algorithm.  This is a technique specially
+    // designed to minimize a function in the absence of derivative information.  
+    // Generally speaking, it is the method of choice if derivatives are not available
+    // and the function you are optimizing is smooth and has only one local optima.  As
+    // an example, consider the be_like_target function defined below:
+    column_vector target = {3, 5, 1, 7};
+    auto be_like_target = [&](const column_vector& x) {
+        return mean(squared(x-target));
+    };
+    starting_point = {-4,5,99,3};
+    find_min_bobyqa(be_like_target, 
+                    starting_point, 
+                    9,    // number of interpolation points
+                    uniform_matrix<double>(4,1, -1e100),  // lower bound constraint
+                    uniform_matrix<double>(4,1, 1e100),   // upper bound constraint
+                    10,    // initial trust region radius
+                    1e-6,  // stopping trust region radius
+                    100    // max number of objective function evaluations
+    );
+    cout << "be_like_target solution:\n" << starting_point << endl;
+
+
+
+
+
+    // Finally, let's try the find_max_global() routine.  Like
+    // find_max_bobyqa(), this is a technique specially designed to maximize
+    // a function in the absence of derivative information.  However, it is
+    // also designed to handle functions with many local optima.  Where
+    // BOBYQA would get stuck at the nearest local optima, find_max_global()
+    // won't.  find_max_global() uses a global optimization method based on a
+    // combination of non-parametric global function modeling and BOBYQA
+    // style quadratic trust region modeling to efficiently find a global
+    // maximizer.  It usually does a good job with a relatively small number
+    // of calls to the function being optimized.  
+    // 
+    // You also don't have to give it a starting point or set any parameters,
+    // other than defining the bounds constraints.  This makes it the method
+    // of choice for derivative free optimization in the presence of local
+    // optima.  Its API also allows you to define functions that take a
+    // column_vector as shown above or to explicitly use named doubles as
+    // arguments, which we do here.
+    auto complex_holder_table = [](double x0, double x1)
     {
+        // This function is a version of the well known Holder table test
+        // function, which is a function containing a bunch of local optima.
+        // Here we make it even more difficult by adding more local optima
+        // and also a bunch of discontinuities. 
 
-
-        // Set the starting point to (4,8).  This is the point the optimization algorithm
-        // will start out from and it will move it closer and closer to the function's 
-        // minimum point.   So generally you want to try and compute a good guess that is
-        // somewhat near the actual optimum value.
-        column_vector starting_point = {4, 8};
-
-        // The first example below finds the minimum of the rosen() function and uses the
-        // analytical derivative computed by rosen_derivative().  Since it is very easy to
-        // make a mistake while coding a function like rosen_derivative() it is a good idea
-        // to compare your derivative function against a numerical approximation and see if
-        // the results are similar.  If they are very different then you probably made a 
-        // mistake.  So the first thing we do is compare the results at a test point: 
-        cout << "Difference between analytic derivative and numerical approximation of derivative: " 
-              << length(derivative(rosen)(starting_point) - rosen_derivative(starting_point)) << endl;
-
-
-        cout << "Find the minimum of the rosen function()" << endl;
-        // Now we use the find_min() function to find the minimum point.  The first argument
-        // to this routine is the search strategy we want to use.  The second argument is the 
-        // stopping strategy.  Below I'm using the objective_delta_stop_strategy which just 
-        // says that the search should stop when the change in the function being optimized 
-        // is small enough.
-
-        // The other arguments to find_min() are the function to be minimized, its derivative, 
-        // then the starting point, and the last is an acceptable minimum value of the rosen() 
-        // function.  That is, if the algorithm finds any inputs to rosen() that gives an output 
-        // value <= -1 then it will stop immediately.  Usually you supply a number smaller than 
-        // the actual global minimum.  So since the smallest output of the rosen function is 0 
-        // we just put -1 here which effectively causes this last argument to be disregarded.
-
-        find_min(bfgs_search_strategy(),  // Use BFGS search algorithm
-                 objective_delta_stop_strategy(1e-7), // Stop when the change in rosen() is less than 1e-7
-                 rosen, rosen_derivative, starting_point, -1);
-        // Once the function ends the starting_point vector will contain the optimum point 
-        // of (1,1).
-        cout << "rosen solution:\n" << starting_point << endl;
-
-
-        // Now let's try doing it again with a different starting point and the version
-        // of find_min() that doesn't require you to supply a derivative function.  
-        // This version will compute a numerical approximation of the derivative since 
-        // we didn't supply one to it.
-        starting_point = {-94, 5.2};
-        find_min_using_approximate_derivatives(bfgs_search_strategy(),
-                                               objective_delta_stop_strategy(1e-7),
-                                               rosen, starting_point, -1);
-        // Again the correct minimum point is found and stored in starting_point
-        cout << "rosen solution:\n" << starting_point << endl;
-
-
-        // Here we repeat the same thing as above but this time using the L-BFGS 
-        // algorithm.  L-BFGS is very similar to the BFGS algorithm, however, BFGS 
-        // uses O(N^2) memory where N is the size of the starting_point vector.  
-        // The L-BFGS algorithm however uses only O(N) memory.  So if you have a 
-        // function of a huge number of variables the L-BFGS algorithm is probably 
-        // a better choice.
-        starting_point = {0.8, 1.3};
-        find_min(lbfgs_search_strategy(10),  // The 10 here is basically a measure of how much memory L-BFGS will use.
-                 objective_delta_stop_strategy(1e-7).be_verbose(),  // Adding be_verbose() causes a message to be 
-                                                                    // printed for each iteration of optimization.
-                 rosen, rosen_derivative, starting_point, -1);
-
-        cout << endl << "rosen solution: \n" << starting_point << endl;
-
-        starting_point = {-94, 5.2};
-        find_min_using_approximate_derivatives(lbfgs_search_strategy(10),
-                                               objective_delta_stop_strategy(1e-7),
-                                               rosen, starting_point, -1);
-        cout << "rosen solution: \n"<< starting_point << endl;
-
-
-
-
-        // dlib also supports solving functions subject to bounds constraints on
-        // the variables.  So for example, if you wanted to find the minimizer
-        // of the rosen function where both input variables were in the range
-        // 0.1 to 0.8 you would do it like this:
-        starting_point = {0.1, 0.1}; // Start with a valid point inside the constraint box.
-        find_min_box_constrained(lbfgs_search_strategy(10),  
-                                 objective_delta_stop_strategy(1e-9),  
-                                 rosen, rosen_derivative, starting_point, 0.1, 0.8);
-        // Here we put the same [0.1 0.8] range constraint on each variable, however, you
-        // can put different bounds on each variable by passing in column vectors of
-        // constraints for the last two arguments rather than scalars.  
-
-        cout << endl << "constrained rosen solution: \n" << starting_point << endl;
-
-        // You can also use an approximate derivative like so:
-        starting_point = {0.1, 0.1}; 
-        find_min_box_constrained(bfgs_search_strategy(),  
-                                 objective_delta_stop_strategy(1e-9),  
-                                 rosen, derivative(rosen), starting_point, 0.1, 0.8);
-        cout << endl << "constrained rosen solution: \n" << starting_point << endl;
-
-
-
-
-        // In many cases, it is useful if we also provide second derivative information
-        // to the optimizers.  Two examples of how we can do that are shown below.  
-        starting_point = {0.8, 1.3};
-        find_min(newton_search_strategy(rosen_hessian),
-                 objective_delta_stop_strategy(1e-7),
-                 rosen,
-                 rosen_derivative,
-                 starting_point,
-                 -1);
-        cout << "rosen solution: \n"<< starting_point << endl;
-
-        // We can also use find_min_trust_region(), which is also a method which uses
-        // second derivatives.  For some kinds of non-convex function it may be more
-        // reliable than using a newton_search_strategy with find_min().
-        starting_point = {0.8, 1.3};
-        find_min_trust_region(objective_delta_stop_strategy(1e-7),
-            rosen_model(), 
-            starting_point, 
-            10 // initial trust region radius
-        );
-        cout << "rosen solution: \n"<< starting_point << endl;
-
-
-
-
-
-        // Next, let's try the BOBYQA algorithm.  This is a technique specially
-        // designed to minimize a function in the absence of derivative information.  
-        // Generally speaking, it is the method of choice if derivatives are not available
-        // and the function you are optimizing is smooth and has only one local optima.  As
-        // an example, consider the be_like_target function defined below:
-        column_vector target = {3, 5, 1, 7};
-        auto be_like_target = [&](const column_vector& x) {
-            return mean(squared(x-target));
-        };
-        starting_point = {-4,5,99,3};
-        find_min_bobyqa(be_like_target, 
-                        starting_point, 
-                        9,    // number of interpolation points
-                        uniform_matrix<double>(4,1, -1e100),  // lower bound constraint
-                        uniform_matrix<double>(4,1, 1e100),   // upper bound constraint
-                        10,    // initial trust region radius
-                        1e-6,  // stopping trust region radius
-                        100    // max number of objective function evaluations
-        );
-        cout << "be_like_target solution:\n" << starting_point << endl;
-
-
-
-
-
-        // Finally, let's try the find_max_global() routine.  Like
-        // find_max_bobyqa(), this is a technique specially designed to maximize
-        // a function in the absence of derivative information.  However, it is
-        // also designed to handle functions with many local optima.  Where
-        // BOBYQA would get stuck at the nearest local optima, find_max_global()
-        // won't.  find_max_global() uses a global optimization method based on a
-        // combination of non-parametric global function modeling and BOBYQA
-        // style quadratic trust region modeling to efficiently find a global
-        // maximizer.  It usually does a good job with a relatively small number
-        // of calls to the function being optimized.  
-        // 
-        // You also don't have to give it a starting point or set any parameters,
-        // other than defining the bounds constraints.  This makes it the method
-        // of choice for derivative free optimization in the presence of local
-        // optima.  Its API also allows you to define functions that take a
-        // column_vector as shown above or to explicitly use named doubles as
-        // arguments, which we do here.
-        auto complex_holder_table = [](double x0, double x1)
+        // add discontinuities
+        double sign = 1;
+        for (double j = -4; j < 9; j += 0.5)
         {
-            // This function is a version of the well known Holder table test
-            // function, which is a function containing a bunch of local optima.
-            // Here we make it even more difficult by adding more local optima
-            // and also a bunch of discontinuities. 
+            if (j < x0 && x0 < j+0.5) 
+                x0 += sign*0.25;
+            sign *= -1;
+        }
+        // Holder table function tilted towards 10,10 and with additional
+        // high frequency terms to add more local optima.
+        return std::abs(sin(x0)*cos(x1)*exp(std::abs(1-std::sqrt(x0*x0+x1*x1)/pi))) -(x0+x1)/10 - sin(x0*10)*cos(x1*10);
+    };
 
-            // add discontinuities
-            double sign = 1;
-            for (double j = -4; j < 9; j += 0.5)
-            {
-                if (j < x0 && x0 < j+0.5) 
-                    x0 += sign*0.25;
-                sign *= -1;
-            }
-            // Holder table function tilted towards 10,10 and with additional
-            // high frequency terms to add more local optima.
-            return std::abs(sin(x0)*cos(x1)*exp(std::abs(1-std::sqrt(x0*x0+x1*x1)/pi))) -(x0+x1)/10 - sin(x0*10)*cos(x1*10);
-        };
+    // To optimize this difficult function all we need to do is call
+    // find_max_global()
+    auto result = find_max_global(complex_holder_table, 
+                                  {-10,-10}, // lower bounds
+                                  {10,10}, // upper bounds
+                                  max_function_calls(300));
 
-        // To optimize this difficult function all we need to do is call
-        // find_max_global()
-        auto result = find_max_global(complex_holder_table, 
-                        {-10,-10}, // lower bounds
-                        {10,10}, // upper bounds
-                        max_function_calls(300));
-
-        cout.precision(9);
-        // These cout statements will show that find_max_global() found the
-        // globally optimal solution to 9 digits of precision:
-        cout << "complex holder table function solution y (should be 21.9210397): " << result.y << endl;
-        cout << "complex holder table function solution x:\n" << result.x << endl;
-    }
-    catch (std::exception& e)
-    {
-        cout << e.what() << endl;
-    }
+    cout.precision(9);
+    // These cout statements will show that find_max_global() found the
+    // globally optimal solution to 9 digits of precision:
+    cout << "complex holder table function solution y (should be 21.9210397): " << result.y << endl;
+    cout << "complex holder table function solution x:\n" << result.x << endl;
+}
+catch (std::exception& e)
+{
+    cout << e.what() << endl;
 }