add support for cudnn 8.0

2024-11-01 10:14:53 +08:00 · 2020-06-20 09:43:17 -04:00 · 2020-06-20 09:43:17 -04:00 · fe803b566f
commit fe803b566f
parent f8887d8cbb
1 changed files with 85 additions and 0 deletions
--- a/dlib/cuda/cudnn_dlibapi.cpp
+++ b/dlib/cuda/cudnn_dlibapi.cpp
@ -751,6 +751,31 @@ namespace dlib
            backward_filters_workspace.reset();
        }

+        // Given an array of cudnn algorithm performance results, like
+        // cudnnConvolutionFwdAlgoPerf_t, pick the best one to use.
+        template <typename T>
+        decltype(std::declval<T>().algo) pick_best_algorithm(const std::vector<T> &perf_results) 
+        {
+            DLIB_CASSERT(!perf_results.empty());
+            CHECK_CUDNN(perf_results[0].status);
+            if (dnn_prefer_fastest_algorithms())
+                return perf_results[0].algo;
+
+            // Otherwise we find the algorithm that has a good status and uses the least amount
+            // of memory.
+            size_t best_memory = std::numeric_limits<size_t>::max();
+            decltype(std::declval<T>().algo) best_alg;
+            for (auto&& perf : perf_results) 
+            {
+                if (perf.status == CUDNN_STATUS_SUCCESS && perf.memory < best_memory) 
+                {
+                    best_memory = perf.memory;
+                    best_alg = perf.algo;
+                }
+            }
+            return best_alg;
+        }
+
        void tensor_conv::
        setup(
            const tensor& data,
@ -841,6 +866,25 @@ namespace dlib
                // Pick which forward algorithm we will use and allocate the necessary
                // workspace buffer.
                cudnnConvolutionFwdAlgo_t forward_best_algo;
+#if CUDNN_MAJOR >= 8
+                {
+                int num_possilbe_algorithms = 0;
+                CHECK_CUDNN(cudnnGetConvolutionForwardAlgorithmMaxCount(context(), &num_possilbe_algorithms));
+                std::vector<cudnnConvolutionFwdAlgoPerf_t> perf_results(num_possilbe_algorithms);
+                int num_algorithms = 0;
+                CHECK_CUDNN(cudnnFindConvolutionForwardAlgorithm(
+                        context(), 
+                        descriptor(data),
+                        (const cudnnFilterDescriptor_t)filter_handle,
+                        (const cudnnConvolutionDescriptor_t)conv_handle,
+                        descriptor(dest_desc),
+                        num_possilbe_algorithms,
+                        &num_algorithms,
+                        perf_results.data()));
+                perf_results.resize(num_algorithms);
+                forward_best_algo = pick_best_algorithm(perf_results);
+                }
+#else
                CHECK_CUDNN(cudnnGetConvolutionForwardAlgorithm(
                        context(), 
                        descriptor(data),
@ -850,6 +894,7 @@ namespace dlib
                        dnn_prefer_fastest_algorithms()?CUDNN_CONVOLUTION_FWD_PREFER_FASTEST:CUDNN_CONVOLUTION_FWD_NO_WORKSPACE,
                        std::numeric_limits<size_t>::max(),
                        &forward_best_algo));
+#endif
                forward_algo = forward_best_algo;
                CHECK_CUDNN(cudnnGetConvolutionForwardWorkspaceSize( 
                        context(),
@ -863,6 +908,25 @@ namespace dlib
                // Pick which backward data algorithm we will use and allocate the
                // necessary workspace buffer.
                cudnnConvolutionBwdDataAlgo_t backward_data_best_algo;
+#if CUDNN_MAJOR >= 8
+                {
+                int num_possilbe_algorithms = 0;
+                CHECK_CUDNN(cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(context(), &num_possilbe_algorithms));
+                std::vector<cudnnConvolutionBwdDataAlgoPerf_t> perf_results(num_possilbe_algorithms);
+                int num_algorithms = 0;
+                CHECK_CUDNN(cudnnFindConvolutionBackwardDataAlgorithm(
+                        context(),
+                        (const cudnnFilterDescriptor_t)filter_handle,
+                        descriptor(dest_desc),
+                        (const cudnnConvolutionDescriptor_t)conv_handle,
+                        descriptor(data),
+                        num_possilbe_algorithms,
+                        &num_algorithms,
+                        perf_results.data()));
+                perf_results.resize(num_algorithms);
+                backward_data_best_algo = pick_best_algorithm(perf_results);
+                }
+#else
                CHECK_CUDNN(cudnnGetConvolutionBackwardDataAlgorithm(
                        context(),
                        (const cudnnFilterDescriptor_t)filter_handle,
@ -872,6 +936,7 @@ namespace dlib
                        dnn_prefer_fastest_algorithms()?CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST:CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE,
                        std::numeric_limits<size_t>::max(),
                        &backward_data_best_algo));
+#endif
                backward_data_algo = backward_data_best_algo;

                CHECK_CUDNN(cudnnGetConvolutionBackwardDataWorkspaceSize(
@ -886,6 +951,25 @@ namespace dlib
                // Pick which backward filters algorithm we will use and allocate the
                // necessary workspace buffer.
                cudnnConvolutionBwdFilterAlgo_t backward_filters_best_algo;
+#if CUDNN_MAJOR >= 8
+                {
+                int num_possilbe_algorithms = 0;
+                CHECK_CUDNN(cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(context(), &num_possilbe_algorithms));
+                std::vector<cudnnConvolutionBwdFilterAlgoPerf_t> perf_results(num_possilbe_algorithms);
+                int num_algorithms = 0;
+                CHECK_CUDNN(cudnnFindConvolutionBackwardFilterAlgorithm(
+                        context(),
+                        descriptor(data),
+                        descriptor(dest_desc),
+                        (const cudnnConvolutionDescriptor_t)conv_handle,
+                        (const cudnnFilterDescriptor_t)filter_handle,
+                        num_possilbe_algorithms,
+                        &num_algorithms,
+                        perf_results.data()));
+                perf_results.resize(num_algorithms);
+                backward_filters_best_algo = pick_best_algorithm(perf_results);
+                }
+#else
                CHECK_CUDNN(cudnnGetConvolutionBackwardFilterAlgorithm(
                        context(),
                        descriptor(data),
@ -895,6 +979,7 @@ namespace dlib
                        dnn_prefer_fastest_algorithms()?CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST:CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE,
                        std::numeric_limits<size_t>::max(),
                        &backward_filters_best_algo));
+#endif
                // cuDNN 5.1 has a bug that causes
                // cudnnGetConvolutionBackwardFilterAlgorithm() to pick the winograd
                // algorithm even for cases where cuDNN doesn't support it, leading to