CUDA代码
#include <stdio.h>
#include <cuda_runtime.h>
#define CHECK_CUDA(call, msg) \
do { \
cudaError_t cudaStatus = call; \
if (cudaStatus != cudaSuccess) { \
fprintf(stderr, "%s: %s\n", msg, cudaGetErrorString(cudaStatus)); \
exit(1); \
} \
} while(0)
int main() {
int gpu_index = 0; // GPU 索引号,0 代表第一个 GPU
cudaDeviceProp prop;
CHECK_CUDA(cudaGetDeviceProperties(&prop, gpu_index), "cudaGetDeviceProperties error");
printf("GPU Name = %s\n", prop.name);
printf("Compute Capability = %d.%d\n", prop.major, prop.minor);
printf("GPU SMs = %d\n", prop.multiProcessorCount);
printf("GPU SM clock rate = %.3f GHz\n", prop.clockRate / 1e6);
printf("GPU Mem clock rate = %.3f GHz\n", prop.memoryClockRate / 1e6);
if (prop.major == 8 && prop.minor == 0) {
printf("-----------CUDA Core Performance------------\n");
printf("FP32 Peak Performance = %.3f GFLOPS.\n", prop.multiProcessorCount * (prop.clockRate / 1e6) * 64 * 2);
printf("FP64 Peak Performance = %.3f GFLOPS.\n", prop.multiProcessorCount * (prop.clockRate / 1e6) * 32 * 2);
printf("FP16 Peak Performance = %.3f GFLOPS.\n", prop.multiProcessorCount * (prop.clockRate / 1e6) * 256 * 2);
printf("BF16 Peak Performance = %.3f GFLOPS.\n", prop.multiProcessorCount * (prop.clockRate / 1e6) * 128 * 2);
printf("INT8 Peak Performance = %.3f GOPS.\n", prop.multiProcessorCount * (prop.clockRate / 1e6) * 256 * 2);
printf("-----------Tensor Core Dense Performance------------\n");
printf("TF32 Peak Performance = %.3f GFLOPS.\n", prop.multiProcessorCount * (prop.clockRate / 1e6) * 512 * 2);
printf("FP64 Peak Performance = %.3f GFLOPS.\n", prop.multiProcessorCount * (prop.clockRate / 1e6) * 64 * 2);
printf("FP16 Peak Performance = %.3f GFLOPS.\n", prop.multiProcessorCount * (prop.clockRate / 1e6) * 1024 * 2);
printf("BF16 Peak Performance = %.3f GFLOPS.\n", prop.multiProcessorCount * (prop.clockRate / 1e6) * 1024 * 2);
printf("INT8 Peak Performance = %.3f GOPS.\n", prop.multiProcessorCount * (prop.clockRate / 1e6) * 2048 * 2);
printf("INT4 Peak Performance = %.3f GOPS.\n", prop.multiProcessorCount * (prop.clockRate / 1e6) * 4096 * 2);
printf("INT1 Peak Performance = %.3f GOPS.\n", prop.multiProcessorCount * (prop.clockRate / 1e6) * 16384 * 2);
printf("-----------Tensor Core Sparse Performance------------\n");
printf("TF32 Peak Performance = %.3f GFLOPS.\n", prop.multiProcessorCount * (prop.clockRate / 1e6) * 1024 * 2);
printf("FP16 Peak Performance = %.3f GFLOPS.\n", prop.multiProcessorCount * (prop.clockRate / 1e6) * 2048 * 2);
printf("BF16 Peak Performance = %.3f GFLOPS.\n", prop.multiProcessorCount * (prop.clockRate / 1e6) * 2048 * 2);
printf("INT8 Peak Performance = %.3f GOPS.\n", prop.multiProcessorCount * (prop.clockRate / 1e6) * 4096 * 2);
printf("INT4 Peak Performance = %.3f GOPS.\n", prop.multiProcessorCount * (prop.clockRate / 1e6) * 8192 * 2);
}
return 0;
}
将以上代码保存为 a100_get_peak.cu 文件,假设已安装好NVIDIA CUDA编译环境,运行命令:
nvcc -o a100_get_peak a100_get_peak.cu
成功编译生成 a100_get_peak 可执行程序,运行后输出结果如下:
GPU Name = NVIDIA A100 80GB PCIe
Compute Capability = 8.0
GPU SMs = 108
GPU SM clock rate = 1.410 GHz
GPU Mem clock rate = 1.512 GHz
-----------CUDA Core Performance------------
FP32 Peak Performance = 19491.840 GFLOPS.
FP64 Peak Performance = 9745.920 GFLOPS.
FP16 Peak Performance = 77967.360 GFLOPS.
BF16 Peak Performance = 38983.680 GFLOPS.
INT8 Peak Performance = 77967.360 GOPS.
-----------Tensor Core Dense Performance------------
TF32 Peak Performance = 155934.720 GFLOPS.
FP64 Peak Performance = 19491.840 GFLOPS.
FP16 Peak Performance = 311869.440 GFLOPS.
BF16 Peak Performance = 311869.440 GFLOPS.
INT8 Peak Performance = 623738.880 GOPS.
INT4 Peak Performance = 1247477.760 GOPS.
INT1 Peak Performance = 4989911.040 GOPS.
-----------Tensor Core Sparse Performance------------
TF32 Peak Performance = 311869.440 GFLOPS.
FP16 Peak Performance = 623738.880 GFLOPS.
BF16 Peak Performance = 623738.880 GFLOPS.
INT8 Peak Performance = 1247477.760 GOPS.
INT4 Peak Performance = 2494955.520 GOPS.