NVIDIA A100 峰值计算性能

CUDA代码

#include <stdio.h>
#include <cuda_runtime.h>

#define CHECK_CUDA(call, msg) \
    do { \
        cudaError_t cudaStatus = call; \
        if (cudaStatus != cudaSuccess) { \
            fprintf(stderr, "%s: %s\n", msg, cudaGetErrorString(cudaStatus)); \
            exit(1); \
        } \
    } while(0)

int main() {
    int gpu_index = 0; // GPU 索引号,0 代表第一个 GPU

    cudaDeviceProp prop;
    CHECK_CUDA(cudaGetDeviceProperties(&prop, gpu_index), "cudaGetDeviceProperties error");

    printf("GPU Name = %s\n", prop.name);
    printf("Compute Capability = %d.%d\n", prop.major, prop.minor);
    printf("GPU SMs = %d\n", prop.multiProcessorCount);
    printf("GPU SM clock rate = %.3f GHz\n", prop.clockRate / 1e6);
    printf("GPU Mem clock rate = %.3f GHz\n", prop.memoryClockRate / 1e6);

    if (prop.major == 8 && prop.minor == 0) {
        printf("-----------CUDA Core Performance------------\n");
        printf("FP32 Peak Performance = %.3f GFLOPS.\n", prop.multiProcessorCount * (prop.clockRate / 1e6) * 64 * 2);
        printf("FP64 Peak Performance = %.3f GFLOPS.\n", prop.multiProcessorCount * (prop.clockRate / 1e6) * 32 * 2);
        printf("FP16 Peak Performance = %.3f GFLOPS.\n", prop.multiProcessorCount * (prop.clockRate / 1e6) * 256 * 2);
        printf("BF16 Peak Performance = %.3f GFLOPS.\n", prop.multiProcessorCount * (prop.clockRate / 1e6) * 128 * 2);
        printf("INT8 Peak Performance = %.3f GOPS.\n", prop.multiProcessorCount * (prop.clockRate / 1e6) * 256 * 2);
        printf("-----------Tensor Core Dense Performance------------\n");
        printf("TF32 Peak Performance = %.3f GFLOPS.\n", prop.multiProcessorCount * (prop.clockRate / 1e6) * 512 * 2);
        printf("FP64 Peak Performance = %.3f GFLOPS.\n", prop.multiProcessorCount * (prop.clockRate / 1e6) * 64 * 2);
        printf("FP16 Peak Performance = %.3f GFLOPS.\n", prop.multiProcessorCount * (prop.clockRate / 1e6) * 1024 * 2);
        printf("BF16 Peak Performance = %.3f GFLOPS.\n", prop.multiProcessorCount * (prop.clockRate / 1e6) * 1024 * 2);
        printf("INT8 Peak Performance = %.3f GOPS.\n", prop.multiProcessorCount * (prop.clockRate / 1e6) * 2048 * 2);
        printf("INT4 Peak Performance = %.3f GOPS.\n", prop.multiProcessorCount * (prop.clockRate / 1e6) * 4096 * 2);
        printf("INT1 Peak Performance = %.3f GOPS.\n", prop.multiProcessorCount * (prop.clockRate / 1e6) * 16384 * 2);
        printf("-----------Tensor Core Sparse Performance------------\n");
        printf("TF32 Peak Performance = %.3f GFLOPS.\n", prop.multiProcessorCount * (prop.clockRate / 1e6) * 1024 * 2);
        printf("FP16 Peak Performance = %.3f GFLOPS.\n", prop.multiProcessorCount * (prop.clockRate / 1e6) * 2048 * 2);
        printf("BF16 Peak Performance = %.3f GFLOPS.\n", prop.multiProcessorCount * (prop.clockRate / 1e6) * 2048 * 2);
        printf("INT8 Peak Performance = %.3f GOPS.\n", prop.multiProcessorCount * (prop.clockRate / 1e6) * 4096 * 2);
        printf("INT4 Peak Performance = %.3f GOPS.\n", prop.multiProcessorCount * (prop.clockRate / 1e6) * 8192 * 2);
    }

    return 0;
}

将以上代码保存为 a100_get_peak.cu 文件,假设已安装好NVIDIA CUDA编译环境,运行命令:

nvcc -o a100_get_peak a100_get_peak.cu

成功编译生成 a100_get_peak 可执行程序,运行后输出结果如下:

GPU Name = NVIDIA A100 80GB PCIe
Compute Capability = 8.0
GPU SMs = 108
GPU SM clock rate = 1.410 GHz
GPU Mem clock rate = 1.512 GHz
-----------CUDA Core Performance------------
FP32 Peak Performance = 19491.840 GFLOPS.
FP64 Peak Performance = 9745.920 GFLOPS.
FP16 Peak Performance = 77967.360 GFLOPS.
BF16 Peak Performance = 38983.680 GFLOPS.
INT8 Peak Performance = 77967.360 GOPS.
-----------Tensor Core Dense Performance------------
TF32 Peak Performance = 155934.720 GFLOPS.
FP64 Peak Performance = 19491.840 GFLOPS.
FP16 Peak Performance = 311869.440 GFLOPS.
BF16 Peak Performance = 311869.440 GFLOPS.
INT8 Peak Performance = 623738.880 GOPS.
INT4 Peak Performance = 1247477.760 GOPS.
INT1 Peak Performance = 4989911.040 GOPS.
-----------Tensor Core Sparse Performance------------
TF32 Peak Performance = 311869.440 GFLOPS.
FP16 Peak Performance = 623738.880 GFLOPS.
BF16 Peak Performance = 623738.880 GFLOPS.
INT8 Peak Performance = 1247477.760 GOPS.
INT4 Peak Performance = 2494955.520 GOPS.
©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

推荐阅读更多精彩内容