通过分片优化器状态、梯度和参数来大幅降低内存占用的分布式训练技术
# 内存节省:4倍(对于Adam) # 每个GPU只存储1/N的优化器状态 # 通信开销:无额外开销 optimizer_memory = model_size * 8 / gpu_count
# 内存节省:8倍 # 每个GPU只保留需要更新的梯度部分 # 通信开销:与数据并行相同 gradient_memory = model_size * 2 / gpu_count
# 内存节省:与GPU数量成正比 # 参数也被分片,需要时通过All-Gather获取 # 通信开销:增加50% parameter_memory = model_size * 2 / gpu_count
每GPU内存需求 = (模型参数 × 系数) / GPU数量 其中系数: - 标准数据并行:16倍(混合精度) - ZeRO-1:12倍 - ZeRO-2:4倍 - ZeRO-3:2倍
# CPU卸载配置 zero_config = { "stage": 3, "offload_optimizer": { "device": "cpu", "pin_memory": True }, "offload_param": { "device": "cpu", "pin_memory": True } }
from deepspeed import DeepSpeedConfig # ZeRO-3配置 ds_config = { "zero_optimization": { "stage": 3, "overlap_comm": True, "contiguous_gradients": True, "sub_group_size": 1e9, "reduce_bucket_size": 1e6, "stage3_prefetch_bucket_size": 1e6, "stage3_param_persistence_threshold": 1e5, "stage3_max_live_parameters": 1e9, "stage3_max_reuse_distance": 1e9, "stage3_gather_16bit_weights_on_model_save": True } } # 初始化模型 model_engine, optimizer, _, _ = deepspeed.initialize( model=model, config=ds_config )