微软开源的深度学习优化库,提供完整的分布式训练和推理解决方案
# DeepSpeed 3D并行配置 ds_config = { # 数据并行:ZeRO优化 "zero_optimization": { "stage": 3, "offload_optimizer": {"device": "cpu"}, "offload_param": {"device": "cpu"} }, # 流水线并行 "pipeline": { "stages": 4, "micro_batches": 16 }, # 张量并行(与Megatron集成) "tensor_parallel": { "tp_size": 8, "mpu": "megatron" } }
# NVMe卸载配置,支持TB级模型训练 "aio": { "block_size": 1048576, "queue_depth": 8, "thread_count": 1, "single_submit": False, "overlap_events": True }
# 编译器优化 model = deepspeed.compile( model, backend="inductor", mode="max-autotune" )
# 自动将HuggingFace模型转换为张量并行 from deepspeed import AutoTP model = AutoTP.convert( model, replace_with_tp=True, tp_size=8 )
import deepspeed # 训练175B参数模型 model_engine, optimizer, _, _ = deepspeed.initialize( model=model, model_parameters=model.parameters(), config={ "train_batch_size": 4096, "gradient_accumulation_steps": 64, "fp16": {"enabled": True}, "zero_optimization": {"stage": 3}, "checkpoint": { "use_node_local_storage": True, "parallel_write": {"pipeline_stage": True} } } )
# 支持百万token序列训练 "sequence_parallel": { "enabled": True, "sequence_chunk_size": 4096, "use_flash_attention": True }