From d29eb8798070719706e3548d61990200f8b18dc3 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Sun, 28 Jul 2024 12:07:40 +0000 Subject: [PATCH 1/3] add support converting checkpoint from hf to mds --- .../finetune_hf_llama/ds_config.json | 8 +- .../finetune_hf_llama/finetune_llama.sh | 23 +- megatron/global_vars.py | 1 + tools/hf2megads_weight_converter.py | 314 +++++++++++++++++- 4 files changed, 315 insertions(+), 31 deletions(-) diff --git a/examples_deepspeed/finetune_hf_llama/ds_config.json b/examples_deepspeed/finetune_hf_llama/ds_config.json index 9c0b332473..85f439ce47 100755 --- a/examples_deepspeed/finetune_hf_llama/ds_config.json +++ b/examples_deepspeed/finetune_hf_llama/ds_config.json @@ -1,11 +1,5 @@ { "train_batch_size" : 256, "train_micro_batch_size_per_gpu": 16, - "steps_per_print": 100, - "zero_optimization": { - "stage": 0 - }, - "bf16": { - "enabled": true - } + "steps_per_print": 1 } diff --git a/examples_deepspeed/finetune_hf_llama/finetune_llama.sh b/examples_deepspeed/finetune_hf_llama/finetune_llama.sh index c48ea11b93..e9d1b8c09a 100644 --- a/examples_deepspeed/finetune_hf_llama/finetune_llama.sh +++ b/examples_deepspeed/finetune_hf_llama/finetune_llama.sh @@ -1,8 +1,8 @@ DS_CONFIG=./examples_deepspeed/finetune_hf_llama/ds_config.json -DATASET_PATH=./alpaca_data.json +DATASET_PATH=./examples_deepspeed/finetune_hf_llama/alpaca_data.json # dataset link: https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json -HF_LLAMA_PATH=/data/llama-7b/ +HF_LLAMA_PATH=/yahao/llama-2-7b-hf/ # weights link: https://huggingface.co/huggyllama/llama-7b MICRO_BATCH_SIZE=16 @@ -44,11 +44,20 @@ cat < $DS_CONFIG EOT -covert_args="deepspeed tools/hf2megads_weight_converter.py \ +covert_hf2mds_args="deepspeed tools/hf2megads_weight_converter.py \ --hf-ckpt-num-shards 2 \ ---origin-hf-ckpt-dir $HF_LLAMA_PATH \ +--hf-ckpt-dir $HF_LLAMA_PATH \ +--load-mode auto \ --save $MEGA_DS_LLAMA_PATH" +covert_mds2hf_args="deepspeed tools/hf2megads_weight_converter.py \ +--hf-ckpt-num-shards 2 \ +--hf-ckpt-dir $HF_LLAMA_PATH \ +--load-mode auto \ +--to-hf-ckpt \ +--load $MEGA_DS_LLAMA_PATH \ +--save $HF_LLAMA_PATH'-hf-out' " + finetune_args="deepspeed finetune_llama.py \ --load $MEGA_DS_LLAMA_PATH" @@ -98,8 +107,10 @@ comm_args="--tensor-model-parallel-size $TP \ --no-gradient-accumulation-fusion \ --repeated-dataloader" -if [ "$1" = "convert" ]; then - task_args="$covert_args" +if [ "$1" = "convert_hf2mds" ]; then + task_args="$covert_hf2mds_args" +elif [ "$1" = "convert_mds2hf" ]; then + task_args="$covert_mds2hf_args" else task_args="$finetune_args" fi diff --git a/megatron/global_vars.py b/megatron/global_vars.py index 3f9d6fd66b..ccd0a4e21e 100644 --- a/megatron/global_vars.py +++ b/megatron/global_vars.py @@ -175,6 +175,7 @@ def _set_wandb_writer(args): 'project or experiment name provided, ' 'therefore WANDB logs will be written ' 'according to random generated project or experiment name.', flush=True) + return try: import wandb diff --git a/tools/hf2megads_weight_converter.py b/tools/hf2megads_weight_converter.py index bfbde1fd05..b91a584807 100755 --- a/tools/hf2megads_weight_converter.py +++ b/tools/hf2megads_weight_converter.py @@ -6,6 +6,7 @@ from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP from megatron import print_rank_0, get_tokenizer, get_args from megatron.core import mpu +from megatron.core import tensor_parallel from megatron.core.utils import divide from megatron.model import GPTModelPipe, Float16Module from megatron.utils import unwrap_model @@ -13,7 +14,7 @@ from megatron.arguments import core_transformer_config_from_args from megatron.initialize import initialize_megatron from megatron.optimizer import get_megatron_optimizer -from megatron.checkpointing import save_checkpoint +from megatron.checkpointing import save_checkpoint, load_checkpoint from megatron.training import get_optimizer_param_scheduler from deepspeed.runtime.utils import see_memory_usage import deepspeed @@ -22,11 +23,18 @@ def add_extra_args(parser): """Text generation arguments.""" group = parser.add_argument_group(title='hf2mega') - group.add_argument("--hf-ckpt-num-shards", type=int, help='num of llama ckpt.') - group.add_argument("--origin-hf-ckpt-dir", + group.add_argument("--hf-ckpt-dir", type=str, default="", - help="the original path of the llama-hf ckpt") + help="the llama-hf ckpt") + group.add_argument("--hf-ckpt-num-shards", type=int, default=-1, help='num of llama ckpt.') + group.add_argument("--load-mode", type=str, + default=None, + choices=['torchbin', 'safetensor', 'auto'], + help="load ckpt format: pytorch.bin or model.safetensor or auto.") + group.add_argument("--to-hf-ckpt", action="store_true", + help="by default convert from hf to megads" + "if set, convert reversely from megads to hf ckpt.") return parser @@ -55,6 +63,49 @@ def load_and_print_hf_weight(hf_ckpt_dir, hf_ckpt_num_of_shards): return loaded +def load_and_print_hf_weight_from_safetensor(hf_ckpt_dir, hf_ckpt_num_of_shards): + from safetensors import safe_open + # Optimization point: We can selectively load specific 'shared' data to reduce CPU memory usage. + hf_model = {} + print_rank_0( + f"----------------------------hf weight list----------------------------") + + for wid in range(1, hf_ckpt_num_of_shards + 1): + if hf_ckpt_num_of_shards == 1: + ckpt_path = f"{hf_ckpt_dir}/model.safetensors" + else: + ckpt_path = f"{hf_ckpt_dir}/model-{wid:05d}-of-{hf_ckpt_num_of_shards:05d}.safetensors" + + with safe_open(ckpt_path, framework="pt", device="cpu") as f: + for k in f.keys(): + print_rank_0(f"name: {k}, shape: {f.get_tensor(k).shape}") + assert k not in hf_model + hf_model[k] = f.get_tensor(k).clone() + + return hf_model + + +def load_and_print_hf_weight_auto(hf_ckpt_dir, no_init=True): + from transformers import AutoConfig, AutoModelForCausalLM + from transformers.modeling_utils import no_init_weights + + if no_init: + hf_config = AutoConfig.from_pretrained(hf_ckpt_dir, trust_remote_code=True) + with no_init_weights(): + hf_model = AutoModelForCausalLM.from_config(hf_config, trust_remote_code=True, torch_dtype=torch.bfloat16) + else: + hf_model = {} + hf_auto_model = AutoModelForCausalLM.from_pretrained(hf_ckpt_dir, trust_remote_code=True, torch_dtype=torch.bfloat16) + print_rank_0( + f"----------------------------hf weight list----------------------------") + + for name, param in hf_auto_model.named_parameters(): + hf_model[name] = param.clone() + print_rank_0(name) + + return hf_model + + def print_distinct_weights(model): print_rank_0( f"----------------------------mega-ds weight list----------------------------") @@ -70,16 +121,18 @@ def print_distinct_weights(model): class refactor: - def __init__(self, model, loaded, args, config): + def __init__(self, ds_model, hf_model, args, config): tokenizer = get_tokenizer() # align layer number - self.model = model - self.loaded = loaded + self.ds_model = ds_model + self.hf_model = hf_model self.config = config self.offset_num = 2 self.mega_emb_wnum = 1 self.mega_norm_wnum = args.num_layers + 2 + self.num_attention_heads = args.num_attention_heads + self.num_key_value_heads = args.num_key_value_heads self.mega_lm_head_wnum = self.mega_norm_wnum + 1 self.token_vocab = tokenizer.vocab_size self.padded_vocab_size = args.padded_vocab_size @@ -95,7 +148,7 @@ def _embedding_refactor(self, pname, p): hf_name = "lm_head.weight" elif pname == f"{self.mega_emb_wnum}.word_embeddings.weight": hf_name = "model.embed_tokens.weight" - hf_w = self.loaded[hf_name] + hf_w = self.hf_model[hf_name] assert hf_w.shape[0] == self.token_vocab per_partition_vocab_size, start_index, end_index = compute_partition_range( self.padded_vocab_size, self.tp_rank, self.tp_size) @@ -112,13 +165,110 @@ def _embedding_refactor(self, pname, p): ) return new_w + def _embedding_refactor_to_hf(self, pname, p): + + if pname == f"{self.mega_lm_head_wnum}.lm_head.weight": + hf_w = self.hf_model.lm_head.weight + elif pname == f"{self.mega_emb_wnum}.word_embeddings.weight": + hf_w = self.hf_model.model.embed_tokens.weight + + ds_w = p + with torch.no_grad(): + ds_w_all_rank = tensor_parallel.mappings._gather_along_first_dim(ds_w) + + hf_w.data.copy_(ds_w_all_rank[:hf_w.shape[0], :]) + + def _direct_refactor_to_hf(self, pname, p, hf_layer=None, subname=None): + ds_w = p + if pname in [f"{self.mega_norm_wnum}.weight"]: + hf_w = self.hf_model.model.norm.weight + elif subname in ["input_layernorm.weight"]: + hf_w = self.hf_model.model.layers[hf_layer].input_layernorm.weight + elif subname in ["post_attention_layernorm.weight"]: + hf_w = self.hf_model.model.layers[hf_layer].post_attention_layernorm.weight + + hf_w.data.copy_(ds_w) + + def _attn_dense_refactor_to_hf(self, pname, p, hf_layer, subname): + ds_w = p + + if subname == "self_attention.dense.weight": + hf_w = self.hf_model.model.layers[hf_layer].self_attn.o_proj.weight + elif subname == "mlp.dense_4h_to_h.weight": + hf_w = self.hf_model.model.layers[hf_layer].mlp.down_proj.weight + + with torch.no_grad(): + ds_w_all_rank = tensor_parallel.mappings._gather_along_last_dim(ds_w) + + hf_w.data.copy_(ds_w_all_rank) + + def _mlphto4h_dense_refactor_to_hf(self, pname, p, hf_layer): + ds_w = p + hf_g = self.hf_model.model.layers[hf_layer].mlp.gate_proj.weight + hf_u = self.hf_model.model.layers[hf_layer].mlp.up_proj.weight + + with torch.no_grad(): + ds_w_all_rank = tensor_parallel.mappings._gather_along_first_dim(ds_w) + + if torch.distributed.get_rank() == 0: + print(f"yahao-dbg: hf_g: {hf_g.shape}, ds_w_all_rank: {ds_w_all_rank.shape}") + + ds_w_shape = ds_w_all_rank.shape + + ds_w_all_rank = ds_w_all_rank.reshape(self.tp_size, 2, -1, ds_w_shape[-1]) + + hf_g.data.copy_(ds_w_all_rank[:, 0, :, :].reshape(-1, ds_w_shape[-1])) + hf_u.data.copy_(ds_w_all_rank[:, 1, :, :].reshape(-1, ds_w_shape[-1])) + + + def _qkv_refactor_to_hf(self, pname, p, hf_layer): + + ds_w = p + + #TODO(yahao): Need to check function if we are using only 1 GPU. + + with torch.no_grad(): + ds_w_all_rank = tensor_parallel.mappings._gather_along_first_dim(ds_w) + + # [np/8, 3, 128, 6144] + + # [np, 3, 128, 6144] + + # hf_w = self.hf_model.model.layers[hf_layer].attention.query_key_value.weight + hf_q = self.hf_model.model.layers[hf_layer].self_attn.q_proj.weight + hf_k = self.hf_model.model.layers[hf_layer].self_attn.k_proj.weight + hf_v = self.hf_model.model.layers[hf_layer].self_attn.v_proj.weight + + oldshape = hf_q.shape + + hidden_size = oldshape[-1] + + hidden_size_per_attention_head = divide(hidden_size, + self.config.num_attention_heads) + + num_attention_heads_per_partition = divide(self.config.num_attention_heads, + self.tp_size) + + newshape = (self.tp_size, num_attention_heads_per_partition, 3, hidden_size_per_attention_head, hidden_size) + + + if torch.distributed.get_rank() == 0: + print(f"yahao-dbg: qkv refactor : oldshape: {oldshape}, newshape: {newshape}") + + ds_w_out = ds_w_all_rank.reshape(*newshape) + + hf_q.data.copy_(ds_w_out[:, :, 0, :, :].reshape(-1, oldshape[-1])) + hf_k.data.copy_(ds_w_out[:, :, 1, :, :].reshape(-1, oldshape[-1])) + hf_v.data.copy_(ds_w_out[:, :, 2, :, :].reshape(-1, oldshape[-1])) + + def _direct_refactor(self, pname, p, hf_layer=None, subname=None): if pname == f"{self.mega_norm_wnum}.weight": hf_name = "model.norm.weight" elif subname in ["input_layernorm.weight", "post_attention_layernorm.weight"]: hf_name = f"model.layers.{hf_layer}.{subname}" - new_w = hf_w = self.loaded[hf_name] + new_w = hf_w = self.hf_model[hf_name] self.record_mapping_info( f"mega-ds:{pname,p.data.shape}<--hf{hf_name,} {hf_w.shape}") return new_w @@ -127,9 +277,9 @@ def _qkv_refactor(self, pname, p, hf_layer): hf_wq_name = f"model.layers.{hf_layer}.self_attn.q_proj.weight" hf_wk_name = f"model.layers.{hf_layer}.self_attn.k_proj.weight" hf_wv_name = f"model.layers.{hf_layer}.self_attn.v_proj.weight" - wq = self.loaded[hf_wq_name] - wk = self.loaded[hf_wk_name] - wv = self.loaded[hf_wv_name] + wq = self.hf_model[hf_wq_name] + wk = self.hf_model[hf_wk_name] + wv = self.hf_model[hf_wv_name] hidden_size = wq.shape[0] per_partition_size, start_index, end_index = compute_partition_range( @@ -159,8 +309,8 @@ def _qkv_refactor(self, pname, p, hf_layer): def _mlphto4h_dense_refactor(self, pname, p, hf_layer): hf_w_gate_name = f"model.layers.{hf_layer}.mlp.gate_proj.weight" hf_w_up_name = f"model.layers.{hf_layer}.mlp.up_proj.weight" - w_gate = self.loaded[hf_w_gate_name] - w_up = self.loaded[hf_w_up_name] + w_gate = self.hf_model[hf_w_gate_name] + w_up = self.hf_model[hf_w_up_name] hidden_size = w_gate.shape[0] per_partition_size, start_index, end_index = compute_partition_range( @@ -184,7 +334,7 @@ def _attn_dense_refactor(self, pname, p, hf_layer, subname): else: hf_name = f"model.layers.{hf_layer}.mlp.down_proj.weight" - hf_w = self.loaded[hf_name] + hf_w = self.hf_model[hf_name] hidden_size = hf_w.shape[1] per_partition_size, start_index, end_index = compute_partition_range( hidden_size, self.tp_rank, self.tp_size) @@ -200,7 +350,7 @@ def _mlphto4h1_refactor(self, pname, p, hf_layer, subname): hf_name = f"model.layers.{hf_layer}.mlp.gate_proj.weight" else: hf_name = f"model.layers.{hf_layer}.mlp.up_proj.weight" - hf_w = self.loaded[hf_name] + hf_w = self.hf_model[hf_name] hidden_size = hf_w.shape[0] per_partition_size, start_index, end_index = compute_partition_range( hidden_size, self.tp_rank, self.tp_size) @@ -215,7 +365,10 @@ def _mlphto4h1_refactor(self, pname, p, hf_layer, subname): def refactor(self): assert self.is_refactored == False new_w = None - for pname, p in self.model.named_parameters(): + for pname, p in self.ds_model.named_parameters(): + if torch.distributed.get_rank() == 0: + print(f"yahao-dbg: pname: {pname}, p.shape: {p.shape}") + if pname in [ f"{self.mega_emb_wnum}.word_embeddings.weight", f"{self.mega_lm_head_wnum}.lm_head.weight" @@ -253,6 +406,54 @@ def refactor(self): new_w = None self.is_refactored = True + def transform_from_megads_to_hf(self): + use_gqa = True if self.num_attention_heads != self.num_key_value_heads else False + + for pname, p in self.ds_model.named_parameters(): + + if torch.distributed.get_rank() == 0: + print(f"yahao-dbg: ds_model pname: {pname} , p.shape: {p.shape}") + + if pname in [ + f"{self.mega_emb_wnum}.word_embeddings.weight", + f"{self.mega_lm_head_wnum}.lm_head.weight", + ]: + self._embedding_refactor_to_hf(pname, p) + elif pname in [ + f"{self.mega_norm_wnum}.weight", + ]: + self._direct_refactor_to_hf(pname, p) + else: + mobj = self.decoder_pat.match(pname) + layer_num = int(mobj.group(1)) + subname = mobj.group(2) + hf_layer = layer_num - self.offset_num + if subname in ["self_attention.query_key_value.weight"]: + if not use_gqa: + self._qkv_refactor_to_hf(pname, p, hf_layer) + else: + #TODO(yahao): Not impl yet ... + assert False + elif subname in ["mlp.dense_h_to_4h.weight"]: + self._mlphto4h_dense_refactor_to_hf(pname, p, hf_layer) + elif subname in [ + "self_attention.dense.weight", + "mlp.dense_4h_to_h.weight" + ]: + self._attn_dense_refactor_to_hf(pname, p, hf_layer, subname) + elif subname in [ + "input_layernorm.weight", + "post_attention_layernorm.weight", + ]: + self._direct_refactor_to_hf(pname, p, hf_layer, subname) + else: + print(f"Unrecognized weight type: {pname}") + raise ValueError(f"Unrecognized weight type: {pname}") + + # if torch.distributed.get_rank() == 0: + # print("yahao debug save last #2 stage: " + str(self.hf_model.state_dict()['lm.embed_in.weight'])) + self.is_refactored = True + def record_mapping_info(self, record_msg): self.refactor_weight_list.append(record_msg) @@ -327,8 +528,85 @@ def convert_hf_to_mega_ds(): save_checkpoint(0, [ds_engine], optimizer, opt_param_scheduler) print_rank_0(f"save checkpoint completed") +def load_hf_weights(args, no_init): + if args.load_mode == 'torchbin': + assert no_init == False, "only work with init" + return load_and_print_hf_weight(args.hf_ckpt_dir, args.hf_ckpt_num_shards) + elif args.load_mode == 'safetensor': + assert no_init == False, "only work with init" + return load_and_print_hf_weight_from_safetensor(args.hf_ckpt_dir, args.hf_ckpt_num_shards) + elif args.load_mode == 'auto': + return load_and_print_hf_weight_auto(args.hf_ckpt_dir, no_init) + +def convert_ckpt(): + """Build the model.""" + args = get_args() + print_rank_0(f'building model ...') + see_memory_usage(f"Before Building Model", force=True) + + config = core_transformer_config_from_args(args) + with deepspeed.zero.Init( + data_parallel_group=mpu.get_data_parallel_group(), + remote_device=None if args.remote_device == 'none' else args.remote_device, + config_dict_or_path=args.deepspeed_config, + enabled=args.zero_stage == 3, + mpu=mpu): + if args.deepspeed and not args.no_pipeline_parallel: + ds_model = GPTModelPipe(config, num_tokentypes=0, parallel_output=True) + else: + raise NotImplementedError("Not implemented") + + see_memory_usage(f"After Building Model", force=True) + if torch.distributed.get_rank() < 2: + print(f"{torch.distributed.get_rank()} {ds_model}") + + # load and initialize HF weight dict + # print hf weights list & mega-ds weights list + + + # 'torchbin', 'safetensor', 'auto' + + hf_model = load_hf_weights(args, no_init=args.to_hf_ckpt) + + # print_distinct_weights(hf_model) + + #init model and save + print_rank_0(f"before deepspeed init") + ds_engine, _, _, _ = deepspeed.initialize( + model=ds_model, + optimizer=None, + args=args, + lr_scheduler=None, + mpu=mpu if args.no_pipeline_parallel else None) + print_rank_0(f"after deepspeed init") + + if args.to_hf_ckpt: + load_checkpoint([ds_engine], None, None, load_only_weights=True) + print(f"Load deepspeed actual checkpoint completed") + + # refactor weight from hf to mega-ds and vice versa + + cur_refactor = refactor(ds_model, hf_model, args, config) + if args.to_hf_ckpt: + cur_refactor.transform_from_megads_to_hf() + else: + cur_refactor.refactor() + # cur_refactor.inorder_show_record() + + if args.to_hf_ckpt: + if mpu.is_pipeline_last_stage(): + # hf_model.tie_weights() + + # mega-ds checkpoint will be saved in args.save + print_rank_0(f"mega-ds checkpoint will be saved in /yahao/Megatron-DeepSpeed/test-llama-7b-hf ") + hf_model.save_pretrained("/yahao/Megatron-DeepSpeed/test-llama-7b-hf", safe_serialization=True) + else: + print_rank_0(f"mega-ds checkpoint will be saved in {args.save}") + save_checkpoint(0, [ds_engine], None, None) + + print_rank_0(f"save checkpoint completed") if __name__ == "__main__": initialize_megatron(extra_args_provider=add_extra_args) - convert_hf_to_mega_ds() + convert_ckpt() From 738e3e173976fe5084f30931f760fb55d429613e Mon Sep 17 00:00:00 2001 From: billishyahao Date: Wed, 7 Aug 2024 10:36:39 +0000 Subject: [PATCH 2/3] Fix PP issue --- .../finetune_hf_llama/finetune_llama.sh | 2 +- tools/hf2megads_weight_converter.py | 277 +++++++----------- 2 files changed, 109 insertions(+), 170 deletions(-) diff --git a/examples_deepspeed/finetune_hf_llama/finetune_llama.sh b/examples_deepspeed/finetune_hf_llama/finetune_llama.sh index e9d1b8c09a..ab8bfdf419 100644 --- a/examples_deepspeed/finetune_hf_llama/finetune_llama.sh +++ b/examples_deepspeed/finetune_hf_llama/finetune_llama.sh @@ -2,7 +2,7 @@ DS_CONFIG=./examples_deepspeed/finetune_hf_llama/ds_config.json DATASET_PATH=./examples_deepspeed/finetune_hf_llama/alpaca_data.json # dataset link: https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json -HF_LLAMA_PATH=/yahao/llama-2-7b-hf/ +HF_LLAMA_PATH=/data/llama-2-7b-hf/ # weights link: https://huggingface.co/huggyllama/llama-7b MICRO_BATCH_SIZE=16 diff --git a/tools/hf2megads_weight_converter.py b/tools/hf2megads_weight_converter.py index b91a584807..58f8e2d9ab 100755 --- a/tools/hf2megads_weight_converter.py +++ b/tools/hf2megads_weight_converter.py @@ -3,6 +3,7 @@ import sys import os sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import torch.distributed from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP from megatron import print_rank_0, get_tokenizer, get_args from megatron.core import mpu @@ -18,6 +19,9 @@ from megatron.training import get_optimizer_param_scheduler from deepspeed.runtime.utils import see_memory_usage import deepspeed +import copy +from pathlib import Path + def add_extra_args(parser): @@ -126,6 +130,7 @@ def __init__(self, ds_model, hf_model, args, config): # align layer number self.ds_model = ds_model self.hf_model = hf_model + self.hf_dict = {} # for handling pp case when converting mds => hf self.config = config self.offset_num = 2 @@ -165,101 +170,7 @@ def _embedding_refactor(self, pname, p): ) return new_w - def _embedding_refactor_to_hf(self, pname, p): - - if pname == f"{self.mega_lm_head_wnum}.lm_head.weight": - hf_w = self.hf_model.lm_head.weight - elif pname == f"{self.mega_emb_wnum}.word_embeddings.weight": - hf_w = self.hf_model.model.embed_tokens.weight - - ds_w = p - with torch.no_grad(): - ds_w_all_rank = tensor_parallel.mappings._gather_along_first_dim(ds_w) - - hf_w.data.copy_(ds_w_all_rank[:hf_w.shape[0], :]) - - def _direct_refactor_to_hf(self, pname, p, hf_layer=None, subname=None): - ds_w = p - if pname in [f"{self.mega_norm_wnum}.weight"]: - hf_w = self.hf_model.model.norm.weight - elif subname in ["input_layernorm.weight"]: - hf_w = self.hf_model.model.layers[hf_layer].input_layernorm.weight - elif subname in ["post_attention_layernorm.weight"]: - hf_w = self.hf_model.model.layers[hf_layer].post_attention_layernorm.weight - - hf_w.data.copy_(ds_w) - - def _attn_dense_refactor_to_hf(self, pname, p, hf_layer, subname): - ds_w = p - - if subname == "self_attention.dense.weight": - hf_w = self.hf_model.model.layers[hf_layer].self_attn.o_proj.weight - elif subname == "mlp.dense_4h_to_h.weight": - hf_w = self.hf_model.model.layers[hf_layer].mlp.down_proj.weight - - with torch.no_grad(): - ds_w_all_rank = tensor_parallel.mappings._gather_along_last_dim(ds_w) - - hf_w.data.copy_(ds_w_all_rank) - - def _mlphto4h_dense_refactor_to_hf(self, pname, p, hf_layer): - ds_w = p - hf_g = self.hf_model.model.layers[hf_layer].mlp.gate_proj.weight - hf_u = self.hf_model.model.layers[hf_layer].mlp.up_proj.weight - - with torch.no_grad(): - ds_w_all_rank = tensor_parallel.mappings._gather_along_first_dim(ds_w) - - if torch.distributed.get_rank() == 0: - print(f"yahao-dbg: hf_g: {hf_g.shape}, ds_w_all_rank: {ds_w_all_rank.shape}") - - ds_w_shape = ds_w_all_rank.shape - - ds_w_all_rank = ds_w_all_rank.reshape(self.tp_size, 2, -1, ds_w_shape[-1]) - - hf_g.data.copy_(ds_w_all_rank[:, 0, :, :].reshape(-1, ds_w_shape[-1])) - hf_u.data.copy_(ds_w_all_rank[:, 1, :, :].reshape(-1, ds_w_shape[-1])) - - def _qkv_refactor_to_hf(self, pname, p, hf_layer): - - ds_w = p - - #TODO(yahao): Need to check function if we are using only 1 GPU. - - with torch.no_grad(): - ds_w_all_rank = tensor_parallel.mappings._gather_along_first_dim(ds_w) - - # [np/8, 3, 128, 6144] - - # [np, 3, 128, 6144] - - # hf_w = self.hf_model.model.layers[hf_layer].attention.query_key_value.weight - hf_q = self.hf_model.model.layers[hf_layer].self_attn.q_proj.weight - hf_k = self.hf_model.model.layers[hf_layer].self_attn.k_proj.weight - hf_v = self.hf_model.model.layers[hf_layer].self_attn.v_proj.weight - - oldshape = hf_q.shape - - hidden_size = oldshape[-1] - - hidden_size_per_attention_head = divide(hidden_size, - self.config.num_attention_heads) - - num_attention_heads_per_partition = divide(self.config.num_attention_heads, - self.tp_size) - - newshape = (self.tp_size, num_attention_heads_per_partition, 3, hidden_size_per_attention_head, hidden_size) - - - if torch.distributed.get_rank() == 0: - print(f"yahao-dbg: qkv refactor : oldshape: {oldshape}, newshape: {newshape}") - - ds_w_out = ds_w_all_rank.reshape(*newshape) - - hf_q.data.copy_(ds_w_out[:, :, 0, :, :].reshape(-1, oldshape[-1])) - hf_k.data.copy_(ds_w_out[:, :, 1, :, :].reshape(-1, oldshape[-1])) - hf_v.data.copy_(ds_w_out[:, :, 2, :, :].reshape(-1, oldshape[-1])) def _direct_refactor(self, pname, p, hf_layer=None, subname=None): @@ -273,6 +184,7 @@ def _direct_refactor(self, pname, p, hf_layer=None, subname=None): f"mega-ds:{pname,p.data.shape}<--hf{hf_name,} {hf_w.shape}") return new_w + def _qkv_refactor(self, pname, p, hf_layer): hf_wq_name = f"model.layers.{hf_layer}.self_attn.q_proj.weight" hf_wk_name = f"model.layers.{hf_layer}.self_attn.k_proj.weight" @@ -362,12 +274,10 @@ def _mlphto4h1_refactor(self, pname, p, hf_layer, subname): ) return new_w - def refactor(self): + def transform_from_hf_to_megds(self): assert self.is_refactored == False new_w = None for pname, p in self.ds_model.named_parameters(): - if torch.distributed.get_rank() == 0: - print(f"yahao-dbg: pname: {pname}, p.shape: {p.shape}") if pname in [ f"{self.mega_emb_wnum}.word_embeddings.weight", @@ -406,14 +316,86 @@ def refactor(self): new_w = None self.is_refactored = True + + def _embedding_refactor_to_hf(self, pname, ds_w): + if pname == f"{self.mega_lm_head_wnum}.lm_head.weight": + hf_w = self.hf_model.lm_head.weight + hf_w_name = "lm_head.weight" + elif pname == f"{self.mega_emb_wnum}.word_embeddings.weight": + hf_w = self.hf_model.model.embed_tokens.weight + hf_w_name = "model.embed_tokens.weight" + + with torch.no_grad(): + ds_w_all_rank = tensor_parallel.mappings._gather_along_first_dim(ds_w) + + self.hf_dict[hf_w_name] = copy.deepcopy(ds_w_all_rank[:hf_w.shape[0], :]) + + def _direct_refactor_to_hf(self, pname, ds_w, hf_layer=None, subname=None): + if pname in [f"{self.mega_norm_wnum}.weight"]: + hf_w = self.hf_model.model.norm.weight + hf_w_name = "model.norm.weight" + elif subname in ["input_layernorm.weight"]: + hf_w = self.hf_model.model.layers[hf_layer].input_layernorm.weight + hf_w_name = f"model.layers.{hf_layer}.input_layernorm.weight" + elif subname in ["post_attention_layernorm.weight"]: + hf_w = self.hf_model.model.layers[hf_layer].post_attention_layernorm.weight + hf_w_name = f"model.layers.{hf_layer}.post_attention_layernorm.weight" + + self.hf_dict[hf_w_name] = copy.deepcopy(ds_w) + + def _attn_dense_refactor_to_hf(self, pname, ds_w, hf_layer, subname): + if subname == "self_attention.dense.weight": + hf_w = self.hf_model.model.layers[hf_layer].self_attn.o_proj.weight + hf_w_name = f"model.layers.{hf_layer}.self_attn.o_proj.weight" + elif subname == "mlp.dense_4h_to_h.weight": + hf_w = self.hf_model.model.layers[hf_layer].mlp.down_proj.weight + hf_w_name = f"model.layers.{hf_layer}.mlp.down_proj.weight" + + with torch.no_grad(): + ds_w_all_rank = tensor_parallel.mappings._gather_along_last_dim(ds_w) + + self.hf_dict[hf_w_name] = copy.deepcopy(ds_w_all_rank) + + def _mlphto4h_dense_refactor_to_hf(self, pname, ds_w, hf_layer): + hf_g_name = f"model.layers.{hf_layer}.mlp.gate_proj.weight" + hf_u_name = f"model.layers.{hf_layer}.mlp.up_proj.weight" + + with torch.no_grad(): + ds_w_all_rank = tensor_parallel.mappings._gather_along_first_dim(ds_w) + + ds_w_shape = ds_w_all_rank.shape + ds_w_all_rank = ds_w_all_rank.reshape(self.tp_size, 2, -1, ds_w_shape[-1]) + self.hf_dict[hf_g_name] = copy.deepcopy(ds_w_all_rank[:, 0, :, :].reshape(-1, ds_w_shape[-1])) + self.hf_dict[hf_u_name] = copy.deepcopy(ds_w_all_rank[:, 1, :, :].reshape(-1, ds_w_shape[-1])) + + + def _qkv_refactor_to_hf(self, pname, ds_w, hf_layer): + with torch.no_grad(): + ds_w_all_rank = tensor_parallel.mappings._gather_along_first_dim(ds_w) + + hf_q = self.hf_model.model.layers[hf_layer].self_attn.q_proj.weight + hf_k = self.hf_model.model.layers[hf_layer].self_attn.k_proj.weight + hf_v = self.hf_model.model.layers[hf_layer].self_attn.v_proj.weight + hf_q_name = f"model.layers.{hf_layer}.self_attn.q_proj.weight" + hf_k_name = f"model.layers.{hf_layer}.self_attn.k_proj.weight" + hf_v_name = f"model.layers.{hf_layer}.self_attn.v_proj.weight" + oldshape = hf_q.shape + hidden_size = oldshape[-1] + hidden_size_per_attention_head = divide(hidden_size, + self.config.num_attention_heads) + num_attention_heads_per_partition = divide(self.config.num_attention_heads, + self.tp_size) + newshape = (self.tp_size, num_attention_heads_per_partition, 3, hidden_size_per_attention_head, hidden_size) + ds_w_out = ds_w_all_rank.reshape(*newshape) + self.hf_dict[hf_q_name] = copy.deepcopy(ds_w_out[:, :, 0, :, :].reshape(-1, oldshape[-1])) + self.hf_dict[hf_k_name] = copy.deepcopy(ds_w_out[:, :, 1, :, :].reshape(-1, oldshape[-1])) + self.hf_dict[hf_v_name] = copy.deepcopy(ds_w_out[:, :, 2, :, :].reshape(-1, oldshape[-1])) + + def transform_from_megads_to_hf(self): use_gqa = True if self.num_attention_heads != self.num_key_value_heads else False for pname, p in self.ds_model.named_parameters(): - - if torch.distributed.get_rank() == 0: - print(f"yahao-dbg: ds_model pname: {pname} , p.shape: {p.shape}") - if pname in [ f"{self.mega_emb_wnum}.word_embeddings.weight", f"{self.mega_lm_head_wnum}.lm_head.weight", @@ -432,7 +414,7 @@ def transform_from_megads_to_hf(self): if not use_gqa: self._qkv_refactor_to_hf(pname, p, hf_layer) else: - #TODO(yahao): Not impl yet ... + #TODO(billishyahao): Not impl yet ... assert False elif subname in ["mlp.dense_h_to_4h.weight"]: self._mlphto4h_dense_refactor_to_hf(pname, p, hf_layer) @@ -449,9 +431,6 @@ def transform_from_megads_to_hf(self): else: print(f"Unrecognized weight type: {pname}") raise ValueError(f"Unrecognized weight type: {pname}") - - # if torch.distributed.get_rank() == 0: - # print("yahao debug save last #2 stage: " + str(self.hf_model.state_dict()['lm.embed_in.weight'])) self.is_refactored = True def record_mapping_info(self, record_msg): @@ -473,61 +452,6 @@ def inorder_show_record(self): torch.distributed.barrier() -def convert_hf_to_mega_ds(): - """Build the model.""" - args = get_args() - print_rank_0(f'building model ...') - see_memory_usage(f"Before Building Model", force=True) - - config = core_transformer_config_from_args(args) - with deepspeed.zero.Init( - data_parallel_group=mpu.get_data_parallel_group(), - remote_device=None if args.remote_device == 'none' else args.remote_device, - config_dict_or_path=args.deepspeed_config, - enabled=args.zero_stage == 3, - mpu=mpu): - if args.deepspeed and not args.no_pipeline_parallel: - model = GPTModelPipe(config, num_tokentypes=0, parallel_output=True) - else: - raise NotImplementedError("Not implemented") - - see_memory_usage(f"After Building Model", force=True) - if torch.distributed.get_rank() < 2: - print(f"{torch.distributed.get_rank()} {model}") - - # load and initialize HF weight dict - # print hf weights list & mega-ds weights list - hf_ckpt_dir = args.origin_hf_ckpt_dir - hf_ckpt_num_of_shards = args.hf_ckpt_num_shards - loaded = load_and_print_hf_weight(hf_ckpt_dir, hf_ckpt_num_of_shards) - print_distinct_weights(model) - - # refactor weight from hf to mega-ds - - cur_refactor = refactor(model, loaded, args, config) - cur_refactor.refactor() - cur_refactor.inorder_show_record() - - del loaded - - unwrapped_model = unwrap_model([model], (torchDDP, LocalDDP, Float16Module)) - optimizer = get_megatron_optimizer(unwrapped_model) - opt_param_scheduler = get_optimizer_param_scheduler(optimizer) - - #init model and save - print_rank_0(f"before deepspeed init") - ds_engine, _, _, _ = deepspeed.initialize( - model=model, - optimizer=optimizer, - args=args, - lr_scheduler=opt_param_scheduler, - mpu=mpu if args.no_pipeline_parallel else None) - print_rank_0(f"after deepspeed init") - - print_rank_0(f"mega-ds checkpoint will be saved in {args.save}") - save_checkpoint(0, [ds_engine], optimizer, opt_param_scheduler) - print_rank_0(f"save checkpoint completed") - def load_hf_weights(args, no_init): if args.load_mode == 'torchbin': assert no_init == False, "only work with init" @@ -538,6 +462,7 @@ def load_hf_weights(args, no_init): elif args.load_mode == 'auto': return load_and_print_hf_weight_auto(args.hf_ckpt_dir, no_init) + def convert_ckpt(): """Build the model.""" args = get_args() @@ -560,12 +485,7 @@ def convert_ckpt(): if torch.distributed.get_rank() < 2: print(f"{torch.distributed.get_rank()} {ds_model}") - # load and initialize HF weight dict - # print hf weights list & mega-ds weights list - - # 'torchbin', 'safetensor', 'auto' - hf_model = load_hf_weights(args, no_init=args.to_hf_ckpt) # print_distinct_weights(hf_model) @@ -582,7 +502,7 @@ def convert_ckpt(): if args.to_hf_ckpt: load_checkpoint([ds_engine], None, None, load_only_weights=True) - print(f"Load deepspeed actual checkpoint completed") + print_rank_0(f"completed to load deepspeed actual checkpoint") # refactor weight from hf to mega-ds and vice versa @@ -590,16 +510,35 @@ def convert_ckpt(): if args.to_hf_ckpt: cur_refactor.transform_from_megads_to_hf() else: - cur_refactor.refactor() + cur_refactor.transform_from_hf_to_megds() # cur_refactor.inorder_show_record() if args.to_hf_ckpt: + + save_path = "/yahao/Megatron-DeepSpeed/test-llama-7b-hf/" + if not os.path.exists(save_path): + Path(save_path).mkdir(parents=True, exist_ok=True) + ckpt_per_pp_path = os.path.join(save_path, f"model_pp{mpu.get_pipeline_model_parallel_rank()}.pt") + torch.save(cur_refactor.hf_dict, ckpt_per_pp_path) + + if torch.distributed.is_initialized(): + torch.distributed.barrier() + + print_rank_0(f"hf checkpoint will be saved in {save_path}/release ") if mpu.is_pipeline_last_stage(): + ## doing checkpoint merging and saving... # hf_model.tie_weights() + all_wei = {} + for pprank in range(mpu.get_pipeline_model_parallel_world_size()): + ckpt_per_pp_path = os.path.join(save_path, f"model_pp{pprank}.pt") + partial_wei = torch.load(ckpt_per_pp_path) + all_wei = all_wei | partial_wei + + hf_model.load_state_dict(all_wei) + # mega-ds checkpoint will be saved in args.save - print_rank_0(f"mega-ds checkpoint will be saved in /yahao/Megatron-DeepSpeed/test-llama-7b-hf ") - hf_model.save_pretrained("/yahao/Megatron-DeepSpeed/test-llama-7b-hf", safe_serialization=True) + hf_model.save_pretrained(os.path.join(save_path, "release"), safe_serialization=True) else: print_rank_0(f"mega-ds checkpoint will be saved in {args.save}") save_checkpoint(0, [ds_engine], None, None) From 05af2ab5e683b2126d51045a273f502894f65700 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Wed, 7 Aug 2024 10:46:27 +0000 Subject: [PATCH 3/3] update --- tools/hf2megads_weight_converter.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/hf2megads_weight_converter.py b/tools/hf2megads_weight_converter.py index 58f8e2d9ab..12468963c5 100755 --- a/tools/hf2megads_weight_converter.py +++ b/tools/hf2megads_weight_converter.py @@ -514,8 +514,7 @@ def convert_ckpt(): # cur_refactor.inorder_show_record() if args.to_hf_ckpt: - - save_path = "/yahao/Megatron-DeepSpeed/test-llama-7b-hf/" + save_path = args.save if not os.path.exists(save_path): Path(save_path).mkdir(parents=True, exist_ok=True) ckpt_per_pp_path = os.path.join(save_path, f"model_pp{mpu.get_pipeline_model_parallel_rank()}.pt")