From d29eb8798070719706e3548d61990200f8b18dc3 Mon Sep 17 00:00:00 2001
From: billishyahao <yahao.he@intel.com>
Date: Sun, 28 Jul 2024 12:07:40 +0000
Subject: [PATCH 1/3] add support converting checkpoint from hf to mds

---
 .../finetune_hf_llama/ds_config.json          |   8 +-
 .../finetune_hf_llama/finetune_llama.sh       |  23 +-
 megatron/global_vars.py                       |   1 +
 tools/hf2megads_weight_converter.py           | 314 +++++++++++++++++-
 4 files changed, 315 insertions(+), 31 deletions(-)
diff --git a/examples_deepspeed/finetune_hf_llama/ds_config.json b/examples_deepspeed/finetune_hf_llama/ds_config.json
index 9c0b332473..85f439ce47 100755
--- a/examples_deepspeed/finetune_hf_llama/ds_config.json
+++ b/examples_deepspeed/finetune_hf_llama/ds_config.json
@@ -1,11 +1,5 @@
 {
   "train_batch_size" : 256,
   "train_micro_batch_size_per_gpu": 16,
-  "steps_per_print": 100,
-  "zero_optimization": {
-    "stage": 0
-  },
-  "bf16": {
-    "enabled": true
-  }
+  "steps_per_print": 1
 }
diff --git a/examples_deepspeed/finetune_hf_llama/finetune_llama.sh b/examples_deepspeed/finetune_hf_llama/finetune_llama.sh
index c48ea11b93..e9d1b8c09a 100644
--- a/examples_deepspeed/finetune_hf_llama/finetune_llama.sh
+++ b/examples_deepspeed/finetune_hf_llama/finetune_llama.sh
@@ -1,8 +1,8 @@
 DS_CONFIG=./examples_deepspeed/finetune_hf_llama/ds_config.json
-DATASET_PATH=./alpaca_data.json
+DATASET_PATH=./examples_deepspeed/finetune_hf_llama/alpaca_data.json
 # dataset link: https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json
 
-HF_LLAMA_PATH=/data/llama-7b/
+HF_LLAMA_PATH=/yahao/llama-2-7b-hf/
 # weights link: https://huggingface.co/huggyllama/llama-7b
 
 MICRO_BATCH_SIZE=16
@@ -44,11 +44,20 @@ cat <<EOT > $DS_CONFIG
 EOT
 
 
-covert_args="deepspeed tools/hf2megads_weight_converter.py \
+covert_hf2mds_args="deepspeed tools/hf2megads_weight_converter.py \
 --hf-ckpt-num-shards 2 \
---origin-hf-ckpt-dir $HF_LLAMA_PATH \
+--hf-ckpt-dir $HF_LLAMA_PATH \
+--load-mode auto \
 --save $MEGA_DS_LLAMA_PATH"
 
+covert_mds2hf_args="deepspeed tools/hf2megads_weight_converter.py \
+--hf-ckpt-num-shards 2 \
+--hf-ckpt-dir $HF_LLAMA_PATH \
+--load-mode auto \
+--to-hf-ckpt \
+--load $MEGA_DS_LLAMA_PATH \
+--save $HF_LLAMA_PATH'-hf-out' "
+
 finetune_args="deepspeed finetune_llama.py \
 --load $MEGA_DS_LLAMA_PATH"
 
@@ -98,8 +107,10 @@ comm_args="--tensor-model-parallel-size $TP \
 --no-gradient-accumulation-fusion \
 --repeated-dataloader"
 
-if [ "$1" = "convert" ]; then
-    task_args="$covert_args"
+if [ "$1" = "convert_hf2mds" ]; then
+    task_args="$covert_hf2mds_args"
+elif [ "$1" = "convert_mds2hf" ]; then
+    task_args="$covert_mds2hf_args"
 else
     task_args="$finetune_args"
 fi
diff --git a/megatron/global_vars.py b/megatron/global_vars.py
index 3f9d6fd66b..ccd0a4e21e 100644
--- a/megatron/global_vars.py
+++ b/megatron/global_vars.py
@@ -175,6 +175,7 @@ def _set_wandb_writer(args):
                   'project or experiment name provided, '
                   'therefore WANDB logs will be written '
                   'according to random generated project or experiment name.', flush=True)
+            return
 
         try:
             import wandb
diff --git a/tools/hf2megads_weight_converter.py b/tools/hf2megads_weight_converter.py
index bfbde1fd05..b91a584807 100755
--- a/tools/hf2megads_weight_converter.py
+++ b/tools/hf2megads_weight_converter.py
@@ -6,6 +6,7 @@
 from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
 from megatron import print_rank_0, get_tokenizer, get_args
 from megatron.core import mpu
+from megatron.core import tensor_parallel
 from megatron.core.utils import divide
 from megatron.model import GPTModelPipe, Float16Module
 from megatron.utils import unwrap_model
@@ -13,7 +14,7 @@
 from megatron.arguments import core_transformer_config_from_args
 from megatron.initialize import initialize_megatron
 from megatron.optimizer import get_megatron_optimizer
-from megatron.checkpointing import save_checkpoint
+from megatron.checkpointing import save_checkpoint, load_checkpoint
 from megatron.training import get_optimizer_param_scheduler
 from deepspeed.runtime.utils import see_memory_usage
 import deepspeed
@@ -22,11 +23,18 @@
 def add_extra_args(parser):
     """Text generation arguments."""
     group = parser.add_argument_group(title='hf2mega')
-    group.add_argument("--hf-ckpt-num-shards", type=int, help='num of llama ckpt.')
-    group.add_argument("--origin-hf-ckpt-dir",
+    group.add_argument("--hf-ckpt-dir",
                        type=str,
                        default="",
-                       help="the original path of the llama-hf ckpt")
+                       help="the llama-hf ckpt")
+    group.add_argument("--hf-ckpt-num-shards", type=int, default=-1, help='num of llama ckpt.')
+    group.add_argument("--load-mode", type=str,
+                       default=None,
+                       choices=['torchbin', 'safetensor', 'auto'],
+                       help="load ckpt format: pytorch.bin or model.safetensor or auto.")
+    group.add_argument("--to-hf-ckpt", action="store_true",
+                       help="by default convert from hf to megads"
+                            "if set, convert reversely from megads to hf ckpt.")
     return parser
 
 
@@ -55,6 +63,49 @@ def load_and_print_hf_weight(hf_ckpt_dir, hf_ckpt_num_of_shards):
     return loaded
 
 
+def load_and_print_hf_weight_from_safetensor(hf_ckpt_dir, hf_ckpt_num_of_shards):
+    from safetensors import safe_open
+    # Optimization point: We can selectively load specific 'shared' data to reduce CPU memory usage.
+    hf_model = {}
+    print_rank_0(
+        f"----------------------------hf weight list----------------------------")
+
+    for wid in range(1, hf_ckpt_num_of_shards + 1):
+        if hf_ckpt_num_of_shards == 1:
+            ckpt_path = f"{hf_ckpt_dir}/model.safetensors"
+        else:
+            ckpt_path = f"{hf_ckpt_dir}/model-{wid:05d}-of-{hf_ckpt_num_of_shards:05d}.safetensors"
+
+        with safe_open(ckpt_path, framework="pt", device="cpu") as f:
+            for k in f.keys():
+                print_rank_0(f"name: {k}, shape: {f.get_tensor(k).shape}")
+                assert k not in hf_model
+                hf_model[k] = f.get_tensor(k).clone()
+
+    return hf_model
+
+
+def load_and_print_hf_weight_auto(hf_ckpt_dir, no_init=True):
+    from transformers import AutoConfig, AutoModelForCausalLM
+    from transformers.modeling_utils import no_init_weights
+
+    if no_init:
+        hf_config = AutoConfig.from_pretrained(hf_ckpt_dir, trust_remote_code=True)
+        with no_init_weights():
+            hf_model = AutoModelForCausalLM.from_config(hf_config, trust_remote_code=True, torch_dtype=torch.bfloat16)
+    else:
+        hf_model = {}
+        hf_auto_model = AutoModelForCausalLM.from_pretrained(hf_ckpt_dir, trust_remote_code=True, torch_dtype=torch.bfloat16)
+        print_rank_0(
+            f"----------------------------hf weight list----------------------------")
+
+        for name, param in hf_auto_model.named_parameters():
+            hf_model[name] = param.clone()
+            print_rank_0(name)
+
+    return hf_model
+
+
 def print_distinct_weights(model):
     print_rank_0(
         f"----------------------------mega-ds weight list----------------------------")
@@ -70,16 +121,18 @@ def print_distinct_weights(model):
 
 
 class refactor:
-    def __init__(self, model, loaded, args, config):
+    def __init__(self, ds_model, hf_model, args, config):
         tokenizer = get_tokenizer()
         # align layer number
-        self.model = model
-        self.loaded = loaded
+        self.ds_model = ds_model
+        self.hf_model = hf_model
         self.config = config
 
         self.offset_num = 2
         self.mega_emb_wnum = 1
         self.mega_norm_wnum = args.num_layers + 2
+        self.num_attention_heads = args.num_attention_heads
+        self.num_key_value_heads = args.num_key_value_heads
         self.mega_lm_head_wnum = self.mega_norm_wnum + 1
         self.token_vocab = tokenizer.vocab_size
         self.padded_vocab_size = args.padded_vocab_size
@@ -95,7 +148,7 @@ def _embedding_refactor(self, pname, p):
             hf_name = "lm_head.weight"
         elif pname == f"{self.mega_emb_wnum}.word_embeddings.weight":
             hf_name = "model.embed_tokens.weight"
-        hf_w = self.loaded[hf_name]
+        hf_w = self.hf_model[hf_name]
         assert hf_w.shape[0] == self.token_vocab
         per_partition_vocab_size, start_index, end_index = compute_partition_range(
             self.padded_vocab_size, self.tp_rank, self.tp_size)
@@ -112,13 +165,110 @@ def _embedding_refactor(self, pname, p):
         )
         return new_w
 
+    def _embedding_refactor_to_hf(self, pname, p):
+
+        if pname == f"{self.mega_lm_head_wnum}.lm_head.weight":
+            hf_w = self.hf_model.lm_head.weight
+        elif pname == f"{self.mega_emb_wnum}.word_embeddings.weight":
+            hf_w = self.hf_model.model.embed_tokens.weight
+
+        ds_w = p
+        with torch.no_grad():
+            ds_w_all_rank = tensor_parallel.mappings._gather_along_first_dim(ds_w)
+        
+        hf_w.data.copy_(ds_w_all_rank[:hf_w.shape[0], :])
+
+    def _direct_refactor_to_hf(self, pname, p, hf_layer=None, subname=None):
+        ds_w = p
+        if pname in [f"{self.mega_norm_wnum}.weight"]:
+            hf_w = self.hf_model.model.norm.weight
+        elif subname in ["input_layernorm.weight"]:
+            hf_w = self.hf_model.model.layers[hf_layer].input_layernorm.weight
+        elif subname in ["post_attention_layernorm.weight"]:
+            hf_w = self.hf_model.model.layers[hf_layer].post_attention_layernorm.weight
+
+        hf_w.data.copy_(ds_w)
+
+    def _attn_dense_refactor_to_hf(self, pname, p, hf_layer, subname):
+        ds_w = p
+        
+        if subname == "self_attention.dense.weight":
+            hf_w = self.hf_model.model.layers[hf_layer].self_attn.o_proj.weight
+        elif subname == "mlp.dense_4h_to_h.weight":
+            hf_w = self.hf_model.model.layers[hf_layer].mlp.down_proj.weight
+
+        with torch.no_grad():
+            ds_w_all_rank = tensor_parallel.mappings._gather_along_last_dim(ds_w)
+
+        hf_w.data.copy_(ds_w_all_rank)
+
+    def _mlphto4h_dense_refactor_to_hf(self, pname, p, hf_layer):
+        ds_w = p
+        hf_g = self.hf_model.model.layers[hf_layer].mlp.gate_proj.weight
+        hf_u = self.hf_model.model.layers[hf_layer].mlp.up_proj.weight
+        
+        with torch.no_grad():
+            ds_w_all_rank = tensor_parallel.mappings._gather_along_first_dim(ds_w)
+
+        if torch.distributed.get_rank() == 0:
+            print(f"yahao-dbg: hf_g: {hf_g.shape}, ds_w_all_rank: {ds_w_all_rank.shape}")
+        
+        ds_w_shape = ds_w_all_rank.shape
+        
+        ds_w_all_rank = ds_w_all_rank.reshape(self.tp_size, 2, -1, ds_w_shape[-1])
+
+        hf_g.data.copy_(ds_w_all_rank[:, 0, :, :].reshape(-1, ds_w_shape[-1]))
+        hf_u.data.copy_(ds_w_all_rank[:, 1, :, :].reshape(-1, ds_w_shape[-1]))
+
+    
+    def _qkv_refactor_to_hf(self, pname, p, hf_layer):
+
+        ds_w = p
+
+        #TODO(yahao): Need to check function if we are using only 1 GPU.
+
+        with torch.no_grad():
+            ds_w_all_rank = tensor_parallel.mappings._gather_along_first_dim(ds_w)
+
+        # [np/8, 3, 128, 6144]
+
+        # [np, 3, 128, 6144]
+
+        # hf_w = self.hf_model.model.layers[hf_layer].attention.query_key_value.weight
+        hf_q = self.hf_model.model.layers[hf_layer].self_attn.q_proj.weight
+        hf_k = self.hf_model.model.layers[hf_layer].self_attn.k_proj.weight
+        hf_v = self.hf_model.model.layers[hf_layer].self_attn.v_proj.weight
+
+        oldshape = hf_q.shape
+
+        hidden_size = oldshape[-1]
+
+        hidden_size_per_attention_head = divide(hidden_size,
+                                                self.config.num_attention_heads)
+        
+        num_attention_heads_per_partition = divide(self.config.num_attention_heads,
+                                                   self.tp_size)
+
+        newshape = (self.tp_size, num_attention_heads_per_partition, 3, hidden_size_per_attention_head, hidden_size)
+
+
+        if torch.distributed.get_rank() == 0:
+            print(f"yahao-dbg: qkv refactor : oldshape: {oldshape}, newshape: {newshape}")
+
+        ds_w_out = ds_w_all_rank.reshape(*newshape)
+
+        hf_q.data.copy_(ds_w_out[:, :, 0, :, :].reshape(-1, oldshape[-1]))
+        hf_k.data.copy_(ds_w_out[:, :, 1, :, :].reshape(-1, oldshape[-1]))
+        hf_v.data.copy_(ds_w_out[:, :, 2, :, :].reshape(-1, oldshape[-1]))
+
+
     def _direct_refactor(self, pname, p, hf_layer=None, subname=None):
         if pname == f"{self.mega_norm_wnum}.weight":
             hf_name = "model.norm.weight"
         elif subname in ["input_layernorm.weight", "post_attention_layernorm.weight"]:
             hf_name = f"model.layers.{hf_layer}.{subname}"
 
-        new_w = hf_w = self.loaded[hf_name]
+        new_w = hf_w = self.hf_model[hf_name]
         self.record_mapping_info(
             f"mega-ds:{pname,p.data.shape}<--hf{hf_name,}  {hf_w.shape}")
         return new_w
@@ -127,9 +277,9 @@ def _qkv_refactor(self, pname, p, hf_layer):
         hf_wq_name = f"model.layers.{hf_layer}.self_attn.q_proj.weight"
         hf_wk_name = f"model.layers.{hf_layer}.self_attn.k_proj.weight"
         hf_wv_name = f"model.layers.{hf_layer}.self_attn.v_proj.weight"
-        wq = self.loaded[hf_wq_name]
-        wk = self.loaded[hf_wk_name]
-        wv = self.loaded[hf_wv_name]
+        wq = self.hf_model[hf_wq_name]
+        wk = self.hf_model[hf_wk_name]
+        wv = self.hf_model[hf_wv_name]
 
         hidden_size = wq.shape[0]
         per_partition_size, start_index, end_index = compute_partition_range(
@@ -159,8 +309,8 @@ def _qkv_refactor(self, pname, p, hf_layer):
     def _mlphto4h_dense_refactor(self, pname, p, hf_layer):
         hf_w_gate_name = f"model.layers.{hf_layer}.mlp.gate_proj.weight"
         hf_w_up_name = f"model.layers.{hf_layer}.mlp.up_proj.weight"
-        w_gate = self.loaded[hf_w_gate_name]
-        w_up = self.loaded[hf_w_up_name]
+        w_gate = self.hf_model[hf_w_gate_name]
+        w_up = self.hf_model[hf_w_up_name]
 
         hidden_size = w_gate.shape[0]
         per_partition_size, start_index, end_index = compute_partition_range(
@@ -184,7 +334,7 @@ def _attn_dense_refactor(self, pname, p, hf_layer, subname):
         else:
             hf_name = f"model.layers.{hf_layer}.mlp.down_proj.weight"
 
-        hf_w = self.loaded[hf_name]
+        hf_w = self.hf_model[hf_name]
         hidden_size = hf_w.shape[1]
         per_partition_size, start_index, end_index = compute_partition_range(
             hidden_size, self.tp_rank, self.tp_size)
@@ -200,7 +350,7 @@ def _mlphto4h1_refactor(self, pname, p, hf_layer, subname):
             hf_name = f"model.layers.{hf_layer}.mlp.gate_proj.weight"
         else:
             hf_name = f"model.layers.{hf_layer}.mlp.up_proj.weight"
-        hf_w = self.loaded[hf_name]
+        hf_w = self.hf_model[hf_name]
         hidden_size = hf_w.shape[0]
         per_partition_size, start_index, end_index = compute_partition_range(
             hidden_size, self.tp_rank, self.tp_size)
@@ -215,7 +365,10 @@ def _mlphto4h1_refactor(self, pname, p, hf_layer, subname):
     def refactor(self):
         assert self.is_refactored == False
         new_w = None
-        for pname, p in self.model.named_parameters():
+        for pname, p in self.ds_model.named_parameters():
+            if torch.distributed.get_rank() == 0:
+                print(f"yahao-dbg: pname: {pname}, p.shape: {p.shape}")
+
             if pname in [
                     f"{self.mega_emb_wnum}.word_embeddings.weight",
                     f"{self.mega_lm_head_wnum}.lm_head.weight"
@@ -253,6 +406,54 @@ def refactor(self):
             new_w = None
         self.is_refactored = True
 
+    def transform_from_megads_to_hf(self):
+        use_gqa = True if self.num_attention_heads != self.num_key_value_heads else False
+
+        for pname, p in self.ds_model.named_parameters():
+
+            if torch.distributed.get_rank() == 0:
+                print(f"yahao-dbg: ds_model pname: {pname} , p.shape: {p.shape}")
+
+            if pname in [
+                    f"{self.mega_emb_wnum}.word_embeddings.weight",
+                    f"{self.mega_lm_head_wnum}.lm_head.weight",
+            ]:
+                self._embedding_refactor_to_hf(pname, p)
+            elif pname in [
+                    f"{self.mega_norm_wnum}.weight",
+            ]:
+                self._direct_refactor_to_hf(pname, p)
+            else:
+                mobj = self.decoder_pat.match(pname)
+                layer_num = int(mobj.group(1))
+                subname = mobj.group(2)
+                hf_layer = layer_num - self.offset_num
+                if subname in ["self_attention.query_key_value.weight"]:
+                    if not use_gqa:
+                        self._qkv_refactor_to_hf(pname, p, hf_layer)
+                    else:
+                        #TODO(yahao): Not impl yet ...
+                        assert False
+                elif subname in ["mlp.dense_h_to_4h.weight"]:
+                    self._mlphto4h_dense_refactor_to_hf(pname, p, hf_layer)
+                elif subname in [
+                    "self_attention.dense.weight",
+                    "mlp.dense_4h_to_h.weight"
+                ]:
+                    self._attn_dense_refactor_to_hf(pname, p, hf_layer, subname)
+                elif subname in [
+                    "input_layernorm.weight",
+                    "post_attention_layernorm.weight",
+                ]:
+                    self._direct_refactor_to_hf(pname, p, hf_layer, subname)
+                else:
+                    print(f"Unrecognized weight type: {pname}")
+                    raise ValueError(f"Unrecognized weight type: {pname}")
+
+        # if torch.distributed.get_rank() == 0:
+        #     print("yahao debug save last #2 stage:  " + str(self.hf_model.state_dict()['lm.embed_in.weight']))
+        self.is_refactored = True
+
     def record_mapping_info(self, record_msg):
         self.refactor_weight_list.append(record_msg)
 
@@ -327,8 +528,85 @@ def convert_hf_to_mega_ds():
     save_checkpoint(0, [ds_engine], optimizer, opt_param_scheduler)
     print_rank_0(f"save checkpoint completed")
 
+def load_hf_weights(args, no_init):
+    if args.load_mode == 'torchbin':
+        assert no_init == False, "only work with init"
+        return load_and_print_hf_weight(args.hf_ckpt_dir, args.hf_ckpt_num_shards)
+    elif args.load_mode == 'safetensor':
+        assert no_init == False, "only work with init"
+        return load_and_print_hf_weight_from_safetensor(args.hf_ckpt_dir, args.hf_ckpt_num_shards)
+    elif args.load_mode ==  'auto':
+        return load_and_print_hf_weight_auto(args.hf_ckpt_dir, no_init)
+
+def convert_ckpt():
+    """Build the model."""
+    args = get_args()
+    print_rank_0(f'building model ...')
+    see_memory_usage(f"Before Building Model", force=True)
+
+    config = core_transformer_config_from_args(args)
+    with deepspeed.zero.Init(
+            data_parallel_group=mpu.get_data_parallel_group(),
+            remote_device=None if args.remote_device == 'none' else args.remote_device,
+            config_dict_or_path=args.deepspeed_config,
+            enabled=args.zero_stage == 3,
+            mpu=mpu):
+        if args.deepspeed and not args.no_pipeline_parallel:
+            ds_model = GPTModelPipe(config, num_tokentypes=0, parallel_output=True)
+        else:
+            raise NotImplementedError("Not implemented")
+
+    see_memory_usage(f"After Building Model", force=True)
+    if torch.distributed.get_rank() < 2:
+        print(f"{torch.distributed.get_rank()} {ds_model}")
+
+    # load and initialize HF weight dict
+    # print hf weights list & mega-ds weights list
+
+
+    # 'torchbin', 'safetensor', 'auto'
+
+    hf_model = load_hf_weights(args, no_init=args.to_hf_ckpt)
+
+    # print_distinct_weights(hf_model)
+
+    #init model and save
+    print_rank_0(f"before deepspeed init")
+    ds_engine, _, _, _ = deepspeed.initialize(
+        model=ds_model,
+        optimizer=None,
+        args=args,
+        lr_scheduler=None,
+        mpu=mpu if args.no_pipeline_parallel else None)
+    print_rank_0(f"after deepspeed init")
+
+    if args.to_hf_ckpt:
+        load_checkpoint([ds_engine], None, None, load_only_weights=True)
+        print(f"Load deepspeed actual checkpoint completed")
+
+    # refactor weight from hf to mega-ds and vice versa
+
+    cur_refactor = refactor(ds_model, hf_model, args, config)
+    if args.to_hf_ckpt:
+        cur_refactor.transform_from_megads_to_hf()
+    else:
+        cur_refactor.refactor()
+    # cur_refactor.inorder_show_record()
+
+    if args.to_hf_ckpt:
+        if mpu.is_pipeline_last_stage():
+            # hf_model.tie_weights()
+
+            # mega-ds checkpoint will be saved in  args.save
+            print_rank_0(f"mega-ds checkpoint will be saved in /yahao/Megatron-DeepSpeed/test-llama-7b-hf ")
+            hf_model.save_pretrained("/yahao/Megatron-DeepSpeed/test-llama-7b-hf", safe_serialization=True)
+    else:
+        print_rank_0(f"mega-ds checkpoint will be saved in {args.save}")
+        save_checkpoint(0, [ds_engine], None, None)
+    
+    print_rank_0(f"save checkpoint completed")
 
 if __name__ == "__main__":
 
     initialize_megatron(extra_args_provider=add_extra_args)
-    convert_hf_to_mega_ds()
+    convert_ckpt()

From 738e3e173976fe5084f30931f760fb55d429613e Mon Sep 17 00:00:00 2001
From: billishyahao <yahao.he@intel.com>
Date: Wed, 7 Aug 2024 10:36:39 +0000
Subject: [PATCH 2/3] Fix PP issue

---
 .../finetune_hf_llama/finetune_llama.sh       |   2 +-
 tools/hf2megads_weight_converter.py           | 277 +++++++-----------
 2 files changed, 109 insertions(+), 170 deletions(-)

diff --git a/examples_deepspeed/finetune_hf_llama/finetune_llama.sh b/examples_deepspeed/finetune_hf_llama/finetune_llama.sh
index e9d1b8c09a..ab8bfdf419 100644
--- a/examples_deepspeed/finetune_hf_llama/finetune_llama.sh
+++ b/examples_deepspeed/finetune_hf_llama/finetune_llama.sh
@@ -2,7 +2,7 @@ DS_CONFIG=./examples_deepspeed/finetune_hf_llama/ds_config.json
 DATASET_PATH=./examples_deepspeed/finetune_hf_llama/alpaca_data.json
 # dataset link: https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json
 
-HF_LLAMA_PATH=/yahao/llama-2-7b-hf/
+HF_LLAMA_PATH=/data/llama-2-7b-hf/
 # weights link: https://huggingface.co/huggyllama/llama-7b
 
 MICRO_BATCH_SIZE=16
diff --git a/tools/hf2megads_weight_converter.py b/tools/hf2megads_weight_converter.py
index b91a584807..58f8e2d9ab 100755
--- a/tools/hf2megads_weight_converter.py
+++ b/tools/hf2megads_weight_converter.py
@@ -3,6 +3,7 @@
 import sys
 import os
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import torch.distributed
 from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
 from megatron import print_rank_0, get_tokenizer, get_args
 from megatron.core import mpu
@@ -18,6 +19,9 @@
 from megatron.training import get_optimizer_param_scheduler
 from deepspeed.runtime.utils import see_memory_usage
 import deepspeed
+import copy
+from pathlib import Path
+
 
 
 def add_extra_args(parser):
@@ -126,6 +130,7 @@ def __init__(self, ds_model, hf_model, args, config):
         # align layer number
         self.ds_model = ds_model
         self.hf_model = hf_model
+        self.hf_dict = {} # for handling pp case when converting mds => hf
         self.config = config
 
         self.offset_num = 2
@@ -165,101 +170,7 @@ def _embedding_refactor(self, pname, p):
         )
         return new_w
 
-    def _embedding_refactor_to_hf(self, pname, p):
-
-        if pname == f"{self.mega_lm_head_wnum}.lm_head.weight":
-            hf_w = self.hf_model.lm_head.weight
-        elif pname == f"{self.mega_emb_wnum}.word_embeddings.weight":
-            hf_w = self.hf_model.model.embed_tokens.weight
-
-        ds_w = p
-        with torch.no_grad():
-            ds_w_all_rank = tensor_parallel.mappings._gather_along_first_dim(ds_w)
-        
-        hf_w.data.copy_(ds_w_all_rank[:hf_w.shape[0], :])
-
-    def _direct_refactor_to_hf(self, pname, p, hf_layer=None, subname=None):
-        ds_w = p
-        if pname in [f"{self.mega_norm_wnum}.weight"]:
-            hf_w = self.hf_model.model.norm.weight
-        elif subname in ["input_layernorm.weight"]:
-            hf_w = self.hf_model.model.layers[hf_layer].input_layernorm.weight
-        elif subname in ["post_attention_layernorm.weight"]:
-            hf_w = self.hf_model.model.layers[hf_layer].post_attention_layernorm.weight
-
-        hf_w.data.copy_(ds_w)
-
-    def _attn_dense_refactor_to_hf(self, pname, p, hf_layer, subname):
-        ds_w = p
-        
-        if subname == "self_attention.dense.weight":
-            hf_w = self.hf_model.model.layers[hf_layer].self_attn.o_proj.weight
-        elif subname == "mlp.dense_4h_to_h.weight":
-            hf_w = self.hf_model.model.layers[hf_layer].mlp.down_proj.weight
-
-        with torch.no_grad():
-            ds_w_all_rank = tensor_parallel.mappings._gather_along_last_dim(ds_w)
-
-        hf_w.data.copy_(ds_w_all_rank)
-
-    def _mlphto4h_dense_refactor_to_hf(self, pname, p, hf_layer):
-        ds_w = p
-        hf_g = self.hf_model.model.layers[hf_layer].mlp.gate_proj.weight
-        hf_u = self.hf_model.model.layers[hf_layer].mlp.up_proj.weight
-        
-        with torch.no_grad():
-            ds_w_all_rank = tensor_parallel.mappings._gather_along_first_dim(ds_w)
-
-        if torch.distributed.get_rank() == 0:
-            print(f"yahao-dbg: hf_g: {hf_g.shape}, ds_w_all_rank: {ds_w_all_rank.shape}")
-        
-        ds_w_shape = ds_w_all_rank.shape
-        
-        ds_w_all_rank = ds_w_all_rank.reshape(self.tp_size, 2, -1, ds_w_shape[-1])
-
-        hf_g.data.copy_(ds_w_all_rank[:, 0, :, :].reshape(-1, ds_w_shape[-1]))
-        hf_u.data.copy_(ds_w_all_rank[:, 1, :, :].reshape(-1, ds_w_shape[-1]))
-
     
-    def _qkv_refactor_to_hf(self, pname, p, hf_layer):
-
-        ds_w = p
-
-        #TODO(yahao): Need to check function if we are using only 1 GPU.
-
-        with torch.no_grad():
-            ds_w_all_rank = tensor_parallel.mappings._gather_along_first_dim(ds_w)
-
-        # [np/8, 3, 128, 6144]
-
-        # [np, 3, 128, 6144]
-
-        # hf_w = self.hf_model.model.layers[hf_layer].attention.query_key_value.weight
-        hf_q = self.hf_model.model.layers[hf_layer].self_attn.q_proj.weight
-        hf_k = self.hf_model.model.layers[hf_layer].self_attn.k_proj.weight
-        hf_v = self.hf_model.model.layers[hf_layer].self_attn.v_proj.weight
-
-        oldshape = hf_q.shape
-
-        hidden_size = oldshape[-1]
-
-        hidden_size_per_attention_head = divide(hidden_size,
-                                                self.config.num_attention_heads)
-        
-        num_attention_heads_per_partition = divide(self.config.num_attention_heads,
-                                                   self.tp_size)
-
-        newshape = (self.tp_size, num_attention_heads_per_partition, 3, hidden_size_per_attention_head, hidden_size)
-
-
-        if torch.distributed.get_rank() == 0:
-            print(f"yahao-dbg: qkv refactor : oldshape: {oldshape}, newshape: {newshape}")
-
-        ds_w_out = ds_w_all_rank.reshape(*newshape)
-
-        hf_q.data.copy_(ds_w_out[:, :, 0, :, :].reshape(-1, oldshape[-1]))
-        hf_k.data.copy_(ds_w_out[:, :, 1, :, :].reshape(-1, oldshape[-1]))
-        hf_v.data.copy_(ds_w_out[:, :, 2, :, :].reshape(-1, oldshape[-1]))
 
 
     def _direct_refactor(self, pname, p, hf_layer=None, subname=None):
@@ -273,6 +184,7 @@ def _direct_refactor(self, pname, p, hf_layer=None, subname=None):
             f"mega-ds:{pname,p.data.shape}<--hf{hf_name,}  {hf_w.shape}")
         return new_w
 
+
     def _qkv_refactor(self, pname, p, hf_layer):
         hf_wq_name = f"model.layers.{hf_layer}.self_attn.q_proj.weight"
         hf_wk_name = f"model.layers.{hf_layer}.self_attn.k_proj.weight"
@@ -362,12 +274,10 @@ def _mlphto4h1_refactor(self, pname, p, hf_layer, subname):
         )
         return new_w
 
-    def refactor(self):
+    def transform_from_hf_to_megds(self):
         assert self.is_refactored == False
         new_w = None
         for pname, p in self.ds_model.named_parameters():
-            if torch.distributed.get_rank() == 0:
-                print(f"yahao-dbg: pname: {pname}, p.shape: {p.shape}")
 
             if pname in [
                     f"{self.mega_emb_wnum}.word_embeddings.weight",
@@ -406,14 +316,86 @@ def refactor(self):
             new_w = None
         self.is_refactored = True
 
+    
+    def _embedding_refactor_to_hf(self, pname, ds_w):
+        if pname == f"{self.mega_lm_head_wnum}.lm_head.weight":
+            hf_w = self.hf_model.lm_head.weight
+            hf_w_name = "lm_head.weight"
+        elif pname == f"{self.mega_emb_wnum}.word_embeddings.weight":
+            hf_w = self.hf_model.model.embed_tokens.weight
+            hf_w_name = "model.embed_tokens.weight"
+
+        with torch.no_grad():
+            ds_w_all_rank = tensor_parallel.mappings._gather_along_first_dim(ds_w)
+        
+        self.hf_dict[hf_w_name] = copy.deepcopy(ds_w_all_rank[:hf_w.shape[0], :])
+
+    def _direct_refactor_to_hf(self, pname, ds_w, hf_layer=None, subname=None):
+        if pname in [f"{self.mega_norm_wnum}.weight"]:
+            hf_w = self.hf_model.model.norm.weight
+            hf_w_name = "model.norm.weight"
+        elif subname in ["input_layernorm.weight"]:
+            hf_w = self.hf_model.model.layers[hf_layer].input_layernorm.weight
+            hf_w_name = f"model.layers.{hf_layer}.input_layernorm.weight"
+        elif subname in ["post_attention_layernorm.weight"]:
+            hf_w = self.hf_model.model.layers[hf_layer].post_attention_layernorm.weight
+            hf_w_name = f"model.layers.{hf_layer}.post_attention_layernorm.weight"
+
+        self.hf_dict[hf_w_name] = copy.deepcopy(ds_w)
+
+    def _attn_dense_refactor_to_hf(self, pname, ds_w, hf_layer, subname):
+        if subname == "self_attention.dense.weight":
+            hf_w = self.hf_model.model.layers[hf_layer].self_attn.o_proj.weight
+            hf_w_name = f"model.layers.{hf_layer}.self_attn.o_proj.weight"
+        elif subname == "mlp.dense_4h_to_h.weight":
+            hf_w = self.hf_model.model.layers[hf_layer].mlp.down_proj.weight
+            hf_w_name = f"model.layers.{hf_layer}.mlp.down_proj.weight"
+
+        with torch.no_grad():
+            ds_w_all_rank = tensor_parallel.mappings._gather_along_last_dim(ds_w)
+
+        self.hf_dict[hf_w_name] = copy.deepcopy(ds_w_all_rank)
+
+    def _mlphto4h_dense_refactor_to_hf(self, pname, ds_w, hf_layer):
+        hf_g_name = f"model.layers.{hf_layer}.mlp.gate_proj.weight"
+        hf_u_name = f"model.layers.{hf_layer}.mlp.up_proj.weight"
+        
+        with torch.no_grad():
+            ds_w_all_rank = tensor_parallel.mappings._gather_along_first_dim(ds_w)
+        
+        ds_w_shape = ds_w_all_rank.shape
+        ds_w_all_rank = ds_w_all_rank.reshape(self.tp_size, 2, -1, ds_w_shape[-1])
+        self.hf_dict[hf_g_name] = copy.deepcopy(ds_w_all_rank[:, 0, :, :].reshape(-1, ds_w_shape[-1]))
+        self.hf_dict[hf_u_name] = copy.deepcopy(ds_w_all_rank[:, 1, :, :].reshape(-1, ds_w_shape[-1]))
+
+    
+    def _qkv_refactor_to_hf(self, pname, ds_w, hf_layer):
+        with torch.no_grad():
+            ds_w_all_rank = tensor_parallel.mappings._gather_along_first_dim(ds_w)
+
+        hf_q = self.hf_model.model.layers[hf_layer].self_attn.q_proj.weight
+        hf_k = self.hf_model.model.layers[hf_layer].self_attn.k_proj.weight
+        hf_v = self.hf_model.model.layers[hf_layer].self_attn.v_proj.weight
+        hf_q_name = f"model.layers.{hf_layer}.self_attn.q_proj.weight"
+        hf_k_name = f"model.layers.{hf_layer}.self_attn.k_proj.weight"
+        hf_v_name = f"model.layers.{hf_layer}.self_attn.v_proj.weight"
+        oldshape = hf_q.shape
+        hidden_size = oldshape[-1]
+        hidden_size_per_attention_head = divide(hidden_size,
+                                                self.config.num_attention_heads)
+        num_attention_heads_per_partition = divide(self.config.num_attention_heads,
+                                                   self.tp_size)
+        newshape = (self.tp_size, num_attention_heads_per_partition, 3, hidden_size_per_attention_head, hidden_size)
+        ds_w_out = ds_w_all_rank.reshape(*newshape)
+        self.hf_dict[hf_q_name] = copy.deepcopy(ds_w_out[:, :, 0, :, :].reshape(-1, oldshape[-1]))
+        self.hf_dict[hf_k_name] = copy.deepcopy(ds_w_out[:, :, 1, :, :].reshape(-1, oldshape[-1]))
+        self.hf_dict[hf_v_name] = copy.deepcopy(ds_w_out[:, :, 2, :, :].reshape(-1, oldshape[-1]))
+
+
     def transform_from_megads_to_hf(self):
         use_gqa = True if self.num_attention_heads != self.num_key_value_heads else False
 
         for pname, p in self.ds_model.named_parameters():
-
-            if torch.distributed.get_rank() == 0:
-                print(f"yahao-dbg: ds_model pname: {pname} , p.shape: {p.shape}")
-
             if pname in [
                     f"{self.mega_emb_wnum}.word_embeddings.weight",
                     f"{self.mega_lm_head_wnum}.lm_head.weight",
@@ -432,7 +414,7 @@ def transform_from_megads_to_hf(self):
                     if not use_gqa:
                         self._qkv_refactor_to_hf(pname, p, hf_layer)
                     else:
-                        #TODO(yahao): Not impl yet ...
+                        #TODO(billishyahao): Not impl yet ...
                         assert False
                 elif subname in ["mlp.dense_h_to_4h.weight"]:
                     self._mlphto4h_dense_refactor_to_hf(pname, p, hf_layer)
@@ -449,9 +431,6 @@ def transform_from_megads_to_hf(self):
                 else:
                     print(f"Unrecognized weight type: {pname}")
                     raise ValueError(f"Unrecognized weight type: {pname}")
-
-        # if torch.distributed.get_rank() == 0:
-        #     print("yahao debug save last #2 stage:  " + str(self.hf_model.state_dict()['lm.embed_in.weight']))
         self.is_refactored = True
 
     def record_mapping_info(self, record_msg):
@@ -473,61 +452,6 @@ def inorder_show_record(self):
                 torch.distributed.barrier()
 
 
-def convert_hf_to_mega_ds():
-    """Build the model."""
-    args = get_args()
-    print_rank_0(f'building model ...')
-    see_memory_usage(f"Before Building Model", force=True)
-
-    config = core_transformer_config_from_args(args)
-    with deepspeed.zero.Init(
-            data_parallel_group=mpu.get_data_parallel_group(),
-            remote_device=None if args.remote_device == 'none' else args.remote_device,
-            config_dict_or_path=args.deepspeed_config,
-            enabled=args.zero_stage == 3,
-            mpu=mpu):
-        if args.deepspeed and not args.no_pipeline_parallel:
-            model = GPTModelPipe(config, num_tokentypes=0, parallel_output=True)
-        else:
-            raise NotImplementedError("Not implemented")
-
-    see_memory_usage(f"After Building Model", force=True)
-    if torch.distributed.get_rank() < 2:
-        print(f"{torch.distributed.get_rank()} {model}")
-
-    # load and initialize HF weight dict
-    # print hf weights list & mega-ds weights list
-    hf_ckpt_dir = args.origin_hf_ckpt_dir
-    hf_ckpt_num_of_shards = args.hf_ckpt_num_shards
-    loaded = load_and_print_hf_weight(hf_ckpt_dir, hf_ckpt_num_of_shards)
-    print_distinct_weights(model)
-
-    # refactor weight from hf to mega-ds
-
-    cur_refactor = refactor(model, loaded, args, config)
-    cur_refactor.refactor()
-    cur_refactor.inorder_show_record()
-
-    del loaded
-
-    unwrapped_model = unwrap_model([model], (torchDDP, LocalDDP, Float16Module))
-    optimizer = get_megatron_optimizer(unwrapped_model)
-    opt_param_scheduler = get_optimizer_param_scheduler(optimizer)
-
-    #init model and save
-    print_rank_0(f"before deepspeed init")
-    ds_engine, _, _, _ = deepspeed.initialize(
-        model=model,
-        optimizer=optimizer,
-        args=args,
-        lr_scheduler=opt_param_scheduler,
-        mpu=mpu if args.no_pipeline_parallel else None)
-    print_rank_0(f"after deepspeed init")
-
-    print_rank_0(f"mega-ds checkpoint will be saved in {args.save}")
-    save_checkpoint(0, [ds_engine], optimizer, opt_param_scheduler)
-    print_rank_0(f"save checkpoint completed")
-
 def load_hf_weights(args, no_init):
     if args.load_mode == 'torchbin':
         assert no_init == False, "only work with init"
@@ -538,6 +462,7 @@ def load_hf_weights(args, no_init):
     elif args.load_mode ==  'auto':
         return load_and_print_hf_weight_auto(args.hf_ckpt_dir, no_init)
 
+
 def convert_ckpt():
     """Build the model."""
     args = get_args()
@@ -560,12 +485,7 @@ def convert_ckpt():
     if torch.distributed.get_rank() < 2:
         print(f"{torch.distributed.get_rank()} {ds_model}")
 
-    # load and initialize HF weight dict
-    # print hf weights list & mega-ds weights list
-
-
     # 'torchbin', 'safetensor', 'auto'
-
     hf_model = load_hf_weights(args, no_init=args.to_hf_ckpt)
 
     # print_distinct_weights(hf_model)
@@ -582,7 +502,7 @@ def convert_ckpt():
 
     if args.to_hf_ckpt:
         load_checkpoint([ds_engine], None, None, load_only_weights=True)
-        print(f"Load deepspeed actual checkpoint completed")
+        print_rank_0(f"completed to load deepspeed actual checkpoint")
 
     # refactor weight from hf to mega-ds and vice versa
 
@@ -590,16 +510,35 @@ def convert_ckpt():
     if args.to_hf_ckpt:
         cur_refactor.transform_from_megads_to_hf()
     else:
-        cur_refactor.refactor()
+        cur_refactor.transform_from_hf_to_megds()
     # cur_refactor.inorder_show_record()
 
     if args.to_hf_ckpt:
+        
+        save_path = "/yahao/Megatron-DeepSpeed/test-llama-7b-hf/"
+        if not os.path.exists(save_path):
+            Path(save_path).mkdir(parents=True, exist_ok=True)
+        ckpt_per_pp_path = os.path.join(save_path, f"model_pp{mpu.get_pipeline_model_parallel_rank()}.pt")
+        torch.save(cur_refactor.hf_dict, ckpt_per_pp_path)
+
+        if torch.distributed.is_initialized():
+            torch.distributed.barrier()
+
+        print_rank_0(f"hf checkpoint will be saved in {save_path}/release ")
         if mpu.is_pipeline_last_stage():
+            ## doing checkpoint merging and saving...
             # hf_model.tie_weights()
 
+            all_wei = {}
+            for pprank in range(mpu.get_pipeline_model_parallel_world_size()):
+                ckpt_per_pp_path = os.path.join(save_path, f"model_pp{pprank}.pt")
+                partial_wei = torch.load(ckpt_per_pp_path)
+                all_wei = all_wei | partial_wei
+
+            hf_model.load_state_dict(all_wei)
+
             # mega-ds checkpoint will be saved in  args.save
-            print_rank_0(f"mega-ds checkpoint will be saved in /yahao/Megatron-DeepSpeed/test-llama-7b-hf ")
-            hf_model.save_pretrained("/yahao/Megatron-DeepSpeed/test-llama-7b-hf", safe_serialization=True)
+            hf_model.save_pretrained(os.path.join(save_path, "release"), safe_serialization=True)
     else:
         print_rank_0(f"mega-ds checkpoint will be saved in {args.save}")
         save_checkpoint(0, [ds_engine], None, None)

From 05af2ab5e683b2126d51045a273f502894f65700 Mon Sep 17 00:00:00 2001
From: billishyahao <yahao.he@intel.com>
Date: Wed, 7 Aug 2024 10:46:27 +0000
Subject: [PATCH 3/3] update

---
 tools/hf2megads_weight_converter.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tools/hf2megads_weight_converter.py b/tools/hf2megads_weight_converter.py
index 58f8e2d9ab..12468963c5 100755
--- a/tools/hf2megads_weight_converter.py
+++ b/tools/hf2megads_weight_converter.py
@@ -514,8 +514,7 @@ def convert_ckpt():
     # cur_refactor.inorder_show_record()
 
     if args.to_hf_ckpt:
-        
-        save_path = "/yahao/Megatron-DeepSpeed/test-llama-7b-hf/"
+        save_path = args.save
         if not os.path.exists(save_path):
             Path(save_path).mkdir(parents=True, exist_ok=True)
         ckpt_per_pp_path = os.path.join(save_path, f"model_pp{mpu.get_pipeline_model_parallel_rank()}.pt")