|
@@ -0,0 +1,191 @@
|
|
|
+import json
|
|
|
+import os
|
|
|
+from typing import List, Union
|
|
|
+
|
|
|
+import click
|
|
|
+import torch
|
|
|
+from tqdm import tqdm
|
|
|
+from transformers import LlamaForCausalLM # @manual
|
|
|
+
|
|
|
+NUM_SHARDS = {
|
|
|
+ "7B": 1,
|
|
|
+ "7Bf": 1,
|
|
|
+ "13B": 2,
|
|
|
+ "13Bf": 2,
|
|
|
+ "34B": 4,
|
|
|
+ "30B": 4,
|
|
|
+ "65B": 8,
|
|
|
+ "70B": 8,
|
|
|
+ "70Bf": 8,
|
|
|
+}
|
|
|
+
|
|
|
+
|
|
|
+def read_json(path):
|
|
|
+ with open(path, "r") as f:
|
|
|
+ return json.load(f)
|
|
|
+
|
|
|
+
|
|
|
+def write_model(model_path, model_size, output_base_path):
|
|
|
+ dtype = torch.bfloat16
|
|
|
+
|
|
|
+ params_path = os.path.join(output_base_path, "params.json")
|
|
|
+ assert os.path.isfile(params_path), f"{params_path} does not exist"
|
|
|
+ params = read_json(params_path)
|
|
|
+ num_shards = NUM_SHARDS[model_size]
|
|
|
+ n_layers = params["n_layers"]
|
|
|
+ n_heads = params["n_heads"]
|
|
|
+ n_heads_per_shard = n_heads // num_shards
|
|
|
+ dim = params["dim"]
|
|
|
+ dims_per_head = dim // n_heads
|
|
|
+ base = 10000.0
|
|
|
+ inv_freq = (
|
|
|
+ 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
|
|
|
+ ).to(dtype)
|
|
|
+
|
|
|
+ if "n_kv_heads" in params:
|
|
|
+ num_key_value_heads = params["n_kv_heads"] # for GQA / MQA
|
|
|
+ num_local_key_value_heads = n_heads_per_shard // num_key_value_heads
|
|
|
+ key_value_dim = dim // num_key_value_heads
|
|
|
+ else: # compatibility with other checkpoints
|
|
|
+ num_key_value_heads = n_heads
|
|
|
+ num_local_key_value_heads = n_heads_per_shard
|
|
|
+ key_value_dim = dim
|
|
|
+
|
|
|
+ model = LlamaForCausalLM.from_pretrained(
|
|
|
+ model_path,
|
|
|
+ torch_dtype=dtype,
|
|
|
+ low_cpu_mem_usage=True,
|
|
|
+ )
|
|
|
+ loaded = model.state_dict()
|
|
|
+
|
|
|
+ # permute for sliced rotary
|
|
|
+ def permute(w, n_heads=n_heads, dim1=dim, dim2=dim):
|
|
|
+ return (
|
|
|
+ w.view(n_heads, 2, dim1 // n_heads // 2, dim2)
|
|
|
+ .transpose(1, 2)
|
|
|
+ .reshape(dim1, dim2)
|
|
|
+ )
|
|
|
+
|
|
|
+ state_dict = [{} for _ in range(num_shards)]
|
|
|
+
|
|
|
+ def insert(name: str, tensor: Union[List, torch.Tensor]):
|
|
|
+ for i in range(num_shards):
|
|
|
+ state_dict[i][name] = (
|
|
|
+ tensor[i].clone() if isinstance(tensor, list) else tensor
|
|
|
+ )
|
|
|
+
|
|
|
+ def insert_chunk(name: str, tensor: torch.Tensor, dim: int):
|
|
|
+ tensors = tensor.chunk(num_shards, dim=dim)
|
|
|
+ for i, tensor in enumerate(tensors):
|
|
|
+ state_dict[i][name] = tensor.clone()
|
|
|
+
|
|
|
+ insert_chunk("tok_embeddings.weight", loaded["model.embed_tokens.weight"], 1)
|
|
|
+ insert("norm.weight", loaded["model.norm.weight"])
|
|
|
+ insert_chunk("output.weight", loaded["lm_head.weight"], 0)
|
|
|
+
|
|
|
+ for layer_i in tqdm(range(n_layers), desc="Converting layers"):
|
|
|
+
|
|
|
+ ts = (
|
|
|
+ permute(loaded[f"model.layers.{layer_i}.self_attn.q_proj.weight"])
|
|
|
+ .view(n_heads_per_shard * num_shards, dims_per_head, dim)
|
|
|
+ .chunk(num_shards, dim=0)
|
|
|
+ )
|
|
|
+ insert(f"layers.{layer_i}.attention.wq.weight", [t.view(-1, dim) for t in ts])
|
|
|
+
|
|
|
+ ts = (
|
|
|
+ permute(
|
|
|
+ loaded[f"model.layers.{layer_i}.self_attn.k_proj.weight"],
|
|
|
+ num_key_value_heads,
|
|
|
+ key_value_dim,
|
|
|
+ dim,
|
|
|
+ )
|
|
|
+ .view(num_local_key_value_heads * num_shards, dims_per_head, dim)
|
|
|
+ .chunk(num_shards, dim=0)
|
|
|
+ )
|
|
|
+ insert(f"layers.{layer_i}.attention.wk.weight", [t.view(-1, dim) for t in ts])
|
|
|
+
|
|
|
+ ts = (
|
|
|
+ loaded[f"model.layers.{layer_i}.self_attn.v_proj.weight"]
|
|
|
+ .view(num_local_key_value_heads * num_shards, dims_per_head, dim)
|
|
|
+ .chunk(num_shards, dim=0)
|
|
|
+ )
|
|
|
+ insert(f"layers.{layer_i}.attention.wv.weight", [t.view(-1, dim) for t in ts])
|
|
|
+
|
|
|
+ insert_chunk(
|
|
|
+ f"layers.{layer_i}.attention.wo.weight",
|
|
|
+ loaded[f"model.layers.{layer_i}.self_attn.o_proj.weight"],
|
|
|
+ 1,
|
|
|
+ )
|
|
|
+
|
|
|
+ insert_chunk(
|
|
|
+ f"layers.{layer_i}.feed_forward.w1.weight",
|
|
|
+ loaded[f"model.layers.{layer_i}.mlp.gate_proj.weight"],
|
|
|
+ 0,
|
|
|
+ )
|
|
|
+
|
|
|
+ insert_chunk(
|
|
|
+ f"layers.{layer_i}.feed_forward.w2.weight",
|
|
|
+ loaded[f"model.layers.{layer_i}.mlp.down_proj.weight"],
|
|
|
+ 1,
|
|
|
+ )
|
|
|
+
|
|
|
+ insert_chunk(
|
|
|
+ f"layers.{layer_i}.feed_forward.w3.weight",
|
|
|
+ loaded[f"model.layers.{layer_i}.mlp.up_proj.weight"],
|
|
|
+ 0,
|
|
|
+ )
|
|
|
+
|
|
|
+ insert(
|
|
|
+ f"layers.{layer_i}.attention_norm.weight",
|
|
|
+ loaded[f"model.layers.{layer_i}.input_layernorm.weight"],
|
|
|
+ )
|
|
|
+ insert(
|
|
|
+ f"layers.{layer_i}.ffn_norm.weight",
|
|
|
+ loaded[f"model.layers.{layer_i}.post_attention_layernorm.weight"],
|
|
|
+ )
|
|
|
+ insert("rope.freqs", inv_freq)
|
|
|
+
|
|
|
+ for i in tqdm(range(num_shards), desc="Saving checkpoint shards"):
|
|
|
+ torch.save(
|
|
|
+ state_dict[i], os.path.join(output_base_path, f"consolidated.{i:02d}.pth")
|
|
|
+ )
|
|
|
+
|
|
|
+
|
|
|
+@click.command()
|
|
|
+@click.option(
|
|
|
+ "--model-path",
|
|
|
+ type=str,
|
|
|
+ default="meta-llama/Llama-2-7b-chat-hf",
|
|
|
+ help="Model name or path to the model directory.",
|
|
|
+)
|
|
|
+@click.option(
|
|
|
+ "--model-size",
|
|
|
+ type=click.Choice(
|
|
|
+ [
|
|
|
+ "7B",
|
|
|
+ "7Bf",
|
|
|
+ "13B",
|
|
|
+ "13Bf",
|
|
|
+ "30B",
|
|
|
+ "34B",
|
|
|
+ "65B",
|
|
|
+ "70B",
|
|
|
+ "70Bf",
|
|
|
+ ]
|
|
|
+ ),
|
|
|
+ default="7Bf",
|
|
|
+ help="llama model size, f' models correspond to the finetuned versions.",
|
|
|
+)
|
|
|
+@click.option(
|
|
|
+ "--output-dir",
|
|
|
+ type=str,
|
|
|
+ required=True,
|
|
|
+ help="Save Llama weights. Should already contains params.json",
|
|
|
+)
|
|
|
+def main(model_path: str, model_size: str, output_dir: str):
|
|
|
+ """Convert llama huggingface format to consolidated weights."""
|
|
|
+ write_model(model_path, model_size, output_dir)
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
+ main()
|