Skip to content

Commit

Permalink
Added Distributed(Tensor Parallel) Inference Recipe (#2245)
Browse files Browse the repository at this point in the history
Co-authored-by: JessicaZhong <zhengjesszhong@gmail.com>
  • Loading branch information
acisseJZhong and jessicazhongeee authored Jan 18, 2025
1 parent 1036095 commit 779569e
Show file tree
Hide file tree
Showing 14 changed files with 618 additions and 13 deletions.
7 changes: 6 additions & 1 deletion recipes/configs/generation.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
# Config for running the InferenceRecipe in generate.py to generate output from an LLM
# Config for running the InferenceRecipe in generate.py to generate output
# from Llama2 7B model
#
# This config assumes that you've run the following command before launching
# this run:
# tune download meta-llama/Llama-2-7b-hf --output-dir /tmp/Llama-2-7b-hf --ignore-patterns "*.safetensors" --hf-token <HF_TOKEN>
#
# To launch, run the following command from root torchtune directory:
# tune run generate --config generation
Expand Down
50 changes: 50 additions & 0 deletions recipes/configs/llama3/70B_generation_distributed.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# Config for running the InferenceRecipe in dev/generate_v2.py to generate output
# using a Llama3 70B Instruct model
#
# This config assumes that you've run the following command before launching:
# tune download meta-llama/Meta-Llama-3-70B-Instruct --output-dir /tmp/Meta-Llama-3-70B-Instruct --ignore-patterns "original/consolidated*" --hf-token <HF_TOKEN>
#
# To launch, run the following command from root torchtune directory:
# tune run --nproc_per_node 8 dev/generate_v2_distributed --config llama3/70B_generation_distributed

output_dir: ./

# Model arguments
model:
_component_: torchtune.models.llama3.llama3_70b

parallelize_plan:
_component_: torchtune.models.llama3.base_llama_tp_plan

# Transform arguments
tokenizer:
_component_: torchtune.models.llama3.llama3_tokenizer
path: /tmp/Meta-Llama-3-70B-Instruct/original/tokenizer.model
prompt_template: null
max_seq_len: 8192

# Checkpointer
checkpointer:
_component_: torchtune.training.FullModelHFCheckpointer
checkpoint_dir: /tmp/Meta-Llama-3-70B-Instruct
checkpoint_files:
filename_format: model-{}-of-{}.safetensors
max_filename: "00030"
recipe_checkpoint: null
output_dir: ${output_dir}
model_type: LLAMA3

# Device
device: cuda
dtype: bf16
seed: 1234
log_level: INFO

# Generation arguments
prompt:
system: null
user:
text: Tell a joke.
max_new_tokens: 200
temperature: 0.6 # 0.8 and 0.6 are popular values to try
top_k: 300
50 changes: 50 additions & 0 deletions recipes/configs/llama3_1/70B_generation_distributed.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# Config for running the InferenceRecipe in dev/generate_v2.py to generate output
# using a Llama3.1 70B Instruct model
#
# This config assumes that you've run the following command before launching:
# tune download meta-llama/Meta-Llama-3.1-70B-Instruct --output-dir /tmp/Meta-Llama-3.1-70B-Instruct --ignore-patterns "original/consolidated*" --hf-token <HF_TOKEN>
#
# To launch, run the following command from root torchtune directory:
# tune run --nproc_per_node 8 dev/generate_v2_distributed --config llama3_1/70B_generation_distributed

output_dir: ./

# Model arguments
model:
_component_: torchtune.models.llama3_1.llama3_1_70b

parallelize_plan:
_component_: torchtune.models.llama3.base_llama_tp_plan

# Transform arguments
tokenizer:
_component_: torchtune.models.llama3.llama3_tokenizer
path: /tmp/Meta-Llama-3.1-70B-Instruct/original/tokenizer.model
prompt_template: null
max_seq_len: 8192

# Checkpointer
checkpointer:
_component_: torchtune.training.FullModelHFCheckpointer
checkpoint_dir: /tmp/Meta-Llama-3.1-70B-Instruct/
checkpoint_files:
filename_format: model-{}-of-{}.safetensors
max_filename: "00030"
recipe_checkpoint: null
output_dir: ${output_dir}
model_type: LLAMA3

# Device
device: cuda
dtype: bf16
seed: 1234
log_level: INFO

# Generation arguments
prompt:
system: null
user:
text: Tell a joke.
max_new_tokens: 200
temperature: 0.6 # 0.8 and 0.6 are popular values to try
top_k: 300
2 changes: 1 addition & 1 deletion recipes/configs/llama3_2_vision/11B_generation_v2.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
# To launch, run the following command from root torchtune directory:
# tune run dev/generate_v2 --config llama3_2_vision/generation_v2

output_dir: ./ # Not needed
output_dir: ./

# Model arguments
model:
Expand Down
50 changes: 50 additions & 0 deletions recipes/configs/llama3_3/70B_generation_distributed.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# Config for running the InferenceRecipe in dev/generate_v2.py to generate output
# using a Llama3.1 70B Instruct model
#
# This config assumes that you've run the following command before launching:
# tune download meta-llama/Llama-3.3-70B-Instruct --ignore-patterns "original/consolidated*" --hf-token <HF_TOKEN>
#
# To launch, run the following command from root torchtune directory:
# tune run --nproc_per_node 8 dev/generate_v2_distributed --config llama3_3/70B_generation_distributed

output_dir: ./

# Model arguments
model:
_component_: torchtune.models.llama3_3.llama3_3_70b

parallelize_plan:
_component_: torchtune.models.llama3.base_llama_tp_plan

# Transform arguments
tokenizer:
_component_: torchtune.models.llama3.llama3_tokenizer
path: /tmp/Llama-3.3-70B-Instruct/original/tokenizer.model
prompt_template: null
max_seq_len: 8192

# Checkpointer
checkpointer:
_component_: torchtune.training.FullModelHFCheckpointer
checkpoint_dir: /tmp/Llama-3.3-70B-Instruct/
checkpoint_files:
filename_format: model-{}-of-{}.safetensors
max_filename: "00030"
recipe_checkpoint: null
output_dir: ${output_dir}
model_type: LLAMA3

# Device
device: cuda
dtype: bf16
seed: 1234
log_level: INFO

# Generation arguments
prompt:
system: null
user:
text: Tell a joke.
max_new_tokens: 200
temperature: 0.6 # 0.8 and 0.6 are popular values to try
top_k: 300
18 changes: 11 additions & 7 deletions recipes/dev/generate_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,18 +39,22 @@ def __call__(self, prompt: Dict[str, Any]) -> List[Message]:

# Iterate through roles and add content
for role, content in prompt.items():
if isinstance(content, str):
if content is None:
continue
elif isinstance(content, str):
new_content = [{"type": "text", "content": content}]
else:
assert (
"image" in content.keys()
), "Multiple entries per role expect an image key"
elif "image" in content.keys():
image_loc = content["image"]
image = load_image(image_loc)
new_content = [
{"type": "image", "content": image},
{"type": "text", "content": content["text"]},
]
else:
assert (
"text" in content.keys()
), "Multiple entries per role expect at least a text key"
new_content = [{"type": "text", "content": content["text"]}]
messages.append(Message(role=role, content=new_content))

# Finally, add an empty assistant message to kick-start generation
Expand Down Expand Up @@ -109,12 +113,12 @@ def log_metrics(self, total_time: int, tokens_per_second: float) -> None:
f"Time for inference: {total_time:.02f} sec total, {tokens_per_second:.02f} tokens/sec"
)
self._logger.info(
f"Bandwidth achieved: {model_size * tokens_per_second / 1e9:.02f} GB/s"
f"Bandwidth achieved: {model_size * tokens_per_second / (1024**3):.02f} GiB/s"
)
if self._device.type != "cpu":
torch_device = utils.get_torch_device_namespace()
self._logger.info(
f"Max memory allocated: {torch_device.max_memory_allocated() / 1e9:.02f} GB"
f"Max memory allocated: {torch_device.max_memory_allocated() / (1024**3):.02f} GiB"
)

@torch.inference_mode()
Expand Down
Loading

0 comments on commit 779569e

Please sign in to comment.