Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit bfbc4a1

Browse files
committed
step2 能运行
1 parent 402f543 commit bfbc4a1

3 files changed

Lines changed: 25 additions & 10 deletions

File tree

  • applications/DeepSpeed-Chat/training
    • step2_reward_model_finetuning/training_scripts/opt/single_gpu
    • step3_rlhf_finetuning/training_scripts/opt/single_gpu

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,4 +129,5 @@ dmypy.json
129129

130130
# Pyre type checker
131131
.pyre/
132-
applications/DeepSpeed-Chat/training/step1_supervised_finetuning/output/*
132+
applications/DeepSpeed-Chat/training/step1_supervised_finetuning/output/*
133+
applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/output/*

applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/opt/single_gpu/run_350m.sh

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,13 @@ if [ "$ZERO_STAGE" == "" ]; then
1313
fi
1414
mkdir -p $OUTPUT
1515

16-
deepspeed --num_gpus 1 main.py --model_name_or_path facebook/opt-350m \
17-
--num_padding_at_beginning 1 --weight_decay 0.1 --dropout 0.0 --gradient_accumulation_steps 4 --zero_stage $ZERO_STAGE \
18-
--enable_tensorboard \
19-
--tensorboard_path $OUTPUT \
20-
--deepspeed --output_dir $OUTPUT &> $OUTPUT/training.log
16+
# deepspeed --num_gpus 1 main.py --model_name_or_path facebook/opt-350m \
17+
# --num_padding_at_beginning 1 --weight_decay 0.1 --dropout 0.0 --gradient_accumulation_steps 4 --zero_stage $ZERO_STAGE \
18+
# --enable_tensorboard \
19+
# --tensorboard_path $OUTPUT \
20+
# --deepspeed --output_dir $OUTPUT &> $OUTPUT/training.log
21+
22+
deepspeed --include="localhost:1,2" main.py --model_name_or_path /home/xuhang/hf_hub/opt-350m \
23+
--num_padding_at_beginning 1 --weight_decay 0.1 --dropout 0.0 --gradient_accumulation_steps 4 --zero_stage 0 \
24+
--enable_tensorboard --tensorboard_path $OUTPUT --deepspeed --output_dir $OUTPUT --per_device_eval_batch_size 8 --per_device_train_batch_size 8 \
25+
# &> $OUTPUT/training.log

applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_gpu/run_1.3b.sh

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,18 @@ if [ "$CRITIC_ZERO_STAGE" == "" ]; then
1919
fi
2020
mkdir -p $OUTPUT
2121

22-
deepspeed --num_gpus 1 main.py \
23-
--actor_model_name_or_path $ACTOR_MODEL_PATH --critic_model_name_or_path $CRITIC_MODEL_PATH \
24-
--actor_zero_stage $ACTOR_ZERO_STAGE --critic_zero_stage $CRITIC_ZERO_STAGE \
22+
# deepspeed --num_gpus 1 main.py \
23+
# --actor_model_name_or_path $ACTOR_MODEL_PATH --critic_model_name_or_path $CRITIC_MODEL_PATH \
24+
# --actor_zero_stage $ACTOR_ZERO_STAGE --critic_zero_stage $CRITIC_ZERO_STAGE \
25+
# --num_padding_at_beginning 1 --gradient_accumulation_steps 2 \
26+
# --deepspeed --actor_lora_dim 128 --enable_hybrid_engine --actor_gradient_checkpointing --actor_dropout 0.0 \
27+
# --output_dir $OUTPUT &> $OUTPUT/training.log
28+
29+
30+
deepspeed --include="localhost:1,2" main.py \
31+
--actor_model_name_or_path /home/xuhang/hf_hub/opt-350m --critic_model_name_or_path /home/xuhang/hf_hub/opt-350m \
32+
--actor_zero_stage 0 --critic_zero_stage 0 \
2533
--num_padding_at_beginning 1 --gradient_accumulation_steps 2 \
2634
--deepspeed --actor_lora_dim 128 --enable_hybrid_engine --actor_gradient_checkpointing --actor_dropout 0.0 \
27-
--output_dir $OUTPUT &> $OUTPUT/training.log
35+
--output_dir ./output
36+
# &> $OUTPUT/training.log

0 commit comments

Comments
 (0)