oumi-ai · wizeng23 · Jun 11, 2025 · Jun 9, 2025 · Jun 10, 2025 · Jun 10, 2025
diff --git a/configs/examples/grpo_verl_countdown/gcp_job.yaml b/configs/examples/grpo_verl_countdown/gcp_job.yaml
@@ -18,7 +18,7 @@ name: grpo-verl-countdown
 
 resources:
   cloud: gcp
-  accelerators: "H100:8"
+  accelerators: "A100-80GB:2"
   use_spot: false
 
 working_dir: .

diff --git a/configs/examples/grpo_verl_countdown/train.yaml b/configs/examples/grpo_verl_countdown/train.yaml
@@ -18,19 +18,19 @@ model:
 data:
   train:
     datasets:
-      - dataset_name: "d1shs0ap/countdown-final"
+      - dataset_name: "d1shs0ap/countdown"
         split: "train"
   validation:
     datasets:
-      - dataset_name: "d1shs0ap/countdown-final"
+      - dataset_name: "d1shs0ap/countdown"
         split: "test"
 
 training:
   trainer_type: "VERL_GRPO"
-  num_train_epochs: 5
-  save_steps: 150
+  num_train_epochs: 1
+  save_steps: -1
   eval_strategy: "steps"
-  eval_steps: 12
+  eval_steps: 50
 
   learning_rate: 1.0e-6
   enable_gradient_checkpointing: True
@@ -40,44 +40,38 @@ training:
   grpo:
     max_completion_length: 1024
     use_vllm: True
-    temperature: 0.6
-    vllm_gpu_memory_utilization: 0.7
+    temperature: 1.0
+    vllm_gpu_memory_utilization: 0.4
 
   verl_config_overrides:
     data:
-      train_files: "/home/cmu/countdown-curriculum/data/countdown/train-3-3.parquet"
-      val_files: "/home/cmu/countdown-curriculum/data/countdown/test-3-10.parquet"
-      train_batch_size: 128
-      val_batch_size: 256
+      train_batch_size: 64
+      val_batch_size: 640
       max_prompt_length: 256
-      max_extrapolation_length: 2048
-      shuffle: False
     actor_rollout_ref:
       model:
         use_remove_padding: True
       actor:
-        ppo_mini_batch_size: 32
-        use_dynamic_bsz: True
-        gradients: "normal"
         use_kl_loss: True
         kl_loss_coef: 0.001
         kl_loss_type: "low_var_kl"
-        entropy_coeff: 0
+        ppo_mini_batch_size: 16
+        ppo_micro_batch_size: 4
       rollout:
+        log_prob_micro_batch_size: 4
         tensor_model_parallel_size: 2
-        n: 8
-        enforce_eager: False
-        free_cache_engine: False
-        val_kwargs:
-          temperature: 0.6
-          n: 8
-          do_sample: True
-        max_num_batched_tokens: 16384
+        n: 16
       ref:
+        log_prob_micro_batch_size: 2
         fsdp_config:
           param_offload: True
+    algorithm:
+      kl_ctrl:
+        kl_coef: 0.001
     trainer:
-      n_gpus_per_node: 8
+      critic_warmup: 0
+      val_before_train: False
+      n_gpus_per_node: 2
       nnodes: 1
 
   output_dir: "output/grpo_verl_countdown"

diff --git a/configs/examples/grpo_verl_geometry3k/gcp_job.yaml b/configs/examples/grpo_verl_geometry3k/gcp_job.yaml
@@ -36,6 +36,11 @@ envs:
 setup: |
   set -e
   pip install uv && uv pip install oumi[gpu] huggingface_hub[hf_xet] mathruler
+  # TODO: OPE-1331 - Fix bug that lets us upgrade to verl 0.4.0.
+  # Note that qwen_vl_utils needs to manually installed.
+  # In the meantime, we need to use this specific commit to support vLLM 0.8.3:
+  # https://github.com/volcengine/verl/pull/912
+  pip install git+https://github.com/volcengine/verl.git@1ee730163f6326e9679644db62eb32c8d1947c7f
   pip install -U flash-attn --no-build-isolation
 
 run: |

diff --git a/pyproject.toml b/pyproject.toml
@@ -120,7 +120,7 @@ gpu = [
     "bitsandbytes>=0.45.0,<0.46",      # Used for QLora, and PagedAdam implementation
     # When updating verl version, make sure to also update the default config:
     # src/oumi/core/trainers/verl_trainer_config.yaml.
-    "verl>=0.3.0,<0.4",   # Used for the VERL_GRPO trainer.
+    "verl>=0.4.0,<0.5", # Used for the VERL_GRPO trainer.
     "vllm>=0.8.3,<0.9", # For VLLMInferenceEngine, and vLLM-powered GRPO training.
 ]