Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions configs/recipes/vision/qwen3_vl/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Qwen3-VL 2B

Configs for Qwen3-VL models. See https://huggingface.co/collections/Qwen/qwen3-vl-68d2a7c1b8a8afce4ebd2dbe
82 changes: 82 additions & 0 deletions configs/recipes/vision/qwen3_vl/sft/4b_instruct_fft_train.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
# LoRA fine-tune config for Qwen3 VL 4B Instruct.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Change to FFT?

#
# Requirements:
# - Log into WandB (`wandb login`) or disable `enable_wandb`
#
# Usage:
# oumi train -c configs/recipes/vision/qwen3_vl/sft/4b_instruct_lora_train.yaml
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ditto

#
# See Also:
# - Documentation: https://oumi.ai/docs/en/latest/user_guides/train/train.html
# - Config class: oumi.core.configs.TrainingConfig
# - Config source: https://github.com/oumi-ai/oumi/blob/main/src/oumi/core/configs/training_config.py
# - Other training configs: configs/**/*train.yaml

model:
model_name: "Qwen/Qwen3-VL-4B-Instruct"
torch_dtype_str: "bfloat16"
model_max_length: 4096
trust_remote_code: True
attn_implementation: "sdpa"
chat_template: "qwen3-vl-instruct"
freeze_layers:
- "visual"

data:
train:
collator_name: "vision_language_with_padding"
use_torchdata: True
datasets:
- dataset_name: "merve/vqav2-small"
split: "validation"
shuffle: True
seed: 42
transform_num_workers: "auto"
dataset_kwargs:
processor_name: "Qwen/Qwen3-VL-4B-Instruct"
return_tensors: True
# limit: 4096 # Uncomment to limit dataset size!
# return_conversations: True

# Below are examples of other vision SFT datasets
# - dataset_name: "HuggingFaceH4/llava-instruct-mix-vsft"
# split: "train"
# shuffle: True
# seed: 42
# transform_num_workers: "auto"
# dataset_kwargs:
# processor_name: "Qwen/Qwen3-VL-4B-Instruct"
# return_tensors: True

training:
output_dir: "output/vlm_finetuned"
trainer_type: "TRL_SFT" # or "OUMI"
enable_gradient_checkpointing: True
per_device_train_batch_size: 1 # Must be 1: the model generates variable-sized image features.
gradient_accumulation_steps: 32
max_steps: 20 # Comment out and use `num_train_epochs` instead for full training.
# num_train_epochs: 1
use_peft: False

gradient_checkpointing_kwargs:
# Reentrant docs: https://pytorch.org/docs/stable/checkpoint.html#torch.utils.checkpoint.checkpoint
use_reentrant: False
max_grad_norm: 0.5 # For vqav2-small this results in more stable training.
ddp_find_unused_parameters: False
empty_device_cache_steps: 1
compile: False

optimizer: "adamw_torch_fused"
learning_rate: 2e-5
warmup_ratio: 0.03
weight_decay: 0.0
lr_scheduler_type: "cosine"

logging_steps: 5
save_steps: 0
dataloader_main_process_only: False
dataloader_num_workers: 2
dataloader_prefetch_factor: 8
include_performance_metrics: True
log_model_summary: False
enable_wandb: True
96 changes: 96 additions & 0 deletions configs/recipes/vision/qwen3_vl/sft/4b_instruct_lora_train.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
# LoRA fine-tune config for Qwen3 VL 4B Instruct.
#
# Requirements:
# - Log into WandB (`wandb login`) or disable `enable_wandb`
#
# Usage:
# oumi train -c configs/recipes/vision/qwen3_vl/sft/4b_instruct_lora_train.yaml
#
# See Also:
# - Documentation: https://oumi.ai/docs/en/latest/user_guides/train/train.html
# - Config class: oumi.core.configs.TrainingConfig
# - Config source: https://github.com/oumi-ai/oumi/blob/main/src/oumi/core/configs/training_config.py
# - Other training configs: configs/**/*train.yaml

model:
model_name: "Qwen/Qwen3-VL-4B-Instruct"
torch_dtype_str: "bfloat16"
model_max_length: 4096
trust_remote_code: True
# TODO: Enable flash attention
attn_implementation: "sdpa"
chat_template: "qwen3-vl-instruct"
freeze_layers:
- "visual"

data:
train:
collator_name: "vision_language_with_padding"
use_torchdata: True
datasets:
- dataset_name: "merve/vqav2-small"
split: "validation"
shuffle: True
seed: 42
transform_num_workers: "auto"
dataset_kwargs:
processor_name: "Qwen/Qwen3-VL-4B-Instruct"
return_tensors: True
# limit: 4096 # Uncomment to limit dataset size!
# return_conversations: True

# Below are examples of other vision SFT datasets
# - dataset_name: "HuggingFaceH4/llava-instruct-mix-vsft"
# split: "train"
# shuffle: True
# seed: 42
# transform_num_workers: "auto"
# dataset_kwargs:
# processor_name: "Qwen/Qwen3-VL-4B-Instruct"
# return_tensors: True

training:
output_dir: "output/vlm_finetuned"
trainer_type: "TRL_SFT" # or "OUMI"
enable_gradient_checkpointing: True
per_device_train_batch_size: 1 # Must be 1: the model generates variable-sized image features.
gradient_accumulation_steps: 32
max_steps: 20 # Comment out and use `num_train_epochs` instead for full training.
# num_train_epochs: 1
use_peft: True

gradient_checkpointing_kwargs:
# Reentrant docs: https://pytorch.org/docs/stable/checkpoint.html#torch.utils.checkpoint.checkpoint
use_reentrant: False
max_grad_norm: 0.5 # For vqav2-small this results in more stable training.
ddp_find_unused_parameters: False
empty_device_cache_steps: 1
compile: False

optimizer: "adamw_torch_fused"
learning_rate: 2e-5
warmup_ratio: 0.03
weight_decay: 0.0
lr_scheduler_type: "cosine"

logging_steps: 5
save_steps: 0
dataloader_main_process_only: False
dataloader_num_workers: 2
dataloader_prefetch_factor: 8
include_performance_metrics: True
log_model_summary: False
enable_wandb: True

peft:
lora_r: 8
lora_alpha: 16
lora_dropout: 0.05
lora_target_modules:
- "q_proj"
- "v_proj"
- "o_proj"
- "k_proj"
- "gate_proj"
- "up_proj"
- "down_proj"
1 change: 1 addition & 0 deletions docs/resources/models/models.md
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ Available templates include:
- `llava` - For LLaVA multimodal models
- `phi3-instruct` - For Phi-3 instruction models
- `qwen2-vl-instruct` - For Qwen2-VL instruction models
- `qwen3-vl-instruct` - For Qwen3-VL instruction models
- `zephyr` - For Zephyr models

All the templates expect a `messages` list, where each message is a dictionary with `role` and `content` keys in {doc}`oumi format </resources/datasets/data_formats>`.
Expand Down
38 changes: 38 additions & 0 deletions src/oumi/core/configs/internal/supported_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,38 @@ def _create_qwen2_5_vl_vlm_config() -> InternalModelConfig:
return config


def _create_qwen3_vl_vlm_config() -> InternalModelConfig:
config = _create_default_vlm_config(
pixel_values_variable_shape=True,
supports_multiple_images=True,
)
config.chat_template = "qwen3-vl-instruct"
# FIXME OPE-946 Consider updating to "right":
# config.padding_side = InternalPaddingSide.PAD_RIGHT
config.model_input_features.update(
{
feature_name: InternalFeatureSpec(
name=feature_name,
required=True,
variable_shape=False,
image_dependent=True,
)
for feature_name in ("image_grid_thw",)
}
)
config.processor_kwargs.update(
# Defaults per Qwen3-VL:
# https://github.com/QwenLM/Qwen3-VL/blob/main/qwen-vl-utils/src/qwen_vl_utils/vision_process.py
{
"min_pixels": 4 * 28 * 28,
"max_pixels": 16384 * 28 * 28,
"patch_size": 16,
}
)

return config


def _create_phi3_vlm_config() -> InternalModelConfig:
config = _create_default_vlm_config(
pixel_values_variable_shape=True,
Expand Down Expand Up @@ -531,6 +563,12 @@ def get_all_models_map() -> Mapping[
tested=True,
config=_create_qwen2_5_vl_vlm_config(),
),
_ModelTypeInfo(
model_type="qwen3_vl",
model_class=default_vlm_class,
tested=True,
config=_create_qwen3_vl_vlm_config(),
),
_ModelTypeInfo(
model_type="vipllava",
model_class=default_vlm_class,
Expand Down
37 changes: 37 additions & 0 deletions src/oumi/datasets/chat_templates/qwen3-vl-instruct.jinja
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
{% set image_count = namespace(value=0) %}
{% set video_count = namespace(value=0) %}

{%- for message in messages -%}
{%- if loop.first and message['role'] != 'system' -%}
{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}
{%- endif -%}

{{ '<|im_start|>' + message['role'] + '\n' }}

{%- if message['content'] is string -%}
{{- message['content'] -}}
{%- elif message['content'] is iterable -%}
{%- for item in message['content'] -%}
{%- if item['type'].startswith('image') -%}
{%- set image_count.value = image_count.value + 1 -%}
{%- if add_vision_id -%}
{{ 'Picture ' + image_count.value + ': ' }}
{%- endif -%}
{{ '<|vision_start|><|image_pad|><|vision_end|>' }}
{%- elif item['type'].startswith('video') -%}
{%- set video_count.value = video_count.value + 1 -%}
{%- if add_vision_id -%}
{{ 'Video ' + video_count.value + ': ' }}
{%- endif -%}
{{ '<|vision_start|><|video_pad|><|vision_end|>' }}
{%- elif item['type']=='text' -%}
{{- item['text'] if 'text' in item else item['content'] -}}
{%- endif -%}
{%- endfor -%}
{%- endif -%}
{{ '<|im_end|>\n' }}
{%- endfor -%}

{%- if add_generation_prompt -%}
{{- '<|im_start|>assistant\n' -}}
{%- endif -%}
Loading