diff --git a/src/instructlab/cli/data/generate.py b/src/instructlab/cli/data/generate.py index 866d7eeb66..fedb2fdedb 100644 --- a/src/instructlab/cli/data/generate.py +++ b/src/instructlab/cli/data/generate.py @@ -143,8 +143,8 @@ @click.option( "--batch-size", type=click.IntRange(min=0), - default=None, - help="Number of elements to process in each batch through the SDG pipeline. Enabled by default for the vLLM serving backend, with a batch size of 8 chosen based on experiments to optimize for throughput. Use 0 to disable.", + cls=clickext.ConfigOption, + default=DEFAULTS.BATCH_SIZE, ) @click.option( "--enable-serving-output", diff --git a/src/instructlab/configuration.py b/src/instructlab/configuration.py index be254362e8..c117b4b9fd 100644 --- a/src/instructlab/configuration.py +++ b/src/instructlab/configuration.py @@ -373,6 +373,10 @@ class _generate(BaseModel): default=DEFAULTS.NUM_CPUS, description="Number of CPU cores to use for generation.", ) + batch_size: PositiveInt = Field( + default=DEFAULTS.BATCH_SIZE, + description="Number of Batches to send for generation on each core.", + ) chunk_word_count: PositiveInt = Field( default=DEFAULTS.CHUNK_WORD_COUNT, description="Maximum number of words per chunk.", diff --git a/src/instructlab/defaults.py b/src/instructlab/defaults.py index e6c0d584bf..2ff1966548 100644 --- a/src/instructlab/defaults.py +++ b/src/instructlab/defaults.py @@ -105,6 +105,8 @@ class _InstructlabDefaults: MAX_CONTEXT_SIZE = 4096 # TODO: these constants should be removed, they should not leak out NUM_CPUS = 10 + # Number of batches to send on each core. Tune the batch size to optimize the vLLM performance + BATCH_SIZE = 8 CHUNK_WORD_COUNT = 1000 CONNECTION_TIMEOUT = httpx.Timeout(timeout=30.0) # use spawn start method, fork is not thread-safe diff --git a/src/instructlab/profiles/amd/cpu.yaml b/src/instructlab/profiles/amd/cpu.yaml index f3016ec9ee..d659be58c5 100644 --- a/src/instructlab/profiles/amd/cpu.yaml +++ b/src/instructlab/profiles/amd/cpu.yaml @@ -37,6 +37,8 @@ generate: model: ~/.cache/instructlab/models/mistral-7b-instruct-v0.2.Q4_K_M.gguf # Number of CPU cores to use for generation num_cpus: 10 + # Number of batches to send on each core + batch_size: 8 # Directory where generated datasets are stored output_dir: ~/.local/share/instructlab/datasets # Directory where pipeline config files are stored diff --git a/src/instructlab/profiles/apple/m1/m1_max.yaml b/src/instructlab/profiles/apple/m1/m1_max.yaml index 7d300e4f93..359bedb541 100644 --- a/src/instructlab/profiles/apple/m1/m1_max.yaml +++ b/src/instructlab/profiles/apple/m1/m1_max.yaml @@ -53,6 +53,8 @@ generate: model: ~/.cache/instructlab/models/mistral-7b-instruct-v0.2.Q4_K_M.gguf # Number of CPU cores to use for generation num_cpus: 10 + # Number of batches to send on each core + batch_size: 8 # Directory where generated datasets are stored output_dir: ~/.local/share/instructlab/datasets # Directory where pipeline config files are stored diff --git a/src/instructlab/profiles/apple/m1/m1_ultra.yaml b/src/instructlab/profiles/apple/m1/m1_ultra.yaml index 600785f3f4..2bcc69d251 100644 --- a/src/instructlab/profiles/apple/m1/m1_ultra.yaml +++ b/src/instructlab/profiles/apple/m1/m1_ultra.yaml @@ -53,6 +53,8 @@ generate: model: ~/.cache/instructlab/models/mistral-7b-instruct-v0.2.Q4_K_M.gguf # Number of CPU cores to use for generation num_cpus: 10 + # Number of batches to send on each core + batch_size: 8 # Directory where generated datasets are stored output_dir: ~/.local/share/instructlab/datasets # Directory where pipeline config files are stored diff --git a/src/instructlab/profiles/apple/m2/m2.yaml b/src/instructlab/profiles/apple/m2/m2.yaml index 9ee435d2b3..9ae8efd7d1 100644 --- a/src/instructlab/profiles/apple/m2/m2.yaml +++ b/src/instructlab/profiles/apple/m2/m2.yaml @@ -55,6 +55,8 @@ generate: model: ~/.cache/instructlab/models/mistral-7b-instruct-v0.2.Q4_K_M.gguf # Number of CPU cores to use for generation num_cpus: 10 + # Number of batches to send on each core + batch_size: 8 # Directory where generated datasets are stored output_dir: ~/.local/share/instructlab/datasets # Directory where pipeline config files are stored diff --git a/src/instructlab/profiles/apple/m2/m2_max.yaml b/src/instructlab/profiles/apple/m2/m2_max.yaml index 4e473bbba8..81df5dc408 100644 --- a/src/instructlab/profiles/apple/m2/m2_max.yaml +++ b/src/instructlab/profiles/apple/m2/m2_max.yaml @@ -53,6 +53,8 @@ generate: model: ~/.cache/instructlab/models/mistral-7b-instruct-v0.2.Q4_K_M.gguf # Number of CPU cores to use for generation num_cpus: 10 + # Number of batches to send on each core + batch_size: 8 # Directory where generated datasets are stored output_dir: ~/.local/share/instructlab/datasets # Directory where pipeline config files are stored diff --git a/src/instructlab/profiles/apple/m2/m2_pro.yaml b/src/instructlab/profiles/apple/m2/m2_pro.yaml index bfe2ebf554..f72cf1b9b8 100644 --- a/src/instructlab/profiles/apple/m2/m2_pro.yaml +++ b/src/instructlab/profiles/apple/m2/m2_pro.yaml @@ -53,6 +53,8 @@ generate: model: ~/.cache/instructlab/models/mistral-7b-instruct-v0.2.Q4_K_M.gguf # Number of CPU cores to use for generation num_cpus: 10 + # Number of batches to send on each core + batch_size: 8 # Directory where generated datasets are stored output_dir: ~/.local/share/instructlab/datasets # Directory where pipeline config files are stored diff --git a/src/instructlab/profiles/apple/m2/m2_ultra.yaml b/src/instructlab/profiles/apple/m2/m2_ultra.yaml index a4db8a3746..d4a66b3c42 100644 --- a/src/instructlab/profiles/apple/m2/m2_ultra.yaml +++ b/src/instructlab/profiles/apple/m2/m2_ultra.yaml @@ -53,6 +53,8 @@ generate: model: ~/.cache/instructlab/models/mistral-7b-instruct-v0.2.Q4_K_M.gguf # Number of CPU cores to use for generation num_cpus: 10 + # Number of batches to send on each core + batch_size: 8 # Directory where generated datasets are stored output_dir: ~/.local/share/instructlab/datasets # Directory where pipeline config files are stored diff --git a/src/instructlab/profiles/apple/m3/m3.yaml b/src/instructlab/profiles/apple/m3/m3.yaml index 8553d959d0..3725699c19 100644 --- a/src/instructlab/profiles/apple/m3/m3.yaml +++ b/src/instructlab/profiles/apple/m3/m3.yaml @@ -53,6 +53,8 @@ generate: model: ~/.cache/instructlab/models/mistral-7b-instruct-v0.2.Q4_K_M.gguf # Number of CPU cores to use for generation num_cpus: 10 + # Number of batches to send on each core + batch_size: 8 # Directory where generated datasets are stored output_dir: ~/.local/share/instructlab/datasets # Directory where pipeline config files are stored diff --git a/src/instructlab/profiles/apple/m3/m3_max.yaml b/src/instructlab/profiles/apple/m3/m3_max.yaml index 9614d3c12d..969991afb7 100644 --- a/src/instructlab/profiles/apple/m3/m3_max.yaml +++ b/src/instructlab/profiles/apple/m3/m3_max.yaml @@ -55,6 +55,8 @@ generate: model: ~/.cache/instructlab/models/mistral-7b-instruct-v0.2.Q4_K_M.gguf # Number of CPU cores to use for generation num_cpus: 10 + # Number of batches to send on each core + batch_size: 8 # Directory where generated datasets are stored output_dir: ~/.local/share/instructlab/datasets # Directory where pipeline config files are stored diff --git a/src/instructlab/profiles/apple/m3/m3_pro.yaml b/src/instructlab/profiles/apple/m3/m3_pro.yaml index a41d62e045..430a9c7f6d 100644 --- a/src/instructlab/profiles/apple/m3/m3_pro.yaml +++ b/src/instructlab/profiles/apple/m3/m3_pro.yaml @@ -55,6 +55,8 @@ generate: model: ~/.cache/instructlab/models/mistral-7b-instruct-v0.2.Q4_K_M.gguf # Number of CPU cores to use for generation num_cpus: 10 + # Number of batches to send on each core + batch_size: 8 # Directory where generated datasets are stored output_dir: ~/.local/share/instructlab/datasets # Directory where pipeline config files are stored diff --git a/src/instructlab/profiles/intel/cpu.yaml b/src/instructlab/profiles/intel/cpu.yaml index 0f012097d5..f717c19a78 100644 --- a/src/instructlab/profiles/intel/cpu.yaml +++ b/src/instructlab/profiles/intel/cpu.yaml @@ -49,6 +49,8 @@ generate: model: ~/.cache/instructlab/models/mistral-7b-instruct-v0.2.Q4_K_M.gguf # Number of CPU cores to use for generation num_cpus: 10 + # Number of batches to send on each core + batch_size: 8 # Directory where generated datasets are stored output_dir: ~/.local/share/instructlab/datasets # Directory where pipeline config files are stored diff --git a/src/instructlab/profiles/intel/gaudi/gaudi_3.yaml b/src/instructlab/profiles/intel/gaudi/gaudi_3.yaml index ce202a0cab..9d5197b617 100644 --- a/src/instructlab/profiles/intel/gaudi/gaudi_3.yaml +++ b/src/instructlab/profiles/intel/gaudi/gaudi_3.yaml @@ -53,6 +53,8 @@ generate: model: ~/.cache/instructlab/models/mistralai/Mixtral-8x7B-Instruct-v0.1 # Number of CPU cores to use for generation num_cpus: 10 + # Number of batches to send on each core + batch_size: 8 # Directory where generated datasets are stored output_dir: ~/.local/share/instructlab/datasets # Directory where pipeline config files are stored diff --git a/src/instructlab/profiles/nvidia/a100/a100_x2.yaml b/src/instructlab/profiles/nvidia/a100/a100_x2.yaml index d960ee1ed2..c59d6d76c0 100644 --- a/src/instructlab/profiles/nvidia/a100/a100_x2.yaml +++ b/src/instructlab/profiles/nvidia/a100/a100_x2.yaml @@ -53,6 +53,8 @@ generate: model: ~/.cache/instructlab/models/mistralai/Mixtral-8x7B-Instruct-v0.1 # Number of CPU cores to use for generation num_cpus: 10 + # Number of batches to send on each core + batch_size: 8 # Directory where generated datasets are stored output_dir: ~/.local/share/instructlab/datasets # Directory where pipeline config files are stored diff --git a/src/instructlab/profiles/nvidia/a100/a100_x4.yaml b/src/instructlab/profiles/nvidia/a100/a100_x4.yaml index 0a293e73c7..e66975d4fc 100644 --- a/src/instructlab/profiles/nvidia/a100/a100_x4.yaml +++ b/src/instructlab/profiles/nvidia/a100/a100_x4.yaml @@ -53,6 +53,8 @@ generate: model: ~/.cache/instructlab/models/mistralai/Mixtral-8x7B-Instruct-v0.1 # Number of CPU cores to use for generation num_cpus: 10 + # Number of batches to send on each core + batch_size: 8 # Directory where generated datasets are stored output_dir: ~/.local/share/instructlab/datasets # Directory where pipeline config files are stored diff --git a/src/instructlab/profiles/nvidia/a100/a100_x8.yaml b/src/instructlab/profiles/nvidia/a100/a100_x8.yaml index 0bef26b623..45163688d2 100644 --- a/src/instructlab/profiles/nvidia/a100/a100_x8.yaml +++ b/src/instructlab/profiles/nvidia/a100/a100_x8.yaml @@ -53,6 +53,8 @@ generate: model: ~/.cache/instructlab/models/mistralai/Mixtral-8x7B-Instruct-v0.1 # Number of CPU cores to use for generation num_cpus: 10 + # Number of batches to send on each core + batch_size: 8 # Directory where generated datasets are stored output_dir: ~/.local/share/instructlab/datasets # Directory where pipeline config files are stored diff --git a/src/instructlab/profiles/nvidia/h100/h100_x2.yaml b/src/instructlab/profiles/nvidia/h100/h100_x2.yaml index 12aa2cfcab..b0b65dec70 100644 --- a/src/instructlab/profiles/nvidia/h100/h100_x2.yaml +++ b/src/instructlab/profiles/nvidia/h100/h100_x2.yaml @@ -53,6 +53,8 @@ generate: model: ~/.cache/instructlab/models/mistralai/Mixtral-8x7B-Instruct-v0.1 # Number of CPU cores to use for generation num_cpus: 10 + # Number of batches to send on each core + batch_size: 8 # Directory where generated datasets are stored output_dir: ~/.local/share/instructlab/datasets # Directory where pipeline config files are stored diff --git a/src/instructlab/profiles/nvidia/h100/h100_x4.yaml b/src/instructlab/profiles/nvidia/h100/h100_x4.yaml index 3aee1af0ea..8508e3e6f9 100644 --- a/src/instructlab/profiles/nvidia/h100/h100_x4.yaml +++ b/src/instructlab/profiles/nvidia/h100/h100_x4.yaml @@ -53,6 +53,8 @@ generate: model: ~/.cache/instructlab/models/mistralai/Mixtral-8x7B-Instruct-v0.1 # Number of CPU cores to use for generation num_cpus: 10 + # Number of batches to send on each core + batch_size: 8 # Directory where generated datasets are stored output_dir: ~/.local/share/instructlab/datasets # Directory where pipeline config files are stored diff --git a/src/instructlab/profiles/nvidia/h100/h100_x8.yaml b/src/instructlab/profiles/nvidia/h100/h100_x8.yaml index cb8443d56b..3ee35f0eef 100644 --- a/src/instructlab/profiles/nvidia/h100/h100_x8.yaml +++ b/src/instructlab/profiles/nvidia/h100/h100_x8.yaml @@ -53,6 +53,8 @@ generate: model: ~/.cache/instructlab/models/mistralai/Mixtral-8x7B-Instruct-v0.1 # Number of CPU cores to use for generation num_cpus: 10 + # Number of batches to send on each core + batch_size: 8 # Directory where generated datasets are stored output_dir: ~/.local/share/instructlab/datasets # Directory where pipeline config files are stored diff --git a/src/instructlab/profiles/nvidia/l4/l4_x8.yaml b/src/instructlab/profiles/nvidia/l4/l4_x8.yaml index 9e633739c4..23f4d2e200 100644 --- a/src/instructlab/profiles/nvidia/l4/l4_x8.yaml +++ b/src/instructlab/profiles/nvidia/l4/l4_x8.yaml @@ -53,6 +53,8 @@ generate: model: ~/.cache/instructlab/models/mistralai/Mixtral-8x7B-Instruct-v0.1 # Number of CPU cores to use for generation num_cpus: 10 + # Number of batches to send on each core + batch_size: 8 # Directory where generated datasets are stored output_dir: ~/.local/share/instructlab/datasets # Directory where pipeline config files are stored diff --git a/src/instructlab/profiles/nvidia/l40s/l40s_x4.yaml b/src/instructlab/profiles/nvidia/l40s/l40s_x4.yaml index e41827b222..1bea600a1d 100644 --- a/src/instructlab/profiles/nvidia/l40s/l40s_x4.yaml +++ b/src/instructlab/profiles/nvidia/l40s/l40s_x4.yaml @@ -53,6 +53,8 @@ generate: model: ~/.cache/instructlab/models/mistralai/Mixtral-8x7B-Instruct-v0.1 # Number of CPU cores to use for generation num_cpus: 10 + # Number of batches to send on each core + batch_size: 8 # Directory where generated datasets are stored output_dir: ~/.local/share/instructlab/datasets # Directory where pipeline config files are stored diff --git a/src/instructlab/profiles/nvidia/l40s/l40s_x8.yaml b/src/instructlab/profiles/nvidia/l40s/l40s_x8.yaml index d309e8bae4..0de86891ef 100644 --- a/src/instructlab/profiles/nvidia/l40s/l40s_x8.yaml +++ b/src/instructlab/profiles/nvidia/l40s/l40s_x8.yaml @@ -53,6 +53,8 @@ generate: model: ~/.cache/instructlab/models/mistralai/Mixtral-8x7B-Instruct-v0.1 # Number of CPU cores to use for generation num_cpus: 10 + # Number of batches to send on each core + batch_size: 8 # Directory where generated datasets are stored output_dir: ~/.local/share/instructlab/datasets # Directory where pipeline config files are stored diff --git a/tests/test_config.py b/tests/test_config.py index 49888676a8..b708713055 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -68,6 +68,7 @@ def _assert_defaults(self, cfg: config.Config): assert cfg.generate.taxonomy_path == f"{data_dir}/taxonomy" assert cfg.generate.taxonomy_base == "origin/main" assert cfg.generate.num_cpus == 10 + assert cfg.generate.batch_size == 8 assert cfg.generate.sdg_scale_factor == 30 assert cfg.generate.chunk_word_count == 1000 assert cfg.generate.output_dir == f"{data_dir}/datasets" diff --git a/tests/testdata/default_config.yaml b/tests/testdata/default_config.yaml index 0c5e22007c..5d20782cc2 100644 --- a/tests/testdata/default_config.yaml +++ b/tests/testdata/default_config.yaml @@ -145,6 +145,9 @@ generate: # Number of CPU cores to use for generation. # Default: 10 num_cpus: 10 + # Number of batches to send on each core + # Default: 8 + batch_size: 8 # Number of instructions to use # Default: -1 # Deprecated: see 'sdg_scale_factor' instead