instructlab · mergify · Feb 26, 2025 · Feb 17, 2025 · cdoern · Feb 17, 2025
diff --git a/src/instructlab/cli/data/generate.py b/src/instructlab/cli/data/generate.py
@@ -143,8 +143,8 @@
 @click.option(
     "--batch-size",
     type=click.IntRange(min=0),
-    default=None,
-    help="Number of elements to process in each batch through the SDG pipeline. Enabled by default for the vLLM serving backend, with a batch size of 8 chosen based on experiments to optimize for throughput. Use 0 to disable.",
+    cls=clickext.ConfigOption,
+    default=DEFAULTS.BATCH_SIZE,
-    default=DEFAULTS.BATCH_SIZE,
-    default=DEFAULTS.BATCH_SIZE,
 )
 @click.option(
     "--enable-serving-output",

diff --git a/src/instructlab/configuration.py b/src/instructlab/configuration.py
@@ -373,6 +373,10 @@ class _generate(BaseModel):
         default=DEFAULTS.NUM_CPUS,
         description="Number of CPU cores to use for generation.",
     )
+    batch_size: PositiveInt = Field(
+        default=DEFAULTS.BATCH_SIZE,
+        description="Number of Batches to send for generation on each core.",
+    )
     chunk_word_count: PositiveInt = Field(
         default=DEFAULTS.CHUNK_WORD_COUNT,
         description="Maximum number of words per chunk.",

diff --git a/src/instructlab/defaults.py b/src/instructlab/defaults.py
@@ -105,6 +105,8 @@ class _InstructlabDefaults:
     MAX_CONTEXT_SIZE = 4096
     # TODO: these constants should be removed, they should not leak out
     NUM_CPUS = 10
+    # Number of batches to send on each core. Tune the batch size to optimize the vLLM performance
+    BATCH_SIZE = 8
     CHUNK_WORD_COUNT = 1000
     CONNECTION_TIMEOUT = httpx.Timeout(timeout=30.0)
     # use spawn start method, fork is not thread-safe

diff --git a/src/instructlab/profiles/amd/cpu.yaml b/src/instructlab/profiles/amd/cpu.yaml
@@ -37,6 +37,8 @@ generate:
   model: ~/.cache/instructlab/models/mistral-7b-instruct-v0.2.Q4_K_M.gguf
   # Number of CPU cores to use for generation
   num_cpus: 10
+  # Number of batches to send on each core
+  batch_size: 8
   # Directory where generated datasets are stored
   output_dir: ~/.local/share/instructlab/datasets
   # Directory where pipeline config files are stored

diff --git a/src/instructlab/profiles/apple/m1/m1_max.yaml b/src/instructlab/profiles/apple/m1/m1_max.yaml
@@ -53,6 +53,8 @@ generate:
   model: ~/.cache/instructlab/models/mistral-7b-instruct-v0.2.Q4_K_M.gguf
   # Number of CPU cores to use for generation
   num_cpus: 10
+  # Number of batches to send on each core
+  batch_size: 8
   # Directory where generated datasets are stored
   output_dir: ~/.local/share/instructlab/datasets
   # Directory where pipeline config files are stored

diff --git a/src/instructlab/profiles/apple/m1/m1_ultra.yaml b/src/instructlab/profiles/apple/m1/m1_ultra.yaml
@@ -53,6 +53,8 @@ generate:
   model: ~/.cache/instructlab/models/mistral-7b-instruct-v0.2.Q4_K_M.gguf
   # Number of CPU cores to use for generation
   num_cpus: 10
+  # Number of batches to send on each core
+  batch_size: 8
   # Directory where generated datasets are stored
   output_dir: ~/.local/share/instructlab/datasets
   # Directory where pipeline config files are stored

diff --git a/src/instructlab/profiles/apple/m2/m2.yaml b/src/instructlab/profiles/apple/m2/m2.yaml
@@ -55,6 +55,8 @@ generate:
   model: ~/.cache/instructlab/models/mistral-7b-instruct-v0.2.Q4_K_M.gguf
   # Number of CPU cores to use for generation
   num_cpus: 10
+  # Number of batches to send on each core
+  batch_size: 8
   # Directory where generated datasets are stored
   output_dir: ~/.local/share/instructlab/datasets
   # Directory where pipeline config files are stored

diff --git a/src/instructlab/profiles/apple/m2/m2_max.yaml b/src/instructlab/profiles/apple/m2/m2_max.yaml
@@ -53,6 +53,8 @@ generate:
   model: ~/.cache/instructlab/models/mistral-7b-instruct-v0.2.Q4_K_M.gguf
   # Number of CPU cores to use for generation
   num_cpus: 10
+  # Number of batches to send on each core
+  batch_size: 8
   # Directory where generated datasets are stored
   output_dir: ~/.local/share/instructlab/datasets
   # Directory where pipeline config files are stored

diff --git a/src/instructlab/profiles/apple/m2/m2_pro.yaml b/src/instructlab/profiles/apple/m2/m2_pro.yaml
@@ -53,6 +53,8 @@ generate:
   model: ~/.cache/instructlab/models/mistral-7b-instruct-v0.2.Q4_K_M.gguf
   # Number of CPU cores to use for generation
   num_cpus: 10
+  # Number of batches to send on each core
+  batch_size: 8
   # Directory where generated datasets are stored
   output_dir: ~/.local/share/instructlab/datasets
   # Directory where pipeline config files are stored

diff --git a/src/instructlab/profiles/apple/m2/m2_ultra.yaml b/src/instructlab/profiles/apple/m2/m2_ultra.yaml
@@ -53,6 +53,8 @@ generate:
   model: ~/.cache/instructlab/models/mistral-7b-instruct-v0.2.Q4_K_M.gguf
   # Number of CPU cores to use for generation
   num_cpus: 10
+  # Number of batches to send on each core
+  batch_size: 8
   # Directory where generated datasets are stored
   output_dir: ~/.local/share/instructlab/datasets
   # Directory where pipeline config files are stored

diff --git a/src/instructlab/profiles/apple/m3/m3.yaml b/src/instructlab/profiles/apple/m3/m3.yaml
@@ -53,6 +53,8 @@ generate:
   model: ~/.cache/instructlab/models/mistral-7b-instruct-v0.2.Q4_K_M.gguf
   # Number of CPU cores to use for generation
   num_cpus: 10
+  # Number of batches to send on each core
+  batch_size: 8
   # Directory where generated datasets are stored
   output_dir: ~/.local/share/instructlab/datasets
   # Directory where pipeline config files are stored

diff --git a/src/instructlab/profiles/apple/m3/m3_max.yaml b/src/instructlab/profiles/apple/m3/m3_max.yaml
@@ -55,6 +55,8 @@ generate:
   model: ~/.cache/instructlab/models/mistral-7b-instruct-v0.2.Q4_K_M.gguf
   # Number of CPU cores to use for generation
   num_cpus: 10
+  # Number of batches to send on each core
+  batch_size: 8
   # Directory where generated datasets are stored
   output_dir: ~/.local/share/instructlab/datasets
   # Directory where pipeline config files are stored

diff --git a/src/instructlab/profiles/apple/m3/m3_pro.yaml b/src/instructlab/profiles/apple/m3/m3_pro.yaml
@@ -55,6 +55,8 @@ generate:
   model: ~/.cache/instructlab/models/mistral-7b-instruct-v0.2.Q4_K_M.gguf
   # Number of CPU cores to use for generation
   num_cpus: 10
+  # Number of batches to send on each core
+  batch_size: 8
   # Directory where generated datasets are stored
   output_dir: ~/.local/share/instructlab/datasets
   # Directory where pipeline config files are stored

diff --git a/src/instructlab/profiles/intel/cpu.yaml b/src/instructlab/profiles/intel/cpu.yaml
@@ -49,6 +49,8 @@ generate:
   model: ~/.cache/instructlab/models/mistral-7b-instruct-v0.2.Q4_K_M.gguf
   # Number of CPU cores to use for generation
   num_cpus: 10
+  # Number of batches to send on each core
+  batch_size: 8
   # Directory where generated datasets are stored
   output_dir: ~/.local/share/instructlab/datasets
   # Directory where pipeline config files are stored

diff --git a/src/instructlab/profiles/intel/gaudi/gaudi_3.yaml b/src/instructlab/profiles/intel/gaudi/gaudi_3.yaml
@@ -53,6 +53,8 @@ generate:
   model: ~/.cache/instructlab/models/mistralai/Mixtral-8x7B-Instruct-v0.1
   # Number of CPU cores to use for generation
   num_cpus: 10
+  # Number of batches to send on each core
+  batch_size: 8
   # Directory where generated datasets are stored
   output_dir: ~/.local/share/instructlab/datasets
   # Directory where pipeline config files are stored

diff --git a/src/instructlab/profiles/nvidia/a100/a100_x2.yaml b/src/instructlab/profiles/nvidia/a100/a100_x2.yaml
@@ -53,6 +53,8 @@ generate:
   model: ~/.cache/instructlab/models/mistralai/Mixtral-8x7B-Instruct-v0.1
   # Number of CPU cores to use for generation
   num_cpus: 10
+  # Number of batches to send on each core
+  batch_size: 8
   # Directory where generated datasets are stored
   output_dir: ~/.local/share/instructlab/datasets
   # Directory where pipeline config files are stored

diff --git a/src/instructlab/profiles/nvidia/a100/a100_x4.yaml b/src/instructlab/profiles/nvidia/a100/a100_x4.yaml
@@ -53,6 +53,8 @@ generate:
   model: ~/.cache/instructlab/models/mistralai/Mixtral-8x7B-Instruct-v0.1
   # Number of CPU cores to use for generation
   num_cpus: 10
+  # Number of batches to send on each core
+  batch_size: 8
   # Directory where generated datasets are stored
   output_dir: ~/.local/share/instructlab/datasets
   # Directory where pipeline config files are stored

diff --git a/src/instructlab/profiles/nvidia/a100/a100_x8.yaml b/src/instructlab/profiles/nvidia/a100/a100_x8.yaml
@@ -53,6 +53,8 @@ generate:
   model: ~/.cache/instructlab/models/mistralai/Mixtral-8x7B-Instruct-v0.1
   # Number of CPU cores to use for generation
   num_cpus: 10
+  # Number of batches to send on each core
+  batch_size: 8
   # Directory where generated datasets are stored
   output_dir: ~/.local/share/instructlab/datasets
   # Directory where pipeline config files are stored

diff --git a/src/instructlab/profiles/nvidia/h100/h100_x2.yaml b/src/instructlab/profiles/nvidia/h100/h100_x2.yaml
@@ -53,6 +53,8 @@ generate:
   model: ~/.cache/instructlab/models/mistralai/Mixtral-8x7B-Instruct-v0.1
   # Number of CPU cores to use for generation
   num_cpus: 10
+  # Number of batches to send on each core
+  batch_size: 8
   # Directory where generated datasets are stored
   output_dir: ~/.local/share/instructlab/datasets
   # Directory where pipeline config files are stored

diff --git a/src/instructlab/profiles/nvidia/h100/h100_x4.yaml b/src/instructlab/profiles/nvidia/h100/h100_x4.yaml
@@ -53,6 +53,8 @@ generate:
   model: ~/.cache/instructlab/models/mistralai/Mixtral-8x7B-Instruct-v0.1
   # Number of CPU cores to use for generation
   num_cpus: 10
+  # Number of batches to send on each core
+  batch_size: 8
   # Directory where generated datasets are stored
   output_dir: ~/.local/share/instructlab/datasets
   # Directory where pipeline config files are stored

diff --git a/src/instructlab/profiles/nvidia/h100/h100_x8.yaml b/src/instructlab/profiles/nvidia/h100/h100_x8.yaml
@@ -53,6 +53,8 @@ generate:
   model: ~/.cache/instructlab/models/mistralai/Mixtral-8x7B-Instruct-v0.1
   # Number of CPU cores to use for generation
   num_cpus: 10
+  # Number of batches to send on each core
+  batch_size: 8
   # Directory where generated datasets are stored
   output_dir: ~/.local/share/instructlab/datasets
   # Directory where pipeline config files are stored

diff --git a/src/instructlab/profiles/nvidia/l4/l4_x8.yaml b/src/instructlab/profiles/nvidia/l4/l4_x8.yaml
@@ -53,6 +53,8 @@ generate:
   model: ~/.cache/instructlab/models/mistralai/Mixtral-8x7B-Instruct-v0.1
   # Number of CPU cores to use for generation
   num_cpus: 10
+  # Number of batches to send on each core
+  batch_size: 8
   # Directory where generated datasets are stored
   output_dir: ~/.local/share/instructlab/datasets
   # Directory where pipeline config files are stored

diff --git a/src/instructlab/profiles/nvidia/l40s/l40s_x4.yaml b/src/instructlab/profiles/nvidia/l40s/l40s_x4.yaml
@@ -53,6 +53,8 @@ generate:
   model: ~/.cache/instructlab/models/mistralai/Mixtral-8x7B-Instruct-v0.1
   # Number of CPU cores to use for generation
   num_cpus: 10
+  # Number of batches to send on each core
+  batch_size: 8
   # Directory where generated datasets are stored
   output_dir: ~/.local/share/instructlab/datasets
   # Directory where pipeline config files are stored

diff --git a/src/instructlab/profiles/nvidia/l40s/l40s_x8.yaml b/src/instructlab/profiles/nvidia/l40s/l40s_x8.yaml
@@ -53,6 +53,8 @@ generate:
   model: ~/.cache/instructlab/models/mistralai/Mixtral-8x7B-Instruct-v0.1
   # Number of CPU cores to use for generation
   num_cpus: 10
+  # Number of batches to send on each core
+  batch_size: 8
   # Directory where generated datasets are stored
   output_dir: ~/.local/share/instructlab/datasets
   # Directory where pipeline config files are stored

diff --git a/tests/test_config.py b/tests/test_config.py
@@ -68,6 +68,7 @@ def _assert_defaults(self, cfg: config.Config):
         assert cfg.generate.taxonomy_path == f"{data_dir}/taxonomy"
         assert cfg.generate.taxonomy_base == "origin/main"
         assert cfg.generate.num_cpus == 10
+        assert cfg.generate.batch_size == 8
         assert cfg.generate.sdg_scale_factor == 30
         assert cfg.generate.chunk_word_count == 1000
         assert cfg.generate.output_dir == f"{data_dir}/datasets"

diff --git a/tests/testdata/default_config.yaml b/tests/testdata/default_config.yaml
@@ -145,6 +145,9 @@ generate:
   # Number of CPU cores to use for generation.
   # Default: 10
   num_cpus: 10
+  # Number of batches to send on each core
+  # Default: 8
+  batch_size: 8
   # Number of instructions to use
   # Default: -1
   # Deprecated: see 'sdg_scale_factor' instead