From f5d44a1e7355e8c2316bc735c64a4af85c677a0f Mon Sep 17 00:00:00 2001
From: jgreer013 <18727435+jgreer013@users.noreply.github.com>
Date: Wed, 3 Sep 2025 12:52:27 -0700
Subject: [PATCH 1/7] Add synth documentation and example configs
---
configs/examples/synthesis/README.md | 150 ++++++
.../synthesis/conversation_synthesis.yaml | 284 +++++++++++
.../examples/synthesis/data_augmentation.yaml | 198 ++++++++
configs/examples/synthesis/domain_qa.yaml | 194 ++++++++
.../synthesis/instruction_following.yaml | 205 ++++++++
.../synthesis/question_answer_generation.yaml | 166 +++++++
docs/cli/commands.md | 12 +
docs/user_guides/synth.md | 451 ++++++++++++++++++
8 files changed, 1660 insertions(+)
create mode 100644 configs/examples/synthesis/README.md
create mode 100644 configs/examples/synthesis/conversation_synthesis.yaml
create mode 100644 configs/examples/synthesis/data_augmentation.yaml
create mode 100644 configs/examples/synthesis/domain_qa.yaml
create mode 100644 configs/examples/synthesis/instruction_following.yaml
create mode 100644 configs/examples/synthesis/question_answer_generation.yaml
create mode 100644 docs/user_guides/synth.md
diff --git a/configs/examples/synthesis/README.md b/configs/examples/synthesis/README.md
new file mode 100644
index 0000000000..79707b7d69
--- /dev/null
+++ b/configs/examples/synthesis/README.md
@@ -0,0 +1,150 @@
+# Synthesis Examples
+
+This directory contains example configurations for different data synthesis use cases using the `oumi synth` command. Each example demonstrates how to generate specific types of synthetic training data.
+
+## Available Examples
+
+### 1. Question-Answer Generation (`question_answer_generation.yaml`)
+
+**Purpose**: Generate QA pairs from documents or contexts for training conversational models.
+
+**What it does**: Creates geography quiz questions with varying difficulty levels (easy, medium, hard) across different topics (capitals, physical geography, countries, climate).
+
+**Key features**:
+- Uses example questions for few-shot learning
+- Generates both questions and answers separately
+- Includes difficulty and topic classification
+- Produces 50 samples with balanced difficulty distribution
+
+**Run with**:
+```bash
+oumi synth -c configs/examples/synthesis/question_answer_generation.yaml
+```
+
+### 2. Data Augmentation (`data_augmentation.yaml`)
+
+**Purpose**: Create variations of existing datasets by sampling different attributes and regenerating content.
+
+**What it does**: Takes existing instruction-response pairs and creates variations with different styles (professional, casual, educational) and complexity levels (simple, detailed, expert).
+
+**Key features**:
+- Uses input data from existing datasets
+- Generates augmented versions with different styles and complexity
+- Preserves original data for comparison
+- Demonstrates combination sampling for specific style-complexity pairs
+
+**Run with**:
+```bash
+oumi synth -c configs/examples/synthesis/data_augmentation.yaml
+```
+
+### 3. Instruction Following (`instruction_following.yaml`)
+
+**Purpose**: Generate instruction-response pairs with varying complexity and domains.
+
+**What it does**: Creates diverse task instructions across multiple domains (writing, analysis, coding, math, science, business) with different complexity levels and task formats.
+
+**Key features**:
+- Multi-domain instruction generation
+- Varying complexity levels (beginner, intermediate, advanced)
+- Different task formats (explain, create, analyze, solve, summarize)
+- Balanced distribution with targeted combinations
+
+**Run with**:
+```bash
+oumi synth -c configs/examples/synthesis/instruction_following.yaml
+```
+
+### 4. Conversation Synthesis (`conversation_synthesis.yaml`)
+
+**Purpose**: Create multi-turn conversations by chaining generated responses.
+
+**What it does**: Generates realistic customer support conversations with different scenarios, customer personalities, and resolution outcomes.
+
+**Key features**:
+- Multi-turn conversation generation (4 messages)
+- Different customer personalities (friendly, frustrated, confused, demanding, curious)
+- Various support scenarios (account issues, billing, product questions, technical support, refunds)
+- Converts to chat format for training
+- Demonstrates conversation flow and natural progression
+
+**Run with**:
+```bash
+oumi synth -c configs/examples/synthesis/conversation_synthesis.yaml
+```
+
+### 5. Domain Adaptation (`domain_adaptation.yaml`)
+
+**Purpose**: Generate domain-specific training data by conditioning on domain attributes.
+
+**What it does**: Creates medical Q&A data across different medical specialties with appropriate context and complexity levels.
+
+**Key features**:
+- Medical specialty focus (cardiology, dermatology, pediatrics, neurology, orthopedics, endocrinology)
+- Context-aware generation (patient education, diagnosis support, treatment guidance, prevention advice)
+- Complexity levels for different audiences (basic, intermediate, professional)
+- Includes medical terminology explanations
+- Demonstrates domain-specific content generation
+
+**Run with**:
+```bash
+oumi synth -c configs/examples/synthesis/domain_adaptation.yaml
+```
+
+## Usage Tips
+
+### Before Running
+
+1. **Set up API access**: Most examples use Claude 3.5 Sonnet. Make sure you have:
+ - Anthropic API key set in your environment (`ANTHROPIC_API_KEY`)
+ - Or modify the `inference_config` to use a different model/engine
+
+2. **Check output paths**: Examples save to files like `geography_qa_dataset.jsonl`. Modify `output_path` if needed.
+
+3. **Adjust sample counts**: Start with smaller `num_samples` for testing, then scale up.
+
+### Customization
+
+- **Change the model**: Modify `inference_config.model.model_name` and `engine`
+- **Adjust generation parameters**: Modify `temperature`, `max_new_tokens`, etc.
+- **Add your own data**: Replace `input_examples` or add `input_data` paths
+- **Modify attributes**: Change `sampled_attributes` and `generated_attributes` for your use case
+- **Control distribution**: Use `sample_rate` and `combination_sampling` to control output distribution
+
+### Common Modifications
+
+```yaml
+# Use a different model
+inference_config:
+ model:
+ model_name: gpt-4o
+ engine: OPENAI
+
+# Add your own input data
+strategy_params:
+ input_data:
+ - path: "path/to/your/data.jsonl"
+ attribute_map:
+ old_field: new_attribute
+
+# Generate more samples
+num_samples: 100
+
+# Use different output format
+output_path: my_custom_dataset.jsonl
+
+# Increase workers for higher throughput
+inference_config:
+ max_workers: 100 # Increase for higher generation throughput based on your API limts
+```
+
+## Next Steps
+
+After generating synthetic data:
+
+1. **Review the output**: Check the generated samples for quality and relevance
+2. **Use for training**: Include the dataset in your training configuration
+3. **Iterate and improve**: Modify the synthesis config based on results
+4. **Combine datasets**: Use multiple synthesis runs to create larger, more diverse datasets
+
+For more information, see the [Data Synthesis Guide](../../docs/user_guides/synth.md).
diff --git a/configs/examples/synthesis/conversation_synthesis.yaml b/configs/examples/synthesis/conversation_synthesis.yaml
new file mode 100644
index 0000000000..46f0f39ba3
--- /dev/null
+++ b/configs/examples/synthesis/conversation_synthesis.yaml
@@ -0,0 +1,284 @@
+# Conversation Synthesis Example
+# Create multi-turn conversations by chaining generated responses.
+# This example generates realistic customer support conversations with different scenarios.
+
+strategy: GENERAL
+num_samples: 50
+output_path: conversation_dataset.jsonl
+
+strategy_params:
+ # Define conversation scenarios and customer types
+ sampled_attributes:
+ - id: system_instruction
+ name: System Instruction
+ description: The system instruction to use for the conversation
+ possible_values:
+ - id: customer_support
+ name: |
+ You are CareBot, a customer service assistant for BrightTech Electronics.
+
+ Your goals:
+ - Resolve customer issues within your capabilities.
+ - Use a polite, empathetic, and professional tone.
+ - Always summarize the result for the customer in natural language.
+ - When you need to perform an action, output it in a structured format so the system can execute it.
+ - If you do not have enough information to resolve the issue, ask the customer to clarify.
+
+ ### Tone Rules
+ - Be polite, concise, and solution-focused.
+ - Acknowledge frustration with empathy.
+ - Do not provide personal opinions or unrelated advice.
+ - Never ask for personal information like name, email, phone number, etc.
+
+ ### Action Rules
+ When an action is required, output ONLY in the following format:
+
+
+ type: [LOOKUP_ORDER | CHECK_SHIPPING | INITIATE_RETURN | ISSUE_REFUND | OFFER_DISCOUNT | ESCALATE]
+ parameters: { ... } # JSON-like key/value parameters
+
+
+ After the action block, provide a natural-language response to the customer.
+ Do not mix the action block with free text.
+
+ ### Action Types
+ 1. LOOKUP_ORDER
+ - parameters: { "order_id": string }
+ - Example: Look up an order when given an order number.
+
+ 2. CHECK_SHIPPING
+ - parameters: { "order_id": string }
+ - Example: Check the shipping carrier’s status for the order.
+
+ 3. INITIATE_RETURN
+ - parameters: { "order_id": string, "item_id": string, "reason": string }
+ - Example: Customer requests to return or exchange a product.
+
+ 4. ISSUE_REFUND
+ - parameters: { "order_id": string, "amount": number }
+ - Rules: Refunds up to $200 only. Escalate otherwise.
+
+ 5. OFFER_DISCOUNT
+ - parameters: { "order_id": string, "amount_percent": number (≤10) }
+ - Example: Offer up to 10% discount to resolve dissatisfaction.
+
+ 6. ESCALATE
+ - parameters: { "reason": string }
+ - Example: Customer requests human support, issue > $200, legal or account issues.
+
+ ### Example Responses
+ Customer: "Where's my order BT1234?"
+ Bot:
+
+ type: CHECK_SHIPPING
+ parameters: { "order_id": "BT1234" }
+
+ Let me check the shipping status for order BT1234. One moment please.
+
+ Customer: "I want to return my headphones."
+ Bot:
+
+ type: INITIATE_RETURN
+ parameters: { "order_id": "BT4567", "item_id": "HD998", "reason": "Product defective" }
+
+ I can help with that. I'll generate a return label for your headphones.
+ description: "N/A"
+
+ - id: scenario
+ name: Support Scenario
+ description: The type of customer support issue being addressed
+ possible_values:
+ - id: account_issue
+ name: Account Problems
+ description: Login issues, password resets, account access problems
+ - id: billing_inquiry
+ name: Billing Questions
+ description: Questions about charges, payments, subscription management
+ - id: product_question
+ name: Product Information
+ description: Feature questions, how-to guides, product comparisons
+ - id: technical_support
+ name: Technical Issues
+ description: Bug reports, troubleshooting, technical difficulties
+ - id: refund_request
+ name: Refund Requests
+ description: Return requests, cancellations, refund processing
+
+ - id: customer_type
+ name: Customer Personality
+ description: The customer's communication style and emotional state
+ possible_values:
+ - id: friendly
+ name: Friendly
+ description: Polite, patient, and appreciative
+ - id: frustrated
+ name: Frustrated
+ description: Annoyed but still reasonable
+ - id: confused
+ name: Confused
+ description: Uncertain and needs clear explanations
+ - id: demanding
+ name: Demanding
+ description: Urgent, wants immediate resolution
+ - id: curious
+ name: Curious
+ description: Asking lots of questions, wants to understand
+ - id: adversarial
+ name: Adversarial
+ description: Attempting to deceive or manipulate the agent to say or do something it shouldn't do
+
+ - id: resolution
+ name: Resolution Type
+ description: How the conversation should end
+ possible_values:
+ - id: resolved
+ name: Fully Resolved
+ description: Issue completely solved, customer satisfied
+ - id: escalated
+ name: Escalated
+ description: Transferred to specialist or supervisor
+ - id: pending
+ name: Pending
+ description: Requires follow-up or additional steps
+ - id: abandoned
+ name: Abandoned
+ description: Customer gives up and ends the conversation
+
+ # Generate the customer's opening message
+ generated_attributes:
+ - id: customer_opener
+ instruction_messages:
+ - role: SYSTEM
+ content: |
+ You are creating realistic customer support conversation starters.
+
+ The customer should sound natural and match their personality type.
+ - role: USER
+ content: |
+ Customer personality: {customer_type.description}
+ Issue type: {scenario.description}
+
+ Company name: BrightTech Electronics
+
+ Make it sound natural and realistic. Write only the customer's opening message.
+
+ Format your response as follows:
+ Customer:
+
+ End Customer
+
+ Write only the customer's opening message, do not include additional filler text.
+ postprocessing_params:
+ id: cleaned_opener
+ cut_prefix: "Customer:"
+ cut_suffix: "End Customer"
+ strip_whitespace: true
+
+ # Generate the support agent's response
+ - id: agent_response
+ instruction_messages:
+ - role: SYSTEM
+ content: |
+ {system_instruction}
+ - role: USER
+ content: "{cleaned_opener}"
+ postprocessing_params:
+ id: cleaned_agent_response
+ strip_whitespace: true
+
+ # Generate the customer's follow-up
+ - id: customer_followup
+ instruction_messages:
+ - role: SYSTEM
+ content: |
+ You are the customer continuing the conversation with the support agent.
+
+ Customer personality: {customer_type.description}
+ Issue type: {scenario.description}
+ Resolution type: {resolution.description}
+
+ Maintain the same personality and tone as your opening message.
+ React naturally to the agent's response.
+ Try to guide the conversation toward a {resolution} resolution.
+
+ Opening message: "{cleaned_opener}"
+
+ Format your response as follows:
+ Customer:
+
+ End Customer
+
+ Write only the customer's follow-up message, do not include additional filler text.
+ - role: USER
+ content: "{cleaned_agent_response}"
+ postprocessing_params:
+ id: cleaned_followup
+ cut_prefix: "Customer:"
+ cut_suffix: "End Customer"
+ strip_whitespace: true
+
+ # Generate the final agent response
+ - id: final_agent_response
+ instruction_messages:
+ - role: SYSTEM
+ content: |
+ {system_instruction}
+ - role: USER
+ content: |
+ Customer's original message: "{cleaned_opener}"
+ Your first response: "{cleaned_agent_response}"
+ Customer's follow-up: "{cleaned_followup}"
+
+ Format your response as follows:
+ Response:
+
+ End Response
+
+ Write only the final agent response, do not include additional filler text.
+ postprocessing_params:
+ id: cleaned_final_response
+ cut_prefix: "Response:"
+ cut_suffix: "End Response"
+ strip_whitespace: true
+
+ # Transform into conversation format
+ transformed_attributes:
+ - id: conversation
+ transformation_strategy:
+ type: CHAT
+ chat_transform:
+ messages:
+ - role: SYSTEM
+ content: "{system_instruction}"
+ - role: USER
+ content: "{cleaned_opener}"
+ - role: ASSISTANT
+ content: "{cleaned_agent_response}"
+ - role: USER
+ content: "{cleaned_followup}"
+ - role: ASSISTANT
+ content: "{cleaned_final_response}"
+
+ # Include relevant attributes in output
+ passthrough_attributes:
+ - conversation
+ - scenario
+ - customer_type
+ - resolution
+ - cleaned_opener
+ - cleaned_agent_response
+ - cleaned_followup
+ - cleaned_final_response
+
+# Model configuration optimized for conversation generation
+inference_config:
+ model:
+ model_name: claude-3-5-sonnet-20240620
+ engine: ANTHROPIC
+ generation:
+ max_new_tokens: 8192
+ temperature: 0.7
+ top_p: 0.9
+ remote_params:
+ num_workers: 50
+ politeness_policy: 60
diff --git a/configs/examples/synthesis/data_augmentation.yaml b/configs/examples/synthesis/data_augmentation.yaml
new file mode 100644
index 0000000000..a42d62138d
--- /dev/null
+++ b/configs/examples/synthesis/data_augmentation.yaml
@@ -0,0 +1,198 @@
+# Data Augmentation Example
+# Expand existing datasets through both instruction rephrasing and related task generation.
+# This example creates both: 1) diverse ways to ask for the same information, and 2) similar but different tasks in the same domain.
+
+strategy: GENERAL
+num_samples: 50
+output_path: augmented_qa_dataset.jsonl
+
+strategy_params:
+ # Input existing data to augment
+ input_data:
+ - path: "data/dataset_examples/alpaca_format.jsonl"
+ attribute_map:
+ instruction: original_instruction
+ input: original_input
+ output: original_response
+
+ # Define augmentation approaches
+ sampled_attributes:
+ - id: augmentation_type
+ name: Augmentation Type
+ description: Whether to rephrase the same task or create a related task
+ possible_values:
+ - id: rephrase
+ name: Rephrase Same Task
+ description: Create different ways to ask for the exact same thing
+ sample_rate: 0.3 # 30% will be rephrases
+ - id: related_task
+ name: Related Task
+ description: Create similar but different tasks in the same domain
+ sample_rate: 0.7 # 70% will be related tasks
+
+ - id: phrasing_style
+ name: Instruction Phrasing Style
+ description: Different ways to phrase the same request
+ possible_values:
+ - id: direct
+ name: Direct
+ description: Straightforward, clear commands (e.g., "Calculate X", "Explain Y")
+ sample_rate: 0.25
+ - id: conversational
+ name: Conversational
+ description: Natural, question-based phrasing (e.g., "Can you help me...", "What would...")
+ sample_rate: 0.25
+ - id: formal
+ name: Formal
+ description: Professional, structured requests (e.g., "Please provide...", "I require...")
+ sample_rate: 0.25
+ - id: context_rich
+ name: Context-Rich
+ description: Instructions that include background or context (e.g., "I'm working on X and need...")
+ sample_rate: 0.25
+
+ - id: specificity_level
+ name: Instruction Specificity
+ description: How detailed and specific the instruction should be
+ possible_values:
+ - id: general
+ name: General
+ description: Broad, high-level requests
+ - id: specific
+ name: Specific
+ description: Detailed requests with specific requirements
+ - id: precise
+ name: Precise
+ description: Very specific with exact parameters or constraints
+
+ # Generate instruction variations or related tasks
+ generated_attributes:
+ - id: augmented_instruction
+ instruction_messages:
+ - role: SYSTEM
+ content: |
+ You are an expert at creating instruction variations for data augmentation.
+
+ Based on the augmentation type specified, you will either:
+ 1. REPHRASE: Create different ways to ask for the exact same thing while maintaining EXACT same meaning and requirements
+ 2. RELATED_TASK: Create similar but different tasks in the same domain that require different responses but use similar knowledge/skills
+
+ Always follow the specified phrasing style and specificity level guidelines.
+ If there was input data, incorporate it naturally or create appropriate new data for related tasks.
+ - role: USER
+ content: |
+ Original instruction: "{original_instruction}"
+ Original input (if any): "{original_input}"
+ Augmentation type: {augmentation_type}
+ Phrasing style: {phrasing_style.description}
+ Specificity level: {specificity_level.description}
+
+ Create a new instruction based on the augmentation type:
+ - If "rephrase": Rewrite the instruction using the specified style/specificity while asking for exactly the same thing
+ - If "related_task": Create a similar but different task in the same domain using the specified style/specificity
+
+ Format your response as follows:
+ Instruction:
+
+ End Instruction
+
+ Write only the instruction, do not include additional filler text or the answer.
+ postprocessing_params:
+ id: cleaned_instruction
+ cut_prefix: "Instruction:"
+ cut_suffix: "End Instruction"
+ strip_whitespace: true
+
+ - id: augmented_response
+ instruction_messages:
+ - role: SYSTEM
+ content: |
+ You are a helpful AI assistant providing accurate and informative responses.
+
+ Based on the augmentation type, you will either:
+ 1. For "rephrase": Provide a response with the same information and style as the original
+ 2. For "related_task": Provide an appropriate response that matches the style and quality of the original but addresses the new request
+
+ Always maintain the same level of helpfulness and accuracy as demonstrated in the original response.
+ - role: USER
+ content: |
+ Instruction: "{cleaned_instruction}"
+ Augmentation type: {augmentation_type}
+ Original instruction: "{original_instruction}"
+ Original response: "{original_response}"
+
+ Provide an appropriate response based on the augmentation type:
+ - If "rephrase": Maintain the same style and quality of the original response
+ - If "related_task": Create a new response that addresses the new instruction while matching the original's style and quality
+
+ Format your response as follows:
+ Response:
+
+ End Response
+
+ Write only the response, do not include additional filler text.
+ postprocessing_params:
+ id: cleaned_response
+ cut_prefix: "Response:"
+ cut_suffix: "End Response"
+ strip_whitespace: true
+
+ # Create variations by sampling different combinations
+ combination_sampling:
+ - combination:
+ augmentation_type: rephrase
+ phrasing_style: conversational
+ specificity_level: general
+ sample_rate: 0.15 # 15% rephrase + conversational + general
+ - combination:
+ augmentation_type: rephrase
+ phrasing_style: formal
+ specificity_level: precise
+ sample_rate: 0.15 # 15% rephrase + formal + precise
+ - combination:
+ augmentation_type: related_task
+ phrasing_style: direct
+ specificity_level: specific
+ sample_rate: 0.1 # 10% related + direct + specific
+ - combination:
+ augmentation_type: related_task
+ phrasing_style: context_rich
+ specificity_level: general
+ sample_rate: 0.1 # 10% related + context-rich + general
+
+ # Transform into conversation format with appropriate response
+ transformed_attributes:
+ - id: conversation
+ transformation_strategy:
+ type: CHAT
+ chat_transform:
+ messages:
+ - role: USER
+ content: "{cleaned_instruction}"
+ - role: ASSISTANT
+ content: "{cleaned_response}"
+
+ # Output the augmented data
+ passthrough_attributes:
+ - conversation
+ - cleaned_instruction
+ - cleaned_response
+ - augmentation_type
+ - phrasing_style
+ - specificity_level
+ - original_instruction
+ - original_input
+ - original_response
+
+# Configure the AI model
+inference_config:
+ model:
+ model_name: claude-3-5-sonnet-20240620
+ engine: ANTHROPIC
+ generation:
+ max_new_tokens: 8192
+ temperature: 0.7
+ top_p: 0.9
+ remote_params:
+ num_workers: 50
+ politeness_policy: 60
diff --git a/configs/examples/synthesis/domain_qa.yaml b/configs/examples/synthesis/domain_qa.yaml
new file mode 100644
index 0000000000..761c464351
--- /dev/null
+++ b/configs/examples/synthesis/domain_qa.yaml
@@ -0,0 +1,194 @@
+# Domain QA Example
+# Generate domain-specific training data by conditioning on domain attributes.
+# This example creates medical Q&A data with different specialties and contexts.
+
+strategy: GENERAL
+num_samples: 50
+output_path: domain_qa.jsonl
+
+strategy_params:
+ # Provide domain-specific input documents for context
+ input_examples:
+ - examples:
+ - example_specialty: "Cardiology"
+ example_specialty_description: "Heart and cardiovascular system disorders"
+ example_context_type: "Patient Education"
+ example_context_type_description: "Information to help patients understand their health"
+ example_complexity_level: "Basic"
+ example_complexity_level_description: "General information accessible to patients"
+ example_question: "What lifestyle changes can help prevent heart disease?"
+ - example_specialty: "Dermatology"
+ example_specialty_description: "Skin, hair, and nail conditions"
+ example_context_type: "Diagnosis Support"
+ example_context_type_description: "Information to aid in medical diagnosis"
+ example_complexity_level: "Intermediate"
+ example_complexity_level_description: "Detailed information for informed patients"
+ example_question: "How do the underlying pathophysiology and treatment approaches differ between atopic dermatitis and psoriasis, and what factors should guide treatment selection?"
+ - example_specialty: "Pediatrics"
+ example_specialty_description: "Medical care for infants, children, and adolescents"
+ example_context_type: "Parent Guidance"
+ example_context_type_description: "Information for parents about children's health"
+ example_complexity_level: "Professional"
+ example_complexity_level_description: "Technical information for healthcare providers"
+ example_question: "What are the evidence-based protocols for fever management in pediatric patients aged 3 months to 3 years, including risk stratification criteria for serious bacterial infections and antibiotic decision-making algorithms?"
+
+ # Define medical specialties and contexts
+ sampled_attributes:
+ - id: specialty
+ name: Medical Specialty
+ description: The specific area of medicine being addressed
+ possible_values:
+ - id: cardiology
+ name: Cardiology
+ description: Heart and cardiovascular system disorders
+ - id: dermatology
+ name: Dermatology
+ description: Skin, hair, and nail conditions
+ - id: pediatrics
+ name: Pediatrics
+ description: Medical care for infants, children, and adolescents
+ - id: neurology
+ name: Neurology
+ description: Nervous system and brain disorders
+ - id: orthopedics
+ name: Orthopedics
+ description: Musculoskeletal system and injuries
+ - id: endocrinology
+ name: Endocrinology
+ description: Hormonal and metabolic disorders
+
+ - id: context_type
+ name: Context Type
+ description: The setting or purpose of the medical information
+ possible_values:
+ - id: patient_education
+ name: Patient Education
+ description: Information to help patients understand their health
+ - id: diagnosis_support
+ name: Diagnosis Support
+ description: Information to aid in medical diagnosis
+ - id: treatment_guidance
+ name: Treatment Guidance
+ description: Information about treatment options and procedures
+ - id: prevention_advice
+ name: Prevention Advice
+ description: Preventive care and health maintenance information
+ - id: parent_guidance
+ name: Parent Guidance
+ description: Information for parents about children's health
+
+ - id: complexity_level
+ name: Medical Complexity
+ description: The technical depth of medical information
+ possible_values:
+ - id: basic
+ name: Basic
+ description: General information accessible to patients
+ sample_rate: 0.4
+ - id: intermediate
+ name: Intermediate
+ description: Detailed information for informed patients
+ sample_rate: 0.4
+ - id: professional
+ name: Professional
+ description: Technical information for healthcare providers
+ sample_rate: 0.2
+
+ # Generate domain-specific medical questions
+ generated_attributes:
+ - id: medical_question
+ instruction_messages:
+ - role: SYSTEM
+ content: |
+ You are a medical education specialist creating questions for the {specialty} field.
+
+ Requirements:
+ - Questions must be medically accurate and appropriate
+ - Match the specified complexity level and context
+ - Be relevant to the medical specialty
+ - Follow medical education best practices
+
+ Format your question as follows:
+ Question:
+
+ End Question
+
+ Write only the question, do not include additional filler text.
+ - role: USER
+ content: |
+ Specialty: {example_specialty} - {example_specialty_description}
+ Context: {example_context_type} - {example_context_type_description}
+ Complexity: {example_complexity_level} - {example_complexity_level_description}
+ - role: ASSISTANT
+ content: |
+ Question:
+ {example_question}
+ End Question
+ - role: USER
+ content: |
+ Specialty: {specialty.description}
+ Context: {context_type.description}
+ Complexity: {complexity_level.description}
+ postprocessing_params:
+ id: cleaned_question
+ cut_prefix: "Question:"
+ cut_suffix: "End Question"
+ strip_whitespace: true
+
+ - id: medical_answer
+ instruction_messages:
+ - role: SYSTEM
+ content: |
+ You are a knowledgeable medical educator providing accurate, educational information.
+
+ Your responses should:
+ - Be medically accurate and evidence-based
+ - Match the appropriate complexity level
+ - Include relevant context and explanations
+ - Be educational and informative
+ - Include appropriate disclaimers when needed
+
+ Provide a comprehensive, medically accurate answer.
+ Include appropriate disclaimers if this is patient-facing information.
+ Include appropriate disclaimers of seeking professional medical advice.
+ - role: USER
+ content: |
+ {cleaned_question}
+
+ postprocessing_params:
+ id: cleaned_answer
+ strip_whitespace: true
+
+ # Transform into conversation format
+ transformed_attributes:
+ - id: conversation
+ transformation_strategy:
+ type: CHAT
+ chat_transform:
+ messages:
+ - role: USER
+ content: "{cleaned_question}"
+ - role: ASSISTANT
+ content: "{cleaned_answer}"
+
+ # Include all domain-specific attributes
+ passthrough_attributes:
+ - conversation
+ - cleaned_question
+ - cleaned_answer
+ - specialty
+ - context_type
+ - complexity_level
+
+# Configure model for medical domain generation
+inference_config:
+ model:
+ model_name: claude-3-5-sonnet-20240620
+ engine: ANTHROPIC
+ generation:
+ max_new_tokens: 8192
+ temperature: 0.7
+ top_p: 0.9
+ remote_params:
+ num_workers: 50
+ politeness_policy: 60
diff --git a/configs/examples/synthesis/instruction_following.yaml b/configs/examples/synthesis/instruction_following.yaml
new file mode 100644
index 0000000000..60af7222df
--- /dev/null
+++ b/configs/examples/synthesis/instruction_following.yaml
@@ -0,0 +1,205 @@
+# Instruction Following Example
+# Generate instruction-response pairs with varying complexity and domains.
+# This example creates diverse task instructions across multiple domains and difficulty levels.
+
+strategy: GENERAL
+num_samples: 50
+output_path: instruction_following_dataset.jsonl
+
+strategy_params:
+ # Provide seed examples for different types of instructions
+ input_examples:
+ - examples:
+ - example_domain: "Creative Writing"
+ example_domain_description: "Tasks involving creative content creation, storytelling, poetry"
+ example_complexity: "Beginner"
+ example_complexity_description: "Simple, straightforward tasks suitable for beginners"
+ example_task_format: "Creation"
+ example_task_format_description: "Create original content, code, or solutions"
+ example_instruction: "Write a short story about a robot learning to paint"
+ - example_domain: "Analysis & Research"
+ example_domain_description: "Tasks requiring analytical thinking, comparison, evaluation"
+ example_complexity: "Intermediate"
+ example_complexity_description: "Moderately complex tasks requiring some expertise"
+ example_task_format: "Analysis"
+ example_task_format_description: "Analyze, compare, or evaluate something"
+ example_instruction: "Compare and contrast renewable vs non-renewable energy sources"
+ - example_domain: "Programming"
+ example_domain_description: "Tasks requiring software development, coding challenges, technical problems"
+ example_complexity: "Advanced"
+ example_complexity_description: "Complex tasks requiring deep knowledge and skills"
+ example_task_format: "Creation"
+ example_task_format_description: "Create original content, code, or solutions"
+ example_instruction: "Implement a thread-safe LRU cache with O(1) operations using a combination of hash map and doubly linked list, including proper memory management and concurrent access handling"
+
+ # Define the domain and complexity attributes
+ sampled_attributes:
+ - id: domain
+ name: Task Domain
+ description: The subject area or field for the instruction
+ possible_values:
+ - id: writing
+ name: Creative Writing
+ description: Tasks involving creative content creation, storytelling, poetry
+ - id: analysis
+ name: Analysis & Research
+ description: Tasks requiring analytical thinking, comparison, evaluation
+ - id: coding
+ name: Programming
+ description: Software development, coding challenges, technical problems
+ - id: math
+ name: Mathematics
+ description: Mathematical problems, calculations, proofs
+ - id: science
+ name: Science
+ description: Scientific concepts, experiments, explanations
+ - id: business
+ name: Business
+ description: Business strategy, marketing, management tasks
+
+ - id: complexity
+ name: Task Complexity
+ description: How challenging the instruction should be
+ possible_values:
+ - id: beginner
+ name: Beginner
+ description: Simple, straightforward tasks suitable for beginners
+ sample_rate: 0.2
+ - id: intermediate
+ name: Intermediate
+ description: Moderately complex tasks requiring some expertise
+ sample_rate: 0.4
+ - id: advanced
+ name: Advanced
+ description: Complex tasks requiring deep knowledge and skills
+ sample_rate: 0.4
+
+ - id: task_format
+ name: Task Format
+ description: The type of task being requested
+ possible_values:
+ - id: explain
+ name: Explanation
+ description: Explain a concept, process, or phenomenon
+ - id: create
+ name: Creation
+ description: Create original content, code, or solutions
+ - id: analyze
+ name: Analysis
+ description: Analyze, compare, or evaluate something
+ - id: solve
+ name: Problem Solving
+ description: Solve a specific problem or challenge
+ - id: summarize
+ name: Summarization
+ description: Summarize or synthesize information
+
+ # Generate diverse instructions
+ generated_attributes:
+ - id: instruction
+ instruction_messages:
+ - role: SYSTEM
+ content: |
+ You are an expert at creating educational and training tasks across different domains.
+
+ Your goal is to create clear, specific, and engaging instructions that:
+ - Match the requested domain and complexity level
+ - Are appropriate for the specified task format
+ - Provide clear expectations for what should be delivered
+ - Are realistic and achievable
+
+ Format your response as follows:
+ Instruction:
+
+ End Instruction
+
+ Write only the instruction, do not include additional filler text.
+ - role: USER
+ content: |
+ Domain: {example_domain} - {example_domain_description}
+ Complexity: {example_complexity} - {example_complexity_description}
+ Task Format: {example_task_format} - {example_task_format_description}
+ - role: ASSISTANT
+ content: |
+ Instruction:
+ {example_instruction}
+ End Instruction
+ - role: USER
+ content: |
+ Domain: {domain} - {domain.description}
+ Complexity: {complexity} - {complexity.description}
+ Task Format: {task_format} - {task_format.description}
+ postprocessing_params:
+ id: cleaned_instruction
+ cut_prefix: "Instruction:"
+ cut_suffix: "End Instruction"
+ strip_whitespace: true
+
+ - id: response
+ instruction_messages:
+ - role: SYSTEM
+ content: |
+ You are a knowledgeable AI assistant who follows instructions carefully and provides high-quality responses.
+
+ Provide responses that:
+ - Directly address the instruction
+ - Match the expected complexity level
+ - Are well-structured and informative
+ - Include examples when helpful
+ - role: USER
+ content: "{cleaned_instruction}"
+ postprocessing_params:
+ id: cleaned_response
+ strip_whitespace: true
+
+ # Ensure good distribution of challenging combinations
+ combination_sampling:
+ - combination:
+ domain: coding
+ complexity: advanced
+ task_format: solve
+ sample_rate: 0.15 # 15% advanced coding problems
+ - combination:
+ domain: science
+ complexity: intermediate
+ task_format: explain
+ sample_rate: 0.1 # 10% intermediate science explanations
+ - combination:
+ domain: writing
+ complexity: beginner
+ task_format: create
+ sample_rate: 0.1 # 10% beginner creative writing
+
+ # Transform into conversation format
+ transformed_attributes:
+ - id: conversation
+ transformation_strategy:
+ type: CHAT
+ chat_transform:
+ messages:
+ - role: USER
+ content: "{cleaned_instruction}"
+ - role: ASSISTANT
+ content: "{cleaned_response}"
+
+ # Include all relevant attributes in output
+ passthrough_attributes:
+ - conversation
+ - cleaned_instruction
+ - cleaned_response
+ - domain
+ - complexity
+ - task_format
+
+# Model configuration for diverse instruction generation
+inference_config:
+ model:
+ model_name: claude-3-5-sonnet-20240620
+ engine: ANTHROPIC
+ generation:
+ max_new_tokens: 8192
+ temperature: 0.7
+ top_p: 0.9
+ remote_params:
+ num_workers: 50
+ politeness_policy: 60
diff --git a/configs/examples/synthesis/question_answer_generation.yaml b/configs/examples/synthesis/question_answer_generation.yaml
new file mode 100644
index 0000000000..2928db5b1f
--- /dev/null
+++ b/configs/examples/synthesis/question_answer_generation.yaml
@@ -0,0 +1,166 @@
+# Question-Answer Generation Example
+# Generate QA pairs from documents or contexts for training conversational models.
+# This example creates geography quiz questions with varying difficulty levels.
+
+strategy: GENERAL
+num_samples: 50
+output_path: geography_qa_dataset.jsonl
+
+strategy_params:
+ # Provide examples for the AI to learn from
+ input_examples:
+ - examples:
+ - example_question: "What is the capital of France?"
+ example_answer: "The capital of France is Paris."
+ example_difficulty: "Basic geography facts that most people should know"
+ example_topic: "Questions about capital cities of countries"
+ - example_question: "Which mountain range separates Europe from Asia?"
+ example_answer: "The Ural Mountains separate Europe from Asia."
+ example_difficulty: "Intermediate geography knowledge"
+ example_topic: "Questions about mountains, rivers, deserts, etc."
+ - example_question: "What is the longest river in South America?"
+ example_answer: "The Amazon River is the longest river in South America."
+ example_difficulty: "Advanced geography knowledge for experts"
+ example_topic: "Questions about mountains, rivers, deserts, etc."
+
+ # Define attributes that will vary across generated examples
+ sampled_attributes:
+ - id: difficulty
+ name: Difficulty Level
+ description: How challenging the geography question should be
+ possible_values:
+ - id: easy
+ name: Easy
+ description: Basic geography facts that most people should know
+ sample_rate: 0.4 # 40% of questions will be easy
+ - id: medium
+ name: Medium
+ description: Intermediate geography knowledge
+ sample_rate: 0.4 # 40% of questions will be medium
+ - id: hard
+ name: Hard
+ description: Advanced geography knowledge for experts
+ sample_rate: 0.2 # 20% of questions will be hard
+
+ - id: topic
+ name: Geography Topic
+ description: The specific area of geography to focus on
+ possible_values:
+ - id: capitals
+ name: Capital Cities
+ description: Questions about capital cities of countries
+ - id: physical
+ name: Physical Geography
+ description: Questions about mountains, rivers, deserts, etc.
+ - id: countries
+ name: Countries and Borders
+ description: Questions about countries, territories, and boundaries
+ - id: climate
+ name: Climate and Weather
+ description: Questions about climate zones and weather patterns
+
+ # Generate the question using LLM
+ generated_attributes:
+ - id: question
+ instruction_messages:
+ - role: SYSTEM
+ content: |
+ You are an expert geography teacher creating quiz questions.
+
+ Create questions that are:
+ - Clear and unambiguous
+ - Factually accurate
+ - Appropriate for the specified difficulty level
+ - Focused on the specified topic area
+
+ Format your question as follows:
+ Question:
+
+ End Question
+
+ Write only the question, do not include the answer.
+ - role: USER
+ content: |
+ Difficulty: {example_difficulty}
+ Topic: {example_topic}
+ - role: ASSISTANT
+ content: |
+ Question:
+ {example_question}
+ End Question
+ - role: USER
+ content: |
+ Difficulty: {difficulty.description}
+ Topic: {topic.description}
+ postprocessing_params:
+ id: cleaned_question
+ cut_prefix: "Question:"
+ cut_suffix: "End Question"
+ strip_whitespace: true
+
+ # Generate the answer using LLM
+ - id: answer
+ instruction_messages:
+ - role: SYSTEM
+ content: |
+ You are a knowledgeable geography expert providing accurate, informative answers.
+
+ Provide answers that are:
+ - Factually correct
+ - Complete but concise
+ - Educational and informative
+
+ Format your answer as follows:
+ Answer:
+
+ End Answer
+
+ Write only the answer, do not include additional filler text.
+ - role: USER
+ content: |
+ {example_question}
+ - role: ASSISTANT
+ content: |
+ Answer:
+ {example_answer}
+ End Answer
+ - role: USER
+ content: |
+ {cleaned_question}
+ postprocessing_params:
+ id: cleaned_answer
+ cut_prefix: "Answer:"
+ cut_suffix: "End Answer"
+ strip_whitespace: true
+
+ transformed_attributes:
+ - id: conversation
+ transformation_strategy:
+ type: CHAT
+ chat_transform:
+ messages:
+ - role: USER
+ content: "{cleaned_question}"
+ - role: ASSISTANT
+ content: "{cleaned_answer}"
+
+ # Specify which attributes to include in the final output
+ passthrough_attributes:
+ - conversation
+ - cleaned_question
+ - cleaned_answer
+ - difficulty
+ - topic
+
+# Configure the AI model for generation
+inference_config:
+ model:
+ model_name: claude-3-5-sonnet-20240620
+ engine: ANTHROPIC
+ generation:
+ max_new_tokens: 8192
+ temperature: 0.7
+ top_p: 0.9
+ remote_params:
+ num_workers: 50
+ politeness_policy: 60
diff --git a/docs/cli/commands.md b/docs/cli/commands.md
index a7ff292eb0..c497ae9dfc 100644
--- a/docs/cli/commands.md
+++ b/docs/cli/commands.md
@@ -115,6 +115,18 @@ For a detailed guide on distributed training, see {doc}`/user_guides/train/train
:width: 80
```
+## Data Synthesis
+
+For a detailed guide on data synthesis, see {doc}`/user_guides/synth`.
+
+```{typer} oumi.cli.main.app.synth
+ :prog: oumi synth
+ :make-sections:
+ :preferred: svg
+ :theme: monokai
+ :width: 80
+```
+
## Environment
This command is a great tool for debugging!
diff --git a/docs/user_guides/synth.md b/docs/user_guides/synth.md
new file mode 100644
index 0000000000..fd5666367d
--- /dev/null
+++ b/docs/user_guides/synth.md
@@ -0,0 +1,451 @@
+# Data Synthesis
+
+The `oumi synth` command enables you to generate synthetic datasets using large language models. Instead of manually creating training data, you can define rules and templates that automatically generate diverse, high-quality examples.
+
+## What You Can Build
+
+- **Question-Answer datasets** for training chatbots
+- **Instruction-following datasets** with varied complexity levels
+- **Domain-specific training data** (legal, medical, technical)
+- **Conversation datasets** with different personas or styles
+- **Data augmentation** to expand existing small datasets
+
+## How It Works
+
+The synthesis process follows three steps:
+
+1. **Define attributes** - What varies in your data (topic, difficulty, style, etc.)
+2. **Create templates** - How the AI should generate content using those attributes
+3. **Generate samples** - The system creates many examples by combining different attribute values
+
+## Your First Synthesis
+
+Let's create a simple question-answer dataset. Save this as `my_first_synth.yaml`:
+
+```yaml
+# Generate 10 geography questions
+strategy: GENERAL
+num_samples: 10
+output_path: geography_qa.jsonl
+
+strategy_params:
+ # Give the AI an example to learn from
+ input_examples:
+ - examples:
+ - example_question: "What is the capital of France?"
+
+ # Define what should vary across examples
+ sampled_attributes:
+ - id: difficulty
+ name: Difficulty Level
+ description: How challenging the question should be
+ possible_values:
+ - id: easy
+ name: Easy
+ description: Basic facts everyone should know
+ - id: hard
+ name: Hard
+ description: Detailed knowledge for experts
+
+ # Tell the AI how to generate questions and answers
+ generated_attributes:
+ - id: question
+ instruction_messages:
+ - role: SYSTEM
+ content: "You are a geography teacher creating quiz questions. Example: {example_question}"
+ - role: USER
+ content: "Create a {difficulty} geography question. Write the question only, not the answer."
+ - id: answer
+ instruction_messages:
+ - role: SYSTEM
+ content: "You are a helpful AI assistant."
+ - role: USER
+ content: "{question}"
+
+# Configure which AI model to use
+inference_config:
+ model:
+ model_name: claude-3-5-sonnet-20240620
+ engine: ANTHROPIC
+```
+
+Run it with:
+```bash
+oumi synth -c my_first_synth.yaml
+```
+
+**What happens:** The system will create 10 geography questions, some easy and some hard, saved to `geography_qa.jsonl`.
+
+## Understanding the Results
+
+After running synthesis, you'll see:
+- A preview table showing the first few generated samples
+- The total number of samples created
+- Instructions for using the dataset in training
+
+Each line in the output file contains one example:
+```json
+{"difficulty": "easy", "question": "What is the largest continent?", "answer": "Asia"}
+{"difficulty": "hard", "question": "Which country has the most time zones?", "answer": "France"}
+```
+
+## Next Steps: Building More Complex Datasets
+
+Once you're comfortable with the basics, you can create more sophisticated datasets:
+
+### Adding Multiple Attributes
+Mix and match different properties (topic + difficulty + style):
+```yaml
+sampled_attributes:
+ - id: topic
+ possible_values: [{id: geography}, {id: history}, {id: science}]
+ - id: difficulty
+ possible_values: [{id: easy}, {id: medium}, {id: hard}]
+ - id: style
+ possible_values: [{id: formal}, {id: casual}, {id: academic}]
+```
+
+### Using Your Own Data
+Feed in existing datasets or documents:
+```yaml
+input_data:
+ - path: "my_existing_data.jsonl"
+input_documents:
+ - path: "textbook.pdf"
+```
+
+### Creating Conversations
+Build multi-turn dialogues:
+```yaml
+transformed_attributes:
+ - id: conversation
+ transformation_strategy:
+ type: CHAT
+ chat_transform:
+ messages:
+ - role: USER
+ content: "{question}"
+ - role: ASSISTANT
+ content: "{answer}"
+```
+
+Ready to dive deeper? The sections below cover all available options in detail.
+
+---
+
+## Complete Configuration Reference
+
+### Top-Level Parameters
+
+- **`strategy`**: The synthesis strategy to use (currently only `GENERAL` is supported)
+- **`num_samples`**: Number of synthetic samples to generate
+- **`output_path`**: Path where the generated dataset will be saved (must end with `.jsonl`)
+- **`strategy_params`**: Parameters specific to the synthesis strategy
+- **`inference_config`**: Configuration for the model used in generation
+
+### Strategy Parameters
+
+The `strategy_params` section defines the core synthesis logic:
+
+#### Input Sources
+
+You can provide data from multiple sources:
+
+**`input_data`**: Existing datasets to sample from
+```yaml
+input_data:
+ - path: "hf:dataset_name" # HuggingFace dataset
+ hf_split: train
+ - path: "/path/to/local/data.jsonl" # Local file
+ attribute_map:
+ old_column_name: new_attribute_name
+```
+
+**`input_documents`**: Documents to segment and use in synthesis
+```yaml
+input_documents:
+ - path: "/path/to/document.pdf"
+ id: my_doc
+ segmentation_params:
+ id: doc_segment
+ segment_length: 2048
+ segment_overlap: 200
+```
+
+**`input_examples`**: Inline examples for few-shot learning
+```yaml
+input_examples:
+ - examples:
+ - attribute1: "value1"
+ attribute2: "value2"
+ - attribute1: "value3"
+ attribute2: "value4"
+```
+
+#### Attribute Types
+
+**Sampled Attributes**: Randomly selected values from predefined options
+```yaml
+sampled_attributes:
+ - id: difficulty
+ name: Difficulty Level
+ description: How challenging the question should be
+ possible_values:
+ - id: easy
+ name: Easy
+ description: Simple, straightforward questions
+ sample_rate: 0.4 # 40% of samples
+ - id: medium
+ name: Medium
+ description: Moderately challenging questions
+ sample_rate: 0.4 # 40% of samples
+ - id: hard
+ name: Hard
+ description: Complex, advanced questions
+ # No sample_rate specified = 20% (remaining)
+```
+
+**Generated Attributes**: Created by LLM using instruction messages
+```yaml
+generated_attributes:
+ - id: summary
+ instruction_messages:
+ - role: SYSTEM
+ content: "You are a helpful summarization assistant."
+ - role: USER
+ content: "Summarize this text: {input_text}. Format your result as 'Summary: '"
+ postprocessing_params:
+ id: clean_summary
+ cut_prefix: "Summary: "
+ strip_whitespace: true
+```
+
+**Transformed Attributes**: Rule-based transformations of existing attributes
+```yaml
+transformed_attributes:
+ - id: conversation
+ transformation_strategy:
+ type: CHAT
+ chat_transform:
+ messages:
+ - role: USER
+ content: "{question}"
+ - role: ASSISTANT
+ content: "{answer}"
+```
+
+#### Advanced Features
+
+**Combination Sampling**: Control probability of specific attribute combinations
+```yaml
+combination_sampling:
+ - combination:
+ difficulty: hard
+ topic: science
+ sample_rate: 0.1 # 10% of samples will have hard science questions
+```
+
+**Passthrough Attributes**: Specify which attributes to include in final output
+```yaml
+passthrough_attributes:
+ - question
+ - answer
+ - difficulty
+ - topic
+```
+
+## Attribute Referencing
+
+In instruction messages and transformations, you can reference attributes using `{attribute_id}` syntax:
+
+- `{attribute_id}`: The value/name of the attribute
+- `{attribute_id.description}`: The description of a sampled attribute value
+- `{attribute_id.parent}`: The parent name of a sampled attribute
+- `{attribute_id.parent.description}`: The parent description of a sampled attribute
+
+## Postprocessing
+
+Generated attributes can be postprocessed to clean up the output:
+
+```yaml
+postprocessing_params:
+ id: cleaned_attribute
+ keep_original_text_attribute: true # Keep original alongside cleaned version
+ cut_prefix: "Answer: " # Remove this prefix and everything before it
+ cut_suffix: "\n\n" # Remove this suffix and everything after it
+ regex: "\\*\\*(.+?)\\*\\*" # Extract content between ** **
+ strip_whitespace: true # Remove leading/trailing whitespace
+ added_prefix: "Response: " # Add this prefix
+ added_suffix: "." # Add this suffix
+```
+
+## Transformation Strategies
+
+### String Transformation
+```yaml
+transformation_strategy:
+ type: STRING
+ string_transform: "Question: {question}\nAnswer: {answer}"
+```
+
+### List Transformation
+```yaml
+transformation_strategy:
+ type: LIST
+ list_transform:
+ - "{question}"
+ - "{answer}"
+```
+
+### Dictionary Transformation
+```yaml
+transformation_strategy:
+ type: DICT
+ dict_transform:
+ question: "{question}"
+ answer: "{answer}"
+```
+
+### Chat Transformation
+```yaml
+transformation_strategy:
+ type: CHAT
+ chat_transform:
+ messages:
+ - role: USER
+ content: "{question}"
+ - role: ASSISTANT
+ content: "{answer}"
+```
+
+## Document Segmentation
+
+When using documents, you can segment them for processing:
+
+```yaml
+input_documents:
+ - path: "/path/to/document.pdf"
+ id: research_paper
+ segmentation_params:
+ id: paper_segment
+ segmentation_strategy: TOKENS
+ tokenizer: "openai-community/gpt2"
+ segment_length: 1024
+ segment_overlap: 128
+ keep_original_text: true
+```
+
+## Inference Configuration
+
+Configure the model and generation parameters:
+
+```yaml
+inference_config:
+ model:
+ model_name: "claude-3-5-sonnet-20240620"
+ engine: ANTHROPIC
+ generation:
+ max_new_tokens: 1024
+ temperature: 0.7
+ top_p: 0.9
+ remote_params:
+ num_workers: 5
+ politeness_policy: 60 # Delay between requests in seconds
+```
+
+### Supported Engines
+
+- `ANTHROPIC`: Claude models (requires API key)
+- `OPENAI`: OpenAI models (requires API key)
+- `VLLM`: Local vLLM inference server
+- `NATIVE_TEXT`: Local HuggingFace transformers
+- And many more (see {doc}`/user_guides/infer/inference_engines`)
+
+## Command Line Options
+
+The `oumi synth` command supports these options:
+
+- `--config`, `-c`: Path to synthesis configuration file (required)
+- `--level`: Set logging level (DEBUG, INFO, WARNING, ERROR)
+
+You can also use CLI overrides to modify configuration parameters:
+
+```bash
+oumi synth -c config.yaml \
+ --num_samples 50 \
+ --inference_config.generation.temperature 0.5 \
+ --strategy_params.sampled_attributes[0].possible_values[0].sample_rate 0.8
+```
+
+## Output Format
+
+The synthesized dataset is saved as a JSONL file where each line contains a JSON object with the attributes in the config:
+
+```json
+{"difficulty": "easy", "topic": "geography", "question": "What is the capital of France?", "answer": "Paris"}
+{"difficulty": "medium", "topic": "history", "question": "When did World War II end?", "answer": "World War II ended in 1945"}
+```
+
+After synthesis completes, you'll see a preview table and instructions on how to use the generated dataset for training:
+
+```
+Successfully synthesized 100 samples and saved to synthetic_qa_dataset.jsonl
+
+To train a model, run: oumi train -c path/to/your/train/config.yaml
+
+If you included a 'conversation' chat attribute in your config, update the
+config to use your new dataset:
+data:
+ train:
+ datasets:
+ - dataset_name: "text_sft_jsonl"
+ dataset_path: "synthetic_qa_dataset.jsonl"
+```
+
+## Best Practices
+
+1. **Start Small**: Begin with a small `num_samples` to test your configuration
+2. **Use Examples**: Provide good examples in `input_examples` for better generation quality
+3. **Postprocess Outputs**: Use postprocessing to clean and format generated text
+4. **Monitor Costs**: Be aware of API costs when using commercial models
+5. **Validate Results**: Review generated samples before using for training
+6. **Version Control**: Keep your synthesis configs in version control
+
+## Common Use Cases
+
+### Question-Answer Generation
+Generate QA pairs from documents or contexts for training conversational models.
+
+**Example**: See [`configs/examples/synthesis/question_answer_generation.yaml`](../../configs/examples/synthesis/question_answer_generation.yaml) for a complete geography Q&A generation example.
+
+### Data Augmentation
+Create variations of existing datasets by sampling different attributes and regenerating content.
+
+**Example**: See [`configs/examples/synthesis/data_augmentation.yaml`](../../configs/examples/synthesis/data_augmentation.yaml) for an example that augments existing datasets with different styles and complexity levels.
+
+### Instruction Following
+Generate instruction-response pairs with varying complexity and domains.
+
+**Example**: See [`configs/examples/synthesis/instruction_following.yaml`](../../configs/examples/synthesis/instruction_following.yaml) for a multi-domain instruction generation example covering writing, coding, analysis, and more.
+
+### Conversation Synthesis
+Create multi-turn conversations by chaining generated responses.
+
+**Example**: See [`configs/examples/synthesis/conversation_synthesis.yaml`](../../configs/examples/synthesis/conversation_synthesis.yaml) for a customer support conversation generation example.
+
+### Domain Adaptation
+Generate domain-specific training data by conditioning on domain attributes.
+
+**Example**: See [`configs/examples/synthesis/domain_qa.yaml`](../../configs/examples/synthesis/domain_qa.yaml) for a medical domain Q&A generation example with specialty-specific content.
+
+## Troubleshooting
+
+**Empty results**: Check that your instruction messages are well-formed and you have proper API access.
+
+**Slow generation**: Increase `num_workers` or lower `politeness_policy` to improve throughput.
+
+**Out of memory**: Use a smaller model or reduce `max_new_tokens` in generation config.
+
+**Validation errors**: Ensure all attribute IDs are unique and required fields are not empty.
+
+For more help, see the [FAQ](../faq/troubleshooting.md) or report issues at https://github.com/oumi-ai/oumi/issues.
From 19fd1c0f2365b689b141523028d3c49abfa14fdd Mon Sep 17 00:00:00 2001
From: jgreer013 <18727435+jgreer013@users.noreply.github.com>
Date: Wed, 3 Sep 2025 13:18:06 -0700
Subject: [PATCH 2/7] Fix tests
---
tests/unit/core/configs/test_parse_configs.py | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/tests/unit/core/configs/test_parse_configs.py b/tests/unit/core/configs/test_parse_configs.py
index 84e4084a37..6b451e00f4 100644
--- a/tests/unit/core/configs/test_parse_configs.py
+++ b/tests/unit/core/configs/test_parse_configs.py
@@ -13,6 +13,7 @@
QuantizationConfig,
TrainingConfig,
)
+from oumi.core.configs.synthesis_config import SynthesisConfig
from oumi.core.types import HardwareException
@@ -57,8 +58,9 @@ def test_parse_configs(config_path: str):
InferenceConfig,
JobConfig,
JudgeConfig,
- TrainingConfig,
QuantizationConfig,
+ SynthesisConfig,
+ TrainingConfig,
]
error_messages = []
for config_class in valid_config_classes:
@@ -88,8 +90,9 @@ def test_parse_configs_from_yaml_and_arg_list(config_path: str):
InferenceConfig,
JobConfig,
JudgeConfig,
- TrainingConfig,
QuantizationConfig,
+ SynthesisConfig,
+ TrainingConfig,
]
error_messages = []
for config_class in valid_config_classes:
From f2cebc8d53d31e46168b9c026f5747cbc54eefe3 Mon Sep 17 00:00:00 2001
From: jgreer013 <18727435+jgreer013@users.noreply.github.com>
Date: Wed, 3 Sep 2025 13:32:38 -0700
Subject: [PATCH 3/7] Update init
---
src/oumi/core/configs/__init__.py | 36 +++++++++++++++++++
tests/unit/core/configs/test_parse_configs.py | 2 +-
2 files changed, 37 insertions(+), 1 deletion(-)
diff --git a/src/oumi/core/configs/__init__.py b/src/oumi/core/configs/__init__.py
index a50640303b..b66089d6c2 100644
--- a/src/oumi/core/configs/__init__.py
+++ b/src/oumi/core/configs/__init__.py
@@ -123,6 +123,26 @@
)
from oumi.core.configs.params.profiler_params import ProfilerParams
from oumi.core.configs.params.remote_params import RemoteParams
+from oumi.core.configs.params.synthesis_params import (
+ AttributeCombination,
+ DocumentSegmentationParams,
+ DocumentSource,
+ ExampleSource,
+ GeneralSynthesisParams,
+ GeneratedAttribute,
+ GeneratedAttributePostprocessingParams,
+ SampledAttribute,
+ SampledAttributeValue,
+ SegmentationStrategy,
+ TextConversation,
+ TextMessage,
+ TransformationStrategy,
+ TransformationType,
+ TransformedAttribute,
+)
+from oumi.core.configs.params.synthesis_params import (
+ DatasetSource as DatasetSourceParam,
+)
from oumi.core.configs.params.telemetry_params import TelemetryParams
from oumi.core.configs.params.training_params import (
MixedPrecisionDtype,
@@ -182,4 +202,20 @@
"TrainerType",
"TrainingConfig",
"TrainingParams",
+ "AttributeCombination",
+ "DatasetSourceParam",
+ "DocumentSegmentationParams",
+ "DocumentSource",
+ "ExampleSource",
+ "GeneratedAttributePostprocessingParams",
+ "GeneralSynthesisParams",
+ "GeneratedAttribute",
+ "SampledAttribute",
+ "SampledAttributeValue",
+ "SegmentationStrategy",
+ "TextConversation",
+ "TextMessage",
+ "TransformationStrategy",
+ "TransformationType",
+ "TransformedAttribute",
]
diff --git a/tests/unit/core/configs/test_parse_configs.py b/tests/unit/core/configs/test_parse_configs.py
index 6b451e00f4..14e233f149 100644
--- a/tests/unit/core/configs/test_parse_configs.py
+++ b/tests/unit/core/configs/test_parse_configs.py
@@ -11,9 +11,9 @@
JobConfig,
JudgeConfig,
QuantizationConfig,
+ SynthesisConfig,
TrainingConfig,
)
-from oumi.core.configs.synthesis_config import SynthesisConfig
from oumi.core.types import HardwareException
From 23965e8683c9110d5030c84feddfd61988e9d965 Mon Sep 17 00:00:00 2001
From: jgreer013 <18727435+jgreer013@users.noreply.github.com>
Date: Tue, 9 Sep 2025 10:49:37 -0700
Subject: [PATCH 4/7] Add examples for transformation strategies
---
docs/user_guides/synth.md | 86 +++++++++++++++++++++++++++++----------
1 file changed, 65 insertions(+), 21 deletions(-)
diff --git a/docs/user_guides/synth.md b/docs/user_guides/synth.md
index fd5666367d..383059d0b5 100644
--- a/docs/user_guides/synth.md
+++ b/docs/user_guides/synth.md
@@ -280,44 +280,88 @@ postprocessing_params:
```
## Transformation Strategies
+For the following examples, let's assume we have a data sample with the following values.
+```
+{
+ "question": "What color is the sky?",
+ "answer": "The sky is blue."
+}
+```
### String Transformation
```yaml
-transformation_strategy:
- type: STRING
- string_transform: "Question: {question}\nAnswer: {answer}"
+transformed_attributes:
+ - id: example_string_attribute
+ transformation_strategy:
+ type: STRING
+ string_transform: "Question: {question}\nAnswer: {answer}"
+```
+
+Example Result:
+```
+{
+ "example_string_attribute": "Question: What color is the sky?\nAnswer: The sky is blue."
+}
```
### List Transformation
```yaml
-transformation_strategy:
- type: LIST
- list_transform:
- - "{question}"
- - "{answer}"
+transformed_attributes:
+ - id: example_list_attribute
+ transformation_strategy:
+ type: LIST
+ list_transform:
+ - "{question}"
+ - "{answer}"
+```
+
+Example Result:
+```
+{
+ "example_list_attribute": [
+ "What color is the sky?",
+ "The sky is blue.",
+ ]
+}
```
### Dictionary Transformation
```yaml
-transformation_strategy:
- type: DICT
- dict_transform:
- question: "{question}"
- answer: "{answer}"
+transformed_attributes:
+ - id: example_dict_attribute
+ transformation_strategy:
+ type: DICT
+ dict_transform:
+ question: "{question}"
+ answer: "{answer}"
+```
+
+Example Result:
+```
+{
+ "example_list_attribute": {
+ "question": "What color is the sky?",
+ "answer": "The sky is blue.",
+ }
+}
```
### Chat Transformation
```yaml
-transformation_strategy:
- type: CHAT
- chat_transform:
- messages:
- - role: USER
- content: "{question}"
- - role: ASSISTANT
- content: "{answer}"
+transformed_attributes:
+ - id: string_attribute
+ transformation_strategy:
+ type: CHAT
+ chat_transform:
+ messages:
+ - role: USER
+ content: "{question}"
+ - role: ASSISTANT
+ content: "{answer}"
```
+
+
## Document Segmentation
When using documents, you can segment them for processing:
From d62c5095817bdecb896614f8600af391d5386806 Mon Sep 17 00:00:00 2001
From: jgreer013 <18727435+jgreer013@users.noreply.github.com>
Date: Tue, 9 Sep 2025 10:57:17 -0700
Subject: [PATCH 5/7] Rename configs to end in _synth.yaml
---
.../{conversation_synthesis.yaml => conversation_synth.yaml} | 0
.../{data_augmentation.yaml => data_augmentation_synth.yaml} | 0
.../examples/synthesis/{domain_qa.yaml => domain_qa_synth.yaml} | 0
...nstruction_following.yaml => instruction_following_synth.yaml} | 0
...question_answer_generation.yaml => question_answer_synth.yaml} | 0
5 files changed, 0 insertions(+), 0 deletions(-)
rename configs/examples/synthesis/{conversation_synthesis.yaml => conversation_synth.yaml} (100%)
rename configs/examples/synthesis/{data_augmentation.yaml => data_augmentation_synth.yaml} (100%)
rename configs/examples/synthesis/{domain_qa.yaml => domain_qa_synth.yaml} (100%)
rename configs/examples/synthesis/{instruction_following.yaml => instruction_following_synth.yaml} (100%)
rename configs/examples/synthesis/{question_answer_generation.yaml => question_answer_synth.yaml} (100%)
diff --git a/configs/examples/synthesis/conversation_synthesis.yaml b/configs/examples/synthesis/conversation_synth.yaml
similarity index 100%
rename from configs/examples/synthesis/conversation_synthesis.yaml
rename to configs/examples/synthesis/conversation_synth.yaml
diff --git a/configs/examples/synthesis/data_augmentation.yaml b/configs/examples/synthesis/data_augmentation_synth.yaml
similarity index 100%
rename from configs/examples/synthesis/data_augmentation.yaml
rename to configs/examples/synthesis/data_augmentation_synth.yaml
diff --git a/configs/examples/synthesis/domain_qa.yaml b/configs/examples/synthesis/domain_qa_synth.yaml
similarity index 100%
rename from configs/examples/synthesis/domain_qa.yaml
rename to configs/examples/synthesis/domain_qa_synth.yaml
diff --git a/configs/examples/synthesis/instruction_following.yaml b/configs/examples/synthesis/instruction_following_synth.yaml
similarity index 100%
rename from configs/examples/synthesis/instruction_following.yaml
rename to configs/examples/synthesis/instruction_following_synth.yaml
diff --git a/configs/examples/synthesis/question_answer_generation.yaml b/configs/examples/synthesis/question_answer_synth.yaml
similarity index 100%
rename from configs/examples/synthesis/question_answer_generation.yaml
rename to configs/examples/synthesis/question_answer_synth.yaml
From e3ba4ddce387b8d07c3a634b5b6aa5afe66f4fbc Mon Sep 17 00:00:00 2001
From: jgreer013 <18727435+jgreer013@users.noreply.github.com>
Date: Tue, 9 Sep 2025 11:31:37 -0700
Subject: [PATCH 6/7] Add better header comments
---
.../examples/synthesis/conversation_synth.yaml | 18 ++++++++++++++++++
.../synthesis/data_augmentation_synth.yaml | 18 ++++++++++++++++++
.../examples/synthesis/domain_qa_synth.yaml | 18 ++++++++++++++++++
.../synthesis/instruction_following_synth.yaml | 18 ++++++++++++++++++
.../synthesis/question_answer_synth.yaml | 18 ++++++++++++++++++
5 files changed, 90 insertions(+)
diff --git a/configs/examples/synthesis/conversation_synth.yaml b/configs/examples/synthesis/conversation_synth.yaml
index 46f0f39ba3..17ce31b2de 100644
--- a/configs/examples/synthesis/conversation_synth.yaml
+++ b/configs/examples/synthesis/conversation_synth.yaml
@@ -1,6 +1,24 @@
# Conversation Synthesis Example
# Create multi-turn conversations by chaining generated responses.
# This example generates realistic customer support conversations with different scenarios.
+#
+# Requirements:
+# - Set your anthropic api key in the environment variable ANTHROPIC_API_KEY
+# - example: `export ANTHROPIC_API_KEY=your_api_key`
+# - Alternatively, change the model in the inference_config section to your desired model
+# - See the inference_config section for more details
+# - Documentation: https://oumi.ai/docs/en/latest/user_guides/infer/configuration.html
+#
+# Usage:
+# oumi synth -c oumi://configs/examples/synthesis/conversation_synth.yaml
+#
+# See Also:
+# - Documentation: https://oumi.ai/docs/en/latest/user_guides/synth.html
+# - Config class: oumi.core.configs.SynthesisConfig
+# - Params class: oumi.core.configs.params.synthesis_params.GeneralSynthesisParams
+# - Config source: https://github.com/oumi-ai/oumi/blob/main/src/oumi/core/configs/synthesis_config.py
+# - Params source: https://github.com/oumi-ai/oumi/blob/main/src/oumi/core/configs/params/synthesis_params.py
+# - Other synthesis configs: configs/**/*synth.yaml
strategy: GENERAL
num_samples: 50
diff --git a/configs/examples/synthesis/data_augmentation_synth.yaml b/configs/examples/synthesis/data_augmentation_synth.yaml
index a42d62138d..ace4c6bdd4 100644
--- a/configs/examples/synthesis/data_augmentation_synth.yaml
+++ b/configs/examples/synthesis/data_augmentation_synth.yaml
@@ -1,6 +1,24 @@
# Data Augmentation Example
# Expand existing datasets through both instruction rephrasing and related task generation.
# This example creates both: 1) diverse ways to ask for the same information, and 2) similar but different tasks in the same domain.
+#
+# Requirements:
+# - Set your anthropic api key in the environment variable ANTHROPIC_API_KEY
+# - example: `export ANTHROPIC_API_KEY=your_api_key`
+# - Alternatively, change the model in the inference_config section to your desired model
+# - See the inference_config section for more details
+# - Documentation: https://oumi.ai/docs/en/latest/user_guides/infer/configuration.html
+#
+# Usage:
+# oumi synth -c oumi://configs/examples/synthesis/data_augmentation_synth.yaml
+#
+# See Also:
+# - Documentation: https://oumi.ai/docs/en/latest/user_guides/synth.html
+# - Config class: oumi.core.configs.SynthesisConfig
+# - Params class: oumi.core.configs.params.synthesis_params.GeneralSynthesisParams
+# - Config source: https://github.com/oumi-ai/oumi/blob/main/src/oumi/core/configs/synthesis_config.py
+# - Params source: https://github.com/oumi-ai/oumi/blob/main/src/oumi/core/configs/params/synthesis_params.py
+# - Other synthesis configs: configs/**/*synth.yaml
strategy: GENERAL
num_samples: 50
diff --git a/configs/examples/synthesis/domain_qa_synth.yaml b/configs/examples/synthesis/domain_qa_synth.yaml
index 761c464351..2645f6326e 100644
--- a/configs/examples/synthesis/domain_qa_synth.yaml
+++ b/configs/examples/synthesis/domain_qa_synth.yaml
@@ -1,6 +1,24 @@
# Domain QA Example
# Generate domain-specific training data by conditioning on domain attributes.
# This example creates medical Q&A data with different specialties and contexts.
+#
+# Requirements:
+# - Set your anthropic api key in the environment variable ANTHROPIC_API_KEY
+# - example: `export ANTHROPIC_API_KEY=your_api_key`
+# - Alternatively, change the model in the inference_config section to your desired model
+# - See the inference_config section for more details
+# - Documentation: https://oumi.ai/docs/en/latest/user_guides/infer/configuration.html
+#
+# Usage:
+# oumi synth -c oumi://configs/examples/synthesis/domain_qa_synth.yaml
+#
+# See Also:
+# - Documentation: https://oumi.ai/docs/en/latest/user_guides/synth.html
+# - Config class: oumi.core.configs.SynthesisConfig
+# - Params class: oumi.core.configs.params.synthesis_params.GeneralSynthesisParams
+# - Config source: https://github.com/oumi-ai/oumi/blob/main/src/oumi/core/configs/synthesis_config.py
+# - Params source: https://github.com/oumi-ai/oumi/blob/main/src/oumi/core/configs/params/synthesis_params.py
+# - Other synthesis configs: configs/**/*synth.yaml
strategy: GENERAL
num_samples: 50
diff --git a/configs/examples/synthesis/instruction_following_synth.yaml b/configs/examples/synthesis/instruction_following_synth.yaml
index 60af7222df..a904dfe0f4 100644
--- a/configs/examples/synthesis/instruction_following_synth.yaml
+++ b/configs/examples/synthesis/instruction_following_synth.yaml
@@ -1,6 +1,24 @@
# Instruction Following Example
# Generate instruction-response pairs with varying complexity and domains.
# This example creates diverse task instructions across multiple domains and difficulty levels.
+#
+# Requirements:
+# - Set your anthropic api key in the environment variable ANTHROPIC_API_KEY
+# - example: `export ANTHROPIC_API_KEY=your_api_key`
+# - Alternatively, change the model in the inference_config section to your desired model
+# - See the inference_config section for more details
+# - Documentation: https://oumi.ai/docs/en/latest/user_guides/infer/configuration.html
+#
+# Usage:
+# oumi synth -c oumi://configs/examples/synthesis/instruction_following_synth.yaml
+#
+# See Also:
+# - Documentation: https://oumi.ai/docs/en/latest/user_guides/synth.html
+# - Config class: oumi.core.configs.SynthesisConfig
+# - Params class: oumi.core.configs.params.synthesis_params.GeneralSynthesisParams
+# - Config source: https://github.com/oumi-ai/oumi/blob/main/src/oumi/core/configs/synthesis_config.py
+# - Params source: https://github.com/oumi-ai/oumi/blob/main/src/oumi/core/configs/params/synthesis_params.py
+# - Other synthesis configs: configs/**/*synth.yaml
strategy: GENERAL
num_samples: 50
diff --git a/configs/examples/synthesis/question_answer_synth.yaml b/configs/examples/synthesis/question_answer_synth.yaml
index 2928db5b1f..7401104509 100644
--- a/configs/examples/synthesis/question_answer_synth.yaml
+++ b/configs/examples/synthesis/question_answer_synth.yaml
@@ -1,6 +1,24 @@
# Question-Answer Generation Example
# Generate QA pairs from documents or contexts for training conversational models.
# This example creates geography quiz questions with varying difficulty levels.
+#
+# Requirements:
+# - Set your anthropic api key in the environment variable ANTHROPIC_API_KEY
+# - example: `export ANTHROPIC_API_KEY=your_api_key`
+# - Alternatively, change the model in the inference_config section to your desired model
+# - See the inference_config section for more details
+# - Documentation: https://oumi.ai/docs/en/latest/user_guides/infer/configuration.html
+#
+# Usage:
+# oumi synth -c oumi://configs/examples/synthesis/question_answer_synth.yaml
+#
+# See Also:
+# - Documentation: https://oumi.ai/docs/en/latest/user_guides/synth.html
+# - Config class: oumi.core.configs.SynthesisConfig
+# - Params class: oumi.core.configs.params.synthesis_params.GeneralSynthesisParams
+# - Config source: https://github.com/oumi-ai/oumi/blob/main/src/oumi/core/configs/synthesis_config.py
+# - Params source: https://github.com/oumi-ai/oumi/blob/main/src/oumi/core/configs/params/synthesis_params.py
+# - Other synthesis configs: configs/**/*synth.yaml
strategy: GENERAL
num_samples: 50
From 1dec94f1b20c3afb63358953710e59d639f34447 Mon Sep 17 00:00:00 2001
From: jgreer013 <18727435+jgreer013@users.noreply.github.com>
Date: Wed, 10 Sep 2025 12:01:10 -0700
Subject: [PATCH 7/7] Add examples and comments to configs
---
configs/examples/synthesis/README.md | 183 ++++++++++++++-
.../synthesis/conversation_synth.yaml | 216 ++++++++++++------
.../synthesis/data_augmentation_synth.yaml | 14 +-
.../examples/synthesis/domain_qa_synth.yaml | 11 +-
.../instruction_following_synth.yaml | 15 +-
.../synthesis/question_answer_synth.yaml | 13 +-
6 files changed, 357 insertions(+), 95 deletions(-)
diff --git a/configs/examples/synthesis/README.md b/configs/examples/synthesis/README.md
index 79707b7d69..4fafebfca0 100644
--- a/configs/examples/synthesis/README.md
+++ b/configs/examples/synthesis/README.md
@@ -4,7 +4,7 @@ This directory contains example configurations for different data synthesis use
## Available Examples
-### 1. Question-Answer Generation (`question_answer_generation.yaml`)
+### 1. Question-Answer Generation (`question_answer_synth.yaml`)
**Purpose**: Generate QA pairs from documents or contexts for training conversational models.
@@ -18,10 +18,38 @@ This directory contains example configurations for different data synthesis use
**Run with**:
```bash
-oumi synth -c configs/examples/synthesis/question_answer_generation.yaml
+oumi synth -c configs/examples/synthesis/question_answer_synth.yaml
```
-### 2. Data Augmentation (`data_augmentation.yaml`)
+
+Example Output
+
+```json
+{
+ "difficulty": "easy",
+ "topic": "climate",
+ "cleaned_question": "Which climate has hot temperatures year-round and high levels of rainfall?",
+ "cleaned_answer": "The climate zone characterized by hot temperatures year-round and high levels of rainfall, typically found near the Earth's equator, is the tropical rainforest climate, also known as the equatorial climate or tropical wet climate.",
+ "conversation": {
+ "conversation_id": "conversation-f6ffd3b5-2605-420a-be16-704772a17de8",
+ "messages": [
+ {
+ "content": "Which climate has hot temperatures year-round and high levels of rainfall?",
+ "role": "user"
+ },
+ {
+ "content": "The climate zone characterized by hot temperatures year-round and high levels of rainfall, typically found near the Earth's equator, is the tropical rainforest climate, also known as the equatorial climate or tropical wet climate.",
+ "role": "assistant"
+ }
+ ],
+ "metadata": {}
+ }
+}
+```
+
+
+
+### 2. Data Augmentation (`data_augmentation_synth.yaml`)
**Purpose**: Create variations of existing datasets by sampling different attributes and regenerating content.
@@ -35,10 +63,40 @@ oumi synth -c configs/examples/synthesis/question_answer_generation.yaml
**Run with**:
```bash
-oumi synth -c configs/examples/synthesis/data_augmentation.yaml
+oumi synth -c configs/examples/synthesis/data_augmentation_synth.yaml
+```
+
+
+Example Output
+
+```json
+{
+ "original_instruction": "What's the weather like in Seattle today?",
+ "original_input": "",
+ "original_response": "I apologize, but I don't have access to real-time weather information for Seattle.",
+ "augmentation_type": "rephrase",
+ "phrasing_style": "formal",
+ "specificity_level": "precise",
+ "cleaned_instruction": "Please provide a comprehensive report on the current meteorological conditions in Seattle, Washington. I require specific details on temperature, precipitation, wind speed, and cloud cover for today's date. Kindly ensure the information is up-to-date and accurate as of the most recent weather update for the city.",
+ "cleaned_response": "I apologize, but I don't have access to real-time or up-to-date meteorological data for Seattle, Washington. As an AI language model, I'm not able to provide current information on temperature, precipitation, wind speed, or cloud cover for today's date in Seattle. For the most accurate and recent weather conditions, I recommend checking a reliable weather service or meteorological website.",
+ "conversation": {
+ "conversation_id": "conversation-485db404-87ea-47c1-8b3e-60cc771da4fc",
+ "messages": [
+ {
+ "content": "Please provide a comprehensive report on the current meteorological conditions in Seattle, Washington. I require specific details on temperature, precipitation, wind speed, and cloud cover for today's date. Kindly ensure the information is up-to-date and accurate as of the most recent weather update for the city.",
+ "role": "user"},
+ {
+ "content": "I apologize, but I don't have access to real-time or up-to-date meteorological data for Seattle, Washington. As an AI language model, I'm not able to provide current information on temperature, precipitation, wind speed, or cloud cover for today's date in Seattle. For the most accurate and recent weather conditions, I recommend checking a reliable weather service or meteorological website.",
+ "role": "assistant"}
+ ],
+ "metadata": {}
+ }
+}
```
-### 3. Instruction Following (`instruction_following.yaml`)
+
+
+### 3. Instruction Following (`instruction_following_synth.yaml`)
**Purpose**: Generate instruction-response pairs with varying complexity and domains.
@@ -52,12 +110,41 @@ oumi synth -c configs/examples/synthesis/data_augmentation.yaml
**Run with**:
```bash
-oumi synth -c configs/examples/synthesis/instruction_following.yaml
+oumi synth -c configs/examples/synthesis/instruction_following_synth.yaml
+```
+
+
+Example Output
+
+```json
+{
+ "domain": "writing",
+ "complexity": "beginner",
+ "task_format": "create",
+ "cleaned_instruction": "Write a short story of 250-300 words about a child's first day of school. Include descriptions of the child's emotions, the classroom environment, and an interaction with a new classmate.",
+ "cleaned_response": "Lily's heart raced as she clutched her mother's hand...already looking forward to tomorrow's adventures.",
+ "conversation": {
+ "conversation_id": "conversation-0670db9d-6147-4d71-bcbd-a05a7f8f8d1a",
+ "messages": [
+ {
+ "content": "Write a short story of 250-300 words about a child's first day of school. Include descriptions of the child's emotions, the classroom environment, and an interaction with a new classmate.",
+ "role": "user"
+ },
+ {
+ "content": "Lily's heart raced as she clutched her mother's hand...already looking forward to tomorrow's adventures.",
+ "role": "assistant"
+ }
+ ],
+ "metadata": {}
+ }
+}
```
-### 4. Conversation Synthesis (`conversation_synthesis.yaml`)
+
-**Purpose**: Create multi-turn conversations by chaining generated responses.
+### 4. Conversation Agent (`conversation_synth.yaml`)
+
+**Purpose**: Create multi-turn conversations by chaining generated responses. Includes examples of structured actions parseable for tool-use.
**What it does**: Generates realistic customer support conversations with different scenarios, customer personalities, and resolution outcomes.
@@ -70,10 +157,53 @@ oumi synth -c configs/examples/synthesis/instruction_following.yaml
**Run with**:
```bash
-oumi synth -c configs/examples/synthesis/conversation_synthesis.yaml
+oumi synth -c configs/examples/synthesis/conversation_synth.yaml
+```
+
+
+Example Output
+
+```json
+{
+ "scenario": "account_issue",
+ "customer_type": "concise",
+ "customer_interaction": "cooperative",
+ "cleaned_opener": "Can't log in. Need password reset ASAP. BrightTech account.",
+ "cleaned_agent_response": "\ntype: CLARIFY\nparameters: { \"reason\": \"Need email address to proceed with escalation\" }\n\nI apologize for the inconvenience you're experiencing with logging into your BrightTech account. To assist you with a password reset, I'll need to escalate this to our account specialists. Could you please provide the email address associated with your BrightTech account?",
+ "cleaned_followup": "jsmith@email.com. That's the one.",
+ "cleaned_final_response": "\ntype: ESCALATE\nparameters: { \"account_email\": \"jsmith@email.com\", \"reason\": \"Password reset request\" }\n\nThank you for providing your email address. I've escalated your password reset request to our account specialists. They will process your request as soon as possible and send instructions to jsmith@email.com on how to reset your password. Please check your email inbox (including spam/junk folders) for further instructions. Is there anything else I can help you with regarding your BrightTech account?",
+ "conversation": {
+ "conversation_id": "conversation-8658569a-8024-48e5-831e-00f0fbe150a6",
+ "messages": [
+ {
+ "content": "You are CareBot...",
+ "role": "system"
+ },
+ {
+ "content": "Can't log in. Need password reset ASAP. BrightTech account.",
+ "role": "user"
+ },
+ {
+ "content": "\ntype: CLARIFY\nparameters: { \"reason\": \"Need email address to proceed with escalation\" }\n\nI apologize for the inconvenience you're experiencing with logging into your BrightTech account. To assist you with a password reset, I'll need to escalate this to our account specialists. Could you please provide the email address associated with your BrightTech account?",
+ "role": "assistant"
+ },
+ {
+ "content": "jsmith@email.com. That's the one.",
+ "role": "user"
+ },
+ {
+ "content": "\ntype: ESCALATE\nparameters: { \"account_email\": \"jsmith@email.com\", \"reason\": \"Password reset request\" }\n\nThank you for providing your email address. I've escalated your password reset request to our account specialists. They will process your request as soon as possible and send instructions to jsmith@email.com on how to reset your password. Please check your email inbox (including spam/junk folders) for further instructions. Is there anything else I can help you with regarding your BrightTech account?",
+ "role": "assistant"
+ }
+ ],
+ "metadata": {}
+ }
+}
```
-### 5. Domain Adaptation (`domain_adaptation.yaml`)
+
+
+### 5. Domain-specific QA (`domain_qa_synth.yaml`)
**Purpose**: Generate domain-specific training data by conditioning on domain attributes.
@@ -88,9 +218,38 @@ oumi synth -c configs/examples/synthesis/conversation_synthesis.yaml
**Run with**:
```bash
-oumi synth -c configs/examples/synthesis/domain_adaptation.yaml
+oumi synth -c configs/examples/synthesis/domain_qa_synth.yaml
```
+
+Example Output
+
+```json
+{
+ "specialty": "dermatology",
+ "context_type": "prevention_advice",
+ "complexity_level": "basic",
+ "cleaned_question": "What are three important steps you can take to protect your skin from sun damage and reduce your risk of skin cancer?",
+ "cleaned_answer": "Here are three important steps you can take to protect your skin from sun damage...",
+ "conversation": {
+ "conversation_id": "conversation-ea1eccda-d7ba-4b34-86f9-52989aa11ae6",
+ "messages": [
+ {
+ "content": "What are three important steps you can take to protect your skin from sun damage and reduce your risk of skin cancer?",
+ "role": "user"
+ },
+ {
+ "content": "Here are three important steps you can take to protect your skin from sun damage...",
+ "role": "assistant"
+ }
+ ],
+ "metadata": {}
+ }
+}
+```
+
+
+
## Usage Tips
### Before Running
@@ -143,7 +302,7 @@ inference_config:
After generating synthetic data:
1. **Review the output**: Check the generated samples for quality and relevance
-2. **Use for training**: Include the dataset in your training configuration
+2. **Use for training**: Include the dataset in your training configuration (see our [training guide](../../docs/user_guides/train/train.md) for more details)
3. **Iterate and improve**: Modify the synthesis config based on results
4. **Combine datasets**: Use multiple synthesis runs to create larger, more diverse datasets
diff --git a/configs/examples/synthesis/conversation_synth.yaml b/configs/examples/synthesis/conversation_synth.yaml
index 17ce31b2de..28488148ee 100644
--- a/configs/examples/synthesis/conversation_synth.yaml
+++ b/configs/examples/synthesis/conversation_synth.yaml
@@ -21,87 +21,126 @@
# - Other synthesis configs: configs/**/*synth.yaml
strategy: GENERAL
-num_samples: 50
+num_samples: 10
output_path: conversation_dataset.jsonl
strategy_params:
# Define conversation scenarios and customer types
sampled_attributes:
+ # Generate the system instruction for the agent
- id: system_instruction
name: System Instruction
description: The system instruction to use for the conversation
possible_values:
- id: customer_support
- name: |
+ name: Customer Support
+ description: |
You are CareBot, a customer service assistant for BrightTech Electronics.
Your goals:
+ - Specify one of the below actions you are taking at the beginning of your response.
+ - Output the action in a structured format so the system can execute it.
+ - When writing the parameters, normalize the formatting of the product names and order IDs.
- Resolve customer issues within your capabilities.
- Use a polite, empathetic, and professional tone.
- - Always summarize the result for the customer in natural language.
- - When you need to perform an action, output it in a structured format so the system can execute it.
- If you do not have enough information to resolve the issue, ask the customer to clarify.
+ - Never suggest you have more information or capabilities than what's provided below.
### Tone Rules
- Be polite, concise, and solution-focused.
- Acknowledge frustration with empathy.
- Do not provide personal opinions or unrelated advice.
- - Never ask for personal information like name, email, phone number, etc.
+ - Never ask for personal information like name, phone number, etc.
+ - Only ask for email address if the customer requests a refund or account issues.
### Action Rules
- When an action is required, output ONLY in the following format:
-
+ Output only one action per response in the following format:
- type: [LOOKUP_ORDER | CHECK_SHIPPING | INITIATE_RETURN | ISSUE_REFUND | OFFER_DISCOUNT | ESCALATE]
- parameters: { ... } # JSON-like key/value parameters
+ type: [CLARIFY | LOOKUP_ORDER | INITIATE_RETURN | ESCALATE]
+ parameters: ... # JSON-like key/value parameters
After the action block, provide a natural-language response to the customer.
Do not mix the action block with free text.
+ Ensure you have all the information you need to execute the action before outputting a specialized action.
+
### Action Types
- 1. LOOKUP_ORDER
- - parameters: { "order_id": string }
- - Example: Look up an order when given an order number.
+ 1. CLARIFY
+ - parameters: { "reason": string }
+ - Example: Need more information to proceed with the request.
- 2. CHECK_SHIPPING
+ 2. LOOKUP_ORDER
- parameters: { "order_id": string }
- - Example: Check the shipping carrier’s status for the order.
+ - Example: Order info, shipping status, etc.
3. INITIATE_RETURN
- - parameters: { "order_id": string, "item_id": string, "reason": string }
+ - parameters: { "order_id": string, "reason": string }
- Example: Customer requests to return or exchange a product.
- 4. ISSUE_REFUND
- - parameters: { "order_id": string, "amount": number }
- - Rules: Refunds up to $200 only. Escalate otherwise.
-
- 5. OFFER_DISCOUNT
- - parameters: { "order_id": string, "amount_percent": number (≤10) }
- - Example: Offer up to 10% discount to resolve dissatisfaction.
+ 4. SEARCH_PRODUCT
+ - parameters: { "product_name": string }
+ - Example: Customer requests information about a product.
- 6. ESCALATE
- - parameters: { "reason": string }
- - Example: Customer requests human support, issue > $200, legal or account issues.
+ 5. ESCALATE
+ - parameters: { "account_email": string, "reason": string }
+ - Example: Customer requests human support, refund, legal or account issues.
- ### Example Responses
+ ### Example Conversations
+ Conversation 1
Customer: "Where's my order BT1234?"
- Bot:
+ Agent:
- type: CHECK_SHIPPING
+ type: LOOKUP_ORDER
parameters: { "order_id": "BT1234" }
- Let me check the shipping status for order BT1234. One moment please.
+ Let me check the order status for order BT1234. One moment please.
- Customer: "I want to return my headphones."
- Bot:
+ Conversation 2
+ Customer:
+ "I want to return my headphones."
+ Agent:
+
+ type: CLARIFY
+ parameters: { "reason": "Need order ID to proceed with return request" }
+
+ Sure, let me help you with that. What's the order ID for the headphones, and why do you want to return them?
+ Customer:
+ "I want to return my headphones because they don't fit."
+ Agent:
+
+ type: CLARIFY
+ parameters: { "reason": "Need more information to proceed with return request" }
+
+ I'm sorry to hear that. If you can provide me with the order ID, I can setup a return request for you right away.
+ Customer:
+ "Oh sure, it's 187390231847."
+ Agent:
type: INITIATE_RETURN
- parameters: { "order_id": "BT4567", "item_id": "HD998", "reason": "Product defective" }
+ parameters: { "order_id": "187390231847", "reason": "Headphones don't fit" }
+
+ Thank you! I've initiated a return request for you.
+
+ Conversation 3
+ Customer:
+ "I can't login to my account."
+ Agent:
+
+ type: CLARIFY
+ parameters: { "reason": "Need email address to proceed with escalation" }
+
+ I'm sorry to hear that. Can you please provide me with the email address associated with your account?
+ Customer:
+ "Oh sure, it's john.doe@example.com."
+ Agent:
+
+ type: ESCALATE
+ parameters: { "account_email": "john.doe@example.com", "reason": "Login issues" }
- I can help with that. I'll generate a return label for your headphones.
- description: "N/A"
+ Thank you! I've escalated this to a specialist. One moment please.
+ # Which support scenario the customer is facing
- id: scenario
name: Support Scenario
description: The type of customer support issue being addressed
@@ -109,58 +148,77 @@ strategy_params:
- id: account_issue
name: Account Problems
description: Login issues, password resets, account access problems
- - id: billing_inquiry
+ - id: order_issue
+ name: Order Problems
+ description: Order issues, shipping status, etc.
+ - id: billing_issue
name: Billing Questions
description: Questions about charges, payments, subscription management
- - id: product_question
+ - id: product_issue
name: Product Information
description: Feature questions, how-to guides, product comparisons
- - id: technical_support
+ - id: technical_issue
name: Technical Issues
description: Bug reports, troubleshooting, technical difficulties
- id: refund_request
name: Refund Requests
description: Return requests, cancellations, refund processing
+ # The customer's personality
- id: customer_type
name: Customer Personality
description: The customer's communication style and emotional state
possible_values:
+ - id: concise
+ name: Concise
+ description: Short and to the point, no fluff or unnecessary details
+ sample_rate: 0.4
- id: friendly
name: Friendly
description: Polite, patient, and appreciative
+ sample_rate: 0.05
- id: frustrated
name: Frustrated
description: Annoyed but still reasonable
+ sample_rate: 0.2
- id: confused
name: Confused
description: Uncertain and needs clear explanations
+ sample_rate: 0.1
- id: demanding
name: Demanding
description: Urgent, wants immediate resolution
+ sample_rate: 0.1
- id: curious
name: Curious
description: Asking lots of questions, wants to understand
+ sample_rate: 0.05
- id: adversarial
name: Adversarial
description: Attempting to deceive or manipulate the agent to say or do something it shouldn't do
+ sample_rate: 0.1
- - id: resolution
- name: Resolution Type
- description: How the conversation should end
+ # How the customer should interact with the agent
+ - id: customer_interaction
+ name: Customer Interaction
+ description: How the customer should interact with the agent
possible_values:
- - id: resolved
- name: Fully Resolved
- description: Issue completely solved, customer satisfied
+ - id: cooperative
+ name: Cooperative
+ description: Cooperates with the agent to resolve the issue
+ sample_rate: 0.6
- id: escalated
name: Escalated
- description: Transferred to specialist or supervisor
- - id: pending
- name: Pending
- description: Requires follow-up or additional steps
- - id: abandoned
- name: Abandoned
- description: Customer gives up and ends the conversation
+ description: Customer needs to be escalated to a specialist or supervisor
+ sample_rate: 0.2
+ - id: incomplete
+ name: Incomplete
+ description: Customer gives insufficient information to resolve the issue
+ sample_rate: 0.1
+ - id: combative
+ name: Combative
+ description: Customer is combative and refuses to cooperate with the agent
+ sample_rate: 0.1
# Generate the customer's opening message
generated_attributes:
@@ -178,11 +236,13 @@ strategy_params:
Company name: BrightTech Electronics
- Make it sound natural and realistic. Write only the customer's opening message.
+ Make it sound natural and realistic.
+ Write only the customer's opening message.
+ Include enough information to know what the customer is asking for, but not enough to be immediately actionable.
Format your response as follows:
Customer:
-
+ Place your customer's opening message here.
End Customer
Write only the customer's opening message, do not include additional filler text.
@@ -197,11 +257,26 @@ strategy_params:
instruction_messages:
- role: SYSTEM
content: |
- {system_instruction}
+ {system_instruction.description}
+
+ Respond to the customer's opening message.
+
+ Format your response as follows:
+ Response:
+
+ type: [CLARIFY | LOOKUP_ORDER | INITIATE_RETURN | SEARCH_PRODUCT | ESCALATE]
+ parameters: ... # JSON-like key/value parameters
+
+ Place your agent's response here.
+ End Response
+
+ Write only the agent's response, do not include additional filler text.
- role: USER
content: "{cleaned_opener}"
postprocessing_params:
id: cleaned_agent_response
+ cut_prefix: "Response:"
+ cut_suffix: "End Response"
strip_whitespace: true
# Generate the customer's follow-up
@@ -213,17 +288,20 @@ strategy_params:
Customer personality: {customer_type.description}
Issue type: {scenario.description}
- Resolution type: {resolution.description}
Maintain the same personality and tone as your opening message.
+ When providing an order ID, it is a sequence of 12 digits.
+ When providing a product name, it should be a realistic product name.
React naturally to the agent's response.
- Try to guide the conversation toward a {resolution} resolution.
+ Your respond should maintain a similar personality to your opening message.
+ While maintaining personality, your message should be the following type:
+ {customer_interaction}: {customer_interaction.description}
Opening message: "{cleaned_opener}"
Format your response as follows:
Customer:
-
+ Place your customer's follow-up message here.
End Customer
Write only the customer's follow-up message, do not include additional filler text.
@@ -240,19 +318,24 @@ strategy_params:
instruction_messages:
- role: SYSTEM
content: |
- {system_instruction}
- - role: USER
- content: |
- Customer's original message: "{cleaned_opener}"
- Your first response: "{cleaned_agent_response}"
- Customer's follow-up: "{cleaned_followup}"
+ {system_instruction.description}
Format your response as follows:
Response:
-
+
+ type: [CLARIFY | LOOKUP_ORDER | INITIATE_RETURN | SEARCH_PRODUCT | ESCALATE]
+ parameters: ... # JSON-like key/value parameters
+
+ Place your final agent response here.
End Response
Write only the final agent response, do not include additional filler text.
+ - role: USER
+ content: "{cleaned_opener}"
+ - role: ASSISTANT
+ content: "{cleaned_agent_response}"
+ - role: USER
+ content: "{cleaned_followup}"
postprocessing_params:
id: cleaned_final_response
cut_prefix: "Response:"
@@ -267,7 +350,7 @@ strategy_params:
chat_transform:
messages:
- role: SYSTEM
- content: "{system_instruction}"
+ content: "{system_instruction.description}"
- role: USER
content: "{cleaned_opener}"
- role: ASSISTANT
@@ -277,18 +360,19 @@ strategy_params:
- role: ASSISTANT
content: "{cleaned_final_response}"
- # Include relevant attributes in output
+ # Include only relevant attributes in output
passthrough_attributes:
- conversation
- scenario
- customer_type
+ - customer_interaction
- resolution
- cleaned_opener
- cleaned_agent_response
- cleaned_followup
- cleaned_final_response
-# Model configuration optimized for conversation generation
+# Example inference configuration
inference_config:
model:
model_name: claude-3-5-sonnet-20240620
diff --git a/configs/examples/synthesis/data_augmentation_synth.yaml b/configs/examples/synthesis/data_augmentation_synth.yaml
index ace4c6bdd4..f079cbbd3a 100644
--- a/configs/examples/synthesis/data_augmentation_synth.yaml
+++ b/configs/examples/synthesis/data_augmentation_synth.yaml
@@ -21,13 +21,14 @@
# - Other synthesis configs: configs/**/*synth.yaml
strategy: GENERAL
-num_samples: 50
+num_samples: 10
output_path: augmented_qa_dataset.jsonl
strategy_params:
# Input existing data to augment
input_data:
- path: "data/dataset_examples/alpaca_format.jsonl"
+ # Map the current attributes to the new attributes used in this config
attribute_map:
instruction: original_instruction
input: original_input
@@ -35,6 +36,7 @@ strategy_params:
# Define augmentation approaches
sampled_attributes:
+ # Whether to rephrase the same task or create a related task
- id: augmentation_type
name: Augmentation Type
description: Whether to rephrase the same task or create a related task
@@ -48,6 +50,7 @@ strategy_params:
description: Create similar but different tasks in the same domain
sample_rate: 0.7 # 70% will be related tasks
+ # Different ways to phrase the same request
- id: phrasing_style
name: Instruction Phrasing Style
description: Different ways to phrase the same request
@@ -69,6 +72,7 @@ strategy_params:
description: Instructions that include background or context (e.g., "I'm working on X and need...")
sample_rate: 0.25
+ # How detailed and specific the instruction should be
- id: specificity_level
name: Instruction Specificity
description: How detailed and specific the instruction should be
@@ -85,6 +89,7 @@ strategy_params:
# Generate instruction variations or related tasks
generated_attributes:
+ # Generate the augmented instruction
- id: augmented_instruction
instruction_messages:
- role: SYSTEM
@@ -121,6 +126,7 @@ strategy_params:
cut_suffix: "End Instruction"
strip_whitespace: true
+ # Generate the augmented response
- id: augmented_response
instruction_messages:
- role: SYSTEM
@@ -155,7 +161,7 @@ strategy_params:
cut_suffix: "End Response"
strip_whitespace: true
- # Create variations by sampling different combinations
+ # Specify particular combinations and how often to sample them
combination_sampling:
- combination:
augmentation_type: rephrase
@@ -190,7 +196,7 @@ strategy_params:
- role: ASSISTANT
content: "{cleaned_response}"
- # Output the augmented data
+ # Include only relevant attributes in output
passthrough_attributes:
- conversation
- cleaned_instruction
@@ -202,7 +208,7 @@ strategy_params:
- original_input
- original_response
-# Configure the AI model
+# Example inference configuration
inference_config:
model:
model_name: claude-3-5-sonnet-20240620
diff --git a/configs/examples/synthesis/domain_qa_synth.yaml b/configs/examples/synthesis/domain_qa_synth.yaml
index 2645f6326e..73c785c957 100644
--- a/configs/examples/synthesis/domain_qa_synth.yaml
+++ b/configs/examples/synthesis/domain_qa_synth.yaml
@@ -21,7 +21,7 @@
# - Other synthesis configs: configs/**/*synth.yaml
strategy: GENERAL
-num_samples: 50
+num_samples: 10
output_path: domain_qa.jsonl
strategy_params:
@@ -52,6 +52,7 @@ strategy_params:
# Define medical specialties and contexts
sampled_attributes:
+ # The specific area of medicine being addressed
- id: specialty
name: Medical Specialty
description: The specific area of medicine being addressed
@@ -75,6 +76,7 @@ strategy_params:
name: Endocrinology
description: Hormonal and metabolic disorders
+ # The setting or purpose of the medical information
- id: context_type
name: Context Type
description: The setting or purpose of the medical information
@@ -95,6 +97,7 @@ strategy_params:
name: Parent Guidance
description: Information for parents about children's health
+ # The technical depth of medical information
- id: complexity_level
name: Medical Complexity
description: The technical depth of medical information
@@ -114,6 +117,7 @@ strategy_params:
# Generate domain-specific medical questions
generated_attributes:
+ # Generate the medical question
- id: medical_question
instruction_messages:
- role: SYSTEM
@@ -153,6 +157,7 @@ strategy_params:
cut_suffix: "End Question"
strip_whitespace: true
+ # Generate the medical answer
- id: medical_answer
instruction_messages:
- role: SYSTEM
@@ -189,7 +194,7 @@ strategy_params:
- role: ASSISTANT
content: "{cleaned_answer}"
- # Include all domain-specific attributes
+ # Include only relevant attributes in output
passthrough_attributes:
- conversation
- cleaned_question
@@ -198,7 +203,7 @@ strategy_params:
- context_type
- complexity_level
-# Configure model for medical domain generation
+# Example inference configuration
inference_config:
model:
model_name: claude-3-5-sonnet-20240620
diff --git a/configs/examples/synthesis/instruction_following_synth.yaml b/configs/examples/synthesis/instruction_following_synth.yaml
index a904dfe0f4..c4c96774bb 100644
--- a/configs/examples/synthesis/instruction_following_synth.yaml
+++ b/configs/examples/synthesis/instruction_following_synth.yaml
@@ -21,7 +21,7 @@
# - Other synthesis configs: configs/**/*synth.yaml
strategy: GENERAL
-num_samples: 50
+num_samples: 10
output_path: instruction_following_dataset.jsonl
strategy_params:
@@ -52,6 +52,7 @@ strategy_params:
# Define the domain and complexity attributes
sampled_attributes:
+ # The subject area or field for the instruction
- id: domain
name: Task Domain
description: The subject area or field for the instruction
@@ -75,6 +76,7 @@ strategy_params:
name: Business
description: Business strategy, marketing, management tasks
+ # How challenging the instruction should be
- id: complexity
name: Task Complexity
description: How challenging the instruction should be
@@ -92,6 +94,7 @@ strategy_params:
description: Complex tasks requiring deep knowledge and skills
sample_rate: 0.4
+ # The type of task being requested
- id: task_format
name: Task Format
description: The type of task being requested
@@ -114,6 +117,7 @@ strategy_params:
# Generate diverse instructions
generated_attributes:
+ # Generate the instruction
- id: instruction
instruction_messages:
- role: SYSTEM
@@ -153,6 +157,7 @@ strategy_params:
cut_suffix: "End Instruction"
strip_whitespace: true
+ # Generate the response
- id: response
instruction_messages:
- role: SYSTEM
@@ -170,7 +175,7 @@ strategy_params:
id: cleaned_response
strip_whitespace: true
- # Ensure good distribution of challenging combinations
+ # Specify particular combinations and how often to sample them
combination_sampling:
- combination:
domain: coding
@@ -188,7 +193,7 @@ strategy_params:
task_format: create
sample_rate: 0.1 # 10% beginner creative writing
- # Transform into conversation format
+ # Convert the instruction-response pairs into a conversation format
transformed_attributes:
- id: conversation
transformation_strategy:
@@ -200,7 +205,7 @@ strategy_params:
- role: ASSISTANT
content: "{cleaned_response}"
- # Include all relevant attributes in output
+ # Include only relevant attributes in output
passthrough_attributes:
- conversation
- cleaned_instruction
@@ -209,7 +214,7 @@ strategy_params:
- complexity
- task_format
-# Model configuration for diverse instruction generation
+# Example inference configuration
inference_config:
model:
model_name: claude-3-5-sonnet-20240620
diff --git a/configs/examples/synthesis/question_answer_synth.yaml b/configs/examples/synthesis/question_answer_synth.yaml
index 7401104509..e02ca465f9 100644
--- a/configs/examples/synthesis/question_answer_synth.yaml
+++ b/configs/examples/synthesis/question_answer_synth.yaml
@@ -21,7 +21,7 @@
# - Other synthesis configs: configs/**/*synth.yaml
strategy: GENERAL
-num_samples: 50
+num_samples: 10
output_path: geography_qa_dataset.jsonl
strategy_params:
@@ -43,6 +43,7 @@ strategy_params:
# Define attributes that will vary across generated examples
sampled_attributes:
+ # How challenging the geography question should be
- id: difficulty
name: Difficulty Level
description: How challenging the geography question should be
@@ -60,6 +61,7 @@ strategy_params:
description: Advanced geography knowledge for experts
sample_rate: 0.2 # 20% of questions will be hard
+ # The specific area of geography to focus on
- id: topic
name: Geography Topic
description: The specific area of geography to focus on
@@ -77,8 +79,8 @@ strategy_params:
name: Climate and Weather
description: Questions about climate zones and weather patterns
- # Generate the question using LLM
generated_attributes:
+ # Generate the question
- id: question
instruction_messages:
- role: SYSTEM
@@ -116,7 +118,7 @@ strategy_params:
cut_suffix: "End Question"
strip_whitespace: true
- # Generate the answer using LLM
+ # Generate the answer
- id: answer
instruction_messages:
- role: SYSTEM
@@ -152,6 +154,7 @@ strategy_params:
strip_whitespace: true
transformed_attributes:
+ # Convert the question-answer pairs into a conversation format
- id: conversation
transformation_strategy:
type: CHAT
@@ -162,7 +165,7 @@ strategy_params:
- role: ASSISTANT
content: "{cleaned_answer}"
- # Specify which attributes to include in the final output
+ # Include only relevant attributes in output
passthrough_attributes:
- conversation
- cleaned_question
@@ -170,7 +173,7 @@ strategy_params:
- difficulty
- topic
-# Configure the AI model for generation
+# Example inference configuration
inference_config:
model:
model_name: claude-3-5-sonnet-20240620