wimport tensorflow as tf
import numpy as np
import pandas as pd
import librosa
import os
import sys
import torch
from PIL import Image
from datetime import datetime
# Install required packages
!pip install -q tensorflow-hub
!pip install -q git+https://github.com/openai/whisper.git
!pip install -q librosa
!pip install -q diffusers transformers accelerate
# Import after installation
import tensorflow_hub as hub
import whisper
from diffusers import (
StableDiffusionXLPipeline,
DPMSolverMultistepScheduler,
StableDiffusionUpscalePipeline
)
# Mount Google Drive if in Colab
IN_COLAB = 'google.colab' in sys.modules
if IN_COLAB:
from google.colab import drive
drive.mount('/content/drive')
#ENTER AUDIO PATH =======================================
AUDIO_FILE_PATH = "/content/drive/MyDrive/trump_gets_laughed_at.mp3"
#========================================================
OUTPUT_DIR = "/content/drive/MyDrive/AI_Generated_Images"
else:
AUDIO_FILE_PATH = "path/to/your/audio.wav" # Replace with local path
OUTPUT_DIR = "./AI_Generated_Images"
# Create output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)
# Configure GPU properly
print("Configuring GPU...")
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
try:
for device in physical_devices:
tf.config.experimental.set_memory_growth(device, True)
print(f"GPU configured successfully: {len(physical_devices)} GPU(s) found")
device = "cuda"
torch_dtype = torch.float16
except Exception as e:
print(f"Error configuring GPU: {e}")
device = "cpu"
torch_dtype = torch.float32
else:
print("No GPU found. Running on CPU.")
device = "cpu"
torch_dtype = torch.float32
# Function to download YAMNet class map
def get_yamnet_class_map():
class_map_path = 'yamnet_class_map.csv'
if not os.path.exists(class_map_path):
print("Downloading YAMNet class map...")
import urllib.request
url =
'https://raw.githubusercontent.com/tensorflow/models/master/research/audioset/
yamnet/yamnet_class_map.csv'
urllib.request.urlretrieve(url, class_map_path)
return class_map_path
# Load YAMNet model with error handling
print("Loading YAMNet model...")
try:
# Force CPU execution for YAMNet as it seems to have issues with the GPU
with tf.device('/cpu:0'):
yamnet_model = hub.load('https://tfhub.dev/google/yamnet/1')
print("YAMNet model loaded successfully on CPU")
except Exception as e:
print(f"Error loading YAMNet model: {e}")
yamnet_model = None
def load_yamnet_class_names():
class_names_path = get_yamnet_class_map()
df = pd.read_csv(class_names_path, header=0)
return df.iloc[:, 2].tolist()
def classify_environmental_sounds(audio_path, threshold=0.15):
try:
print(f"Loading audio file: {audio_path}")
waveform, sr = librosa.load(audio_path, sr=16000)
print(f"Audio loaded: {len(waveform)} samples, {sr}Hz")
# Ensure waveform is the right shape and type
waveform = waveform.astype(np.float32)
# Process in smaller chunks to avoid memory issues
chunk_size = 5 * sr # 5 seconds chunks
all_scores = []
# Process each chunk separately
for i in range(0, len(waveform), chunk_size):
chunk = waveform[i:i + chunk_size]
if len(chunk) < sr: # Skip chunks less than 1 second
continue
chunk_tensor = tf.convert_to_tensor(chunk, dtype=tf.float32)
print(f"Processing chunk {i//chunk_size + 1}/{(len(waveform) +
chunk_size - 1)//chunk_size}")
try:
# Clear previous session memory
tf.keras.backend.clear_session()
# Process the chunk with YAMNet on CPU
with tf.device('/cpu:0'):
scores, embeddings, spectrogram = yamnet_model(chunk_tensor)
all_scores.append(scores)
except Exception as chunk_err:
print(f"Error processing chunk: {chunk_err}")
# Continue with next chunk
continue
if not all_scores:
print("No valid audio chunks processed")
return []
# Combine scores from all chunks
combined_scores = tf.concat(all_scores, axis=0)
mean_scores = tf.reduce_mean(combined_scores, axis=0).numpy()
class_names = load_yamnet_class_names()
detected_classes = [
(class_names[i], float(mean_scores[i]))
for i in np.where(mean_scores > threshold)[0]
if i < len(class_names)
]
return sorted(detected_classes, key=lambda x: x[1], reverse=True)
except Exception as e:
print(f"Error in classify_environmental_sounds: {e}")
import traceback
traceback.print_exc()
return []
# Load Whisper model
print("Loading Whisper model...")
try:
# Continue using GPU for Whisper since it's working correctly
whisper_device = "cuda" if torch.cuda.is_available() else "cpu"
whisper_model = whisper.load_model("base", device=whisper_device)
print(f"Whisper model loaded successfully on {whisper_device}")
except Exception as e:
print(f"Error loading Whisper model: {e}")
whisper_model = None
def transcribe_with_validation(audio_path, min_confidence=0.5, min_words=3):
try:
if whisper_model is None:
return {'text': '', 'confidence': 0, 'is_valid': False}
print(f"Transcribing audio: {audio_path}")
result = whisper_model.transcribe(audio_path)
transcription = result.get("text", "").strip()
segments = result.get("segments", [])
confidences = [seg.get('confidence', 0.7) for seg in segments]
avg_confidence = np.mean(confidences) if confidences else 0.8
is_valid = (
avg_confidence >= min_confidence and
len(transcription.split()) >= min_words and
any(c.isalpha() for c in transcription)
)
print(f"Transcription: '{transcription}'")
print(f"Confidence: {avg_confidence:.2f}, Valid: {is_valid}")
return {
'text': transcription,
'confidence': avg_confidence,
'is_valid': is_valid
}
except Exception as e:
print(f"Error in transcribe_with_validation: {e}")
import traceback
traceback.print_exc()
return {'text': '', 'confidence': 0, 'is_valid': False}
# ENHANCED: More dynamic and contextual prompt creation
def create_image_prompt(audio_path, env_threshold=0.15):
"""
Creates a context-aware image prompt based on audio analysis
that adapts to the specific audio content detected
"""
try:
# Process audio with both YAMNet and Whisper
env_results = []
env_labels = []
env_types = set()
if yamnet_model is not None:
print("Classifying environmental sounds...")
env_results = classify_environmental_sounds(audio_path,
threshold=env_threshold)
env_labels = [label for label, score in env_results[:10]] # Top 10
labels
print(f"Top detected sounds: {env_labels}")
# Categorize the environment for better context understanding
weather_sounds = {'Rain', 'Thunder', 'Wind', 'Thunderstorm', 'Storm'}
nature_sounds = {'Water', 'Stream', 'River', 'Ocean', 'Waves',
'Forest', 'Birds', 'Animals'}
indoor_sounds = {'Inside, small room', 'Speech', 'Conversation',
'Music', 'Keyboard', 'Computer', 'Typing'}
urban_sounds = {'Traffic', 'Vehicle', 'Car', 'Engine', 'Urban', 'City',
'Street'}
crowd_sounds = {'Crowd', 'Applause', 'Cheering', 'Speech',
'Conference', 'Meeting'}
# Identify environment types present
for label in env_labels:
if any(sound.lower() in label.lower() for sound in weather_sounds):
env_types.add('weather')
if any(sound.lower() in label.lower() for sound in nature_sounds):
env_types.add('nature')
if any(sound.lower() in label.lower() for sound in indoor_sounds):
env_types.add('indoor')
if any(sound.lower() in label.lower() for sound in urban_sounds):
env_types.add('urban')
if any(sound.lower() in label.lower() for sound in crowd_sounds):
env_types.add('crowd')
print("Transcribing speech...")
stt_result = transcribe_with_validation(audio_path)
# Build a context-aware prompt based on detected content
# This is the key enhancement - dynamic prompt construction
if stt_result['is_valid'] and len(stt_result['text']) > 10:
# Speech is prominent - build a speech-focused scene with environmental
context
speech_text = stt_result['text']
speech_topic = extract_topic(speech_text)
speech_emotion = analyze_speech_emotion(speech_text)
# Determine speech context (formal/informal, etc.)
speech_context = determine_speech_context(env_types, env_labels,
speech_text)
# Build prompt around the speech with appropriate context
prompt = f"A {speech_context} scene with a person speaking about
{speech_topic}"
# Add emotional context if detected
if speech_emotion:
prompt += f" with {speech_emotion} expression"
# Add environment context if available
if env_types:
environment = get_environment_description(env_types, env_labels)
prompt += f". {environment}"
elif env_types:
# No clear speech - focus on environmental sounds
primary_env = determine_primary_environment(env_types, env_labels)
# Build a rich environmental scene based on detected sounds
prompt = primary_env
# Add weather details if detected
weather_details = extract_weather_details(env_labels)
if weather_details:
prompt += f" {weather_details}"
# Add activity details if detected
activity_details = extract_activity_details(env_labels)
if activity_details:
prompt += f" {activity_details}"
else:
# Fallback for when no clear context is detected
prompt = "A realistic environmental scene with natural lighting and
atmosphere"
# Add quality enhancers appropriate for the type of scene
prompt = enhance_prompt_quality(prompt, env_types, 'speech' in
prompt.lower())
return prompt
except Exception as e:
print(f"Error in create_image_prompt: {e}")
import traceback
traceback.print_exc()
return "Realistic natural environment scene"
def extract_topic(text):
"""Extract the main topic from speech text"""
# Simplified topic extraction - in production you might use NLP
if len(text) < 20:
return text.strip()
# Basic topic extraction by taking the first sentence or phrase
first_sentence = text.split('.')[0].strip()
if len(first_sentence) > 50:
return first_sentence[:50] + "..."
return first_sentence
def analyze_speech_emotion(text):
"""Detect emotional tone in speech text"""
# Simple keyword-based emotion detection
positive_words = ['happy', 'excited', 'glad', 'wonderful', 'great', 'amazing',
'joy']
negative_words = ['sad', 'angry', 'upset', 'terrible', 'awful', 'worried',
'concerned']
neutral_words = ['explain', 'inform', 'tell', 'describe', 'discuss']
text_lower = text.lower()
# Count emotion words
positive_count = sum(1 for word in positive_words if word in text_lower)
negative_count = sum(1 for word in negative_words if word in text_lower)
neutral_count = sum(1 for word in neutral_words if word in text_lower)
# Determine dominant emotion
if positive_count > negative_count and positive_count > neutral_count:
return "positive"
elif negative_count > positive_count and negative_count > neutral_count:
return "serious or concerned"
elif neutral_count > 0:
return "informative"
# Default to neutral if no clear emotion
return ""
def determine_speech_context(env_types, env_labels, speech_text):
"""Determine the context of speech based on environment and content"""
# Check for formal settings
formal_indicators = ['conference', 'meeting', 'lecture', 'presentation',
'speech']
formal_env = any(indicator in ' '.join(env_labels).lower() for indicator in
formal_indicators)
# Check for formal language in speech
formal_speech = any(term in speech_text.lower() for term in
['ladies and gentlemen', 'thank you for', 'i am pleased
to',
'in conclusion', 'our company', 'organization'])
# Check for casual settings
casual_indicators = ['music', 'party', 'conversation', 'chat', 'laugh']
casual_env = any(indicator in ' '.join(env_labels).lower() for indicator in
casual_indicators)
# Determine context
if (formal_env or formal_speech) and not casual_env:
return "formal presentation"
elif casual_env:
return "casual conversation"
elif 'indoor' in env_types:
return "indoor discussion"
elif 'crowd' in env_types:
return "public address"
else:
return "realistic speaking"
def determine_primary_environment(env_types, env_labels):
"""Determine the primary environment type based on detected sounds"""
env_priorities = ['weather', 'nature', 'urban', 'crowd', 'indoor']
# Check environment types in priority order
for env in env_priorities:
if env in env_types:
if env == 'weather':
return "A dramatic weather scene with atmospheric conditions"
elif env == 'nature':
return "A beautiful natural landscape with organic elements"
elif env == 'urban':
return "A detailed urban cityscape with architectural elements"
elif env == 'crowd':
return "A vibrant scene with a gathering of people"
elif env == 'indoor':
return "A detailed interior space with ambient lighting"
# Default environment if no clear type is determined
return "A realistic environmental scene"
def get_environment_description(env_types, env_labels):
"""Generate a rich environment description based on detected types and
labels"""
descriptions = []
# Add specific descriptions based on environment types
if 'weather' in env_types:
weather_terms = [label for label in env_labels if label.lower() in
['rain', 'thunder', 'wind', 'storm', 'lightning']]
if weather_terms:
descriptions.append(f"with {' and '.join(weather_terms).lower()}
visible")
if 'nature' in env_types:
nature_desc = "in a natural setting"
water_terms = [label for label in env_labels if label.lower() in
['water', 'river', 'stream', 'ocean', 'waves']]
if water_terms:
nature_desc += f" with {water_terms[0].lower()}"
descriptions.append(nature_desc)
if 'indoor' in env_types:
indoor_desc = "in an indoor space"
if "Inside, small room" in env_labels:
indoor_desc = "in a small room with intimate lighting"
elif "Inside, large room" in env_labels:
indoor_desc = "in a large hall with spacious architecture"
descriptions.append(indoor_desc)
if 'urban' in env_types:
descriptions.append("in an urban environment with city elements")
if 'crowd' in env_types:
descriptions.append("with a crowd of attentive people")
# Combine descriptions
if descriptions:
return " ".join(descriptions)
else:
return "in a detailed environment"
def extract_weather_details(env_labels):
"""Extract weather details from environment labels"""
weather_details = []
# Look for specific weather conditions
if any('rain' in label.lower() for label in env_labels):
weather_details.append("rain falling")
if any('thunder' in label.lower() for label in env_labels):
weather_details.append("thunderclouds")
if any('wind' in label.lower() for label in env_labels):
weather_details.append("with visible wind effects")
if any('storm' in label.lower() for label in env_labels):
weather_details.append("during a storm")
# Combine details
if weather_details:
return "with " + ", ".join(weather_details)
return ""
def extract_activity_details(env_labels):
"""Extract activity details from environment labels"""
activities = []
# Look for human activities
if any(label.lower() in ['speech', 'speaking', 'talk'] for label in
env_labels):
activities.append("people engaged in conversation")
if any(label.lower() in ['keyboard', 'typing', 'computer'] for label in
env_labels):
activities.append("someone working on a computer")
if any(label.lower() in ['music', 'singing', 'instrument'] for label in
env_labels):
activities.append("with music being played")
# Combine activities
if activities:
return "showing " + ", ".join(activities)
return ""
def enhance_prompt_quality(prompt, env_types, has_person):
"""Add appropriate quality enhancers to the prompt based on content"""
# Base quality enhancers
quality_base = "highly detailed, sharp focus, professional photography"
# Add environment-specific quality enhancers
if 'weather' in env_types:
prompt += f", {quality_base}, volumetric lighting, atmospheric conditions"
elif 'nature' in env_types:
prompt += f", {quality_base}, natural lighting, organic textures,
atmospheric perspective"
elif 'urban' in env_types:
prompt += f", {quality_base}, urban textures, architectural details,
realistic lighting"
elif 'indoor' in env_types:
prompt += f", {quality_base}, interior lighting, ambient occlusion,
realistic textures"
else:
prompt += f", {quality_base}, realistic lighting"
# Add person-specific quality if needed
if has_person:
prompt += ", detailed facial features, realistic expression, natural
posture"
return prompt
# Image Generator class
class ImageGenerator:
def __init__(self):
self.base_model_loaded = False
self.upscaler_loaded = False
self.base_model = None
self.upscaler = None
def load_base_model(self):
"""Load the base Stable Diffusion XL model"""
if not self.base_model_loaded:
print("Loading Stable Diffusion XL model...")
try:
# Use SDXL for high-quality images
self.base_model = StableDiffusionXLPipeline.from_pretrained(
"stabilityai/stable-diffusion-xl-base-1.0",
torch_dtype=torch_dtype,
variant="fp16",
use_safetensors=True
)
# Optimize for speed and memory
self.base_model.scheduler =
DPMSolverMultistepScheduler.from_config(
self.base_model.scheduler.config,
algorithm_type="sde-dpmsolver++",
use_karras_sigmas=True
)
# Move to GPU if available
self.base_model = self.base_model.to(device)
# Enable memory optimization
self.base_model.enable_attention_slicing()
if torch.cuda.is_available():
self.base_model.enable_model_cpu_offload()
self.base_model_loaded = True
print("Base model loaded successfully")
except Exception as e:
print(f"Error loading base model: {e}")
raise
return self.base_model
def load_upscaler(self):
"""Load the upscaler model for higher resolution"""
if not self.upscaler_loaded:
print("Loading upscaler model...")
try:
self.upscaler = StableDiffusionUpscalePipeline.from_pretrained(
"stabilityai/stable-diffusion-x4-upscaler",
torch_dtype=torch_dtype
)
self.upscaler = self.upscaler.to(device)
if torch.cuda.is_available():
self.upscaler.enable_model_cpu_offload()
self.upscaler_loaded = True
print("Upscaler model loaded successfully")
except Exception as e:
print(f"Error loading upscaler: {e}")
# Continue without upscaler
pass
return self.upscaler
def generate_image(self, prompt, negative_prompt=None, guidance_scale=7.5,
steps=30, width=1024, height=1024, upscale=True,
enhance_faces=True, num_images=1):
"""Generate high-quality images with environmental context awareness"""
# Load the base model if not already loaded
if not self.base_model_loaded:
self.load_base_model()
if negative_prompt is None:
negative_prompt = "deformed, bad anatomy, disfigured, poorly drawn
face, mutation, mutated, extra limbs, ugly, poorly drawn hands, missing limbs,
blurry, watermark, blurry, grainy, signature, cut off, low-res"
# Generate images with the base model
print(f"Generating {num_images} image(s) with prompt: {prompt}")
images = self.base_model(
prompt=prompt,
negative_prompt=negative_prompt,
guidance_scale=guidance_scale,
num_inference_steps=steps,
width=width,
height=height,
num_images_per_prompt=num_images
).images
# Process each generated image
processed_images = []
for idx, img in enumerate(images):
print(f"Processing image {idx+1}/{len(images)}...")
# Upscale if requested and upscaler is available
if upscale and self.upscaler_loaded:
img = self._upscale_image(img)
processed_images.append(img)
# Save the image
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"{OUTPUT_DIR}/image_{timestamp}_{idx}.png"
img.save(filename)
print(f"Saved image to {filename}")
return processed_images
def _upscale_image(self, image):
"""Upscale an image to higher resolution"""
try:
if not self.upscaler_loaded:
self.load_upscaler()
if self.upscaler_loaded:
# Resize to match upscaler's expected input
low_res_img = image.resize((512, 512))
upscaled = self.upscaler(
prompt="high quality, detailed, sharp focus",
image=low_res_img,
num_inference_steps=20
).images[0]
return upscaled
except Exception as e:
print(f"Error during upscaling: {e}")
# Return original if upscaling failed
return image
# Complete pipeline function
def audio_to_image(audio_file_path, num_images=1):
"""Complete pipeline from audio file to images"""
# Process audio to generate contextual prompt
prompt = create_image_prompt(audio_file_path)
print(f"\nGenerated prompt: {prompt}")
# Initialize image generator
generator = ImageGenerator()
# Determine appropriate generation parameters based on prompt content
has_face = "person" in prompt.lower() or "people" in prompt.lower() or
"speaking" in prompt.lower()
is_weather = "weather" in prompt.lower() or "rain" in prompt.lower() or "storm"
in prompt.lower()
is_nature = "nature" in prompt.lower() or "landscape" in prompt.lower() or
"forest" in prompt.lower()
# Adjust guidance scale based on content
guidance_scale = 8.0 if has_face else 7.5
if is_weather or is_nature:
guidance_scale = 7.0 # Lower for natural scenes
# Adjust steps based on complexity
steps = 35 if has_face else 30
if is_weather:
steps = 40 # More steps for complex weather
# Generate images
images = generator.generate_image(
prompt=prompt,
guidance_scale=guidance_scale,
steps=steps,
width=1024,
height=1024,
upscale=True,
enhance_faces=has_face,
num_images=num_images
)
return images, prompt
# Main execution
if __name__ == "__main__":
print(f"Using audio file: {AUDIO_FILE_PATH}")
if not os.path.exists(AUDIO_FILE_PATH):
print(f"ERROR: File not found at {AUDIO_FILE_PATH}")
else:
print("\nProcessing audio to generate images...")
images, prompt = audio_to_image(AUDIO_FILE_PATH, num_images=1)
print("\n=== AUDIO ANALYSIS AND IMAGE GENERATION COMPLETE ===")
print(f"Prompt: {prompt}")
print(f"Generated {len(images)} images in {OUTPUT_DIR}")
print("====================================================")