Thanks to visit codestin.com
Credit goes to www.scribd.com

0% found this document useful (0 votes)
32 views12 pages

ATI Ipynb

The document outlines a Python script that utilizes various libraries such as TensorFlow, Whisper, and Librosa to analyze audio files for environmental sounds and transcribe speech. It includes functionalities for classifying sounds using the YAMNet model, transcribing audio with the Whisper model, and generating context-aware image prompts based on the audio content. The script is designed to run on both Google Colab and local environments, with GPU configuration for enhanced performance.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
32 views12 pages

ATI Ipynb

The document outlines a Python script that utilizes various libraries such as TensorFlow, Whisper, and Librosa to analyze audio files for environmental sounds and transcribe speech. It includes functionalities for classifying sounds using the YAMNet model, transcribing audio with the Whisper model, and generating context-aware image prompts based on the audio content. The script is designed to run on both Google Colab and local environments, with GPU configuration for enhanced performance.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 12

wimport tensorflow as tf

import numpy as np
import pandas as pd
import librosa
import os
import sys
import torch
from PIL import Image
from datetime import datetime

# Install required packages


!pip install -q tensorflow-hub
!pip install -q git+https://github.com/openai/whisper.git
!pip install -q librosa
!pip install -q diffusers transformers accelerate

# Import after installation


import tensorflow_hub as hub
import whisper
from diffusers import (
StableDiffusionXLPipeline,
DPMSolverMultistepScheduler,
StableDiffusionUpscalePipeline
)

# Mount Google Drive if in Colab


IN_COLAB = 'google.colab' in sys.modules
if IN_COLAB:
from google.colab import drive
drive.mount('/content/drive')
#ENTER AUDIO PATH =======================================
AUDIO_FILE_PATH = "/content/drive/MyDrive/trump_gets_laughed_at.mp3"
#========================================================
OUTPUT_DIR = "/content/drive/MyDrive/AI_Generated_Images"
else:
AUDIO_FILE_PATH = "path/to/your/audio.wav" # Replace with local path
OUTPUT_DIR = "./AI_Generated_Images"

# Create output directory if it doesn't exist


os.makedirs(OUTPUT_DIR, exist_ok=True)

# Configure GPU properly


print("Configuring GPU...")
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
try:
for device in physical_devices:
tf.config.experimental.set_memory_growth(device, True)
print(f"GPU configured successfully: {len(physical_devices)} GPU(s) found")
device = "cuda"
torch_dtype = torch.float16
except Exception as e:
print(f"Error configuring GPU: {e}")
device = "cpu"
torch_dtype = torch.float32
else:
print("No GPU found. Running on CPU.")
device = "cpu"
torch_dtype = torch.float32
# Function to download YAMNet class map
def get_yamnet_class_map():
class_map_path = 'yamnet_class_map.csv'
if not os.path.exists(class_map_path):
print("Downloading YAMNet class map...")
import urllib.request
url =
'https://raw.githubusercontent.com/tensorflow/models/master/research/audioset/
yamnet/yamnet_class_map.csv'
urllib.request.urlretrieve(url, class_map_path)
return class_map_path

# Load YAMNet model with error handling


print("Loading YAMNet model...")
try:
# Force CPU execution for YAMNet as it seems to have issues with the GPU
with tf.device('/cpu:0'):
yamnet_model = hub.load('https://tfhub.dev/google/yamnet/1')
print("YAMNet model loaded successfully on CPU")
except Exception as e:
print(f"Error loading YAMNet model: {e}")
yamnet_model = None

def load_yamnet_class_names():
class_names_path = get_yamnet_class_map()
df = pd.read_csv(class_names_path, header=0)
return df.iloc[:, 2].tolist()

def classify_environmental_sounds(audio_path, threshold=0.15):


try:
print(f"Loading audio file: {audio_path}")
waveform, sr = librosa.load(audio_path, sr=16000)
print(f"Audio loaded: {len(waveform)} samples, {sr}Hz")

# Ensure waveform is the right shape and type


waveform = waveform.astype(np.float32)

# Process in smaller chunks to avoid memory issues


chunk_size = 5 * sr # 5 seconds chunks
all_scores = []

# Process each chunk separately


for i in range(0, len(waveform), chunk_size):
chunk = waveform[i:i + chunk_size]
if len(chunk) < sr: # Skip chunks less than 1 second
continue

chunk_tensor = tf.convert_to_tensor(chunk, dtype=tf.float32)


print(f"Processing chunk {i//chunk_size + 1}/{(len(waveform) +
chunk_size - 1)//chunk_size}")

try:
# Clear previous session memory
tf.keras.backend.clear_session()

# Process the chunk with YAMNet on CPU


with tf.device('/cpu:0'):
scores, embeddings, spectrogram = yamnet_model(chunk_tensor)
all_scores.append(scores)

except Exception as chunk_err:


print(f"Error processing chunk: {chunk_err}")
# Continue with next chunk
continue

if not all_scores:
print("No valid audio chunks processed")
return []

# Combine scores from all chunks


combined_scores = tf.concat(all_scores, axis=0)
mean_scores = tf.reduce_mean(combined_scores, axis=0).numpy()

class_names = load_yamnet_class_names()
detected_classes = [
(class_names[i], float(mean_scores[i]))
for i in np.where(mean_scores > threshold)[0]
if i < len(class_names)
]

return sorted(detected_classes, key=lambda x: x[1], reverse=True)

except Exception as e:
print(f"Error in classify_environmental_sounds: {e}")
import traceback
traceback.print_exc()
return []

# Load Whisper model


print("Loading Whisper model...")
try:
# Continue using GPU for Whisper since it's working correctly
whisper_device = "cuda" if torch.cuda.is_available() else "cpu"
whisper_model = whisper.load_model("base", device=whisper_device)
print(f"Whisper model loaded successfully on {whisper_device}")
except Exception as e:
print(f"Error loading Whisper model: {e}")
whisper_model = None

def transcribe_with_validation(audio_path, min_confidence=0.5, min_words=3):


try:
if whisper_model is None:
return {'text': '', 'confidence': 0, 'is_valid': False}

print(f"Transcribing audio: {audio_path}")


result = whisper_model.transcribe(audio_path)
transcription = result.get("text", "").strip()

segments = result.get("segments", [])


confidences = [seg.get('confidence', 0.7) for seg in segments]
avg_confidence = np.mean(confidences) if confidences else 0.8

is_valid = (
avg_confidence >= min_confidence and
len(transcription.split()) >= min_words and
any(c.isalpha() for c in transcription)
)
print(f"Transcription: '{transcription}'")
print(f"Confidence: {avg_confidence:.2f}, Valid: {is_valid}")
return {
'text': transcription,
'confidence': avg_confidence,
'is_valid': is_valid
}
except Exception as e:
print(f"Error in transcribe_with_validation: {e}")
import traceback
traceback.print_exc()
return {'text': '', 'confidence': 0, 'is_valid': False}

# ENHANCED: More dynamic and contextual prompt creation


def create_image_prompt(audio_path, env_threshold=0.15):
"""
Creates a context-aware image prompt based on audio analysis
that adapts to the specific audio content detected
"""
try:
# Process audio with both YAMNet and Whisper
env_results = []
env_labels = []
env_types = set()

if yamnet_model is not None:


print("Classifying environmental sounds...")
env_results = classify_environmental_sounds(audio_path,
threshold=env_threshold)
env_labels = [label for label, score in env_results[:10]] # Top 10
labels
print(f"Top detected sounds: {env_labels}")

# Categorize the environment for better context understanding


weather_sounds = {'Rain', 'Thunder', 'Wind', 'Thunderstorm', 'Storm'}
nature_sounds = {'Water', 'Stream', 'River', 'Ocean', 'Waves',
'Forest', 'Birds', 'Animals'}
indoor_sounds = {'Inside, small room', 'Speech', 'Conversation',
'Music', 'Keyboard', 'Computer', 'Typing'}
urban_sounds = {'Traffic', 'Vehicle', 'Car', 'Engine', 'Urban', 'City',
'Street'}
crowd_sounds = {'Crowd', 'Applause', 'Cheering', 'Speech',
'Conference', 'Meeting'}

# Identify environment types present


for label in env_labels:
if any(sound.lower() in label.lower() for sound in weather_sounds):
env_types.add('weather')
if any(sound.lower() in label.lower() for sound in nature_sounds):
env_types.add('nature')
if any(sound.lower() in label.lower() for sound in indoor_sounds):
env_types.add('indoor')
if any(sound.lower() in label.lower() for sound in urban_sounds):
env_types.add('urban')
if any(sound.lower() in label.lower() for sound in crowd_sounds):
env_types.add('crowd')

print("Transcribing speech...")
stt_result = transcribe_with_validation(audio_path)

# Build a context-aware prompt based on detected content


# This is the key enhancement - dynamic prompt construction

if stt_result['is_valid'] and len(stt_result['text']) > 10:


# Speech is prominent - build a speech-focused scene with environmental
context
speech_text = stt_result['text']
speech_topic = extract_topic(speech_text)
speech_emotion = analyze_speech_emotion(speech_text)

# Determine speech context (formal/informal, etc.)


speech_context = determine_speech_context(env_types, env_labels,
speech_text)

# Build prompt around the speech with appropriate context


prompt = f"A {speech_context} scene with a person speaking about
{speech_topic}"

# Add emotional context if detected


if speech_emotion:
prompt += f" with {speech_emotion} expression"

# Add environment context if available


if env_types:
environment = get_environment_description(env_types, env_labels)
prompt += f". {environment}"

elif env_types:
# No clear speech - focus on environmental sounds
primary_env = determine_primary_environment(env_types, env_labels)

# Build a rich environmental scene based on detected sounds


prompt = primary_env

# Add weather details if detected


weather_details = extract_weather_details(env_labels)
if weather_details:
prompt += f" {weather_details}"

# Add activity details if detected


activity_details = extract_activity_details(env_labels)
if activity_details:
prompt += f" {activity_details}"

else:
# Fallback for when no clear context is detected
prompt = "A realistic environmental scene with natural lighting and
atmosphere"

# Add quality enhancers appropriate for the type of scene


prompt = enhance_prompt_quality(prompt, env_types, 'speech' in
prompt.lower())

return prompt

except Exception as e:
print(f"Error in create_image_prompt: {e}")
import traceback
traceback.print_exc()
return "Realistic natural environment scene"

def extract_topic(text):
"""Extract the main topic from speech text"""
# Simplified topic extraction - in production you might use NLP
if len(text) < 20:
return text.strip()

# Basic topic extraction by taking the first sentence or phrase


first_sentence = text.split('.')[0].strip()
if len(first_sentence) > 50:
return first_sentence[:50] + "..."
return first_sentence

def analyze_speech_emotion(text):
"""Detect emotional tone in speech text"""
# Simple keyword-based emotion detection
positive_words = ['happy', 'excited', 'glad', 'wonderful', 'great', 'amazing',
'joy']
negative_words = ['sad', 'angry', 'upset', 'terrible', 'awful', 'worried',
'concerned']
neutral_words = ['explain', 'inform', 'tell', 'describe', 'discuss']

text_lower = text.lower()

# Count emotion words


positive_count = sum(1 for word in positive_words if word in text_lower)
negative_count = sum(1 for word in negative_words if word in text_lower)
neutral_count = sum(1 for word in neutral_words if word in text_lower)

# Determine dominant emotion


if positive_count > negative_count and positive_count > neutral_count:
return "positive"
elif negative_count > positive_count and negative_count > neutral_count:
return "serious or concerned"
elif neutral_count > 0:
return "informative"

# Default to neutral if no clear emotion


return ""

def determine_speech_context(env_types, env_labels, speech_text):


"""Determine the context of speech based on environment and content"""
# Check for formal settings
formal_indicators = ['conference', 'meeting', 'lecture', 'presentation',
'speech']
formal_env = any(indicator in ' '.join(env_labels).lower() for indicator in
formal_indicators)

# Check for formal language in speech


formal_speech = any(term in speech_text.lower() for term in
['ladies and gentlemen', 'thank you for', 'i am pleased
to',
'in conclusion', 'our company', 'organization'])

# Check for casual settings


casual_indicators = ['music', 'party', 'conversation', 'chat', 'laugh']
casual_env = any(indicator in ' '.join(env_labels).lower() for indicator in
casual_indicators)

# Determine context
if (formal_env or formal_speech) and not casual_env:
return "formal presentation"
elif casual_env:
return "casual conversation"
elif 'indoor' in env_types:
return "indoor discussion"
elif 'crowd' in env_types:
return "public address"
else:
return "realistic speaking"

def determine_primary_environment(env_types, env_labels):


"""Determine the primary environment type based on detected sounds"""
env_priorities = ['weather', 'nature', 'urban', 'crowd', 'indoor']

# Check environment types in priority order


for env in env_priorities:
if env in env_types:
if env == 'weather':
return "A dramatic weather scene with atmospheric conditions"
elif env == 'nature':
return "A beautiful natural landscape with organic elements"
elif env == 'urban':
return "A detailed urban cityscape with architectural elements"
elif env == 'crowd':
return "A vibrant scene with a gathering of people"
elif env == 'indoor':
return "A detailed interior space with ambient lighting"

# Default environment if no clear type is determined


return "A realistic environmental scene"

def get_environment_description(env_types, env_labels):


"""Generate a rich environment description based on detected types and
labels"""
descriptions = []

# Add specific descriptions based on environment types


if 'weather' in env_types:
weather_terms = [label for label in env_labels if label.lower() in
['rain', 'thunder', 'wind', 'storm', 'lightning']]
if weather_terms:
descriptions.append(f"with {' and '.join(weather_terms).lower()}
visible")

if 'nature' in env_types:
nature_desc = "in a natural setting"
water_terms = [label for label in env_labels if label.lower() in
['water', 'river', 'stream', 'ocean', 'waves']]
if water_terms:
nature_desc += f" with {water_terms[0].lower()}"
descriptions.append(nature_desc)

if 'indoor' in env_types:
indoor_desc = "in an indoor space"
if "Inside, small room" in env_labels:
indoor_desc = "in a small room with intimate lighting"
elif "Inside, large room" in env_labels:
indoor_desc = "in a large hall with spacious architecture"
descriptions.append(indoor_desc)

if 'urban' in env_types:
descriptions.append("in an urban environment with city elements")

if 'crowd' in env_types:
descriptions.append("with a crowd of attentive people")

# Combine descriptions
if descriptions:
return " ".join(descriptions)
else:
return "in a detailed environment"

def extract_weather_details(env_labels):
"""Extract weather details from environment labels"""
weather_details = []

# Look for specific weather conditions


if any('rain' in label.lower() for label in env_labels):
weather_details.append("rain falling")

if any('thunder' in label.lower() for label in env_labels):


weather_details.append("thunderclouds")

if any('wind' in label.lower() for label in env_labels):


weather_details.append("with visible wind effects")

if any('storm' in label.lower() for label in env_labels):


weather_details.append("during a storm")

# Combine details
if weather_details:
return "with " + ", ".join(weather_details)
return ""

def extract_activity_details(env_labels):
"""Extract activity details from environment labels"""
activities = []

# Look for human activities


if any(label.lower() in ['speech', 'speaking', 'talk'] for label in
env_labels):
activities.append("people engaged in conversation")

if any(label.lower() in ['keyboard', 'typing', 'computer'] for label in


env_labels):
activities.append("someone working on a computer")

if any(label.lower() in ['music', 'singing', 'instrument'] for label in


env_labels):
activities.append("with music being played")

# Combine activities
if activities:
return "showing " + ", ".join(activities)
return ""

def enhance_prompt_quality(prompt, env_types, has_person):


"""Add appropriate quality enhancers to the prompt based on content"""
# Base quality enhancers
quality_base = "highly detailed, sharp focus, professional photography"

# Add environment-specific quality enhancers


if 'weather' in env_types:
prompt += f", {quality_base}, volumetric lighting, atmospheric conditions"

elif 'nature' in env_types:


prompt += f", {quality_base}, natural lighting, organic textures,
atmospheric perspective"

elif 'urban' in env_types:


prompt += f", {quality_base}, urban textures, architectural details,
realistic lighting"

elif 'indoor' in env_types:


prompt += f", {quality_base}, interior lighting, ambient occlusion,
realistic textures"

else:
prompt += f", {quality_base}, realistic lighting"

# Add person-specific quality if needed


if has_person:
prompt += ", detailed facial features, realistic expression, natural
posture"

return prompt

# Image Generator class


class ImageGenerator:
def __init__(self):
self.base_model_loaded = False
self.upscaler_loaded = False
self.base_model = None
self.upscaler = None

def load_base_model(self):
"""Load the base Stable Diffusion XL model"""
if not self.base_model_loaded:
print("Loading Stable Diffusion XL model...")
try:
# Use SDXL for high-quality images
self.base_model = StableDiffusionXLPipeline.from_pretrained(
"stabilityai/stable-diffusion-xl-base-1.0",
torch_dtype=torch_dtype,
variant="fp16",
use_safetensors=True
)

# Optimize for speed and memory


self.base_model.scheduler =
DPMSolverMultistepScheduler.from_config(
self.base_model.scheduler.config,
algorithm_type="sde-dpmsolver++",
use_karras_sigmas=True
)

# Move to GPU if available


self.base_model = self.base_model.to(device)

# Enable memory optimization


self.base_model.enable_attention_slicing()
if torch.cuda.is_available():
self.base_model.enable_model_cpu_offload()

self.base_model_loaded = True
print("Base model loaded successfully")
except Exception as e:
print(f"Error loading base model: {e}")
raise
return self.base_model

def load_upscaler(self):
"""Load the upscaler model for higher resolution"""
if not self.upscaler_loaded:
print("Loading upscaler model...")
try:
self.upscaler = StableDiffusionUpscalePipeline.from_pretrained(
"stabilityai/stable-diffusion-x4-upscaler",
torch_dtype=torch_dtype
)
self.upscaler = self.upscaler.to(device)
if torch.cuda.is_available():
self.upscaler.enable_model_cpu_offload()
self.upscaler_loaded = True
print("Upscaler model loaded successfully")
except Exception as e:
print(f"Error loading upscaler: {e}")
# Continue without upscaler
pass
return self.upscaler

def generate_image(self, prompt, negative_prompt=None, guidance_scale=7.5,


steps=30, width=1024, height=1024, upscale=True,
enhance_faces=True, num_images=1):
"""Generate high-quality images with environmental context awareness"""

# Load the base model if not already loaded


if not self.base_model_loaded:
self.load_base_model()

if negative_prompt is None:
negative_prompt = "deformed, bad anatomy, disfigured, poorly drawn
face, mutation, mutated, extra limbs, ugly, poorly drawn hands, missing limbs,
blurry, watermark, blurry, grainy, signature, cut off, low-res"

# Generate images with the base model


print(f"Generating {num_images} image(s) with prompt: {prompt}")
images = self.base_model(
prompt=prompt,
negative_prompt=negative_prompt,
guidance_scale=guidance_scale,
num_inference_steps=steps,
width=width,
height=height,
num_images_per_prompt=num_images
).images

# Process each generated image


processed_images = []
for idx, img in enumerate(images):
print(f"Processing image {idx+1}/{len(images)}...")

# Upscale if requested and upscaler is available


if upscale and self.upscaler_loaded:
img = self._upscale_image(img)

processed_images.append(img)

# Save the image


timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"{OUTPUT_DIR}/image_{timestamp}_{idx}.png"
img.save(filename)
print(f"Saved image to {filename}")

return processed_images

def _upscale_image(self, image):


"""Upscale an image to higher resolution"""
try:
if not self.upscaler_loaded:
self.load_upscaler()

if self.upscaler_loaded:
# Resize to match upscaler's expected input
low_res_img = image.resize((512, 512))
upscaled = self.upscaler(
prompt="high quality, detailed, sharp focus",
image=low_res_img,
num_inference_steps=20
).images[0]
return upscaled
except Exception as e:
print(f"Error during upscaling: {e}")

# Return original if upscaling failed


return image

# Complete pipeline function


def audio_to_image(audio_file_path, num_images=1):
"""Complete pipeline from audio file to images"""
# Process audio to generate contextual prompt
prompt = create_image_prompt(audio_file_path)
print(f"\nGenerated prompt: {prompt}")

# Initialize image generator


generator = ImageGenerator()

# Determine appropriate generation parameters based on prompt content


has_face = "person" in prompt.lower() or "people" in prompt.lower() or
"speaking" in prompt.lower()
is_weather = "weather" in prompt.lower() or "rain" in prompt.lower() or "storm"
in prompt.lower()
is_nature = "nature" in prompt.lower() or "landscape" in prompt.lower() or
"forest" in prompt.lower()

# Adjust guidance scale based on content


guidance_scale = 8.0 if has_face else 7.5
if is_weather or is_nature:
guidance_scale = 7.0 # Lower for natural scenes

# Adjust steps based on complexity


steps = 35 if has_face else 30
if is_weather:
steps = 40 # More steps for complex weather

# Generate images
images = generator.generate_image(
prompt=prompt,
guidance_scale=guidance_scale,
steps=steps,
width=1024,
height=1024,
upscale=True,
enhance_faces=has_face,
num_images=num_images
)

return images, prompt

# Main execution
if __name__ == "__main__":
print(f"Using audio file: {AUDIO_FILE_PATH}")

if not os.path.exists(AUDIO_FILE_PATH):
print(f"ERROR: File not found at {AUDIO_FILE_PATH}")
else:
print("\nProcessing audio to generate images...")
images, prompt = audio_to_image(AUDIO_FILE_PATH, num_images=1)

print("\n=== AUDIO ANALYSIS AND IMAGE GENERATION COMPLETE ===")


print(f"Prompt: {prompt}")
print(f"Generated {len(images)} images in {OUTPUT_DIR}")
print("====================================================")

You might also like