*1
from google.colab import files
# Upload kaggle.json
files.upload()
# Move kaggle.json to the proper directory
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
# Install Kaggle API
!pip install kaggle
*2
# Download dataset
!kaggle datasets download -d warcoder/flickr-8k-audio-caption-corpus
# Unzip dataset
!unzip flickr-8k-audio-caption-corpus.zip -d /content/flickr8k_audio
*3
!kaggle datasets download -d adityajn105/flickr8k
!unzip flickr8k.zip -d /content/flickr8k_images
*4
import os
from PIL import Image
import librosa
audio_dir = "/content/flickr8k_audio/flickr_audio/flickr_audio/wavs"
# Example: Load an audio file
audio_path = os.path.join(audio_dir,
"/content/flickr8k_audio/flickr_audio/flickr_audio/wavs/1000268201_693b08cb0e_0.wav") #
Replace with actual filename
audio, sr = librosa.load(audio_path, sr=None)
print(f"Loaded audio with shape: {audio.shape}, Sample Rate: {sr}")
*5
import os
from PIL import Image
import librosa
from IPython.display import display, Audio
# Define paths
audio_dir = "/content/flickr8k_audio/flickr_audio/flickr_audio/wavs"
image_dir = "/content/flickr8k_images/Images"
# Load and display an example image
image_path = os.path.join(image_dir, "1000268201_693b08cb0e.jpg") # Corrected file path
image = Image.open(image_path)
display(image)
# Load and play an example audio file
audio_path = os.path.join(audio_dir, "1000268201_693b08cb0e_0.wav") # Corrected file path
audio, sr = librosa.load(audio_path, sr=None)
print(f"Audio Loaded: Shape={audio.shape}, Sampling Rate={sr}")
# Play the audio in Colab
display(Audio(audio_path)) # Corrected method to play the audio
*6
import pandas as pd
from torch.utils.data import Dataset
class Flickr8kAudioImageDataset(Dataset):
def __init__(self, mapping_file, image_dir, audio_dir, transform=None):
self.data = pd.read_csv(mapping_file)
self.image_dir = image_dir
self.audio_dir = audio_dir
self.transform = transform
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
row = self.data.iloc[idx]
image_path = os.path.join(self.image_dir, row["image"])
audio_path = os.path.join(self.audio_dir, row["audio"])
caption = row["caption"]
# Load image
image = Image.open(image_path)
if self.transform:
image = self.transform(image)
# Load audio
audio, sr = librosa.load(audio_path, sr=None)
return image, audio, caption
*7
import os
image_dir = "/content/flickr8k_images/Images"
audio_dir = "/content/flickr8k_audio/flickr_audio/flickr_audio/wavs"
image_filenames = os.listdir(image_dir)
audio_filenames = os.listdir(audio_dir)
print(f"Number of images: {len(image_filenames)}")
print(f"Number of audio files: {len(audio_filenames)}")
# Optional: Print the first few files to see which ones exist
print(f"First few image filenames: {image_filenames[:5]}")
print(f"First few audio filenames: {audio_filenames[:5]}")
*8
import os
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import librosa
from torchvision import transforms
import numpy as np
import matplotlib.pyplot as plt
class Flickr8kAudioImageDataset(Dataset):
def __init__(self, image_dir, audio_dir, transform=None, audio_length=22050):
self.image_dir = image_dir
self.audio_dir = audio_dir
self.transform = transform
self.audio_length = audio_length # Target length for audio (e.g., 22050 samples for 1 second at
22.05 kHz)
# Get image and audio filenames
self.image_filenames = os.listdir(image_dir)
self.audio_filenames = os.listdir(audio_dir)
# Sort the image filenames by their base name (strip extensions)
image_base_filenames = sorted([os.path.splitext(f)[0] for f in self.image_filenames])
# Sort the audio filenames by their base name (remove suffix and strip extensions)
audio_base_filenames = sorted([os.path.splitext(f)[0] for f in self.audio_filenames])
# Initialize dictionaries to map each image to its audio files
image_to_audio_map = {}
# Map image filenames to corresponding audio files (first occurrence for each)
for image_base in image_base_filenames:
corresponding_audio = [audio for audio in audio_base_filenames if
audio.startswith(image_base)]
if corresponding_audio:
image_to_audio_map[image_base] = corresponding_audio[0] # Get the first matching
audio file
# Create lists of sorted matching image and audio filenames
self.image_filenames = [image_base + ".jpg" for image_base in image_to_audio_map.keys()]
self.audio_filenames = [audio + ".wav" for audio in image_to_audio_map.values()]
# Ensure there is at least one matching pair
assert len(self.image_filenames) > 0, "No matching image and audio files found"
def __len__(self):
return len(self.image_filenames)
def __getitem__(self, idx):
# Get the image and audio file names
image_filename = self.image_filenames[idx]
audio_filename = self.audio_filenames[idx]
# Load the image
image_path = os.path.join(self.image_dir, image_filename)
image = Image.open(image_path)
# Optionally apply transformations to the image
if self.transform:
image = self.transform(image)
else:
# Default transform to tensor if none provided
transform = transforms.Compose([transforms.Resize((256, 256)), transforms.ToTensor()])
image = transform(image)
# Load the audio
audio_path = os.path.join(self.audio_dir, audio_filename)
audio, sr = librosa.load(audio_path, sr=None)
# Ensure the audio length matches the target length by padding or truncating
if len(audio) < self.audio_length:
# Pad with zeros if the audio is shorter than the target length
audio = np.pad(audio, (0, self.audio_length - len(audio)), mode='constant')
else:
# Truncate if the audio is longer than the target length
audio = audio[:self.audio_length]
# Convert audio to tensor
audio = torch.tensor(audio, dtype=torch.float32)
# Return image, audio, and the filename (for potential captions or other info)
return image, audio, audio_filename
# Define paths
audio_dir = "/content/flickr8k_audio/flickr_audio/flickr_audio/wavs"
image_dir = "/content/flickr8k_images/Images"
# Initialize dataset and DataLoader
dataset = Flickr8kAudioImageDataset(
image_dir=image_dir,
audio_dir=audio_dir,
transform=None, # Add image transformations if needed
audio_length=22050 # Set audio length (e.g., 1 second at 22.05 kHz)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
# Iterate over the DataLoader
for batch in dataloader:
images, audios, filenames = batch
print("Images shape:", images[0].size()) # For image shape (should be [3, 256, 256])
print("Audios shape:", audios.shape) # For audio shape (should be [batch_size, 22050])
print("Filenames:", filenames[:5]) # Show first few filenames
# Visualize the first image and its corresponding audio waveform
plt.figure(figsize=(12, 6))
# Plot image
plt.subplot(1, 2, 1)
plt.imshow(images[0].permute(1, 2, 0)) # Convert from [C, H, W] to [H, W, C] for plotting
plt.title(f"Image: {filenames[0]}")
# Plot audio waveform
plt.subplot(1, 2, 2)
plt.plot(audios[0].numpy()) # Convert tensor to numpy for plotting
plt.title(f"Audio waveform: {filenames[0]}")
plt.show()
break # Only display the first batch, remove break to loop over all batches
*9
import torch
import torch.nn as nn
import torch.optim as optim
# Define a simple CNN for image processing
class ImageModel(nn.Module):
def __init__(self):
super(ImageModel, self).__init__()
self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1)
self.pool = nn.MaxPool2d(2, 2)
# Use adaptive pooling to handle varying input sizes
self.adaptive_pool = nn.AdaptiveAvgPool2d((8, 8))
self.fc1 = nn.Linear(32 * 8 * 8, 512)
self.fc2 = nn.Linear(512, 128)
def forward(self, x):
x = self.pool(nn.ReLU()(self.conv1(x)))
x = self.adaptive_pool(x) # Adaptive pooling to handle different input sizes
x = x.view(-1, 32 * 8 * 8) # Flatten the output
x = nn.ReLU()(self.fc1(x))
x = self.fc2(x)
return x
# Define an RNN for audio processing
class AudioModel(nn.Module):
def __init__(self, input_size=22050, hidden_size=128):
super(AudioModel, self).__init__()
self.rnn = nn.LSTM(input_size, hidden_size, batch_first=True)
self.fc = nn.Linear(hidden_size, 128)
def forward(self, x):
# Ensure input shape is (batch_size, seq_len, input_size)
# x should have shape [batch_size, seq_len, input_size]
out, _ = self.rnn(x) # out has shape (batch_size, seq_len, hidden_size)
# If the LSTM output is 2D (which might happen if you process sequences with fixed length)
# we need to handle this by adding an additional dimension
if out.dim() == 2:
out = out.unsqueeze(1) # Add an extra dimension to make it 3D: [batch_size, 1, hidden_size]
out = out[:, -1, :] # Take the last timestep (should work now as the tensor is 3D)
out = self.fc(out) # Pass through the fully connected layer
return out
# Update the MultimodalModel class
class MultimodalModel(nn.Module):
def __init__(self, num_classes):
super(MultimodalModel, self).__init__()
self.image_model = ImageModel()
self.audio_model = AudioModel()
self.fc = nn.Linear(128 + 128, num_classes) # For multiclass classification
def forward(self, image, audio):
image_features = self.image_model(image)
audio_features = self.audio_model(audio)
combined = torch.cat((image_features, audio_features), dim=1) # Concatenate image and audio
features
output = self.fc(combined)
return output
# In the training loop
criterion = nn.CrossEntropyLoss() # For multiclass classification
# In the forward pass
outputs = model(images, audios)
loss = criterion(outputs, labels) # Ensure labels are integer for multiclass
# Initialize the model
model = MultimodalModel()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Training loop (simplified)
num_epochs = 10
for epoch in range(num_epochs):
model.train()
for images, audios, filenames in dataloader:
images, audios = images.to(device), audios.to(device) # Send data to GPU if available
optimizer.zero_grad()
# Forward pass
outputs = model(images, audios)
# Calculate loss (use appropriate target)
loss = criterion(outputs, torch.ones_like(outputs).to(device)) # Modify according to your target
# Backward pass
loss.backward()
optimizer.step()
print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")
*10
from torch.utils.data import random_split
# Define the percentage of data to be used for validation (e.g., 20% validation, 80% training)
validation_split = 0.2
dataset_size = len(dataset)
validation_size = int(validation_split * dataset_size)
train_size = dataset_size - validation_size
# Split the dataset
train_dataset, validation_dataset = random_split(dataset, [train_size, validation_size])
*11
# Create DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
validation_dataloader = DataLoader(validation_dataset, batch_size=32, shuffle=False)
*12
def evaluate(model, validation_dataloader, criterion, device):
model.eval() # Set model to evaluation mode
validation_loss = 0.0
with torch.no_grad():
for images, audios, filenames in validation_dataloader:
images, audios = images.to(device), audios.to(device)
outputs = model(images, audios)
targets = get_target(filenames) # Implement target fetching logic based on filenames
loss = criterion(outputs, targets.to(device))
validation_loss += loss.item()
avg_validation_loss = validation_loss / len(validation_dataloader)
print(f"Validation Loss: {avg_validation_loss:.4f}")
*13
torch.save(model.state_dict(), 'best_model.pth')
*14
def __getitem__(self, idx):
image = Image.open(self.image_paths[idx]).convert("RGB") # Load image
label = self.labels[idx] # Get the corresponding label
if self.transform:
image = self.transform(image) # Apply transformations
return image, label # Ensure only two values are returned
*15
data = next(iter(train_dataloader))
print(len(data)) # Number of elements in the returned tuple
print(type(data)) # Check the type (it should be a tuple)
*16
images, labels, additional_info = next(iter(train_dataloader))
*17
print(labels.shape)
*18
import matplotlib.pyplot as plt
# Get a batch of images and labels (audio)
images, labels, filenames = next(iter(train_dataloader))
# Get the first audio sample (labels[0] is the audio)
audio = labels[0].cpu().numpy() # Convert the tensor to a numpy array
# Plot the audio waveform
plt.figure(figsize=(10, 4))
plt.plot(audio)
plt.title(f"Audio Waveform of Sample 0")
plt.xlabel("Time (samples)")
plt.ylabel("Amplitude")
plt.show()
*19
# Debugging label extraction
for filename in filenames[:5]:
print(f"Filename: {filename}, Extracted Label: {get_labels_from_filenames([filename])}")
*20
# Check predicted and actual labels
_, predicted = torch.max(outputs, 1)
print(f"Predicted: {predicted}, Actual: {labels}")
*21
# Visualize or print out image and label pairs
for i in range(5):
print(f"Image {i}: {images[i].shape}, Label: {labels[i]}")
*22
def get_labels_from_filenames(filenames):
labels = []
for filename in filenames:
# Example: assuming label is the first part of the filename
# For instance, if filenames are like 'class1_img_1.jpg', extract 'class1'
# You can customize this depending on your dataset
label = filename.split('_')[0] # Taking the first part before underscore
# Convert label to an integer (if needed, here assuming class labels are numeric)
# If labels are categorical, you may want to convert to class index
label = int(label[5:]) # Assuming labels are numeric after 'class' (e.g., 'class1', 'class2')
labels.append(label)
# Convert labels to a tensor (as long type)
labels = torch.tensor(labels, dtype=torch.long)
return labels
*23
correct = 0
total = 0
for images, audios, filenames in validation_dataloader: # Assuming you have filenames as part of the
batch
images = images.to(device)
audios = audios.to(device)
# Extract labels from filenames (adjust this part based on how your dataset is structured)
labels = get_labels_from_filenames(filenames)
labels = labels.to(device) # Ensure labels are moved to the same device
# Forward pass
outputs = model(images, audios)
_, predicted = torch.max(outputs, 1)
# Calculate total and correct predictions
total += labels.size(0)
correct += (predicted == labels).sum().item()
# Calculate and print accuracy
accuracy = 100 * correct / total
print(f'Validation Accuracy: {accuracy}%')