0% found this document useful (0 votes)

6 views15 pages

Codefp 1

The document outlines a process for using the Kaggle API to download and prepare the Flickr8k audio-caption dataset for a multimodal deep learning project. It includes steps for loading audio and image data, defining a custom dataset class, and creating a neural network model that combines image and audio processing. The document also details the training and evaluation procedures, including data splitting, loss calculation, and saving the model.

Uploaded by

sanjana.devarapalli7

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

6 views15 pages

Codefp 1

Uploaded by

sanjana.devarapalli7

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 15

*1

from google.colab import files

# Upload kaggle.json

files.upload()

# Move kaggle.json to the proper directory

!mkdir -p ~/.kaggle

!mv kaggle.json ~/.kaggle/

!chmod 600 ~/.kaggle/kaggle.json

# Install Kaggle API

!pip install kaggle

# Download dataset

!kaggle datasets download -d warcoder/flickr-8k-audio-caption-corpus

# Unzip dataset

!unzip flickr-8k-audio-caption-corpus.zip -d /content/flickr8k_audio

!kaggle datasets download -d adityajn105/flickr8k

!unzip flickr8k.zip -d /content/flickr8k_images

import os

from PIL import Image

import librosa

audio_dir = "/content/flickr8k_audio/flickr_audio/flickr_audio/wavs"
# Example: Load an audio file

audio_path = os.path.join(audio_dir,
"/content/flickr8k_audio/flickr_audio/flickr_audio/wavs/1000268201_693b08cb0e_0.wav") #
Replace with actual filename

audio, sr = librosa.load(audio_path, sr=None)

print(f"Loaded audio with shape: {audio.shape}, Sample Rate: {sr}")

import os

from PIL import Image

import librosa

from IPython.display import display, Audio

# Define paths

audio_dir = "/content/flickr8k_audio/flickr_audio/flickr_audio/wavs"

image_dir = "/content/flickr8k_images/Images"

# Load and display an example image

image_path = os.path.join(image_dir, "1000268201_693b08cb0e.jpg") # Corrected file path

image = Image.open(image_path)

display(image)

# Load and play an example audio file

audio_path = os.path.join(audio_dir, "1000268201_693b08cb0e_0.wav") # Corrected file path

audio, sr = librosa.load(audio_path, sr=None)

print(f"Audio Loaded: Shape={audio.shape}, Sampling Rate={sr}")

# Play the audio in Colab

display(Audio(audio_path)) # Corrected method to play the audio

import pandas as pd
from torch.utils.data import Dataset

class Flickr8kAudioImageDataset(Dataset):

def init(self, mapping_file, image_dir, audio_dir, transform=None):

self.data = pd.read_csv(mapping_file)

self.image_dir = image_dir

self.audio_dir = audio_dir

self.transform = transform

def __len__(self):

return len(self.data)

def getitem(self, idx):

row = self.data.iloc[idx]

image_path = os.path.join(self.image_dir, row["image"])

audio_path = os.path.join(self.audio_dir, row["audio"])

caption = row["caption"]

# Load image

image = Image.open(image_path)

if self.transform:

image = self.transform(image)

# Load audio

audio, sr = librosa.load(audio_path, sr=None)

return image, audio, caption

import os
image_dir = "/content/flickr8k_images/Images"

audio_dir = "/content/flickr8k_audio/flickr_audio/flickr_audio/wavs"

image_filenames = os.listdir(image_dir)

audio_filenames = os.listdir(audio_dir)

print(f"Number of images: {len(image_filenames)}")

print(f"Number of audio files: {len(audio_filenames)}")

# Optional: Print the first few files to see which ones exist

print(f"First few image filenames: {image_filenames[:5]}")

print(f"First few audio filenames: {audio_filenames[:5]}")

import os

import torch

from torch.utils.data import Dataset, DataLoader

from PIL import Image

import librosa

from torchvision import transforms

import numpy as np

import matplotlib.pyplot as plt

class Flickr8kAudioImageDataset(Dataset):

def init(self, image_dir, audio_dir, transform=None, audio_length=22050):

self.image_dir = image_dir

self.audio_dir = audio_dir

self.transform = transform

self.audio_length = audio_length # Target length for audio (e.g., 22050 samples for 1 second at
22.05 kHz)
# Get image and audio filenames

self.image_filenames = os.listdir(image_dir)

self.audio_filenames = os.listdir(audio_dir)

# Sort the image filenames by their base name (strip extensions)

image_base_filenames = sorted([os.path.splitext(f)[0] for f in self.image_filenames])

# Sort the audio filenames by their base name (remove suffix and strip extensions)

audio_base_filenames = sorted([os.path.splitext(f)[0] for f in self.audio_filenames])

# Initialize dictionaries to map each image to its audio files

image_to_audio_map = {}

# Map image filenames to corresponding audio files (first occurrence for each)

for image_base in image_base_filenames:

corresponding_audio = [audio for audio in audio_base_filenames if

audio.startswith(image_base)]

if corresponding_audio:

image_to_audio_map[image_base] = corresponding_audio[0] # Get the first matching

audio file

# Create lists of sorted matching image and audio filenames

self.image_filenames = [image_base + ".jpg" for image_base in image_to_audio_map.keys()]

self.audio_filenames = [audio + ".wav" for audio in image_to_audio_map.values()]

# Ensure there is at least one matching pair

assert len(self.image_filenames) > 0, "No matching image and audio files found"

def __len__(self):

return len(self.image_filenames)

def getitem(self, idx):

# Get the image and audio file names

image_filename = self.image_filenames[idx]

audio_filename = self.audio_filenames[idx]

# Load the image

image_path = os.path.join(self.image_dir, image_filename)

image = Image.open(image_path)

# Optionally apply transformations to the image

if self.transform:

image = self.transform(image)

else:

# Default transform to tensor if none provided

transform = transforms.Compose([transforms.Resize((256, 256)), transforms.ToTensor()])

image = transform(image)

# Load the audio

audio_path = os.path.join(self.audio_dir, audio_filename)

audio, sr = librosa.load(audio_path, sr=None)

# Ensure the audio length matches the target length by padding or truncating

if len(audio) < self.audio_length:

# Pad with zeros if the audio is shorter than the target length

audio = np.pad(audio, (0, self.audio_length - len(audio)), mode='constant')

else:

# Truncate if the audio is longer than the target length

audio = audio[:self.audio_length]

# Convert audio to tensor

audio = torch.tensor(audio, dtype=torch.float32)

# Return image, audio, and the filename (for potential captions or other info)

return image, audio, audio_filename

# Define paths

audio_dir = "/content/flickr8k_audio/flickr_audio/flickr_audio/wavs"

image_dir = "/content/flickr8k_images/Images"

# Initialize dataset and DataLoader

dataset = Flickr8kAudioImageDataset(

image_dir=image_dir,

audio_dir=audio_dir,

transform=None, # Add image transformations if needed

audio_length=22050 # Set audio length (e.g., 1 second at 22.05 kHz)

dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Iterate over the DataLoader

for batch in dataloader:

images, audios, filenames = batch

print("Images shape:", images[0].size()) # For image shape (should be [3, 256, 256])

print("Audios shape:", audios.shape) # For audio shape (should be [batch_size, 22050])

print("Filenames:", filenames[:5]) # Show first few filenames

# Visualize the first image and its corresponding audio waveform

plt.figure(figsize=(12, 6))

# Plot image

plt.subplot(1, 2, 1)

plt.imshow(images[0].permute(1, 2, 0)) # Convert from [C, H, W] to [H, W, C] for plotting

plt.title(f"Image: {filenames[0]}")
# Plot audio waveform

plt.subplot(1, 2, 2)

plt.plot(audios[0].numpy()) # Convert tensor to numpy for plotting

plt.title(f"Audio waveform: {filenames[0]}")

plt.show()

break # Only display the first batch, remove break to loop over all batches

import torch

import torch.nn as nn

import torch.optim as optim

# Define a simple CNN for image processing

class ImageModel(nn.Module):

def __init__(self):

super(ImageModel, self).__init__()

self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1)

self.pool = nn.MaxPool2d(2, 2)

# Use adaptive pooling to handle varying input sizes

self.adaptive_pool = nn.AdaptiveAvgPool2d((8, 8))

self.fc1 = nn.Linear(32 * 8 * 8, 512)

self.fc2 = nn.Linear(512, 128)

def forward(self, x):

x = self.pool(nn.ReLU()(self.conv1(x)))

x = self.adaptive_pool(x) # Adaptive pooling to handle different input sizes

x = x.view(-1, 32 * 8 * 8) # Flatten the output

x = nn.ReLU()(self.fc1(x))

x = self.fc2(x)
return x

# Define an RNN for audio processing

class AudioModel(nn.Module):

def init(self, input_size=22050, hidden_size=128):

super(AudioModel, self).__init__()

self.rnn = nn.LSTM(input_size, hidden_size, batch_first=True)

self.fc = nn.Linear(hidden_size, 128)

def forward(self, x):

# Ensure input shape is (batch_size, seq_len, input_size)

# x should have shape [batch_size, seq_len, input_size]

out, _ = self.rnn(x) # out has shape (batch_size, seq_len, hidden_size)

# If the LSTM output is 2D (which might happen if you process sequences with fixed length)

# we need to handle this by adding an additional dimension

if out.dim() == 2:

out = out.unsqueeze(1) # Add an extra dimension to make it 3D: [batch_size, 1, hidden_size]

out = out[:, -1, :] # Take the last timestep (should work now as the tensor is 3D)

out = self.fc(out) # Pass through the fully connected layer

return out

# Update the MultimodalModel class

class MultimodalModel(nn.Module):

def init(self, num_classes):

super(MultimodalModel, self).__init__()

self.image_model = ImageModel()

self.audio_model = AudioModel()

self.fc = nn.Linear(128 + 128, num_classes) # For multiclass classification

def forward(self, image, audio):

image_features = self.image_model(image)

audio_features = self.audio_model(audio)

combined = torch.cat((image_features, audio_features), dim=1) # Concatenate image and audio

features

output = self.fc(combined)

return output

# In the training loop

criterion = nn.CrossEntropyLoss() # For multiclass classification

# In the forward pass

outputs = model(images, audios)

loss = criterion(outputs, labels) # Ensure labels are integer for multiclass

# Initialize the model

model = MultimodalModel()

optimizer = optim.Adam(model.parameters(), lr=0.001)

# Move the model to GPU if available

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)

# Training loop (simplified)

num_epochs = 10

for epoch in range(num_epochs):

model.train()

for images, audios, filenames in dataloader:

images, audios = images.to(device), audios.to(device) # Send data to GPU if available

optimizer.zero_grad()

# Forward pass

outputs = model(images, audios)

# Calculate loss (use appropriate target)

loss = criterion(outputs, torch.ones_like(outputs).to(device)) # Modify according to your target

# Backward pass

loss.backward()

optimizer.step()

print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

*10

from torch.utils.data import random_split

# Define the percentage of data to be used for validation (e.g., 20% validation, 80% training)

validation_split = 0.2

dataset_size = len(dataset)

validation_size = int(validation_split * dataset_size)

train_size = dataset_size - validation_size

# Split the dataset

train_dataset, validation_dataset = random_split(dataset, [train_size, validation_size])

*11

# Create DataLoaders

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)

validation_dataloader = DataLoader(validation_dataset, batch_size=32, shuffle=False)

*12

def evaluate(model, validation_dataloader, criterion, device):

model.eval() # Set model to evaluation mode

validation_loss = 0.0

with torch.no_grad():

for images, audios, filenames in validation_dataloader:

images, audios = images.to(device), audios.to(device)

outputs = model(images, audios)

targets = get_target(filenames) # Implement target fetching logic based on filenames

loss = criterion(outputs, targets.to(device))

validation_loss += loss.item()

avg_validation_loss = validation_loss / len(validation_dataloader)

print(f"Validation Loss: {avg_validation_loss:.4f}")

*13

torch.save(model.state_dict(), 'best_model.pth')

*14

def getitem(self, idx):

image = Image.open(self.image_paths[idx]).convert("RGB") # Load image

label = self.labels[idx] # Get the corresponding label

if self.transform:

image = self.transform(image) # Apply transformations

return image, label # Ensure only two values are returned

*15

data = next(iter(train_dataloader))

print(len(data)) # Number of elements in the returned tuple

print(type(data)) # Check the type (it should be a tuple)

*16

images, labels, additional_info = next(iter(train_dataloader))

*17

print(labels.shape)

*18

import matplotlib.pyplot as plt

# Get a batch of images and labels (audio)

images, labels, filenames = next(iter(train_dataloader))

# Get the first audio sample (labels[0] is the audio)

audio = labels[0].cpu().numpy() # Convert the tensor to a numpy array

# Plot the audio waveform

plt.figure(figsize=(10, 4))

plt.plot(audio)

plt.title(f"Audio Waveform of Sample 0")

plt.xlabel("Time (samples)")

plt.ylabel("Amplitude")

plt.show()

*19

# Debugging label extraction

for filename in filenames[:5]:

print(f"Filename: {filename}, Extracted Label: {get_labels_from_filenames([filename])}")

*20

# Check predicted and actual labels

_, predicted = torch.max(outputs, 1)

print(f"Predicted: {predicted}, Actual: {labels}")

*21

# Visualize or print out image and label pairs

for i in range(5):

print(f"Image {i}: {images[i].shape}, Label: {labels[i]}")

*22

def get_labels_from_filenames(filenames):

labels = []

for filename in filenames:

# Example: assuming label is the first part of the filename

# For instance, if filenames are like 'class1_img_1.jpg', extract 'class1'

# You can customize this depending on your dataset

label = filename.split('_')[0] # Taking the first part before underscore

# Convert label to an integer (if needed, here assuming class labels are numeric)

# If labels are categorical, you may want to convert to class index

label = int(label[5:]) # Assuming labels are numeric after 'class' (e.g., 'class1', 'class2')

labels.append(label)

# Convert labels to a tensor (as long type)

labels = torch.tensor(labels, dtype=torch.long)

return labels

*23

correct = 0

total = 0
for images, audios, filenames in validation_dataloader: # Assuming you have filenames as part of the
batch

images = images.to(device)

audios = audios.to(device)

# Extract labels from filenames (adjust this part based on how your dataset is structured)

labels = get_labels_from_filenames(filenames)

labels = labels.to(device) # Ensure labels are moved to the same device

# Forward pass

outputs = model(images, audios)

_, predicted = torch.max(outputs, 1)

# Calculate total and correct predictions

total += labels.size(0)

correct += (predicted == labels).sum().item()

# Calculate and print accuracy

accuracy = 100 * correct / total

print(f'Validation Accuracy: {accuracy}%')

Final Project Report
No ratings yet
Final Project Report
8 pages
4-Channel YOLO Training Guide For RGB+IR Drone Detection
No ratings yet
4-Channel YOLO Training Guide For RGB+IR Drone Detection
22 pages
Handling Images With PyTorch
No ratings yet
Handling Images With PyTorch
12 pages
Lec 9
No ratings yet
Lec 9
87 pages
BOP Pressure Testing Procedures
100% (5)
BOP Pressure Testing Procedures
7 pages
Kijai ComfyUI VEnhancer
No ratings yet
Kijai ComfyUI VEnhancer
76 pages
Lec 5
No ratings yet
Lec 5
69 pages
Test Work
No ratings yet
Test Work
18 pages
Augumented Documentation
No ratings yet
Augumented Documentation
19 pages
MusicGen - Ipynb - Colab
No ratings yet
MusicGen - Ipynb - Colab
12 pages
Lab 32
No ratings yet
Lab 32
8 pages
Speech
No ratings yet
Speech
13 pages
Micro-Gasification Cooking With Gas From Biomass
100% (3)
Micro-Gasification Cooking With Gas From Biomass
100 pages
Emotion Dect
No ratings yet
Emotion Dect
4 pages
Free Resume Dark Blue Version
No ratings yet
Free Resume Dark Blue Version
1 page
Evolution of Communication Methods
No ratings yet
Evolution of Communication Methods
1 page
Project Proposal
No ratings yet
Project Proposal
22 pages
101 Ways To Promote Your Real Estate Web Site
100% (1)
101 Ways To Promote Your Real Estate Web Site
391 pages
Proposal and Bio
No ratings yet
Proposal and Bio
2 pages
UrbanSound8K Dataset: Automatic Sound Recognition (ASR) Project With CNN and ANN Models
No ratings yet
UrbanSound8K Dataset: Automatic Sound Recognition (ASR) Project With CNN and ANN Models
31 pages
NM Final
No ratings yet
NM Final
15 pages
Green Architecture Pioneer
No ratings yet
Green Architecture Pioneer
10 pages
Stable Diffusion Report Updated
No ratings yet
Stable Diffusion Report Updated
19 pages
Oath of Office (CS Form No. 32) (2 Original) W/ Documentary Stamp Oath of Office (CS Form No. 32) (2 Original) W/ Documentary Stamp
No ratings yet
Oath of Office (CS Form No. 32) (2 Original) W/ Documentary Stamp Oath of Office (CS Form No. 32) (2 Original) W/ Documentary Stamp
1 page
ATI Ipynb
No ratings yet
ATI Ipynb
12 pages
Finetuning
No ratings yet
Finetuning
10 pages
Public Domain Book Access Guide
No ratings yet
Public Domain Book Access Guide
551 pages
Application Code Exp2
No ratings yet
Application Code Exp2
4 pages
Voice Identification GLM4 Guide
No ratings yet
Voice Identification GLM4 Guide
2 pages
Audio Classification with ANN
No ratings yet
Audio Classification with ANN
1 page
Text-Image Embeddings With OpenAIs CLIP
No ratings yet
Text-Image Embeddings With OpenAIs CLIP
5 pages
Audio Recognition with CNN
No ratings yet
Audio Recognition with CNN
14 pages
2.4 Zero-Shot Audio Classification
No ratings yet
2.4 Zero-Shot Audio Classification
3 pages
Adaptive Noise Cancellation Report
No ratings yet
Adaptive Noise Cancellation Report
10 pages
PR Writing
No ratings yet
PR Writing
21 pages
CNN 1721592934
No ratings yet
CNN 1721592934
53 pages
RVCV2 Tools Mocci - Ipynb
No ratings yet
RVCV2 Tools Mocci - Ipynb
17 pages
SoundGeneration Wiki
No ratings yet
SoundGeneration Wiki
7 pages
Lab Report Solutions - Multimedia
No ratings yet
Lab Report Solutions - Multimedia
10 pages
Wa0029.
No ratings yet
Wa0029.
11 pages
01 - Mnist - Ipynb (4) - JupyterLab
No ratings yet
01 - Mnist - Ipynb (4) - JupyterLab
23 pages
Audiosegment Readthedocs Io en Latest
No ratings yet
Audiosegment Readthedocs Io en Latest
23 pages
Q1 Reading Given MP3 Using Audioread: CLC Clearvars
No ratings yet
Q1 Reading Given MP3 Using Audioread: CLC Clearvars
9 pages
ASIP Practical
No ratings yet
ASIP Practical
30 pages
Image Caption2
No ratings yet
Image Caption2
9 pages
Audio GAN
No ratings yet
Audio GAN
2 pages
Ass 8
No ratings yet
Ass 8
2 pages
Yolo Detect
No ratings yet
Yolo Detect
5 pages
Video Api Endpoint N
No ratings yet
Video Api Endpoint N
7 pages
Project Guidelines - AIML
No ratings yet
Project Guidelines - AIML
30 pages
Guide To YAMNet - Sound Event Classifier
No ratings yet
Guide To YAMNet - Sound Event Classifier
10 pages
PTW Questionnaire 1 Nov. 2020
No ratings yet
PTW Questionnaire 1 Nov. 2020
4 pages
An Ontology of Technology
100% (1)
An Ontology of Technology
11 pages
FROMTXTTIMESERIESTOWAVEFILESANDSPECTROGRAMEXTRACTION SEISMIC JupyterNotebook
No ratings yet
FROMTXTTIMESERIESTOWAVEFILESANDSPECTROGRAMEXTRACTION SEISMIC JupyterNotebook
29 pages
Chandni Tiwari: Chandni - Sai21@yahoo - Co M
No ratings yet
Chandni Tiwari: Chandni - Sai21@yahoo - Co M
4 pages
Q1 Reading Given MP3 Using Audioread: CLC Clearvars
No ratings yet
Q1 Reading Given MP3 Using Audioread: CLC Clearvars
9 pages
Editor de Audio
No ratings yet
Editor de Audio
5 pages
Pad Assignment 2
No ratings yet
Pad Assignment 2
12 pages
Ass
No ratings yet
Ass
5 pages
GNS3 - Quick Start Guide For Windows Users
No ratings yet
GNS3 - Quick Start Guide For Windows Users
2 pages
Repair GRUB2 When Ubuntu Won't Boot
No ratings yet
Repair GRUB2 When Ubuntu Won't Boot
14 pages
Inverter E171781
No ratings yet
Inverter E171781
6 pages
Image Filtering & Hybrid Images Guide
No ratings yet
Image Filtering & Hybrid Images Guide
7 pages
Daylighting for School Design
50% (2)
Daylighting for School Design
48 pages
Pre-Trained Models: Objectives
No ratings yet
Pre-Trained Models: Objectives
12 pages
Extra Items & Substitute Items
No ratings yet
Extra Items & Substitute Items
175 pages
Tutorial Pytorch Best Commands
No ratings yet
Tutorial Pytorch Best Commands
8 pages
Predicting Singer Voice Using Convolutional Neural Network
No ratings yet
Predicting Singer Voice Using Convolutional Neural Network
17 pages
Spec Jack Enerpac CLRG
No ratings yet
Spec Jack Enerpac CLRG
4 pages
MSC Data Science - 02 PDF
No ratings yet
MSC Data Science - 02 PDF
37 pages
DIB - Generators & Motors
No ratings yet
DIB - Generators & Motors
6 pages
Cad and Dog
No ratings yet
Cad and Dog
5 pages
PCM 1867
No ratings yet
PCM 1867
141 pages
D D D D D D D: LM2900, LM3900 Quadruple Norton Operational Amplifiers
No ratings yet
D D D D D D D: LM2900, LM3900 Quadruple Norton Operational Amplifiers
16 pages
IC695PSD040 Datasheet
No ratings yet
IC695PSD040 Datasheet
5 pages
Audio Noise Detection
No ratings yet
Audio Noise Detection
29 pages
Data Structures and Algorithms: (CS210/ESO207/ESO211)
No ratings yet
Data Structures and Algorithms: (CS210/ESO207/ESO211)
21 pages
Heart Sound Segmentation & Classification
No ratings yet
Heart Sound Segmentation & Classification
9 pages
Gvpce - Nueve It 2025
0% (1)
Gvpce - Nueve It 2025
28 pages
Compact Language Courses: A Solution
No ratings yet
Compact Language Courses: A Solution
16 pages
Cad and Dog 2
No ratings yet
Cad and Dog 2
5 pages
A Whirlwind Tour of Python
No ratings yet
A Whirlwind Tour of Python
24 pages
Accepted Manuscript
No ratings yet
Accepted Manuscript
23 pages
Image Classification with PyTorch
No ratings yet
Image Classification with PyTorch
19 pages
Descriptions: A-56590 E-1321/02 Fanuc Ac Spindle Motor SERIES (Sensor-Less Type)
No ratings yet
Descriptions: A-56590 E-1321/02 Fanuc Ac Spindle Motor SERIES (Sensor-Less Type)
19 pages
Dell Se2422h Se2722h Product Data Sheet Us English
No ratings yet
Dell Se2422h Se2722h Product Data Sheet Us English
2 pages
Generating Music Using AI: Ebba Rickard
No ratings yet
Generating Music Using AI: Ebba Rickard
66 pages
JS
No ratings yet
JS
14 pages
Cylinder-Head Cover Guard
No ratings yet
Cylinder-Head Cover Guard
7 pages
IJCRT2108410
No ratings yet
IJCRT2108410
5 pages
Literature Survey1
No ratings yet
Literature Survey1
4 pages
VTTS: Visual-Text To Speech: The University of Tokyo, Japan. Nara Institute of Science and Technology, Japan
No ratings yet
VTTS: Visual-Text To Speech: The University of Tokyo, Japan. Nara Institute of Science and Technology, Japan
5 pages
Trane 4TEC3 Air Handler
No ratings yet
Trane 4TEC3 Air Handler
4 pages
Image Text To Speech Conversion in Desired Language: International Journal of Creative Research Thoughts December 2023
No ratings yet
Image Text To Speech Conversion in Desired Language: International Journal of Creative Research Thoughts December 2023
11 pages
Cyber Security (MENTOR LED)
No ratings yet
Cyber Security (MENTOR LED)
18 pages

Codefp 1

Uploaded by

Codefp 1

Uploaded by

*1

from google.colab import files

# Move kaggle.json to the proper directory

!mv kaggle.json ~/.kaggle/

!chmod 600 ~/.kaggle/kaggle.json

# Install Kaggle API

!pip install kaggle

!kaggle datasets download -d warcoder/flickr-8k-audio-caption-corpus

!unzip flickr-8k-audio-caption-corpus.zip -d /content/flickr8k_audio

!kaggle datasets download -d adityajn105/flickr8k

!unzip flickr8k.zip -d /content/flickr8k_images

from PIL import Image

audio, sr = librosa.load(audio_path, sr=None)

print(f"Loaded audio with shape: {audio.shape}, Sample Rate: {sr}")

from PIL import Image

from IPython.display import display, Audio

# Load and display an example image

image_path = os.path.join(image_dir, "1000268201_693b08cb0e.jpg") # Corrected file path

# Load and play an example audio file

audio_path = os.path.join(audio_dir, "1000268201_693b08cb0e_0.wav") # Corrected file path

audio, sr = librosa.load(audio_path, sr=None)

print(f"Audio Loaded: Shape={audio.shape}, Sampling Rate={sr}")

# Play the audio in Colab

display(Audio(audio_path)) # Corrected method to play the audio

def __init__(self, mapping_file, image_dir, audio_dir, transform=None):

def __getitem__(self, idx):

image_path = os.path.join(self.image_dir, row["image"])

audio_path = os.path.join(self.audio_dir, row["audio"])

audio, sr = librosa.load(audio_path, sr=None)

return image, audio, caption

print(f"Number of images: {len(image_filenames)}")

print(f"Number of audio files: {len(audio_filenames)}")

print(f"First few image filenames: {image_filenames[:5]}")

print(f"First few audio filenames: {audio_filenames[:5]}")

from torch.utils.data import Dataset, DataLoader

from PIL import Image

from torchvision import transforms

import matplotlib.pyplot as plt

def __init__(self, image_dir, audio_dir, transform=None, audio_length=22050):

# Sort the image filenames by their base name (strip extensions)

image_base_filenames = sorted([os.path.splitext(f)[0] for f in self.image_filenames])

audio_base_filenames = sorted([os.path.splitext(f)[0] for f in self.audio_filenames])

# Initialize dictionaries to map each image to its audio files

for image_base in image_base_filenames:

corresponding_audio = [audio for audio in audio_base_filenames if

image_to_audio_map[image_base] = corresponding_audio[0] # Get the first matching

# Create lists of sorted matching image and audio filenames

self.image_filenames = [image_base + ".jpg" for image_base in image_to_audio_map.keys()]

self.audio_filenames = [audio + ".wav" for audio in image_to_audio_map.values()]

# Ensure there is at least one matching pair

def __getitem__(self, idx):

# Load the image

image_path = os.path.join(self.image_dir, image_filename)

# Optionally apply transformations to the image

# Default transform to tensor if none provided

transform = transforms.Compose([transforms.Resize((256, 256)), transforms.ToTensor()])

# Load the audio

audio_path = os.path.join(self.audio_dir, audio_filename)

audio, sr = librosa.load(audio_path, sr=None)

if len(audio) < self.audio_length:

audio = np.pad(audio, (0, self.audio_length - len(audio)), mode='constant')

# Truncate if the audio is longer than the target length

# Convert audio to tensor

audio = torch.tensor(audio, dtype=torch.float32)

return image, audio, audio_filename

# Initialize dataset and DataLoader

transform=None, # Add image transformations if needed

audio_length=22050 # Set audio length (e.g., 1 second at 22.05 kHz)

dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Iterate over the DataLoader

for batch in dataloader:

images, audios, filenames = batch

print("Audios shape:", audios.shape) # For audio shape (should be [batch_size, 22050])

print("Filenames:", filenames[:5]) # Show first few filenames

# Visualize the first image and its corresponding audio waveform

plt.imshow(images[0].permute(1, 2, 0)) # Convert from [C, H, W] to [H, W, C] for plotting

def init(self, mapping_file, image_dir, audio_dir, transform=None):

def getitem(self, idx):

def init(self, image_dir, audio_dir, transform=None, audio_length=22050):

def getitem(self, idx):

def init(self, input_size=22050, hidden_size=128):

def init(self, num_classes):

def getitem(self, idx):