ASSIGNMENT
1.SVM Classification on News Dataset
Code:
#SVM classification on News Dataset
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import
accuracy_score,classification_report,confusion_matrix
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack
#Load the dataset
file_path = "Google News.csv"
df = pd.read_csv(file_path)#encoding='ISO-8859-1'
#Drop missing values
df=df.dropna()
#Extract features and labels
X_text =
df[['title','publisher','date','keyword','country']].astype(str).agg(''.join,axis=1)
y = df['category']
#Convert text to numerical features using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english',max_features=5000)
X_tfidf=vectorizer.fit_transform(X_text)
#Standardize the TF-IDF features
scaler = StandardScaler(with_mean=False)
X_tfidf_scaled=scaler.fit_transform(X_tfidf)
#Split into training and testing sets(80%train,20%test)
X_train,X_test,y_train,y_test=train_test_split(X_tfidf_scaled,y,test_size=0.2,ran
dom_state=42)
#Train SVM model
svm_model=SVC(kernel='linear',random_state=42)
svm_model.fit(X_train,y_train)
#Predict on test data
y_pred=svm_model.predict(X_test)
#Evaluate modell performance
accuracy=accuracy_score(y_test,y_pred)
report=classification_report(y_test,y_pred,zero_division=1)
#Compute confusion matrix
conf_matrix=confusion_matrix(y_test,y_pred)
#Plot confusion matrix
plt.figure(figsize=(10,7))
sns.heatmap(conf_matrix,annot=True,fmt='d',cmap='Blues',xticklabels=np.uniq
ue(y),yticklabels=np.unique(y))
plt.xlabel('Predicted Label')
plt.ylabel('Confusion Matrix')
plt.show()
#Print the results
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(report)
Dataset:
Input:
Output:
2.KNN Classification with Decision Boundary
Code:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix,classification_report
#Load Dataset
df=pd.read_csv("student_pass.csv")
#Split into features (X) and target (y)
X = df[['Hours_Studied','Sleep_Hours']] #Features
y = df['Exam_Score'].map({'Fail':0,'Pass':1})
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=4
2)
#Train KNN model(K=3)
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train,y_train)
#Predict on test data
y_pred=knn.predict(X_test)
cm=confusion_matrix(y_test,y_pred)
print("Confusion Matrix:\n",cm)
print("\nClassification Report:\n",classification_report(y_test,y_pred))
#Plotting the decision boundary
plt.figure(figsize=(10,6))
#Create a mesh grid for decision boundary
x_min,x_max=X["Hours_Studied"].min()-1,X["Hours_Studied"].max()+1
y_min,y_max=X["Sleep_Hours"].min()-1,X["Sleep_Hours"].max()+1
xx,yy=np.meshgrid(np.linspace(x_min,x_max,100),np.linspace(y_min,y_max,10
0))
#Predict for each point in the grid
Z=knn.predict(np.c_[xx.ravel(),yy.ravel()])
Z=Z.reshape(xx.shape)
#Plot the decision boundary using contour
plt.contourf(xx,yy,Z,alpha=0.3,cmap='coolwarm')
#Scatter plot of training data
sns.scatterplot(x=X_train["Hours_Studied"],y=X_train["Sleep_Hours"],hue=y_tr
ain,palette={0:'red',1:'green'},s=100,edgecolor='black')
#Scatter plot of test data
sns.scatterplot(x=X_test["Hours_Studied"],y=X_test["Sleep_Hours"],hue=y_pre
d,marker='s',palette={0:'orange',1:'blue'},s=150,edgecolor='black')
#Labels and title
plt.xlabel("Hours Studied")
plt.ylabel("Sleep Hours")
plt.title("KNN Classification with Decision Boundary")
plt.legend(title="Legend",labels=["Fail(Train)","Pass(Train)","Fail(Test)","Pass(Te
st)"])
plt.grid(True)
plt.show()
Dataset:
Input:
Output:
3.Practice Questions
3(a)
Code:
import pandas as pd
#Creating a series from a list
data = [10,20,30,40,50]
series1 = pd.Series(data)
print(series1)
Input:
Output:
3(b)
Code:
#Creating a pandas dataframe
import pandas as pd
#creating a dataframe froom a dictionary
data={
'Name':['Alice','Bob','Charlie'],
'Age':[25,30,35],
'Salary':[50000,60000,70000]
}
df=pd.DataFrame(data)
print(df)
Input:
Output:
3(c)
Code:
#From a list of lists
data = [
['Alice',25,50000],
['Bob',30,60000],
['Charlie',35,70000]
]
df = pd.DataFrame(data,columns=['Name','Age','Salary'])
print(df)
Input:
Output:
3(d)
Code:
#missing values
import pandas as pd
import numpy as np
#creating a dataset with some missing values
data = {
'Name': ['Alice','Bob','Charlie','David','Emma'],
'Age': [25,np.nan,30,35,np.nan],
'Salary': [50000,60000,np.nan,80000,75000],
'Department': ['HR','IT',np.nan,'Finance','IT']
}
df = pd.DataFrame(data)
print("Original Dataset with Missing Values:")
print(df)
Input:
Output:
3(e)
Code:
print("Missing Values in Each Column:")
print(df.isnull().sum()) #count missing values in each column
Input:
Output:
3(f)
Code:
import pandas as pd
import numpy as np
#Fill missing Age with the mean age
df['Age'].fillna(df['Age'].mean(),inplace=True)
#Fill missing salary with the median salary
df['Salary'].fillna(df['Salary'].median(),inplace=True)
#Fill missing department with the most frequent vzlue(mode)
df['Department'].fillna(df['Department'].mode()[0],inplace=True)
print("Dataset After filling missing values")
print(df)
Input:
Output:
3(g)
Code:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
#minmax normalization
#sample data
data = np.array([[1,2],[3,4],[5,6],[7,8]])
#initialize the scaler
scaler = MinMaxScaler()
#fit and transform the data
print(data)
normalized_data = scaler.fit_transform(data)
print("Normalized Data (Min-Max Scaling)")
print(normalized_data)
Input:
Output:
3(h)
Code:
import pandas as pd
import numpy as np
#dictionary
data={
'Name':['Geek1','Geek2','Geek3','Geek4'],
'Salary':[18000,20000,15000,35000]
}
#create a dataframe
data = pd.DataFrame(data,
columns=['Name',
'Salary'])
#show the dataframe
data
data['logarithm_base2'] = np.log2(data['Salary'])
#Show the dataframe
print(data)
Input:
Output:
3(i)
Code:
import pandas as pd
import numpy as np
#sample dataset
data = [50,60,70,80,90,100]
#convert to Pandas DataFrame
df = pd.DataFrame(data,columns=['Values'])
#compute mean and standard deviation
mean = df['Values'].mean()
std_dev = df['Values'].std()
#Apply Z-score normalization
df['Z-Score'] = (df['Values']-mean)/std_dev
#display the results
print(df)
Input:
Output:
4. Naïve Bayes Classification
Code:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import
accuracy_score,classification_report,confusion_matrix
#Sample weather Dataset
data = pd.read_csv("weather.csv")
df=pd.DataFrame(data)
#Encoding categorical features
label_enc=LabelEncoder()
df['Outlook'] = label_enc.fit_transform(df['Outlook']) #Convert
#'Sunny','Rain' etc. to numbets
df['Wind'] = label_enc.fit_transform(df['Wind']) #Covert 'Yes'
#No' to 1,0
df['Humidity'] = label_enc.fit_transform(df['Humidity']) #Convert 'Yes'
df['Temperature'] = label_enc.fit_transform(df['Humidity'])
#Splitting features and target
X=df[['Outlook','Temperature','Humidity','Wind']]
y=df['PlayTennis']
#Train test split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=4
2)
#Train Naive Bayes Classifier
model=GaussianNB()
model.fit(X_train,y_train)
#Predictions
y_pred=model.predict(X_test)
#Evaluate Model
print("Accuracy:",accuracy_score(y_test,y_pred))
print("Confusion Matrix:\n",confusion_matrix(y_test,y_pred))
print("Classification Report:\n",classification_report(y_test,y_pred))
Dataset:
Input:
Output:
5.EM-Model
Code:
#EM-Model
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix,accuracy_score
#Load dataset
df = pd.read_csv("student_data.csv")
#Extraxt features(Math Score, Science Score)
X = df[["Math_Score","Science_Score"]].values
y_true = df["Category"].values #True labels(0 or 1)
#Standardize data for better clustering
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
#Apply Gaussian Mixture Model(EM Algorithm)
gmm =
GaussianMixture(n_components=2,covariance_type='full',random_state=42)
gmm.fit(X_scaled)
y_pred=gmm.predict(X_scaled) #Predicted clusters
#Adjust cluster labels to match true labels
if np.mean(y_pred[y_true==1])< np.mean(y_pred[y_true==0]):
y_pred = 1-y_pred #swap labels if necessary
#Compute Accuracy & Confusion Matrix
accuracy = accuracy_score(y_true,y_pred)
conf_matrix = confusion_matrix(y_true,y_pred)
print("Accuracy:",accuracy)
print("Confusion Matrix:\n",conf_matrix)
#Plot the clusters
plt.figure(figsize=(8,6))
plt.scatter(X[:,0],X[:,1],c=y_pred,cmap='coolwarm',edgecolors='k',s=100)
plt.xlabel("Math Score")
plt.ylabel("Science Score")
plt.title("Student Clusters using EM(GMM)")
plt.colorbar(label="Cluster Label")
plt.show()
Dataset:
Input:
Output: