Ex No: 7a BUILD LINEAR REGRESSION MODELS
Date:
Program:
import pandas as pd
import statsmodels.api as sm
data = pd.read_csv(“pima_diabetes.csv")
#create correlation matrix
data.corr()
#Bivariate Analysis of Glucose-Insulin features
#define response variable 1
y1 = data['Glucose']
#define explanatory variable 1
x1 = data[['Insulin']]
#add constant to predictor variables
x1 = sm.add_constant(x1)
#fit linear regression model
model1 = sm.OLS(y1, x1).fit()
#view model summary
print(model1.summary())
#Bivariate Analysis of Age-Pregnancies features
#define response variable 2
y2 = data['Age']
#define explanatory variable 2
x2 = data['Pregnancies']
#add a constant to predictor
variablesx2 = sm.add_constant(x2)
#fit linear regression model
model2 = sm.OLS(y2, x2).fit()
#view model summary
print(model2.summary())
#Bivariate Analysis of SkinThickness-BMI features
#define response variable 3
y3 = data['SkinThickness']
#define explanatory variable 3
x3 = data[['BMI']]
#add constant to predictor variables
x3 = sm.add_constant(x3)
#fit linear regression model
Model3 = sm.OLS(y3, x3).fit()
#view model summary
print(model3.summary())
Output:
a. Correlation Matrix
b. Bivariate Analysis of Glucose-Insulin features
c. Bivariate Analysis of Age-Pregnancies features
d. Bivariate Analysis of SkinThickness-BMI features
Result:
Ex No: 7b BUILD LOGISTIC REGRESSION MODELS
Date:
Program:
# importing libraries
import statsmodels.api as sm
import pandas as pd
# loading the training dataset
data = pd.read_csv('pima_diabetes.csv', index_col = 0)
# defining the dependent and independent variables
Xtrain = data[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI',
'DiabetesPedigreeFunction','Age']]
ytrain = data[['Outcome']]
# building the model and fitting the data
log_reg = sm.Logit(ytrain, Xtrain).fit()
# printing the summary table
print(log_reg.summary())
Output:
Result:
Ex No: 7c BUILD DECISION TREES
Date:
Program:
import pandas
from sklearn import tree
from sklearn.tree import DecisionTreeClassifierdf
= pandas.read_csv("data.csv")
print("Input:")
print(df.head(5))
d = {'UK':0,'USA':1,'N':2}
df['Nationality'] = df['Nationality'].map(d)d
= {'YES':1, 'NO':0}
df['Go'] = df['Go'].map(d)
print("Transformed Data:")
print(df.head(5))
features = ['Age','Experience','Rank','Nationality']X
= df[features]
y = df['Go']
dtree = DecisionTreeClassifier()
dtree = dtree.fit(X,y)
print(dtree.predict([[40,10,6,1]]))
print("[1]means 'Go'")
print("[0]means 'NO'")
DATA SET : (data.csv)
Age Experience Rank Nationality Go
36 10 9 UK NO
42 12 4 USA NO
23 4 6 N NO
52 4 4 USA NO
43 21 8 USA YES
Output:
Result:
Ex No: 7d BUILD RANDOM FORESTS
Date:
Program:
# Pandas is used for data manipulation
import pandas as pd
# Read in data and display first 5 rows
features = pd.read_csv('temps.csv')
features.head(5)
print('The shape of our features is:', features.shape)
# Descriptive statistics for each column
features.describe()
# One-hot encode the data using pandas get_dummies
features = pd.get_dummies(features)
# Display the first 5 rows of the last 12 columns
features.iloc[:,5:].head(5)
import numpy as np
# Labels are the values we want to predict
labels = np.array(features['actual'])
# Remove the labels from the features
# axis 1 refers to the columns
features= features.drop('actual', axis = 1)
# Saving feature names for later use
feature_list = list(features.columns)
# Convert to numpy array
features = np.array(features)
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size =
0.25, random_state = 42)
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
# Limit depth of tree to 3 levels
rf_small = RandomForestRegressor(n_estimators=10, max_depth = 3)
# Train the model on training data
rf_small.fit(train_features, train_labels)
# Extract the small tree
tree_small = rf_small.estimators_[5]
# Save the tree as a png image
export_graphviz(tree_small, out_file = 'small_tree.dot', feature_names = feature_list, rounded =
True, precision = 1)
(graph, ) = pydot.graph_from_dot_file('small_tree.dot')
graph.write_png('small_tree.png');
# Use the forest's predict method on the test data
predictions = rf_small.predict(test_features)
# Calculate the absolute errors
errors = abs(predictions - test_labels)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / test_labels)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')
Output:
The shape of our features is: (348, 12)
Training Features Shape: (261, 17)
Training Labels Shape: (261,)
Testing Features Shape: (87, 17)
Testing Labels Shape: (87,)
RandomForestRegressor(max_depth=3, n_estimators=10)
Mean Absolute Error: 4.0 degrees.
Accuracy: 93.73 %.
Result:
Ex No: 7e BUILD SVM MODELS
Date:
Program:
import pandas
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
data = pandas.read_csv("vector.csv")
print("Input: ")
print(data.head(10))
training_set, test_set = train_test_split(data, test_size = 0.3, random_state=1)
x_train = training_set.iloc[:,0:2].values
y_train = training_set.iloc[:,2].values
x_test = test_set.iloc[:,0:2].values
y_test = test_set.iloc[:,2].values
classifier = SVC(kernel='linear', random_state=1)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)
test_set["prediction"] = y_pred
print("Output")
print(test_set)
cm = confusion_matrix(y_test, y_pred)
accuracy = float(cm.diagonal().sum()/len(y_test))
print("\nAccuracy of SVM for the given dataset: ", accuracy)
Dataset
Output:
Result:
Ex No: 8 IMPLEMENT ENSEMBLING TECHNIQUES
Date:
Program:
#Implement VotingClassifier
#Importing necessary libraries:
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score
#Creating dataset:
X, y = make_moons(n_samples=500, noise=0.30)
X_train, X_test, y_train, y_test = train_test_split(X, y)
#Initializing the models:
log = LogisticRegression()
rnd = RandomForestClassifier(n_estimators=100)
svm = SVC()
voting = VotingClassifier(
estimators=[('logistics_regression', log), ('random_forest', rnd), ('support_vector_machine', svm)],
voting='hard')
#Fitting training data:
voting.fit(X_train, y_train)
#prediction using test data
for clf in (log, rnd, svm, voting):
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(clf. class . name , accuracy_score(y_test, y_pred))
#Implement BaggingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
bagging_clf = BaggingClassifier(
DecisionTreeClassifier(), n_estimators=250,
max_samples=100, bootstrap=True, random_state=101)
#Fitting training data:
bagging_clf.fit(X_train, y_train)
#prediction using test data
y_pred = bagging_clf.predict(X_test)
print(accuracy_score(y_test, y_pred))
#Implement AdaBoostClassifier
from sklearn.ensemble import AdaBoostClassifier
adaboost_clf = AdaBoostClassifier(
DecisionTreeClassifier(max_depth=1), n_estimators=200,
algorithm="SAMME.R", learning_rate=0.5, random_state=42)
#Fitting training data:
adaboost_clf.fit(X_train, y_train)
#prediction using test data
y_pred = adaboost_clf.predict(X_test)
accuracy_score(y_test, y_pred)
Output:
#For VotingClassifier
LogisticRegression 0.848
RandomForestClassifier 0.88
SVC 0.896
VotingClassifier 0.896
#For BaggingClassifier
0.888
#For AdaBoostClassifier
0.864
Result:
Ex No: 9 IMPLEMENT CLUSTERING ALGORITHMS
Date:
Program:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
data = {'x':
[25,34,22,27,33,33,31,22,35,34,67,54,57,43,50,57,59,52,65,47,49,48,35,33,44,45,38,43,51,4
6],
'y':
[79,51,53,78,59,74,73,57,69,75,51,32,40,47,53,36,35,58,59,50,25,20,14,12,20,5,29,27,8,7]
}
df = pd.DataFrame(data, columns=['x', 'y'])
kmeans = KMeans(n_clusters=3).fit(df)
centroids = kmeans.cluster_centers_
print(centroids)
plt.scatter(df['x'], df['y'], c= kmeans.labels_.astype(float), s=50, alpha=0.5)
plt.scatter(centroids[:, 0], centroids[:, 1], c='red', s=50)
plt.show()
Output:
Result:
Ex No: 10 IMPLEMENT GMM ALGORITHMS
Date:
Program:
import matplotlib.pyplot as plt
from sklearn import datasets
import sklearn.metrics as sm
import pandas as pd
import numpy as np
%matplotlib inline
# import some data to play with
iris = datasets.load_iris()
#print("\n IRIS DATA :",iris.data);
#print("\n IRIS FEATURES :\n",iris.feature_names)
#print("\n IRIS TARGET :\n",iris.target)
#print("\n IRIS TARGET NAMES:\n",iris.target_names)
# Store the inputs as a Pandas Dataframe and set the column names
X = pd.DataFrame(iris.data)
#print(X)
X.columns = ['Sepal_Length','Sepal_Width','Petal_Length','Petal_Width']
#print(X.columns)
#print("X:",x)
#print("Y:",y)
y = pd.DataFrame(iris.target)
y.columns = ['Targets']
# Set the size of the plot
plt.figure(figsize=(14,7))
# Create a colormap
colormap = np.array(['red', 'lime', 'black'])
# Plot Sepal
plt.subplot(1, 2, 1)
plt.scatter(X.Sepal_Length,X.Sepal_Width, c=colormap[y.Targets], s=40)
plt.title('Sepal')
plt.subplot(1, 2, 2)
plt.scatter(X.Petal_Length,X.Petal_Width, c=colormap[y.Targets], s=40)
plt.title('Petal')
# GMM
from sklearn import preprocessing
scaler = preprocessing.StandardScaler()
scaler.fit(X)
xsa = scaler.transform(X)
xs = pd.DataFrame(xsa, columns = X.columns)
xs.sample(5)
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=3)
gmm.fit(xs)
y_cluster_gmm = gmm.predict(xs)
y_cluster_gmm
plt.subplot(1, 2, 1)
plt.scatter(X.Petal_Length, X.Petal_Width, c=colormap[y_cluster_gmm], s=40)
plt.title('GMM Classification')
# Accuracy
sm.accuracy_score(y, y_cluster_gmm)
# Confusion Matrix
sm.confusion_matrix(y, y_cluster_gmm)
Output:
array([[50, 0, 0],
[ 0, 5, 45],
[ 0, 50, 0]], dtype=int64)
Result: