In [1]:
# Loading dataset
# importing required libraries
# importing Scikit-learn library and datasets package
from sklearn import datasets
import pandas as pd
# importing random forest classifier from ensemble module
from sklearn.ensemble import RandomForestClassifier
# Loading the iris plants dataset (classification)
iris = datasets.load_iris()
In [2]:
print(iris.target_names) # Dependent Variable
['setosa' 'versicolor' 'virginica']
In [3]:
print(iris.feature_names) # Independent features or columns
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
In [4]:
# Here dataset will contain all independent columns given by iris.data. It will convert to dataframe.
dataset = pd.DataFrame(iris.data)
In [5]:
# printing the top 5 rows in iris dataset
print(dataset.head())
0 1 2 3
0 5.1 3.5 1.4 0.2
1 4.9 3.0 1.4 0.2
2 4.7 3.2 1.3 0.2
3 4.6 3.1 1.5 0.2
4 5.0 3.6 1.4 0.2
In [6]:
# We are trying to create a new column named 'species' in dataset. The values of species column
# is same as iris.target - setosa, versicolor and verginica i.e. 0,1,2
dataset['species'] = iris.target
In [7]:
# Adding column name to the respective columns
dataset.columns =['sepallength', 'sepalwidth', 'petallength', 'petalwidth', 'species']
# displaying the DataFrame
print(dataset)
sepallength sepalwidth petallength petalwidth species
0 5.1 3.5 1.4 0.2 0
1 4.9 3.0 1.4 0.2 0
2 4.7 3.2 1.3 0.2 0
3 4.6 3.1 1.5 0.2 0
4 5.0 3.6 1.4 0.2 0
.. ... ... ... ... ...
145 6.7 3.0 5.2 2.3 2
146 6.3 2.5 5.0 1.9 2
147 6.5 3.0 5.2 2.0 2
148 6.2 3.4 5.4 2.3 2
149 5.9 3.0 5.1 1.8 2
[150 rows x 5 columns]
In [8]:
# Spliting arrays or matrices into random train and test subsets
from sklearn.model_selection import train_test_split
X = dataset.iloc[:, : -1]
y = dataset.iloc[:, -1]
# i.e. 70 % training dataset and 30 % test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 8)
In [9]:
from sklearn.ensemble import RandomForestClassifier
# Create a Random Forest Classifier
# n_estimators : int, default=100 : The number of trees in the forest.
# criterion{“gini”, “entropy”}, default=”gini”
clf = RandomForestClassifier(n_estimators=100) # 100 trees
# Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)
# Prediction on test set
y_pred=clf.predict(X_test)
# Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy: ",metrics.accuracy_score(y_test, y_pred))
Accuracy: 0.9111111111111111
In [10]:
# predicting which type of flower it is.
clf.predict([[3, 3, 2, 2]])
Out[10]:
array([0])
In [11]:
# This implies it is setosa flower type as we got the three species or classes in our data set:
# Setosa, Versicolor, and Virginia.
In [12]:
clf.predict([[3, 5, 5, 2]])
# Here, array([2]) indicates the flower type Virginica.
Out[12]:
array([2])
In [13]:
# Now we will also find out the important features or selecting features in the IRIS dataset.
In [14]:
# importing random forest classifier from assemble module
from sklearn.ensemble import RandomForestClassifier
# Create a Random forest Classifier
clf = RandomForestClassifier(n_estimators = 100)
# Train the model using the training sets
clf.fit(X_train, y_train)
Out[14]:
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False)
In [15]:
# using the feature importance variable
feature_imp = pd.Series(clf.feature_importances_, index = iris.feature_names).sort_values(
ascending = False)
feature_imp
Out[15]:
petal width (cm) 0.519967
petal length (cm) 0.349479
sepal length (cm) 0.103166
sepal width (cm) 0.027388
dtype: float64
In [16]:
# Generating the Model on Selected Features
# Here, we can remove the "sepal width" feature because it has very low importance,
# and select the 3 remaining features.
# Import train_test_split function
from sklearn.model_selection import train_test_split
# Split dataset into features and labels
X=dataset[['petallength', 'petalwidth','sepallength']]
# Removed feature "sepal width"
y=dataset['species']
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=5)
In [17]:
from sklearn.ensemble import RandomForestClassifier
# Create Random Forest Classifier
# n_estimatorsint, default=100: The number of trees in the forest.
clf=RandomForestClassifier(n_estimators=100)
# Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)
# Prediction on test set
y_pred=clf.predict(X_test)
# Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy: ",metrics.accuracy_score(y_test, y_pred))
Accuracy: 0.9333333333333333
In [18]:
# We can see that after removing the least important features (sepal width), the accuracy increased.
# This is because you removed misleading data and noise, resulting in an increased accuracy.
# A lesser amount of features also reduces the training time.
In [19]:
# first decision tree is 0th tree and total trees are from 0 to 99
clf.estimators_[0]
Out[19]:
DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
max_depth=None, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, presort='deprecated',
random_state=1364519456, splitter='best')
In [20]:
# Plot first decision tree
from sklearn import tree
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(25,20))
a = tree.plot_tree(clf.estimators_[0], feature_names = X.columns, filled=True)
In [21]:
clf.estimators_[1]
Out[21]:
DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
max_depth=None, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, presort='deprecated',
random_state=9010534, splitter='best')
In [22]:
# Plot second decision tree
from sklearn import tree
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(25,20))
a = tree.plot_tree(clf.estimators_[1], feature_names = X.columns, filled=True)
In [ ]: