# # Data Preprocessing
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from google.colab import files
uploaded = files.upload()
Choose Files No file chosen Upload widget is only available when the cell has been executed in the current browser session. Please reru
enable.
Saving Salary_Data.csv to Salary_Data.csv
# Importing the dataset
dataset = pd.read_csv('Salary_Data.csv')
dataset
YearsExperience Salary
0 1.1 39343.0
dataset.describe()
1 1.3 46205.0
2 YearsExperience
1.5 37731.0 Salary
count
3 30.000000
2.0 43525.030.000000
mean
4 5.313333
2.2 76003.000000
39891.0
5std 2.837888
2.9 27414.429785
56642.0
6min 1.100000
3.0 37731.000000
60150.0
725% 3.200000
3.2 56720.750000
54445.0
850% 4.700000
3.2 65237.000000
64445.0
975% 7.700000
3.7 100544.750000
57189.0
max
10 10.500000
3.9 122391.000000
63218.0
11 4.0 55794.0
# Mounting Google Drive
12 4.0 56957.0
from google.colab import drive
drive.mount('/content/drive')
13 4.1 57081.0
14
Drive 4.5 at
already mounted 61111.0
/content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount
15 4.9 67938.0
16 5.1 66029.0
# Importing the dataset
17 = pd.read_csv('/content/drive/My
# dataset 5.3 83088.0 Drive/ATAL/Salary_Data.csv')
18 5.9 81363.0
---------------------------------------------------------------------------
FileNotFoundError
19 6.0 93940.0 Traceback (most recent call last)
<ipython-input-6-242e04d314aa> in <module>()
20 1 # Importing
6.8 the91738.0
dataset
----> 2 dataset = pd.read_csv('/content/drive/My Drive/ATAL/Salary_Data.csv')
21 7.1 98273.0
4 frames
22 7.9 101302.0
/usr/local/lib/python3.6/dist-packages/pandas/io/parsers.py in __init__(self, src, **kwds)
23 2008 kwds["usecols"]
8.2 113812.0 = self.usecols
2009
->
24 2010 self._reader
8.7 109431.0= parsers.TextReader(src, **kwds)
2011 self.unnamed_cols = self._reader.unnamed_cols
25 2012 9.0 105582.0
26 9.5 116969.0
pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader.__cinit__()
27 9.6 112635.0
pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader._setup_parser_source()
28 10.3 122391.0
FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/My Drive/ATAL/Salary_Data.csv'
29 10.5 121872.0
SEARCH STACK OVERFLOW
print(dataset)
YearsExperience Salary
0 1.1 39343.0
1 1.3 46205.0
2 1.5 37731.0
3 2.0 43525.0
4 2.2 39891.0
5 2.9 56642.0
6 3.0 60150.0
7 3.2 54445.0
8 3.2 64445.0
9 3.7 57189.0
10 3.9 63218.0
11 4.0 55794.0
12 4.0 56957.0
13 4.1 57081.0
14 4.5 61111.0
15 4.9 67938.0
16 5.1 66029.0
17 5.3 83088.0
18 5.9 81363.0
19 6.0 93940.0
20 6.8 91738.0
21 7.1 98273.0
22 7.9 101302.0
23 8.2 113812.0
24 8.7 109431.0
25 9.0 105582.0
26 9.5 116969.0
27 9.6 112635.0
28 10.3 122391.0
29 10.5 121872.0
dataset.shape
(30, 2)
dataset.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 2 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 YearsExperience 30 non-null float64
1 Salary 30 non-null float64
dtypes: float64(2)
memory usage: 608.0 bytes
# Extracting dependent and independent variables:
# Extracting independent variable:
X = dataset.iloc[:, :-1].values
# Extracting dependent variable:
y = dataset.iloc[:, 1].values
print(X)
[[ 1.1]
[ 1.3]
[ 1.5]
[ 2. ]
[ 2.2]
[ 2.9]
[ 3. ]
[ 3.2]
[ 3.2]
[ 3.7]
[ 3.9]
[ 4. ]
[ 4. ]
[ 4.1]
[ 4.5]
[ 4.9]
[ 5.1]
[ 5.3]
[ 5.9]
[ 6. ]
[ 6.8]
[ 7.1]
[ 7.9]
[ 8.2]
[ 8.7]
[ 9. ]
[ 9.5]
[ 9.6]
[10.3]
[10.5]]
print(y)
[ 39343 46205 37731 43525 39891 56642 60150 54445 64445 57189
63218 55794 56957 57081 61111 67938 66029 83088 81363 93940
91738 98273 101302 113812 109431 105582 116969 112635 122391 121872]
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1/3, random_state = 0)
print(X_train)
[[ 2.9]
[ 5.1]
[ 3.2]
[ 4.5]
[ 8.2]
[ 6.8]
[ 1.3]
[10.5]
[ 3. ]
[ 2.2]
[ 5.9]
[ 6. ]
[ 3.7]
[ 3.2]
[ 9. ]
[ 2. ]
[ 1.1]
[ 7.1]
[ 4.9]
[ 4. ]]
print(X_test)
[[ 1.5]
[10.3]
[ 4.1]
[ 3.9]
[ 9.5]
[ 8.7]
[ 9.6]
[ 4. ]
[ 5.3]
[ 7.9]]
print(y_test)
[ 37731 122391 57081 63218 116969 109431 112635 55794 83088 101302]
print(y_train)
[ 56642. 66029. 64445. 61111. 113812. 91738. 46205. 121872. 60150.
39891. 81363. 93940. 57189. 54445. 105582. 43525. 39343. 98273.
67938. 56957.]
# Fitting Simple Linear Regression to the Training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)
▾ LinearRegression
y_predLinearRegression()
= regressor.predict(X_test)
#print("%2.f"%(y_pred))
print(y_pred)
[ 40835.10590871 123079.39940819 65134.55626083 63265.36777221
115602.64545369 108125.8914992 116537.23969801 64199.96201652
76349.68719258 100649.1375447 ]
# Visualising the Training set results
plt.scatter(X_train, y_train, color = 'red')
plt.plot(X_train, regressor.predict(X_train), color = 'blue')
plt.title('Salary vs Experience (Training set)')
plt.xlabel('Years of Experience')
plt.ylabel('Salary')
plt.show()
# Visualising the Test set results
plt.scatter(X_test, y_test, color = 'red')
plt.plot(X_train, regressor.predict(X_train), color = 'blue')
plt.title('Salary vs Experience (Test set)')
plt.xlabel('Years of Experience')
plt.ylabel('Salary')
plt.show()
# Visualising the Test set results
plt.scatter(X_test, y_test, color = 'red')
plt.plot(X_test, y_pred, color = 'blue')
plt.title('Salary vs Experience (Test set)')
plt.xlabel('Years of Experience')
plt.ylabel('Salary')
plt.show()
print("Regressor slope: %2.f "%( regressor.coef_[0]))
print("Regressor intercept:%2.f "% regressor.intercept_)
Regressor slope: 9346
Regressor intercept:26816
YearsExperience= 10
print("Salary for given Years of Experience is : %.f" %(regressor.predict([[YearsExperience]])))
Salary for given Years of Experience is : 120276
from sklearn import metrics
print("MAE %2.f" %(metrics.mean_absolute_error(y_test,y_pred)))
MAE 3426
from sklearn import metrics
print("RMSE %2.f" %(np.sqrt(metrics.mean_absolute_error(y_test,y_pred))))
RMSE 59
print('Train Score: %f' %(regressor.score(X_train, y_train)))
print('Test Score: %f' % (regressor.score(X_test, y_test)) )
Train Score: 0.938190
Test Score: 0.974915
Colab paid products - Cancel contracts here