-
-
Notifications
You must be signed in to change notification settings - Fork 26.3k
Closed
Description
Hi, im trying to calibrate logistic regression classifier and i get the error ValueError: could not convert string to float: 'OLIFE',
I did onehotencode my categorical values using pipeline, it works fine when i test my model but when i calibrate it doesnt work even if im passing the pipeline model in to CalibratedClassifierCV, iget an error when i execute the calib_clf.fit(Valid, labelValid) kindly assist please as im new to machine learning
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler
#Importing Visualization module
import matplotlib.pyplot as plt
# Model Building and Evaluation modules
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.metrics import roc_auc_score
# Importing calibration modules
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
# Importing the train, Valid(Calibration) & test data
trainData = pd.read_csv("./trainNew.csv")
ValidData = pd.read_csv("./ValidData.csv")
testData = pd.read_csv("./test.csv")
labelTrain = trainData['Status']
labelTest = testData['Status']
labelValid = ValidData['Status']
test = testData.drop(['POLICY_NO','CANCEL_CODE','Status','HIV_TEST_DATE','POLICY_HOLDER_ID'], axis = 1)
Valid = ValidData.drop(['POLICY_NO','CANCEL_CODE','Status','HIV_TEST_DATE','POLICY_HOLDER_ID'], axis = 1)
train = trainData.drop(['POLICY_NO','CANCEL_CODE','Status','HIV_TEST_DATE','POLICY_HOLDER_ID'], axis = 1)
# check missing values in train data
train.isnull().sum()
prof = ProfileReport(data)
prof.to_file(output_file='output.html')
# check column names
train.columns
# Checking data dimension'
train.shape
# Imputing missing values, Encoding Categorical Variables & Standardizing the data
column_trans = make_column_transformer(
(OneHotEncoder(), ['PRODUCT_LINE_ID','SMOKING_STATUS','gender','Cover_Type']),
remainder = StandardScaler()
)
column_trans.fit_transform(train)
# Create a pipeline that scales the data then trains a classifier
logreg = LogisticRegression()
model_pipeline = make_pipeline(column_trans, logreg)
# KFold/StratifiedKFold cross validation with 5 folds (the default)
# applying the classifier pipeline to the features and target data
scores = cross_val_score(model_pipeline, train, labelTrain, cv=5)
scores.mean()
# Fitting the model pipeline
model_pipeline.fit(train,labelTrain)
# Testing the model pipeline on new data/test data
predictions = model_pipeline.predict_proba(test)[:,1]
calib_clf = CalibratedClassifierCV(model_pipeline, method="sigmoid", cv="prefit")
calib_clf.fit(Valid, labelValid)
Metadata
Metadata
Assignees
Labels
No labels