Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Fix cross validation evaluation #337

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 45 additions & 29 deletions learning.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,7 @@ def sample(self):
# ______________________________________________________________________________


def PluralityLearner(dataset):
def PluralityLearner(dataset, size=None):
"""A very dumb algorithm: always pick the result that was most popular
in the training data. Makes a baseline for comparison."""
most_popular = mode([e[dataset.target] for e in dataset.examples])
Expand All @@ -306,14 +306,14 @@ def predict(example):
# ______________________________________________________________________________


def NaiveBayesLearner(dataset, continuous=True):
def NaiveBayesLearner(dataset, size=None, continuous=True):
if(continuous):
return NaiveBayesContinuous(dataset)
return NaiveBayesContinuous(dataset, size)
else:
return NaiveBayesDiscrete(dataset)
return NaiveBayesDiscrete(dataset, size)


def NaiveBayesDiscrete(dataset):
def NaiveBayesDiscrete(dataset, size):
"""Just count how many times each value of each input attribute
occurs, conditional on the target value. Count the different
target values too."""
Expand Down Expand Up @@ -341,7 +341,7 @@ def class_probability(targetval):
return predict


def NaiveBayesContinuous(dataset):
def NaiveBayesContinuous(dataset, size):
"""Count how many times each target value occurs.
Also, find the means and deviations of input attribute values for each target value."""
means, deviations = dataset.find_means_and_deviations()
Expand Down Expand Up @@ -426,7 +426,7 @@ def __repr__(self):
# ______________________________________________________________________________


def DecisionTreeLearner(dataset):
def DecisionTreeLearner(dataset, size=None):
"""[Figure 18.5]"""

target, values = dataset.target, dataset.values
Expand Down Expand Up @@ -905,49 +905,65 @@ def train_and_test(dataset, start, end):
return train, val


def partition(dataset, fold, k):
num_examples = len(dataset.examples)
return train_and_test(dataset, fold * (num_examples / k), (fold + 1) * (num_examples / k))


def cross_validation(learner, size, dataset, k=10, trials=1):
"""Do k-fold cross_validate and return their mean.
That is, keep out 1/k of the examples for testing on each of k runs.
Shuffle the examples first; if trials>1, average over several shuffles.
Returns Training error, Validataion error"""
if k is None:
k = len(dataset.examples)
if trials > 1:
trial_errT = 0
trial_errV = 0

for t in range(trials):
errT, errV = cross_validation(learner, size, dataset,
k=10, trials=1)
errT, errV = cross_validation(learner, size, dataset, k)
trial_errT += errT
trial_errV += errV

return trial_errT / trials, trial_errV / trials
else:
fold_errT = 0
fold_errV = 0
n = len(dataset.examples)

examples = dataset.examples
for fold in range(k):
random.shuffle(dataset.examples)
train_data, val_data = train_and_test(dataset, fold * (n / k),
(fold + 1) * (n / k))
dataset.examples = train_data
training_set, validation_set = partition(dataset, fold, k)
h = learner(dataset, size)

fold_errT += err_ratio(h, dataset, train_data)
fold_errV += err_ratio(h, dataset, val_data)

# Reverting back to original once test is completed
dataset.examples = examples

return fold_errT / k, fold_errV / k


def leave_one_out(learner, dataset, size=None):
"""Leave one out cross-validation over the dataset."""
return cross_validation(learner, size, dataset, k=len(dataset.examples))


def converges(err_val):
"""Check for convergence provided err_val has more than two values"""
return err_val >= 2 and isclose(err_val[-2], err_val[-1], rel_tol=1e-6)


def cross_validation_wrapper(learner, dataset, k=10, trials=1):
"""[Fig 18.8]
Return the optimal value of size having minimum error
on validataion set.
on validation set.
err_train: A training error array, indexed by size
err_val: A validataion error array, indexed by size
"""
err_val = []
err_train = []
err_val = []

size = 1

while True:
Expand All @@ -963,15 +979,15 @@ def cross_validation_wrapper(learner, dataset, k=10, trials=1):
min_val = err_val[i]
best_size = i
i += 1

err_val.append(errV)
err_train.append(errT)
print(err_val)
size += 1

if converges(err_val):
best_size = size
return learner(dataset, best_size)

def leave_one_out(learner, dataset, size=None):
"""Leave one out cross-validation over the dataset."""
return cross_validation(learner, size, dataset, k=len(dataset.examples))
size += 1


def learningcurve(learner, dataset, trials=10, sizes=None):
Expand Down Expand Up @@ -1096,14 +1112,14 @@ def ContinuousXor(n):
# ______________________________________________________________________________


def compare(algorithms=[PluralityLearner, NaiveBayesLearner,
NearestNeighborLearner, DecisionTreeLearner],
datasets=[iris, orings, zoo, restaurant, SyntheticRestaurant(20),
Majority(7, 100), Parity(7, 100), Xor(100)],
k=10, trials=1):
def compare(algorithms=[PluralityLearner, NaiveBayesLearner, NearestNeighborLearner,
DecisionTreeLearner],
datasets=[iris, orings, zoo, restaurant, SyntheticRestaurant(20), Majority(7, 100),
Parity(7, 100), Xor(100)],
k=10, size=3, trials=1):
"""Compare various learners on various datasets using cross-validation.
Print results as a table."""
print_table([[a.__name__.replace('Learner', '')] +
[cross_validation(a, d, k, trials) for d in datasets]
[cross_validation(a, size, d, k, trials) for d in datasets]
for a in algorithms],
header=[''] + [d.name[0:7] for d in datasets], numfmt='%.2f')
header=[''] + [d.name[0:7] for d in datasets], numfmt='{:.2f}')
13 changes: 10 additions & 3 deletions tests/test_learning.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

import pytest
import math
from utils import DataFile
Expand All @@ -7,7 +6,6 @@
rms_error, manhattan_distance, mean_boolean_error, mean_error)



def test_euclidean():
distance = euclidean_distance([1, 2], [3, 4])
assert round(distance, 2) == 2.83
Expand Down Expand Up @@ -149,6 +147,16 @@ def test_perceptron():
assert err_ratio(perceptron, iris) < 0.4


def test_train_and_test():
dataset = DataSet(name="iris")
start = 50
end = 100

train_set, validation_set = train_and_test(dataset, start, end)

assert len(train_set) == 100
assert len(validation_set) == 50

def test_random_weights():
min_value = -0.5
max_value = 0.5
Expand All @@ -157,4 +165,3 @@ def test_random_weights():
assert len(test_weights) == num_weights
for weight in test_weights:
assert weight >= min_value and weight <= max_value

14 changes: 13 additions & 1 deletion utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,6 +317,18 @@ def issequence(x):
return isinstance(x, collections.abc.Sequence)


def format_table_value(value, numfmt):
if isnumber(value):
value = numfmt.format(value)
elif type(value) is tuple:
tmp = []
for v in value:
tmp.append(format_table_value(v, numfmt))
value = tuple(tmp)

return value


def print_table(table, header=None, sep=' ', numfmt='{}'):
"""Print a list of lists as a table, so that columns line up nicely.
header, if specified, will be printed as the first row.
Expand All @@ -328,7 +340,7 @@ def print_table(table, header=None, sep=' ', numfmt='{}'):
if header:
table.insert(0, header)

table = [[numfmt.format(x) if isnumber(x) else x for x in row]
table = [[format_table_value(x, numfmt) for x in row]
for row in table]

sizes = list(
Expand Down