1
1
from utils4e import (
2
2
removeall , unique , mode , argmax_random_tie , isclose , dotproduct , weighted_sample_with_replacement ,
3
- num_or_str , normalize , clip , print_table , open_data , probability , random_weights
3
+ num_or_str , normalize , clip , print_table , open_data , probability , random_weights , euclidean_distance
4
4
)
5
5
6
6
import copy
@@ -382,8 +382,8 @@ def cross_validation(learner, size, dataset, k=10, trials=1):
382
382
examples = dataset .examples
383
383
random .shuffle (dataset .examples )
384
384
for fold in range (k ):
385
- train_data , val_data = train_test_split (dataset , fold * (n / k ),
386
- (fold + 1 ) * (n / k ))
385
+ train_data , val_data = train_test_split (dataset , fold * (n // k ),
386
+ (fold + 1 ) * (n // k ))
387
387
dataset .examples = train_data
388
388
h = learner (dataset , size )
389
389
fold_errs += err_ratio (h , dataset , train_data )
@@ -393,6 +393,37 @@ def cross_validation(learner, size, dataset, k=10, trials=1):
393
393
return fold_errs / k
394
394
395
395
396
+ def cross_validation_nosize (learner , dataset , k = 10 , trials = 1 ):
397
+ """Do k-fold cross_validate and return their mean.
398
+ That is, keep out 1/k of the examples for testing on each of k runs.
399
+ Shuffle the examples first; if trials>1, average over several shuffles.
400
+ Returns Training error, Validataion error"""
401
+ k = k or len (dataset .examples )
402
+ if trials > 1 :
403
+ trial_errs = 0
404
+ for t in range (trials ):
405
+ errs = cross_validation (learner , dataset ,
406
+ k = 10 , trials = 1 )
407
+ trial_errs += errs
408
+ return trial_errs / trials
409
+ else :
410
+ fold_errs = 0
411
+ n = len (dataset .examples )
412
+ examples = dataset .examples
413
+ random .shuffle (dataset .examples )
414
+ for fold in range (k ):
415
+ train_data , val_data = train_test_split (dataset , fold * (n // k ),
416
+ (fold + 1 ) * (n // k ))
417
+ dataset .examples = train_data
418
+ h = learner (dataset )
419
+ fold_errs += err_ratio (h , dataset , train_data )
420
+
421
+ # Reverting back to original once test is completed
422
+ dataset .examples = examples
423
+ return fold_errs / k
424
+
425
+
426
+
396
427
def err_ratio (predict , dataset , examples = None , verbose = 0 ):
397
428
"""Return the proportion of the examples that are NOT correctly predicted.
398
429
verbose - 0: No output; 1: Output wrong; 2 (or greater): Output correct"""
@@ -521,6 +552,8 @@ def LinearLearner(dataset, learning_rate=0.01, epochs=100):
521
552
for example in examples :
522
553
x = [1 ] + example
523
554
y = dotproduct (w , x )
555
+ # if threshold:
556
+ # y = threshold(y)
524
557
t = example [idx_t ]
525
558
err .append (t - y )
526
559
@@ -554,17 +587,20 @@ def LogisticLinearLeaner(dataset, learning_rate=0.01, epochs=100):
554
587
555
588
for epoch in range (epochs ):
556
589
err = []
590
+ h = []
557
591
# Pass over all examples
558
592
for example in examples :
559
593
x = [1 ] + example
560
594
y = 1 / (1 + math .exp (- dotproduct (w , x )))
561
- h = [ y * (1 - y )]
595
+ h . append ( y * (1 - y ))
562
596
t = example [idx_t ]
563
597
err .append (t - y )
564
598
565
599
# update weights
566
600
for i in range (len (w )):
567
- w [i ] = w [i ] + learning_rate * (dotproduct (dotproduct (err ,h ), X_col [i ]) / num_examples )
601
+ buffer = [x * y for x ,y in zip (err , h )]
602
+ # w[i] = w[i] + learning_rate * (dotproduct(err, X_col[i]) / num_examples)
603
+ w [i ] = w [i ] + learning_rate * (dotproduct (buffer , X_col [i ]) / num_examples )
568
604
569
605
def predict (example ):
570
606
x = [1 ] + example
@@ -580,6 +616,7 @@ def NearestNeighborLearner(dataset, k=1):
580
616
"""k-NearestNeighbor: the k nearest neighbors vote."""
581
617
def predict (example ):
582
618
"""Find the k closest items, and have them vote for the best."""
619
+ example .pop (dataset .target )
583
620
best = heapq .nsmallest (k , ((dataset .distance (e , example ), e )
584
621
for e in dataset .examples ))
585
622
return mode (e [dataset .target ] for (d , e ) in best )
@@ -829,6 +866,6 @@ def compare(algorithms=None, datasets=None, k=10, trials=1):
829
866
Majority (7 , 100 ), Parity (7 , 100 ), Xor (100 )] # of datasets
830
867
831
868
print_table ([[a .__name__ .replace ('Learner' , '' )] +
832
- [cross_validation (a , d , k , trials ) for d in datasets ]
869
+ [cross_validation_nosize (a , d , k , trials ) for d in datasets ]
833
870
for a in algorithms ],
834
- header = ['' ] + [d .name [0 :7 ] for d in datasets ], numfmt = '% .2f' )
871
+ header = ['' ] + [d .name [0 :7 ] for d in datasets ], numfmt = '{0: .2f} ' )
0 commit comments