diff --git a/learning.ipynb b/learning.ipynb index 78ff4f0e3..d31a708ef 100644 --- a/learning.ipynb +++ b/learning.ipynb @@ -14,7 +14,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 6, "metadata": { "collapsed": true, "deletable": true, @@ -36,8 +36,10 @@ "\n", "* Machine Learning Overview\n", "* Datasets\n", + "* Distance Functions\n", "* Plurality Learner\n", "* k-Nearest Neighbours\n", + "* Naive Bayes Learner\n", "* Perceptron\n", "* MNIST Handwritten Digits\n", " * Loading and Visualising\n", @@ -578,6 +580,241 @@ "As you can see \"setosa\" was mapped to 0." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Distance Functions\n", + "\n", + "In a lot of algorithms (like the *k-Nearest Neighbors* algorithm), there is a need to compare items, finding how *similar* or *close* they are. For that we have many different functions at our disposal. Below are the functions implemented in the module:\n", + "\n", + "### Manhattan Distance (`manhattan_distance`)\n", + "\n", + "One of the simplest distance functions. It calculates the difference between the coordinates/features of two items. To understand how it works, imagine a 2D grid with coordinates *x* and *y*. In that grid we have two items, at the squares positioned at `(1,2)` and `(3,4)`. The difference between their two coordinates is `3-1=2` and `4-2=2`. If we sum these up we get `4`. That means to get from `(1,2)` to `(3,4)` we need four moves; two to the right and two more up. The function works similarly for n-dimensional grids." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Manhattan Distance between (1,2) and (3,4) is 4\n" + ] + } + ], + "source": [ + "def manhattan_distance(X, Y):\n", + " return sum([abs(x - y) for x, y in zip(X, Y)])\n", + "\n", + "\n", + "distance = manhattan_distance([1,2], [3,4])\n", + "print(\"Manhattan Distance between (1,2) and (3,4) is\", distance)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Euclidean Distance (`euclidean_distance`)\n", + "\n", + "Probably the most popular distance function. It returns the square root of the sum of the squared differences between individual elements of two items." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Euclidean Distance between (1,2) and (3,4) is 2.8284271247461903\n" + ] + } + ], + "source": [ + "def euclidean_distance(X, Y):\n", + " return math.sqrt(sum([(x - y)**2 for x, y in zip(X,Y)]))\n", + "\n", + "\n", + "distance = euclidean_distance([1,2], [3,4])\n", + "print(\"Euclidean Distance between (1,2) and (3,4) is\", distance)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Hamming Distance (`hamming_distance`)\n", + "\n", + "This function counts the number of differences between single elements in two items. For example, if we have two binary strings \"111\" and \"011\" the function will return 1, since the two strings only differ at the first element. The function works the same way for non-binary strings too." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Hamming Distance between 'abc' and 'abb' is 1\n" + ] + } + ], + "source": [ + "def hamming_distance(X, Y):\n", + " return sum(x != y for x, y in zip(X, Y))\n", + "\n", + "\n", + "distance = hamming_distance(['a','b','c'], ['a','b','b'])\n", + "print(\"Hamming Distance between 'abc' and 'abb' is\", distance)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Mean Boolean Error (`mean_boolean_error`)\n", + "\n", + "To calculate this distance, we find the ratio of different elements over all elements of two items. For example, if the two items are `(1,2,3)` and `(1,4,5)`, the ration of different/all elements is 2/3, since they differ in two out of three elements." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Mean Boolean Error Distance between (1,2,3) and (1,4,5) is 0.6666666666666666\n" + ] + } + ], + "source": [ + "def mean_boolean_error(X, Y):\n", + " return mean(int(x != y) for x, y in zip(X, Y))\n", + "\n", + "\n", + "distance = mean_boolean_error([1,2,3], [1,4,5])\n", + "print(\"Mean Boolean Error Distance between (1,2,3) and (1,4,5) is\", distance)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Mean Error (`mean_error`)\n", + "\n", + "This function finds the mean difference of single elements between two items. For example, if the two items are `(1,0,5)` and `(3,10,5)`, their error distance is `(3-1) + (10-0) + (5-5) = 2 + 10 + 0 = 12`. The mean error distance therefore is `12/3=4`." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Mean Error Distance between (1,0,5) and (3,10,5) is 4\n" + ] + } + ], + "source": [ + "def mean_error(X, Y):\n", + " return mean([abs(x - y) for x, y in zip(X, Y)])\n", + "\n", + "\n", + "distance = mean_error([1,0,5], [3,10,5])\n", + "print(\"Mean Error Distance between (1,0,5) and (3,10,5) is\", distance)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Mean Square Error (`ms_error`)\n", + "\n", + "This is very similar to the `Mean Error`, but instead of calculating the difference between elements, we are calculating the *square* of the differences." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Mean Square Distance between (1,0,5) and (3,10,5) is 34.666666666666664\n" + ] + } + ], + "source": [ + "def ms_error(X, Y):\n", + " return mean([(x - y)**2 for x, y in zip(X, Y)])\n", + "\n", + "\n", + "distance = ms_error([1,0,5], [3,10,5])\n", + "print(\"Mean Square Distance between (1,0,5) and (3,10,5) is\", distance)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Root of Mean Square Error (`rms_error`)\n", + "\n", + "This is the square root of `Mean Square Error`." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Root of Mean Error Distance between (1,0,5) and (3,10,5) is 5.887840577551898\n" + ] + } + ], + "source": [ + "def rms_error(X, Y):\n", + " return math.sqrt(ms_error(X, Y))\n", + "\n", + "\n", + "distance = rms_error([1,0,5], [3,10,5])\n", + "print(\"Root of Mean Error Distance between (1,0,5) and (3,10,5) is\", distance)" + ] + }, { "cell_type": "markdown", "metadata": { diff --git a/learning.py b/learning.py index ec685131d..99185dc54 100644 --- a/learning.py +++ b/learning.py @@ -17,28 +17,32 @@ # ______________________________________________________________________________ -def rms_error(predictions, targets): - return math.sqrt(ms_error(predictions, targets)) +def euclidean_distance(X, Y): + return math.sqrt(sum([(x - y)**2 for x, y in zip(X, Y)])) -def ms_error(predictions, targets): - return mean([(p - t)**2 for p, t in zip(predictions, targets)]) +def rms_error(X, Y): + return math.sqrt(ms_error(X, Y)) -def mean_error(predictions, targets): - return mean([abs(p - t) for p, t in zip(predictions, targets)]) +def ms_error(X, Y): + return mean([(x - y)**2 for x, y in zip(X, Y)]) -def manhattan_distance(predictions, targets): - return sum([abs(p - t) for p, t in zip(predictions, targets)]) +def mean_error(X, Y): + return mean([abs(x - y) for x, y in zip(X, Y)]) -def mean_boolean_error(predictions, targets): - return mean(int(p != t) for p, t in zip(predictions, targets)) +def manhattan_distance(X, Y): + return sum([abs(x - y) for x, y in zip(X, Y)]) -def hamming_distance(predictions, targets): - return sum(p != t for p, t in zip(predictions, targets)) +def mean_boolean_error(X, Y): + return mean(int(x != y) for x, y in zip(X, Y)) + + +def hamming_distance(X, Y): + return sum(x != y for x, y in zip(X, Y)) # ______________________________________________________________________________ diff --git a/tests/test_learning.py b/tests/test_learning.py index 4f618f7c1..ecba5e0d4 100644 --- a/tests/test_learning.py +++ b/tests/test_learning.py @@ -1,9 +1,22 @@ from learning import parse_csv, weighted_mode, weighted_replicate, DataSet, \ PluralityLearner, NaiveBayesLearner, NearestNeighborLearner, \ - NeuralNetLearner, PerceptronLearner, DecisionTreeLearner + NeuralNetLearner, PerceptronLearner, DecisionTreeLearner, \ + euclidean_distance from utils import DataFile + +def test_euclidean(): + distance = euclidean_distance([1,2], [3,4]) + assert round(distance, 2) == 2.83 + + distance = euclidean_distance([1,2,3], [4,5,6]) + assert round(distance, 2) == 5.2 + + distance = euclidean_distance([0,0,0], [0,0,0]) + assert distance == 0 + + def test_exclude(): iris = DataSet(name='iris', exclude=[3]) assert iris.inputs == [0, 1, 2] @@ -22,6 +35,20 @@ def test_weighted_replicate(): assert weighted_replicate('ABC', [1, 2, 1], 4) == ['A', 'B', 'B', 'C'] +def test_means_and_deviation(): + iris = DataSet(name="iris") + + means, deviations = iris.find_means_and_deviations() + + assert means["setosa"] == [5.006, 3.418, 1.464, 0.244] + assert means["versicolor"] == [5.936, 2.77, 4.26, 1.326] + assert means["virginica"] == [6.588, 2.974, 5.552, 2.026] + + assert round(deviations["setosa"][0],3) == 0.352 + assert round(deviations["versicolor"][0],3) == 0.516 + assert round(deviations["virginica"][0],3) == 0.636 + + def test_plurality_learner(): zoo = DataSet(name="zoo") @@ -32,8 +59,14 @@ def test_plurality_learner(): def test_naive_bayes(): iris = DataSet(name="iris") - nB = NaiveBayesLearner(iris) - assert nB([5,3,1,0.1]) == "setosa" + # Discrete + nBD = NaiveBayesLearner(iris) + assert nBD([5,3,1,0.1]) == "setosa" + + # Continuous + nBC = NaiveBayesLearner(iris, continuous=True) + assert nBC([5,3,1,0.1]) == "setosa" + assert nBC([7,3,6.5,2]) == "virginica" def test_k_nearest_neighbors():