|
31 | 31 | " - State dependent reward function\n",
|
32 | 32 | " - State and action dependent reward function\n",
|
33 | 33 | " - State, action and next state dependent reward function\n",
|
34 |
| - "\n", |
| 34 | + "- Grid MDP\n", |
| 35 | + " - Pathfinding problem\n", |
35 | 36 | "\n",
|
36 | 37 | "## SIMPLE MDP\n",
|
37 | 38 | "---\n",
|
|
221 | 222 | "name": "stdout",
|
222 | 223 | "output_type": "stream",
|
223 | 224 | "text": [
|
224 |
| - "['study', 'pub', 'sleep', 'facebook', 'quit']\n" |
| 225 | + "['quit', 'sleep', 'study', 'pub', 'facebook']\n" |
225 | 226 | ]
|
226 | 227 | }
|
227 | 228 | ],
|
|
294 | 295 | "name": "stdout",
|
295 | 296 | "output_type": "stream",
|
296 | 297 | "text": [
|
297 |
| - "{'class3': 'pub', 'leisure': 'quit', 'class2': 'study', 'class1': 'study', 'end': None}\n" |
| 298 | + "{'class2': 'sleep', 'class3': 'pub', 'end': None, 'class1': 'study', 'leisure': 'quit'}\n" |
298 | 299 | ]
|
299 | 300 | }
|
300 | 301 | ],
|
|
318 | 319 | "data": {
|
319 | 320 | "text/plain": [
|
320 | 321 | "{'class1': 'study',\n",
|
321 |
| - " 'class2': 'study',\n", |
| 322 | + " 'class2': 'sleep',\n", |
322 | 323 | " 'class3': 'pub',\n",
|
323 | 324 | " 'end': None,\n",
|
324 | 325 | " 'leisure': 'quit'}"
|
|
668 | 669 | "name": "stdout",
|
669 | 670 | "output_type": "stream",
|
670 | 671 | "text": [
|
671 |
| - "['study', 'pub', 'sleep', 'facebook', 'quit']\n" |
| 672 | + "['quit', 'sleep', 'study', 'pub', 'facebook']\n" |
672 | 673 | ]
|
673 | 674 | }
|
674 | 675 | ],
|
|
769 | 770 | "name": "stdout",
|
770 | 771 | "output_type": "stream",
|
771 | 772 | "text": [
|
772 |
| - "{'class3': 'study', 'leisure': 'quit', 'class2': 'sleep', 'class1': 'facebook', 'end': None}\n" |
| 773 | + "{'class2': 'sleep', 'class3': 'study', 'end': None, 'class1': 'facebook', 'leisure': 'quit'}\n" |
773 | 774 | ]
|
774 | 775 | }
|
775 | 776 | ],
|
|
832 | 833 | "We have the following transition probability matrices:\n",
|
833 | 834 | "<br>\n",
|
834 | 835 | "<br>\n",
|
835 |
| - "Action 1: Cruising streets\n", |
836 |
| - "<br>\n", |
| 836 | + "Action 1: Cruising streets \n", |
837 | 837 | "<br>\n",
|
838 |
| - "$$\\\\\n", |
| 838 | + "$\\\\\n", |
839 | 839 | " P^{1} = \n",
|
840 | 840 | " \\left[ {\\begin{array}{ccc}\n",
|
841 | 841 | " \\frac{1}{2} & \\frac{1}{4} & \\frac{1}{4} \\\\\n",
|
842 | 842 | " \\frac{1}{2} & 0 & \\frac{1}{2} \\\\\n",
|
843 | 843 | " \\frac{1}{4} & \\frac{1}{4} & \\frac{1}{2} \\\\\n",
|
844 | 844 | " \\end{array}}\\right] \\\\\n",
|
845 | 845 | " \\\\\n",
|
846 |
| - "$$\n", |
| 846 | + " $\n", |
847 | 847 | "<br>\n",
|
848 | 848 | "<br>\n",
|
849 |
| - "Action 2: Waiting at the taxi stand \n", |
| 849 | + "Action 2: Waiting at the taxi stand \n", |
850 | 850 | "<br>\n",
|
851 |
| - "<br>\n", |
852 |
| - "$$\\\\\n", |
| 851 | + "$\\\\\n", |
853 | 852 | " P^{2} = \n",
|
854 | 853 | " \\left[ {\\begin{array}{ccc}\n",
|
855 | 854 | " \\frac{1}{16} & \\frac{3}{4} & \\frac{3}{16} \\\\\n",
|
856 | 855 | " \\frac{1}{16} & \\frac{7}{8} & \\frac{1}{16} \\\\\n",
|
857 | 856 | " \\frac{1}{8} & \\frac{3}{4} & \\frac{1}{8} \\\\\n",
|
858 | 857 | " \\end{array}}\\right] \\\\\n",
|
859 | 858 | " \\\\\n",
|
860 |
| - "$$\n", |
| 859 | + " $\n", |
861 | 860 | "<br>\n",
|
862 | 861 | "<br>\n",
|
863 | 862 | "Action 3: Waiting for dispatch \n",
|
864 | 863 | "<br>\n",
|
865 |
| - "<br>\n", |
866 |
| - "$$\\\\\n", |
| 864 | + "$\\\\\n", |
867 | 865 | " P^{3} =\n",
|
868 | 866 | " \\left[ {\\begin{array}{ccc}\n",
|
869 | 867 | " \\frac{1}{4} & \\frac{1}{8} & \\frac{5}{8} \\\\\n",
|
870 | 868 | " 0 & 1 & 0 \\\\\n",
|
871 | 869 | " \\frac{3}{4} & \\frac{1}{16} & \\frac{3}{16} \\\\\n",
|
872 | 870 | " \\end{array}}\\right] \\\\\n",
|
873 | 871 | " \\\\\n",
|
874 |
| - "$$\n", |
| 872 | + " $\n", |
875 | 873 | "<br>\n",
|
876 | 874 | "<br>\n",
|
877 | 875 | "For the sake of readability, we will call the states A, B and C and the actions 'cruise', 'stand' and 'dispatch'.\n",
|
|
914 | 912 | "<br>\n",
|
915 | 913 | "Action 1: Cruising streets \n",
|
916 | 914 | "<br>\n",
|
917 |
| - "<br>\n", |
918 |
| - "$$\\\\\n", |
| 915 | + "$\\\\\n", |
919 | 916 | " R^{1} = \n",
|
920 | 917 | " \\left[ {\\begin{array}{ccc}\n",
|
921 | 918 | " 10 & 4 & 8 \\\\\n",
|
922 | 919 | " 14 & 0 & 18 \\\\\n",
|
923 | 920 | " 10 & 2 & 8 \\\\\n",
|
924 | 921 | " \\end{array}}\\right] \\\\\n",
|
925 | 922 | " \\\\\n",
|
926 |
| - "$$\n", |
| 923 | + " $\n", |
927 | 924 | "<br>\n",
|
928 | 925 | "<br>\n",
|
929 | 926 | "Action 2: Waiting at the taxi stand \n",
|
930 | 927 | "<br>\n",
|
931 |
| - "<br>\n", |
932 |
| - "$$\\\\\n", |
| 928 | + "$\\\\\n", |
933 | 929 | " R^{2} = \n",
|
934 | 930 | " \\left[ {\\begin{array}{ccc}\n",
|
935 | 931 | " 8 & 2 & 4 \\\\\n",
|
936 | 932 | " 8 & 16 & 8 \\\\\n",
|
937 | 933 | " 6 & 4 & 2\\\\\n",
|
938 | 934 | " \\end{array}}\\right] \\\\\n",
|
939 | 935 | " \\\\\n",
|
940 |
| - "$$\n", |
| 936 | + " $\n", |
941 | 937 | "<br>\n",
|
942 | 938 | "<br>\n",
|
943 | 939 | "Action 3: Waiting for dispatch \n",
|
944 | 940 | "<br>\n",
|
945 |
| - "<br>\n", |
946 |
| - "$$\\\\\n", |
| 941 | + "$\\\\\n", |
947 | 942 | " R^{3} = \n",
|
948 | 943 | " \\left[ {\\begin{array}{ccc}\n",
|
949 | 944 | " 4 & 6 & 4 \\\\\n",
|
950 | 945 | " 0 & 0 & 0 \\\\\n",
|
951 | 946 | " 4 & 0 & 8\\\\\n",
|
952 | 947 | " \\end{array}}\\right] \\\\\n",
|
953 | 948 | " \\\\\n",
|
954 |
| - "$$\n", |
| 949 | + " $\n", |
955 | 950 | "<br>\n",
|
956 | 951 | "<br>\n",
|
957 | 952 | "We now build the reward model as a dictionary using these matrices."
|
|
1194 | 1189 | "name": "stdout",
|
1195 | 1190 | "output_type": "stream",
|
1196 | 1191 | "text": [
|
1197 |
| - "['cruise', 'dispatch', 'stand']\n" |
| 1192 | + "['stand', 'dispatch', 'cruise']\n" |
1198 | 1193 | ]
|
1199 | 1194 | }
|
1200 | 1195 | ],
|
|
1290 | 1285 | "We have successfully adapted the existing code to a different scenario yet again.\n",
|
1291 | 1286 | "The takeaway from this section is that you can convert the vast majority of reinforcement learning problems into MDPs and solve for the best policy using simple yet efficient tools."
|
1292 | 1287 | ]
|
| 1288 | + }, |
| 1289 | + { |
| 1290 | + "cell_type": "markdown", |
| 1291 | + "metadata": {}, |
| 1292 | + "source": [ |
| 1293 | + "## GRID MDP\n", |
| 1294 | + "---\n", |
| 1295 | + "### Pathfinding Problem\n", |
| 1296 | + "Markov Decision Processes can be used to find the best path through a maze. Let us consider this simple maze.\n", |
| 1297 | + "\n", |
| 1298 | + "\n", |
| 1299 | + "This environment can be formulated as a GridMDP.\n", |
| 1300 | + "<br>\n", |
| 1301 | + "To make the grid matrix, we will consider the state-reward to be -0.1 for every state.\n", |
| 1302 | + "<br>\n", |
| 1303 | + "State (1, 1) will have a reward of -5 to signify that this state is to be prohibited.\n", |
| 1304 | + "<br>\n", |
| 1305 | + "State (9, 9) will have a reward of +5.\n", |
| 1306 | + "This will be the terminal state.\n", |
| 1307 | + "<br>\n", |
| 1308 | + "The matrix can be generated using the GridMDP editor or we can write it ourselves." |
| 1309 | + ] |
| 1310 | + }, |
| 1311 | + { |
| 1312 | + "cell_type": "code", |
| 1313 | + "execution_count": 35, |
| 1314 | + "metadata": { |
| 1315 | + "collapsed": true |
| 1316 | + }, |
| 1317 | + "outputs": [], |
| 1318 | + "source": [ |
| 1319 | + "grid = [\n", |
| 1320 | + " [None, None, None, None, None, None, None, None, None, None, None], \n", |
| 1321 | + " [None, -0.1, -0.1, -0.1, -0.1, -0.1, -0.1, -0.1, None, +5.0, None], \n", |
| 1322 | + " [None, -0.1, None, None, None, None, None, None, None, -0.1, None], \n", |
| 1323 | + " [None, -0.1, -0.1, -0.1, -0.1, -0.1, -0.1, -0.1, -0.1, -0.1, None], \n", |
| 1324 | + " [None, -0.1, None, None, None, None, None, None, None, None, None], \n", |
| 1325 | + " [None, -0.1, None, -0.1, -0.1, -0.1, -0.1, -0.1, -0.1, -0.1, None], \n", |
| 1326 | + " [None, -0.1, None, None, None, None, None, -0.1, None, -0.1, None], \n", |
| 1327 | + " [None, -0.1, -0.1, -0.1, -0.1, -0.1, -0.1, -0.1, None, -0.1, None], \n", |
| 1328 | + " [None, None, None, None, None, -0.1, None, -0.1, None, -0.1, None], \n", |
| 1329 | + " [None, -5.0, -0.1, -0.1, -0.1, -0.1, None, -0.1, None, -0.1, None], \n", |
| 1330 | + " [None, None, None, None, None, None, None, None, None, None, None]\n", |
| 1331 | + "]" |
| 1332 | + ] |
| 1333 | + }, |
| 1334 | + { |
| 1335 | + "cell_type": "markdown", |
| 1336 | + "metadata": {}, |
| 1337 | + "source": [ |
| 1338 | + "We have only one terminal state, (9, 9)" |
| 1339 | + ] |
| 1340 | + }, |
| 1341 | + { |
| 1342 | + "cell_type": "code", |
| 1343 | + "execution_count": 36, |
| 1344 | + "metadata": { |
| 1345 | + "collapsed": true |
| 1346 | + }, |
| 1347 | + "outputs": [], |
| 1348 | + "source": [ |
| 1349 | + "terminals = [(9, 9)]" |
| 1350 | + ] |
| 1351 | + }, |
| 1352 | + { |
| 1353 | + "cell_type": "markdown", |
| 1354 | + "metadata": {}, |
| 1355 | + "source": [ |
| 1356 | + "We define our maze environment below" |
| 1357 | + ] |
| 1358 | + }, |
| 1359 | + { |
| 1360 | + "cell_type": "code", |
| 1361 | + "execution_count": 37, |
| 1362 | + "metadata": {}, |
| 1363 | + "outputs": [], |
| 1364 | + "source": [ |
| 1365 | + "maze = GridMDP(grid, terminals)" |
| 1366 | + ] |
| 1367 | + }, |
| 1368 | + { |
| 1369 | + "cell_type": "markdown", |
| 1370 | + "metadata": {}, |
| 1371 | + "source": [ |
| 1372 | + "To solve the maze, we can use the `best_policy` function along with `value_iteration`." |
| 1373 | + ] |
| 1374 | + }, |
| 1375 | + { |
| 1376 | + "cell_type": "code", |
| 1377 | + "execution_count": 38, |
| 1378 | + "metadata": { |
| 1379 | + "collapsed": true |
| 1380 | + }, |
| 1381 | + "outputs": [], |
| 1382 | + "source": [ |
| 1383 | + "pi = best_policy(maze, value_iteration(maze))" |
| 1384 | + ] |
| 1385 | + }, |
| 1386 | + { |
| 1387 | + "cell_type": "markdown", |
| 1388 | + "metadata": {}, |
| 1389 | + "source": [ |
| 1390 | + "This is the heatmap generated by the GridMDP editor using `value_iteration` on this environment\n", |
| 1391 | + "<br>\n", |
| 1392 | + "\n", |
| 1393 | + "<br>\n", |
| 1394 | + "Let's print out the best policy" |
| 1395 | + ] |
| 1396 | + }, |
| 1397 | + { |
| 1398 | + "cell_type": "code", |
| 1399 | + "execution_count": 39, |
| 1400 | + "metadata": {}, |
| 1401 | + "outputs": [ |
| 1402 | + { |
| 1403 | + "name": "stdout", |
| 1404 | + "output_type": "stream", |
| 1405 | + "text": [ |
| 1406 | + "None None None None None None None None None None None\n", |
| 1407 | + "None v < < < < < < None . None\n", |
| 1408 | + "None v None None None None None None None ^ None\n", |
| 1409 | + "None > > > > > > > > ^ None\n", |
| 1410 | + "None ^ None None None None None None None None None\n", |
| 1411 | + "None ^ None > > > > v < < None\n", |
| 1412 | + "None ^ None None None None None v None ^ None\n", |
| 1413 | + "None ^ < < < < < < None ^ None\n", |
| 1414 | + "None None None None None ^ None ^ None ^ None\n", |
| 1415 | + "None > > > > ^ None ^ None ^ None\n", |
| 1416 | + "None None None None None None None None None None None\n" |
| 1417 | + ] |
| 1418 | + } |
| 1419 | + ], |
| 1420 | + "source": [ |
| 1421 | + "from utils import print_table\n", |
| 1422 | + "print_table(maze.to_arrows(pi))" |
| 1423 | + ] |
| 1424 | + }, |
| 1425 | + { |
| 1426 | + "cell_type": "markdown", |
| 1427 | + "metadata": {}, |
| 1428 | + "source": [ |
| 1429 | + "As you can infer, we can find the path to the terminal state starting from any given state using this policy.\n", |
| 1430 | + "All maze problems can be solved by formulating it as a MDP." |
| 1431 | + ] |
1293 | 1432 | }
|
1294 | 1433 | ],
|
1295 | 1434 | "metadata": {
|
|
0 commit comments