-
Notifications
You must be signed in to change notification settings - Fork 0
/
maze_solver_qlearning.c
95 lines (81 loc) · 2.09 KB
/
maze_solver_qlearning.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#include "maze_solver_qlearning.h"
#include "maze_markov.h"
#include "maze_markov_bellman.h"
#include "ratmaze.h"
#include <stdio.h>
#include <stdlib.h>
#include <float.h>
#include <time.h>
void maze_solver_qlearning_perform(
struct maze_markov_decision_process *mdp,
unsigned int limit
)
{
struct maze_markov_state *curs, *nexts;
struct maze_markov_bellman_list *qlist, *tmplist;
struct maze_markov_bellman_policy *policy;
struct maze_markov_bellman_node *tmpnode;
enum maze_markov_action cura;
float alpha, reward, maxq;
unsigned int i;
curs = mdp->init;
alpha = MAZE_SOLVER_QLEARNING_DEFAULT_ALPHA;
qlist = maze_markov_bellman_default_qlist_create(
mdp,MAZE_SOLVER_QLEARNING_DEFAULT_Q0
);
i=0;
srand(time(0));
while (i < limit)
{
alpha = MAZE_SOLVER_QLEARNING_DEFAULT_ALPHA * ((limit - i) / (float)limit);
cura = maze_markov_get_action_random();
nexts = maze_markov_state_action_perform(
mdp->states,curs,cura,&reward
);
/* get max(a')(Q(s',a')) */
tmplist = qlist;
while ((tmplist) && (tmplist->state != nexts))
tmplist = tmplist->next;
if (tmplist)
{
tmpnode = tmplist->list;
maxq = FLT_MIN;
while (tmpnode)
{
if (tmpnode->cost > maxq)
maxq = tmpnode->cost;
tmpnode = tmpnode->next;
}
}
else
{
printf("\nERROR\n");
maxq = MAZE_SOLVER_QLEARNING_DEFAULT_Q0;
}
maze_markov_bellman_qlist_set_cost(qlist,curs,cura,
((1.0f - alpha)
* maze_markov_bellman_qlist_get_cost(
qlist,curs,cura
)
)
+ (alpha * (reward + MAZE_MARKOV_BELLMAN_GAMMA * maxq))
);
#ifdef MAZE_DEBUG
printf("Step %d:\n\ts:%d, s':%d, a:%s\n\tQ(s,a)=%g\n\talpha=%g\n",
i,curs->id,nexts->id,MAZE_MARKOV_ACTION_NAMES(cura),
maze_markov_bellman_qlist_get_cost(qlist,curs,cura),
alpha
);
#endif
curs = nexts;
i++;
}
policy = maze_markov_bellman_optimal_policy_create(qlist);
printf("\nQ List:\n");
maze_markov_bellman_list_display(qlist);
printf("\nOptimal policy:\n");
maze_markov_bellman_policy_display(policy);
printf("---\n");
maze_markov_bellman_list_destroy(qlist);
maze_markov_bellman_policy_destroy(policy);
}