forked from MAPIRlab/rlrobot
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathaction_selection.py
More file actions
executable file
·137 lines (107 loc) · 3.98 KB
/
action_selection.py
File metadata and controls
executable file
·137 lines (107 loc) · 3.98 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# -*- coding: utf-8 -*-
# +-----------------------------------------------+
# | RL-ROBOT. Reinforcement Learning for Robotics |
# | Angel Martinez-Tenor |
# | MAPIR. University of Malaga. 2016 |
# +-----------------------------------------------+
""" Exploration-exploitation strategy """
import math
import random
import sys
import numpy as np
import exp
import lp
import task
if exp.ACTION_STRATEGY == "QBIASR":
import action_qbiasr
EPSILON = 0.3 # e-greedy
LEAST_EXPLORED = 0.3 # Probability of choose the least explored action
TEMPERATURE = exp.TEMPERATURE # Temperature Parameter for Softmax Boltzmann
initiated = False
def setup():
""" Initialize QBIASR if needed """
global initiated
if exp.ACTION_STRATEGY == "QBIASR":
action_qbiasr.setup()
initiated = True
def execute(s):
""" From state s select an action a """
if exp.TEACH_THE_ROBOT:
print("Warning: Controlling the robot for teaching not implemented")
pass
# selected_action = task.key2action() ToDo: re-implement teaching
# return selected_action
elif exp.TEACHING_PROCESS:
if lp.step >= exp.TEACHING_STEPS:
exp.TEACHING_PROCESS = False
else:
return exp.TAUGHT_SASR[lp.step, 1]
# if EXP.EXECUTE_GIVEN_POLICY:
# selected_action = TASK.OPTIMAL_POLICY[s]
if exp.ACTION_STRATEGY == "exploit":
selected_action = exploit_policy(s)
elif exp.ACTION_STRATEGY == "random":
selected_action = random_action()
elif exp.ACTION_STRATEGY == "eGreedy":
selected_action = egreedy(s, EPSILON)
# elif EXP.ACTION_STRATEGY == "E-GREEDY_IMPROVE_LEAST_EXPLORED":
# selected_action = egreedyLeastExplored(s, EPSILON, LEAST_EXPLORED)"""
elif exp.ACTION_STRATEGY == "softmax":
selected_action = softmax(s)
elif exp.ACTION_STRATEGY == "QBIASR": # novel technique
selected_action = action_qbiasr.select_biased_action(s)
else:
sys.exit("ERROR: WRONG ACTION STRATEGY: " + exp.ACTION_STRATEGY)
return selected_action
def exploit_policy(s):
""" Exploit the action a given an state s according to the Policy """
selected_action = lp.policy[s]
return selected_action
def random_action():
""" Select a random action a (uniform distribution) """
# random.seed()
selected_action = random.randint(0, task.n_actions - 1)
return selected_action
def egreedy(s, e): # if e = 0.3_: 30% exploration
""" Select an action a given a state s based on egreedy exploration """
# random.seed()
if random.random() < e:
selected_action = random_action()
else:
selected_action = exploit_policy(s)
return selected_action
def egreedy_least_explored(s, e, least):
""" Select an action a given a state s based on egreedy exploration
improving the probability of selecting the least explored action """
# random.seed()
if random.random() < e:
if random.random() < least:
selected_action = 0
for i in range(task.n_actions):
if lp.q_count[s, i] < lp.q_count[s, selected_action]:
# exploration holds the number of times that the cells
# of Q[s, a] have been explored
selected_action = i
else:
selected_action = random_action()
else:
selected_action = exploit_policy(s)
return selected_action
def softmax(s):
""" Select an action a given a state s based on Boltzmann exploration """
selected_action = -1
# 1: Get the probabilities
pa = np.zeros(task.n_actions)
for i in range(task.n_actions):
pa[i] = math.exp(lp.q[s, i] / TEMPERATURE)
pa = np.divide(pa, sum(pa))
# 2: Select the action
# random.seed()
ran = random.random()
accum = 0.0
for i in range(task.n_actions):
accum = accum + pa[i]
if ran < accum:
selected_action = i
break
return selected_action