rlrobot/action_selection.py at master · FanFeast/rlrobot · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# -*- coding: utf-8 -*-
#   +-----------------------------------------------+
#   | RL-ROBOT. Reinforcement Learning for Robotics |
#   | Angel Martinez-Tenor                          |
#   | MAPIR. University of Malaga. 2016             |
#   +-----------------------------------------------+
""" Exploration-exploitation strategy """

import math
import random
import sys

import numpy as np

import exp
import lp
import task

if exp.ACTION_STRATEGY == "QBIASR":
    import action_qbiasr

EPSILON = 0.3  # e-greedy
LEAST_EXPLORED = 0.3  # Probability of choose the least explored action
TEMPERATURE = exp.TEMPERATURE  # Temperature Parameter for Softmax Boltzmann
initiated = False


def setup():
    """ Initialize QBIASR if needed """
    global initiated
    if exp.ACTION_STRATEGY == "QBIASR":
        action_qbiasr.setup()
    initiated = True


def execute(s):
    """ From state s select an action a """

    if exp.TEACH_THE_ROBOT:
        print("Warning: Controlling the robot for teaching not implemented")
        pass
        # selected_action = task.key2action()    ToDo: re-implement teaching
        # return selected_action

    elif exp.TEACHING_PROCESS:
        if lp.step >= exp.TEACHING_STEPS:
            exp.TEACHING_PROCESS = False
        else:
            return exp.TAUGHT_SASR[lp.step, 1]

    # if EXP.EXECUTE_GIVEN_POLICY:
    #     selected_action = TASK.OPTIMAL_POLICY[s]

    if exp.ACTION_STRATEGY == "exploit":
        selected_action = exploit_policy(s)

    elif exp.ACTION_STRATEGY == "random":
        selected_action = random_action()

    elif exp.ACTION_STRATEGY == "eGreedy":
        selected_action = egreedy(s, EPSILON)

    # elif EXP.ACTION_STRATEGY == "E-GREEDY_IMPROVE_LEAST_EXPLORED":
    #    selected_action = egreedyLeastExplored(s, EPSILON, LEAST_EXPLORED)"""

    elif exp.ACTION_STRATEGY == "softmax":
        selected_action = softmax(s)

    elif exp.ACTION_STRATEGY == "QBIASR":  # novel technique
        selected_action = action_qbiasr.select_biased_action(s)

    else:
        sys.exit("ERROR:   WRONG ACTION STRATEGY: " + exp.ACTION_STRATEGY)
    return selected_action


def exploit_policy(s):
    """ Exploit the action a given an state s according to the Policy """
    selected_action = lp.policy[s]
    return selected_action


def random_action():
    """ Select a random action a (uniform distribution) """
    # random.seed()
    selected_action = random.randint(0, task.n_actions - 1)
    return selected_action


def egreedy(s, e):  # if e = 0.3_: 30% exploration
    """ Select an action a given a state s based on egreedy exploration """
    # random.seed()
    if random.random() < e:
        selected_action = random_action()
    else:
        selected_action = exploit_policy(s)
    return selected_action


def egreedy_least_explored(s, e, least):
    """ Select an action a given a state s based on egreedy exploration
        improving the probability of selecting the least explored action  """
    # random.seed()
    if random.random() < e:
        if random.random() < least:
            selected_action = 0
            for i in range(task.n_actions):
                if lp.q_count[s, i] < lp.q_count[s, selected_action]:
                    # exploration holds the number of times that the cells
                    # of Q[s, a] have been explored
                    selected_action = i
        else:
            selected_action = random_action()
    else:
        selected_action = exploit_policy(s)
    return selected_action


def softmax(s):
    """ Select an action a given a state s based on Boltzmann exploration """
    selected_action = -1
    # 1: Get the probabilities
    pa = np.zeros(task.n_actions)
    for i in range(task.n_actions):
        pa[i] = math.exp(lp.q[s, i] / TEMPERATURE)
    pa = np.divide(pa, sum(pa))

    # 2: Select the action
    # random.seed()
    ran = random.random()
    accum = 0.0
    for i in range(task.n_actions):
        accum = accum + pa[i]
        if ran < accum:
            selected_action = i
            break
    return selected_action