-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathutils.py
More file actions
96 lines (67 loc) · 3.01 KB
/
utils.py
File metadata and controls
96 lines (67 loc) · 3.01 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
from sklearn.kernel_approximation import RBFSampler
import numpy as np
rbf_feature = RBFSampler(gamma=1, random_state=12345)
def extract_features(state, num_actions):
""" This function computes the RFF features for a state for all the discrete actions
:param state: column vector of the state we want to compute phi(s,a) of (shape |S|x1)
:param num_actions: number of discrete actions you want to compute the RFF features for
:return: phi(s,a) for all the actions (shape 100x|num_actions|)
"""
s = state.reshape(1, -1)
s = np.repeat(s, num_actions, 0)
a = np.arange(0, num_actions).reshape(-1, 1)
sa = np.concatenate([s,a], -1)
feats = rbf_feature.fit_transform(sa)
feats = feats.T
return feats
def compute_softmax(logits, axis):
""" computes the softmax of the logits
:param logits: the vector to compute the softmax over
:param axis: the axis we are summing over
:return: the softmax of the vector
Hint: to make the softmax more stable, subtract the max from the vector before applying softmax
"""
# TODO
pass
def compute_action_distribution(theta, phis):
""" compute probability distrubtion over actions
:param theta: model parameter (shape d x 1)
:param phis: RFF features of the state and actions (shape d x |A|)
:return: softmax probability distribution over actions (shape 1 x |A|)
"""
# TODO
pass
def compute_log_softmax_grad(theta, phis, action_idx):
""" computes the log softmax gradient for the action with index action_idx
:param theta: model parameter (shape d x 1)
:param phis: RFF features of the state and actions (shape d x |A|)
:param action_idx: The index of the action you want to compute the gradient of theta with respect to
:return: log softmax gradient (shape d x 1)
"""
# TODO
pass
def compute_fisher_matrix(grads):
""" computes the fisher information matrix using the sampled trajectories gradients
:param grads: list of list of gradients, where each sublist represents a trajectory (each gradient has shape d x 1)
:return: fisher information matrix (shape d x d)
Note: don't forget to take into account that trajectories might have different lengths
"""
# TODO
pass
def compute_value_gradient(grads, rewards):
""" computes the value function gradient with respect to the sampled gradients and rewards
:param grads: ist of list of gradients, where each sublist represents a trajectory
:param rewards: list of list of rewards, where each sublist represents a trajectory
:return: value function gradient with respect to theta (shape d x 1)
"""
# TODO
pass
def compute_eta(delta, fisher, v_grad):
""" computes the learning rate for gradient descent
:param delta: trust region size
:param fisher: fisher information matrix (shape d x d)
:param v_grad: value function gradient with respect to theta (shape d x 1)
:return: the maximum learning rate that respects the trust region size delta
"""
# TODO
pass