-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathplayer.go
More file actions
300 lines (279 loc) · 8.23 KB
/
player.go
File metadata and controls
300 lines (279 loc) · 8.23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
package main
import (
"fmt"
"math/rand"
"os"
"strconv"
)
type stateCounts map[int64]uint // each state maps to how many times it's encountered
type stateValues map[int64]float64 // each state maps to a value
type stateValueHistory map[int64][]float64 // each state maps to an array of values
type robotSpecs struct {
alp float64 // learning rate; if zero, use weighted average to update the value
eps float64 // epsilon-greedy search
gam float64 // discount factor
}
type mind struct {
specs robotSpecs
counts stateCounts // count number of times each state has appeared
demohist stateValueHistory // historic values of demo states in the robot's record
values stateValues // most updated values of the robot's known states
verb bool // verbose
}
type player struct {
name string // name of the player
symbol string // "x" plays first, "o" plays second. Each episode assigns symbols randomly.
being string // human or robot
history []int64 // history of states played in the episode
wins int // number of wins
mind mind // empty if human
}
type playerPair [2]player
func createPlayers() []player {
// number of players
var N uint
for {
fmt.Print("Enter number of players: ")
_, err := fmt.Scanf("%d", &N)
if err == nil {
break
}
}
// define each player
players := make([]player, N)
for i := range players {
var name string
var isRobot bool
// name
for {
fmt.Printf("Enter name of player #%v: ", i)
_, err := fmt.Scanf("%s", &name)
if err == nil {
break
}
}
// being
for {
fmt.Printf("robot? (t/f): ")
_, err := fmt.Scanf("%t", &isRobot)
if err == nil {
break
}
}
if isRobot {
// specs
var a, e, g float64
fmt.Printf("specs (alp eps gam) / click enter to use default values (%v %v %v): ", alpha, epsilon, gamma)
_, err := fmt.Scanf("%f%f%f", &a, &e, &g)
if err != nil {
a, e, g = alpha, epsilon, gamma
fmt.Printf("use default specs \n")
}
players[i].initializeRobot(name, robotSpecs{alp: a, eps: e, gam: g}, false)
} else {
players[i].initializeHuman(name)
}
}
fmt.Print("*** Done creating players *** \n\n")
return players
}
func (p *player) initializeRobot(name string, rs robotSpecs, verb bool) {
p.name = name
p.symbol = ""
p.being = "robot"
p.history = []int64{}
p.wins = 0
p.mind.specs = rs
p.mind.counts = stateCounts{}
p.mind.demohist = stateValueHistory{}
p.mind.values = stateValues{}
p.mind.verb = verb
return
}
func (p *player) initializeHuman(name string) {
p.name = name
p.symbol = ""
p.being = "human"
p.history = []int64{}
p.wins = 0
p.mind = mind{}
return
}
// resetHistory resets the state history of a player
func (p *player) resetHistory() {
p.history = []int64{}
return
}
// append the new state to the player's state history within the episode
func (p *player) updateStateSequence(state int64) {
p.history = append(p.history, state)
return
}
func (p *player) getDemoStates() {
if p.being == "robot" {
for i := len(p.history) - 1; i > len(p.history)-(1+nDemoStates); i-- {
state := p.history[i]
p.mind.demohist[state] = []float64{}
}
}
return
}
func (p *player) playerActs(env environment) (actionLocation location) {
if p.being == "robot" {
return p.robotActs(env)
} else if p.being == "human" {
return p.humanActs(env)
}
fmt.Printf("player %v is an unknown creature; the game board explodes \n", p.name)
os.Exit(1)
return
}
func (p *player) humanActs(env environment) (actionLocation location) {
printBoard(&env.board, true)
for {
var x, y int
fmt.Print("Enter location (x y): ")
_, err := fmt.Scanf("%d%d", &x, &y)
if err == nil {
l := location{x, y}
if env.board[l[0]][l[1]] == "" {
fmt.Printf("you are making a move to %v \n", l)
return l
}
}
// invalid move, re-enter location
fmt.Print("invalid move \n")
}
}
// determine what location the robot moves to
func (p *player) robotActs(env environment) (actionLocation location) {
if rand.Float64() < p.mind.specs.eps {
// take a random action
possibleLocations := []location{}
for irow, row := range env.board {
for ielement, element := range row {
if element == "" {
possibleLocations = append(possibleLocations, location{irow, ielement})
}
}
}
pickedIndex := rand.Intn(len(possibleLocations))
actionLocation = possibleLocations[pickedIndex]
if p.mind.verb || printSteps {
fmt.Printf("player %v(%v)'s takes action randomly at %v \n", p.name, p.symbol, actionLocation)
}
} else {
plan := make(board, boardSize) // only useful for printing out the plan
// choose the best action based on current values of states
bestGain := -1.0
for irow, row := range env.board {
plan[irow] = make([]string, boardSize)
for ielement, element := range row {
plan[irow][ielement] = element
if element == "" { // location is empty; find value if player moves here
env.board[irow][ielement] = p.symbol // board after this move
testState := boardToState(&env.board, p.symbol) // state after this move
testWinner := getWinner(env.board) // winner after this move
testEmpties := getEmpties(env.board) // empty spots after this move
env.board[irow][ielement] = "" // revert this action
// get gain of the test state
var testGain float64
if testWinner != "" || testEmpties == 0 {
// test state is final state, reward is non-zero, value is zero
testGain = getReward(testWinner, p.symbol)
} else {
testValue, ok := p.mind.values[testState]
if !ok { // there's no record of this state, use default value
testValue = defaultValue()
}
testGain = p.mind.specs.gam * testValue
}
plan[irow][ielement] = strconv.FormatFloat(testGain, 'f', 2, 64)
if testGain > bestGain {
bestGain = testGain
actionLocation = location{irow, ielement}
}
}
}
}
if p.mind.verb || printSteps {
fmt.Printf("player %v(%v)'s plan board: \n", p.name, p.symbol)
printBoard(&plan, true)
fmt.Printf("player %v(%v) takes action at %v \n", p.name, p.symbol, actionLocation)
}
}
return actionLocation
}
// append the state-values learnt in each episode to the player's memory
func (p *player) updatePlayerRecord(env environment) {
if p.symbol == env.winner {
p.wins++
}
if p.being == "robot" {
p.updateStateValues(env)
p.updateStateValueHistory(env)
p.updateStateCounts()
}
p.resetHistory()
return
}
// should only be run at the end of an episode
func (p *player) updateStateValues(env environment) {
gains := make(map[int64]float64, len(p.history)) // values learned through this episode
finalReward := getReward(env.winner, p.symbol)
// loop backward from the last state to the first along history of this episode
// i is the index of history array
gain := 0.0
for i := len(p.history) - 1; i >= 0; i-- {
state := p.history[i]
var reward float64
if i == len(p.history)-2 {
reward = finalReward
} else {
reward = 0.0
}
gain = reward + p.mind.specs.gam*gain
gains[state] = gain
}
// update the state values
for state, gain := range gains {
if p.mind.specs.alp == 0.0 {
// update V by weighted average between new and existing values
count, ok := p.mind.counts[state]
if !ok {
count = 0
}
p.mind.values[state] = (float64(count)*p.mind.values[state] + gain) / float64(count+1)
} else {
// update V by correction to the new value with learning rate
oldValue, ok := p.mind.values[state]
if !ok {
oldValue = defaultValue()
}
p.mind.values[state] = oldValue + p.mind.specs.alp*(gain-oldValue)
}
}
return
}
// generate a value of certain mean and certain randomness
func defaultValue() float64 {
return initialValue + fluctuation*(rand.Float64()-0.5)
}
// should be run right after updateStateValues()
func (p *player) updateStateValueHistory(env environment) {
for state := range p.mind.demohist {
p.mind.demohist[state] = append(p.mind.demohist[state], p.mind.values[state])
}
return
}
// update the record of how many times each state has appeared
func (p *player) updateStateCounts() {
for _, state := range p.history {
count, ok := p.mind.counts[state]
if !ok { // this state appears the first time
count = 0
}
p.mind.counts[state] = count + 1
}
return
}