GoTick/player.go at main · wcchu/GoTick · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
package main

import (
	"fmt"
	"math/rand"
	"os"
	"strconv"
)

type stateCounts map[int64]uint            // each state maps to how many times it's encountered
type stateValues map[int64]float64         // each state maps to a value
type stateValueHistory map[int64][]float64 // each state maps to an array of values

type robotSpecs struct {
	alp float64 // learning rate; if zero, use weighted average to update the value
	eps float64 // epsilon-greedy search
	gam float64 // discount factor
}

type mind struct {
	specs    robotSpecs
	counts   stateCounts       // count number of times each state has appeared
	demohist stateValueHistory // historic values of demo states in the robot's record
	values   stateValues       // most updated values of the robot's known states
	verb     bool              // verbose
}

type player struct {
	name    string  // name of the player
	symbol  string  // "x" plays first, "o" plays second. Each episode assigns symbols randomly.
	being   string  // human or robot
	history []int64 // history of states played in the episode
	wins    int     // number of wins
	mind    mind    // empty if human
}

type playerPair [2]player

func createPlayers() []player {
	// number of players
	var N uint
	for {
		fmt.Print("Enter number of players: ")
		_, err := fmt.Scanf("%d", &N)
		if err == nil {
			break
		}
	}

	// define each player
	players := make([]player, N)
	for i := range players {
		var name string
		var isRobot bool
		// name
		for {
			fmt.Printf("Enter name of player #%v: ", i)
			_, err := fmt.Scanf("%s", &name)
			if err == nil {
				break
			}
		}
		// being
		for {
			fmt.Printf("robot? (t/f): ")
			_, err := fmt.Scanf("%t", &isRobot)
			if err == nil {
				break
			}
		}
		if isRobot {
			// specs
			var a, e, g float64
			fmt.Printf("specs (alp eps gam) / click enter to use default values (%v %v %v): ", alpha, epsilon, gamma)
			_, err := fmt.Scanf("%f%f%f", &a, &e, &g)
			if err != nil {
				a, e, g = alpha, epsilon, gamma
				fmt.Printf("use default specs \n")
			}
			players[i].initializeRobot(name, robotSpecs{alp: a, eps: e, gam: g}, false)
		} else {
			players[i].initializeHuman(name)
		}
	}
	fmt.Print("*** Done creating players *** \n\n")
	return players
}

func (p *player) initializeRobot(name string, rs robotSpecs, verb bool) {
	p.name = name
	p.symbol = ""
	p.being = "robot"
	p.history = []int64{}
	p.wins = 0
	p.mind.specs = rs
	p.mind.counts = stateCounts{}
	p.mind.demohist = stateValueHistory{}
	p.mind.values = stateValues{}
	p.mind.verb = verb
	return
}

func (p *player) initializeHuman(name string) {
	p.name = name
	p.symbol = ""
	p.being = "human"
	p.history = []int64{}
	p.wins = 0
	p.mind = mind{}
	return
}

// resetHistory resets the state history of a player
func (p *player) resetHistory() {
	p.history = []int64{}
	return
}

// append the new state to the player's state history within the episode
func (p *player) updateStateSequence(state int64) {
	p.history = append(p.history, state)
	return
}

func (p *player) getDemoStates() {
	if p.being == "robot" {
		for i := len(p.history) - 1; i > len(p.history)-(1+nDemoStates); i-- {
			state := p.history[i]
			p.mind.demohist[state] = []float64{}
		}
	}
	return
}

func (p *player) playerActs(env environment) (actionLocation location) {
	if p.being == "robot" {
		return p.robotActs(env)
	} else if p.being == "human" {
		return p.humanActs(env)
	}
	fmt.Printf("player %v is an unknown creature; the game board explodes \n", p.name)
	os.Exit(1)
	return
}

func (p *player) humanActs(env environment) (actionLocation location) {
	printBoard(&env.board, true)
	for {
		var x, y int
		fmt.Print("Enter location (x y): ")
		_, err := fmt.Scanf("%d%d", &x, &y)
		if err == nil {
			l := location{x, y}
			if env.board[l[0]][l[1]] == "" {
				fmt.Printf("you are making a move to %v \n", l)
				return l
			}
		}
		// invalid move, re-enter location
		fmt.Print("invalid move \n")
	}
}

// determine what location the robot moves to
func (p *player) robotActs(env environment) (actionLocation location) {
	if rand.Float64() < p.mind.specs.eps {
		// take a random action
		possibleLocations := []location{}
		for irow, row := range env.board {
			for ielement, element := range row {
				if element == "" {
					possibleLocations = append(possibleLocations, location{irow, ielement})
				}
			}
		}
		pickedIndex := rand.Intn(len(possibleLocations))
		actionLocation = possibleLocations[pickedIndex]
		if p.mind.verb || printSteps {
			fmt.Printf("player %v(%v)'s takes action randomly at %v \n", p.name, p.symbol, actionLocation)
		}
	} else {
		plan := make(board, boardSize) // only useful for printing out the plan
		// choose the best action based on current values of states
		bestGain := -1.0
		for irow, row := range env.board {
			plan[irow] = make([]string, boardSize)
			for ielement, element := range row {
				plan[irow][ielement] = element
				if element == "" { // location is empty; find value if player moves here
					env.board[irow][ielement] = p.symbol            // board after this move
					testState := boardToState(&env.board, p.symbol) // state after this move
					testWinner := getWinner(env.board)              // winner after this move
					testEmpties := getEmpties(env.board)            // empty spots after this move
					env.board[irow][ielement] = ""                  // revert this action
					// get gain of the test state
					var testGain float64
					if testWinner != "" || testEmpties == 0 {
						// test state is final state, reward is non-zero, value is zero
						testGain = getReward(testWinner, p.symbol)
					} else {
						testValue, ok := p.mind.values[testState]
						if !ok { // there's no record of this state, use default value
							testValue = defaultValue()
						}
						testGain = p.mind.specs.gam * testValue
					}
					plan[irow][ielement] = strconv.FormatFloat(testGain, 'f', 2, 64)
					if testGain > bestGain {
						bestGain = testGain
						actionLocation = location{irow, ielement}
					}
				}
			}
		}
		if p.mind.verb || printSteps {
			fmt.Printf("player %v(%v)'s plan board: \n", p.name, p.symbol)
			printBoard(&plan, true)
			fmt.Printf("player %v(%v) takes action at %v \n", p.name, p.symbol, actionLocation)
		}
	}
	return actionLocation
}

// append the state-values learnt in each episode to the player's memory
func (p *player) updatePlayerRecord(env environment) {
	if p.symbol == env.winner {
		p.wins++
	}
	if p.being == "robot" {
		p.updateStateValues(env)
		p.updateStateValueHistory(env)
		p.updateStateCounts()
	}
	p.resetHistory()
	return
}

// should only be run at the end of an episode
func (p *player) updateStateValues(env environment) {
	gains := make(map[int64]float64, len(p.history)) // values learned through this episode
	finalReward := getReward(env.winner, p.symbol)
	// loop backward from the last state to the first along history of this episode
	// i is the index of history array
	gain := 0.0
	for i := len(p.history) - 1; i >= 0; i-- {
		state := p.history[i]
		var reward float64
		if i == len(p.history)-2 {
			reward = finalReward
		} else {
			reward = 0.0
		}
		gain = reward + p.mind.specs.gam*gain
		gains[state] = gain
	}
	// update the state values
	for state, gain := range gains {
		if p.mind.specs.alp == 0.0 {
			// update V by weighted average between new and existing values
			count, ok := p.mind.counts[state]
			if !ok {
				count = 0
			}
			p.mind.values[state] = (float64(count)*p.mind.values[state] + gain) / float64(count+1)
		} else {
			// update V by correction to the new value with learning rate
			oldValue, ok := p.mind.values[state]
			if !ok {
				oldValue = defaultValue()
			}
			p.mind.values[state] = oldValue + p.mind.specs.alp*(gain-oldValue)
		}
	}
	return
}

// generate a value of certain mean and certain randomness
func defaultValue() float64 {
	return initialValue + fluctuation*(rand.Float64()-0.5)
}

// should be run right after updateStateValues()
func (p *player) updateStateValueHistory(env environment) {
	for state := range p.mind.demohist {
		p.mind.demohist[state] = append(p.mind.demohist[state], p.mind.values[state])
	}
	return
}

// update the record of how many times each state has appeared
func (p *player) updateStateCounts() {
	for _, state := range p.history {
		count, ok := p.mind.counts[state]
		if !ok { // this state appears the first time
			count = 0
		}
		p.mind.counts[state] = count + 1
	}
	return
}