-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmdp_rl.py
122 lines (118 loc) · 4.61 KB
/
mdp_rl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# ------------------------------------------------------------------------------------------------
# Copyright (c) 2016 Microsoft Corporation
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
# associated documentation files (the "Software"), to deal in the Software without restriction,
# including without limitation the rights to use, copy, modify, merge, publish, distribute,
# sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all copies or
# substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
# NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
# ------------------------------------------------------------------------------------------------
# Structure borrowed from:
# Malmo's Python Tutorial sample #6: Discrete movement, rewards, and learning
# Implements TD Q-learning
# Pages: 843, 844
# Artificial Intelligence: A Modern Approach 3rd Edition
# Stuart J. Russell and Peter Norvig
import MalmoPython
import os
import sys
import time
import matplotlib as mlib
mlib.use('Agg')
from QAgent import TabQAgent
import matplotlib.pyplot as plt
# store reward_list, num_moves_per_episode, avg_q_value_per_episode
reward_list = []
move_list = []
avg_q_list = []
sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0) # flush print output immediately
agent = TabQAgent()
agent_host = MalmoPython.AgentHost()
test = False
try:
agent_host.parse( sys.argv )
except RuntimeError as e:
print 'ERROR:',e
print agent_host.getUsage()
exit(1)
if agent_host.receivedArgument("help"):
print agent_host.getUsage()
exit(0)
# -- set up the mission -- #
mission_file = 'mdp_version/wall_room.xml'
with open(mission_file, 'r') as f:
print "Loading mission from %s" % mission_file
mission_xml = f.read()
my_mission = MalmoPython.MissionSpec(mission_xml, True)
max_retries = 3
if test:
num_repeats = 5
else:
num_repeats = 1000
cumulative_rewards = []
with open('wall_room.csv','a+') as stat_file:
for i in range(num_repeats):
print
print 'Repeat %d of %d' % ( i+1, num_repeats )
my_mission_record = MalmoPython.MissionRecordSpec()
for retry in range(max_retries):
try:
agent_host.startMission( my_mission, my_mission_record )
break
except RuntimeError as e:
if retry == max_retries - 1:
print "Error starting mission:",e
exit(1)
else:
time.sleep(2.5)
print "Waiting for the mission to start",
world_state = agent_host.getWorldState()
while not world_state.has_mission_begun:
sys.stdout.write(".")
time.sleep(1.0)
world_state = agent_host.getWorldState()
for error in world_state.errors:
print "Error:",error.text
print
# -- run the agent in the world -- #
if not test: #use to toggle between test and RL execution
cumulative_reward, avg_q, num_moves = agent.run(agent_host, i)
reward_list.append(cumulative_reward)
move_list.append(num_moves)
avg_q_list.append(avg_q)
# print on the terminal and save to the file
print 'Cumulative reward: {0}, Number of Moves: {1}, Average Q-value: {2}'.format(
cumulative_reward,
num_moves,
avg_q,
)
stat_file.write("{0},{1},{2}\n".format(cumulative_reward,
num_moves,
avg_q))
cumulative_rewards += [ cumulative_reward ]
# -- clean up -- #
time.sleep(2.0) # (let the Mod reset)
else:
time.sleep(30) #let the human do the thang
print "Done."
print
print "Cumulative rewards for all %d runs:" % num_repeats
print cumulative_rewards
plt.plot(reward_list)
plt.savefig('reward_wall.png')
plt.close()
plt.plot(move_list)
plt.savefig('move_wall.png')
plt.close()
plt.plot(avg_q_list)
plt.savefig('avg_q_wall.png')
plt.close()