Skip to content

Commit 0688cc5

Browse files
authoredMar 8, 2021
Create multi_armed_bandits
1 parent f8ed1ec commit 0688cc5

File tree

1 file changed

+262
-0
lines changed

1 file changed

+262
-0
lines changed
 

‎multi_armed_bandits

+262
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,262 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Created on Sun Sep 20 16:24:38 2020
4+
5+
@author: iball
6+
"""
7+
8+
print('imports start')
9+
10+
import matplotlib
11+
import matplotlib.pyplot as plt
12+
import numpy as np
13+
from tqdm import trange
14+
15+
matplotlib.use('Agg')
16+
17+
print('imports done')
18+
19+
class Bandit:
20+
# @k_arm: # of arms
21+
# @epsilon: probability for exploration in epsilon-greedy algorithm
22+
# @initial: initial estimation for each action
23+
# @step_size: constant step size for updating estimations
24+
# @sample_averages: if True, use sample averages to update estimations instead of constant step size
25+
# @UCB_param: if not None, use UCB algorithm to select action
26+
# @gradient: if True, use gradient based bandit algorithm
27+
# @gradient_baseline: if True, use average reward as baseline for gradient based bandit algorithm
28+
def __init__(self, k_arm=10, epsilon=0., initial=0., step_size=0.1, sample_averages=False, UCB_param=None,
29+
gradient=False, gradient_baseline=False, true_reward=0.):
30+
self.k = k_arm
31+
self.step_size = step_size
32+
self.sample_averages = sample_averages
33+
self.indices = np.arange(self.k)
34+
self.time = 0
35+
self.UCB_param = UCB_param
36+
self.gradient = gradient
37+
self.gradient_baseline = gradient_baseline
38+
self.average_reward = 0
39+
self.true_reward = true_reward
40+
self.epsilon = epsilon
41+
self.initial = initial
42+
43+
def reset(self):
44+
# real reward for each action
45+
self.q_true = np.random.randn(self.k) + self.true_reward
46+
47+
# estimation for each action
48+
self.q_estimation = np.zeros(self.k) + self.initial
49+
50+
# # of chosen times for each action
51+
self.action_count = np.zeros(self.k)
52+
53+
self.best_action = np.argmax(self.q_true)
54+
55+
self.time = 0
56+
57+
# get an action for this bandit
58+
def act(self):
59+
if np.random.rand() < self.epsilon:
60+
return np.random.choice(self.indices)
61+
62+
if self.UCB_param is not None:
63+
UCB_estimation = self.q_estimation + \
64+
self.UCB_param * np.sqrt(np.log(self.time + 1) / (self.action_count + 1e-5))
65+
q_best = np.max(UCB_estimation)
66+
return np.random.choice(np.where(UCB_estimation == q_best)[0])
67+
68+
if self.gradient:
69+
exp_est = np.exp(self.q_estimation)
70+
self.action_prob = exp_est / np.sum(exp_est)
71+
return np.random.choice(self.indices, p=self.action_prob)
72+
73+
q_best = np.max(self.q_estimation)
74+
return np.random.choice(np.where(self.q_estimation == q_best)[0])
75+
76+
# take an action, update estimation for this action
77+
def step(self, action):
78+
# generate the reward under N(real reward, 1)
79+
reward = np.random.randn() + self.q_true[action]
80+
self.time += 1
81+
self.action_count[action] += 1
82+
self.average_reward += (reward - self.average_reward) / self.time
83+
84+
if self.sample_averages:
85+
# update estimation using sample averages
86+
self.q_estimation[action] += (reward - self.q_estimation[action]) / self.action_count[action]
87+
elif self.gradient:
88+
one_hot = np.zeros(self.k)
89+
one_hot[action] = 1
90+
if self.gradient_baseline:
91+
baseline = self.average_reward
92+
else:
93+
baseline = 0
94+
self.q_estimation += self.step_size * (reward - baseline) * (one_hot - self.action_prob)
95+
else:
96+
# update estimation with constant step size
97+
self.q_estimation[action] += self.step_size * (reward - self.q_estimation[action])
98+
return reward
99+
100+
print('Done')
101+
102+
def simulate(runs, time, bandits):
103+
print('here 1')
104+
rewards = np.zeros((len(bandits), runs, time))
105+
best_action_counts = np.zeros(rewards.shape)
106+
print('here 2')
107+
for i, bandit in enumerate(bandits):
108+
for r in trange(runs):
109+
bandit.reset()
110+
for t in range(time):
111+
action = bandit.act()
112+
reward = bandit.step(action)
113+
rewards[i, r, t] = reward
114+
if action == bandit.best_action:
115+
best_action_counts[i, r, t] = 1
116+
mean_best_action_counts = best_action_counts.mean(axis=1)
117+
mean_rewards = rewards.mean(axis=1)
118+
print('here 3')
119+
print('mean_best_action_counts, mean_rewards ='+mean_best_action_counts+ mean_rewards)
120+
return mean_best_action_counts, mean_rewards
121+
122+
123+
def figure_2_1():
124+
plt.violinplot(dataset=np.random.randn(200, 10) + np.random.randn(10))
125+
plt.xlabel("Action")
126+
plt.ylabel("Reward distribution")
127+
#plt.savefig('../images/figure_2_1.png')
128+
plt.close()
129+
130+
131+
def figure_2_2(runs=2000, time=1000):
132+
epsilons = [0, 0.1, 0.01]
133+
bandits = [Bandit(epsilon=eps, sample_averages=True) for eps in epsilons]
134+
best_action_counts, rewards = simulate(runs, time, bandits)
135+
136+
plt.figure(figsize=(10, 20))
137+
138+
plt.subplot(2, 1, 1)
139+
for eps, rewards in zip(epsilons, rewards):
140+
plt.plot(rewards, label='epsilon = %.02f' % (eps))
141+
plt.xlabel('steps')
142+
plt.ylabel('average reward')
143+
plt.legend()
144+
145+
plt.subplot(2, 1, 2)
146+
for eps, counts in zip(epsilons, best_action_counts):
147+
plt.plot(counts, label='epsilon = %.02f' % (eps))
148+
plt.xlabel('steps')
149+
plt.ylabel('% optimal action')
150+
plt.legend()
151+
152+
#plt.savefig('../images/figure_2_2.png')
153+
plt.close()
154+
155+
156+
def figure_2_3(runs=2000, time=1000):
157+
bandits = []
158+
bandits.append(Bandit(epsilon=0, initial=5, step_size=0.1))
159+
bandits.append(Bandit(epsilon=0.1, initial=0, step_size=0.1))
160+
best_action_counts, _ = simulate(runs, time, bandits)
161+
162+
plt.plot(best_action_counts[0], label='epsilon = 0, q = 5')
163+
plt.plot(best_action_counts[1], label='epsilon = 0.1, q = 0')
164+
plt.xlabel('Steps')
165+
plt.ylabel('% optimal action')
166+
plt.legend()
167+
168+
#plt.savefig('../images/figure_2_3.png')
169+
plt.close()
170+
171+
172+
def figure_2_4(runs=2000, time=1000):
173+
bandits = []
174+
bandits.append(Bandit(epsilon=0, UCB_param=2, sample_averages=True))
175+
bandits.append(Bandit(epsilon=0.1, sample_averages=True))
176+
_, average_rewards = simulate(runs, time, bandits)
177+
178+
plt.plot(average_rewards[0], label='UCB c = 2')
179+
plt.plot(average_rewards[1], label='epsilon greedy epsilon = 0.1')
180+
plt.xlabel('Steps')
181+
plt.ylabel('Average reward')
182+
plt.legend()
183+
184+
#plt.savefig('../images/figure_2_4.png')
185+
plt.close()
186+
187+
188+
def figure_2_5(runs=2000, time=1000):
189+
bandits = []
190+
bandits.append(Bandit(gradient=True, step_size=0.1, gradient_baseline=True, true_reward=4))
191+
bandits.append(Bandit(gradient=True, step_size=0.1, gradient_baseline=False, true_reward=4))
192+
bandits.append(Bandit(gradient=True, step_size=0.4, gradient_baseline=True, true_reward=4))
193+
bandits.append(Bandit(gradient=True, step_size=0.4, gradient_baseline=False, true_reward=4))
194+
best_action_counts, _ = simulate(runs, time, bandits)
195+
labels = ['alpha = 0.1, with baseline',
196+
'alpha = 0.1, without baseline',
197+
'alpha = 0.4, with baseline',
198+
'alpha = 0.4, without baseline']
199+
200+
for i in range(len(bandits)):
201+
plt.plot(best_action_counts[i], label=labels[i])
202+
plt.xlabel('Steps')
203+
plt.ylabel('% Optimal action')
204+
plt.legend()
205+
206+
#plt.savefig('../images/figure_2_5.png')
207+
plt.close()
208+
209+
210+
def figure_2_6(runs=2000, time=1000):
211+
labels = ['epsilon-greedy', 'gradient bandit',
212+
'UCB', 'optimistic initialization']
213+
generators = [lambda epsilon: Bandit(epsilon=epsilon, sample_averages=True),
214+
lambda alpha: Bandit(gradient=True, step_size=alpha, gradient_baseline=True),
215+
lambda coef: Bandit(epsilon=0, UCB_param=coef, sample_averages=True),
216+
lambda initial: Bandit(epsilon=0, initial=initial, step_size=0.1)]
217+
parameters = [np.arange(-7, -1, dtype=np.float),
218+
np.arange(-5, 2, dtype=np.float),
219+
np.arange(-4, 3, dtype=np.float),
220+
np.arange(-2, 3, dtype=np.float)]
221+
222+
bandits = []
223+
for generator, parameter in zip(generators, parameters):
224+
for param in parameter:
225+
bandits.append(generator(pow(2, param)))
226+
227+
_, average_rewards = simulate(runs, time, bandits)
228+
rewards = np.mean(average_rewards, axis=1)
229+
230+
i = 0
231+
for label, parameter in zip(labels, parameters):
232+
l = len(parameter)
233+
plt.plot(parameter, rewards[i:i+l], label=label)
234+
i += l
235+
plt.xlabel('Parameter(2^x)')
236+
plt.ylabel('Average reward')
237+
plt.legend()
238+
239+
#plt.savefig('../images/figure_2_6.png')
240+
plt.close()
241+
242+
243+
if __name__ == '__main__':
244+
figure_2_1()
245+
figure_2_2()
246+
figure_2_3()
247+
figure_2_4()
248+
figure_2_5()
249+
250+
plt.violinplot(dataset=np.random.randn(200, 10) + np.random.randn(10))
251+
plt.xlabel("Action")
252+
plt.ylabel("Reward distribution")
253+
254+
# bandits = []
255+
# bandits.append(Bandit(epsilon=0, initial=5, step_size=0.1))
256+
# bandits.append(Bandit(epsilon=0.1, initial=0, step_size=0.1))
257+
# simulate(runs=2000, time=100, bandits=bandits)
258+
259+
260+
261+
262+

0 commit comments

Comments
 (0)
Please sign in to comment.