22import matplotlib .pyplot as plt
33from abc import ABC , abstractmethod
44
5+
56class BanditAlgorithm (ABC ):
67 """Base class for bandit algorithms"""
7-
8+
89 def __init__ (self , n_arms ):
910 self .n_arms = n_arms
1011 self .reset ()
11-
12+
1213 def reset (self ):
1314 self .counts = np .zeros (self .n_arms )
1415 self .rewards = np .zeros (self .n_arms )
1516 self .t = 0
16-
17+
1718 @abstractmethod
1819 def select_arm (self ):
1920 pass
20-
21+
2122 def update (self , arm , reward ):
2223 self .t += 1
2324 self .counts [arm ] += 1
2425 self .rewards [arm ] += reward
2526
27+
2628class EpsilonGreedy (BanditAlgorithm ):
2729 """Epsilon-Greedy Algorithm"""
28-
30+
2931 def __init__ (self , n_arms , epsilon = 0.1 ):
3032 super ().__init__ (n_arms )
3133 self .epsilon = epsilon
32-
34+
3335 def select_arm (self ):
3436 if np .random .random () < self .epsilon :
3537 # Explore: random arm
3638 return np .random .randint (self .n_arms )
3739 else :
3840 # Exploit: best arm so far
39- avg_rewards = np .divide (self .rewards , self .counts ,
40- out = np .zeros_like (self .rewards ),
41- where = self .counts != 0 )
41+ avg_rewards = np .divide (
42+ self .rewards ,
43+ self .counts ,
44+ out = np .zeros_like (self .rewards ),
45+ where = self .counts != 0 ,
46+ )
4247 return np .argmax (avg_rewards )
4348
49+
4450class UCB (BanditAlgorithm ):
4551 """Upper Confidence Bound Algorithm"""
46-
52+
4753 def __init__ (self , n_arms , c = 2.0 ):
4854 super ().__init__ (n_arms )
4955 self .c = c
50-
56+
5157 def select_arm (self ):
5258 # If any arm hasn't been tried, try it
5359 if 0 in self .counts :
5460 return np .where (self .counts == 0 )[0 ][0 ]
55-
61+
5662 # Calculate UCB values
5763 avg_rewards = self .rewards / self .counts
5864 confidence = self .c * np .sqrt (np .log (self .t ) / self .counts )
5965 ucb_values = avg_rewards + confidence
60-
66+
6167 return np .argmax (ucb_values )
6268
69+
6370class ThompsonSampling (BanditAlgorithm ):
6471 """Thompson Sampling (Beta-Bernoulli)"""
65-
72+
6673 def __init__ (self , n_arms ):
6774 super ().__init__ (n_arms )
6875 self .alpha = np .ones (n_arms ) # Prior successes
69- self .beta = np .ones (n_arms ) # Prior failures
70-
76+ self .beta = np .ones (n_arms ) # Prior failures
77+
7178 def select_arm (self ):
7279 # Sample from Beta distribution for each arm
7380 samples = np .random .beta (self .alpha , self .beta )
7481 return np .argmax (samples )
75-
82+
7683 def update (self , arm , reward ):
7784 super ().update (arm , reward )
7885 # Update Beta parameters
@@ -81,146 +88,154 @@ def update(self, arm, reward):
8188 else :
8289 self .beta [arm ] += 1
8390
91+
8492class GradientBandit (BanditAlgorithm ):
8593 """Gradient Bandit Algorithm"""
86-
94+
8795 def __init__ (self , n_arms , alpha = 0.1 ):
8896 super ().__init__ (n_arms )
8997 self .alpha = alpha
9098 self .preferences = np .zeros (n_arms )
9199 self .avg_reward = 0
92-
100+
93101 def select_arm (self ):
94102 # Softmax to get probabilities
95103 exp_prefs = np .exp (self .preferences - np .max (self .preferences ))
96104 probs = exp_prefs / np .sum (exp_prefs )
97105 return np .random .choice (self .n_arms , p = probs )
98-
106+
99107 def update (self , arm , reward ):
100108 super ().update (arm , reward )
101-
109+
102110 # Update average reward
103111 self .avg_reward += (reward - self .avg_reward ) / self .t
104-
112+
105113 # Get action probabilities
106114 exp_prefs = np .exp (self .preferences - np .max (self .preferences ))
107115 probs = exp_prefs / np .sum (exp_prefs )
108-
116+
109117 # Update preferences
110118 for a in range (self .n_arms ):
111119 if a == arm :
112- self .preferences [a ] += self .alpha * (reward - self .avg_reward ) * (1 - probs [a ])
120+ self .preferences [a ] += (
121+ self .alpha * (reward - self .avg_reward ) * (1 - probs [a ])
122+ )
113123 else :
114- self .preferences [a ] -= self .alpha * (reward - self .avg_reward ) * probs [a ]
124+ self .preferences [a ] -= (
125+ self .alpha * (reward - self .avg_reward ) * probs [a ]
126+ )
127+
115128
116129# Testbed for comparing algorithms
117130class BanditTestbed :
118131 """Environment for testing bandit algorithms"""
119-
132+
120133 def __init__ (self , n_arms = 10 , true_rewards = None ):
121134 self .n_arms = n_arms
122135 if true_rewards is None :
123136 self .true_rewards = np .random .normal (0 , 1 , n_arms )
124137 else :
125138 self .true_rewards = true_rewards
126139 self .optimal_arm = np .argmax (self .true_rewards )
127-
140+
128141 def get_reward (self , arm ):
129142 """Get noisy reward for pulling an arm"""
130143 return np .random .normal (self .true_rewards [arm ], 1 )
131-
144+
132145 def run_experiment (self , algorithm , n_steps = 1000 ):
133146 """Run bandit algorithm for n_steps"""
134147 algorithm .reset ()
135148 rewards = []
136149 optimal_actions = []
137-
150+
138151 for _ in range (n_steps ):
139152 arm = algorithm .select_arm ()
140153 reward = self .get_reward (arm )
141154 algorithm .update (arm , reward )
142-
155+
143156 rewards .append (reward )
144157 optimal_actions .append (1 if arm == self .optimal_arm else 0 )
145-
158+
146159 return np .array (rewards ), np .array (optimal_actions )
147160
161+
148162# Example usage and comparison
149163def compare_algorithms ():
150164 """Compare different bandit algorithms"""
151-
165+
152166 # Create testbed
153167 testbed = BanditTestbed (n_arms = 10 )
154-
168+
155169 # Initialize algorithms
156170 algorithms = {
157- ' ε-greedy (0.1)' : EpsilonGreedy (10 , epsilon = 0.1 ),
158- ' ε-greedy (0.01)' : EpsilonGreedy (10 , epsilon = 0.01 ),
159- ' UCB (c=2)' : UCB (10 , c = 2 ),
160- ' Thompson Sampling' : ThompsonSampling (10 ),
161- ' Gradient Bandit' : GradientBandit (10 , alpha = 0.1 )
171+ " ε-greedy (0.1)" : EpsilonGreedy (10 , epsilon = 0.1 ),
172+ " ε-greedy (0.01)" : EpsilonGreedy (10 , epsilon = 0.01 ),
173+ " UCB (c=2)" : UCB (10 , c = 2 ),
174+ " Thompson Sampling" : ThompsonSampling (10 ),
175+ " Gradient Bandit" : GradientBandit (10 , alpha = 0.1 ),
162176 }
163-
177+
164178 n_steps = 2000
165179 n_runs = 100
166-
180+
167181 results = {}
168-
182+
169183 for name , algorithm in algorithms .items ():
170184 print (f"Running { name } ..." )
171185 avg_rewards = np .zeros (n_steps )
172186 optimal_actions = np .zeros (n_steps )
173-
187+
174188 for run in range (n_runs ):
175189 rewards , optimal = testbed .run_experiment (algorithm , n_steps )
176190 avg_rewards += rewards
177191 optimal_actions += optimal
178-
192+
179193 avg_rewards /= n_runs
180194 optimal_actions /= n_runs
181-
182- results [name ] = {
183- 'rewards' : avg_rewards ,
184- 'optimal_actions' : optimal_actions
185- }
186-
195+
196+ results [name ] = {"rewards" : avg_rewards , "optimal_actions" : optimal_actions }
197+
187198 # Plot results
188199 plt .figure (figsize = (15 , 5 ))
189-
200+
190201 # Average reward over time
191202 plt .subplot (1 , 2 , 1 )
192203 for name , result in results .items ():
193- plt .plot (np .cumsum (result ['rewards' ]) / np .arange (1 , n_steps + 1 ),
194- label = name )
195- plt .xlabel ('Steps' )
196- plt .ylabel ('Average Reward' )
197- plt .title ('Average Reward vs Steps' )
204+ plt .plot (np .cumsum (result ["rewards" ]) / np .arange (1 , n_steps + 1 ), label = name )
205+ plt .xlabel ("Steps" )
206+ plt .ylabel ("Average Reward" )
207+ plt .title ("Average Reward vs Steps" )
198208 plt .legend ()
199209 plt .grid (True )
200-
210+
201211 # Percentage of optimal actions
202212 plt .subplot (1 , 2 , 2 )
203213 for name , result in results .items ():
204- plt .plot (np .cumsum (result ['optimal_actions' ]) / np .arange (1 , n_steps + 1 ) * 100 ,
205- label = name )
206- plt .xlabel ('Steps' )
207- plt .ylabel ('% Optimal Action' )
208- plt .title ('Optimal Action Selection vs Steps' )
214+ plt .plot (
215+ np .cumsum (result ["optimal_actions" ]) / np .arange (1 , n_steps + 1 ) * 100 ,
216+ label = name ,
217+ )
218+ plt .xlabel ("Steps" )
219+ plt .ylabel ("% Optimal Action" )
220+ plt .title ("Optimal Action Selection vs Steps" )
209221 plt .legend ()
210222 plt .grid (True )
211-
223+
212224 plt .tight_layout ()
213225 plt .show ()
214-
226+
215227 return results
216228
229+
217230# Run the comparison
218231if __name__ == "__main__" :
219232 results = compare_algorithms ()
220-
233+
221234 # Print final performance
222235 print ("\n Final Performance (last 100 steps):" )
223236 for name , result in results .items ():
224- avg_reward = np .mean (result ['rewards' ][- 100 :])
225- optimal_pct = np .mean (result ['optimal_actions' ][- 100 :]) * 100
226- print (f"{ name :20s} : Avg Reward = { avg_reward :.3f} , Optimal = { optimal_pct :.1f} %" )
237+ avg_reward = np .mean (result ["rewards" ][- 100 :])
238+ optimal_pct = np .mean (result ["optimal_actions" ][- 100 :]) * 100
239+ print (
240+ f"{ name :20s} : Avg Reward = { avg_reward :.3f} , Optimal = { optimal_pct :.1f} %"
241+ )
0 commit comments