Hi all, I started learning reinforcement learning recently and I tried to implement a simple Monte Carlo learning algorithm with the Gambler’s Problem. I wrote some code in python but the solution that came out after 100000000 iterations is this:

Optimal bet size for each level of capital from 0 to 99

My code is this:

M = 100000000 alpha = 0.1 # Assume H = 1, T = 0 def flip_coin() -> int: return random.choices([0, 1], weights=[0.6, 0.4], k=1)[0] def policy(Q, epsilon, capital : int) -> int: ”’ :param capital: level of capital player currently has :return: size of bet ”’ greedy_action = Q[ capital, : ].argmax() do_greedy = random.choices([0, 1], weights=[epsilon, 1 – epsilon], k=1)[0] if do_greedy: return greedy_action else: return random.randint(0,capital) def play_game(Q, epsilon): ”’ :return: list of lists containing states, action (bets), and final reward (100 or 0) ”’ history = [] capital = 50 while capital != 0 and capital < 100: assert capital < 100 assert capital != 0 bet = policy(Q, epsilon, capital) coin = flip_coin() history.append([capital,bet]) if coin == 1: capital += bet elif coin == 0: capital -= bet if capital == 0: return [history, capital] if capital >= 100: return [history, 100] def mc_control(track_history=True): ”’ :param track_history: :return: Q, Q_hist ”’ Q = np.zeros((100,100)) # index = capital, columns = bet size Q_hist = [] t = 0 for m in tqdm(range(M + 1)): t += 1 epsilon = max(0.1, 1 – 1e-5 * t) # linearly decaying epsilon history, reward = play_game(Q,epsilon) if track_history: Q_hist.append(Q) for state_action in history: capital = state_action[0] bet = state_action[1] Q[capital, bet] = Q[capital, bet] + alpha * (reward – Q[capital, bet]) return Q, Q_hist

submitted by /u/yuriIsLifeFuckYou

[link] [comments]