KDD Cup | Humanities Track Tutorial Bandit Randomized Probability Matching

Date Created: June 12, 2019
KDD Cup | Humanities Track Tutorial Bandit Randomized Probability Matching

State

$S \in \{1,2,3,4,5\}$

Action

$A_S = [a_{ITN},a_{IRS}]$

where $a_{ITN} \in [0,1]$ and $a_{IRS} \in [0,1]$

Reward

$R_{\pi} \in (- \infty,\infty)$


In [1]:
import numpy as np
from collections import defaultdict
import random
# !pip3 install git+https://github.com/slremy/netsapi --user --upgrade
from netsapi.challenge import * 

Bandit Randomized Probability Matching Agent Code:

In [2]:
class BanditRPM(object):
    def __init__(self,env):
        self.env = env
        self.action_resolution = 0.1
        self.actions = self.actionSpace()    
        
        self.ActionValue = {}
        self.init = (2,5)
        for key in self.actions:
            self.ActionValue[key] = self.init
        
    def actionSpace(self):
         x = np.arange(0,1+self.action_resolution,self.action_resolution)
         y = 1-x
         x = x.reshape(len(x),1)
         y = y.reshape(len(y),1)
         xy = np.concatenate((x, y), axis=1)
         xy = xy.round(2)
         xy = [tuple(row) for row in xy]
        
         return xy
        
    
    def choose_action(self):
        """
        Use Thompson sampling to choose action. Sample from each posterior and choose the max of the samples.
        """
        samples = {}
        for key in self.ActionValue:
            samples[key] = np.random.beta(self.ActionValue[key][0], self.ActionValue[key][1])
        max_value =  max(samples, key=samples.get)
        return max_value    

    def update(self,action,reward):
        """
        Update parameters of posteriors, which are Beta distributions
        """
        a, b = self.ActionValue[action]
        a = a+reward/100
        b = b + 1 - reward/100
        a = 0.001 if a <= 0 else a
        b = 0.001 if b <= 0 else b
        
        self.ActionValue[action] = (a, b)
        
    def train(self):
        for _ in range(20): #Do not change
            self.env.reset()
            while True:
                action =  self.choose_action()
                nextstate, reward, done, _ = self.env.evaluateAction(list(action))
                self.update(action,reward)
                if done:
                    break


    def generate(self):
        best_policy = None
        best_reward = -float('Inf')
        self.train()
        best_policy = {state: list(self.choose_action()) for state in range(1,6)}
        best_reward = self.env.evaluatePolicy(best_policy)
        
        print(best_policy, best_reward)
        
        return best_policy, best_reward                    

Run the EvaluateChallengeSubmission Method with your Agent Class

In [ ]:
EvaluateChallengeSubmission(ChallengeSeqDecEnvironment, BanditRPM, "BanditRPM_submission.csv")
Share this:
Share

Comment

Comments are disabled for this post.