PenDigits with NeuralTS#

In this tutorial, we will be training a NeuralTS agent to solve the PenDigits dataset, converted into a bandit environment. We will also use evolutionary hyperparameter optimization on a population of agents.

To complete the PenDigits environment, the agent must learn to select the best arm, or action, to take in a given context, or state.


Figure 1: Cumulative regret from training on the PenDigits dataset#


Figure 2: Reward from training on the PenDigits dataset#

NeuralTS (Neural Contextual Bandits with Thompson Sampling) adapts deep neural networks for both exploration and exploitation by using a posterior distribution of the reward with a neural network approximator as its mean, and neural tangent features as its variance.

For this tutorial, we will use the labelled PenDigits dataset from the UCI Machine Learning Repository. These datasets can easily be imported and used for training with the Python package ucimlrepo, and to choose from the hundreds of available datasets it is as simple as changing the id parameter used by fetch_uci_repo. We can convert these labelled datasets into a bandit learning environment easily by using the agilerl.wrappers.learning.BanditEnv class.

"""This tutorial shows how to train an NeuralTS agent on the PenDigits dataset with evolutionary HPO.

Authors: Nick (

import matplotlib.pyplot as plt
import numpy as np
import torch
from scipy.ndimage import gaussian_filter1d
from tqdm import trange
from ucimlrepo import fetch_ucirepo

from agilerl.components.replay_buffer import ReplayBuffer
from agilerl.hpo.mutation import Mutations
from agilerl.hpo.tournament import TournamentSelection
from agilerl.utils.utils import initialPopulation
from agilerl.wrappers.learning import BanditEnv

if __name__ == "__main__":
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    NET_CONFIG = {
        "arch": "mlp",  # Network architecture
        "hidden_size": [128],  # Actor hidden size

    INIT_HP = {
        "POPULATION_SIZE": 4,  # Population size
        "BATCH_SIZE": 64,  # Batch size
        "LR": 0.001,  # Learning rate
        "GAMMA": 1.0,  # Scaling factor
        "LAMBDA": 1.0,  # Regularization factor
        "REG": 0.0625,  # Loss regularization factor
        "LEARN_STEP": 1,  # Learning frequency
        # Swap image channels dimension from last to first [H, W, C] -> [C, H, W]
        "CHANNELS_LAST": False,

    # Fetch data
    pendigits = fetch_ucirepo(id=81)
    features =
    targets =

    env = BanditEnv(features, targets)  # Create environment
    context_dim = env.context_dim
    action_dim = env.arms

    pop = initialPopulation(
        algo="NeuralTS",  # Algorithm
        state_dim=context_dim,  # State dimension
        action_dim=action_dim,  # Action dimension
        one_hot=None,  # One-hot encoding
        net_config=NET_CONFIG,  # Network configuration
        INIT_HP=INIT_HP,  # Initial hyperparameters
        population_size=INIT_HP["POPULATION_SIZE"],  # Population size

    field_names = ["context", "reward"]
    memory = ReplayBuffer(
        action_dim=action_dim,  # Number of agent actions
        memory_size=10000,  # Max replay buffer size
        field_names=field_names,  # Field names to store in memory

    tournament = TournamentSelection(
        tournament_size=2,  # Tournament selection size
        elitism=True,  # Elitism in tournament selection
        population_size=INIT_HP["POPULATION_SIZE"],  # Population size
    )  # Evaluate using last N fitness scores

    mutations = Mutations(
        algo="NeuralTS",  # Algorithm
        no_mutation=0.4,  # No mutation
        architecture=0.2,  # Architecture mutation
        new_layer_prob=0.2,  # New layer mutation
        parameters=0.2,  # Network parameters mutation
        activation=0.2,  # Activation layer mutation
        rl_hp=0.2,  # Learning HP mutation
        rl_hp_selection=["lr", "batch_size"],  # Learning HPs to choose from
        mutation_sd=0.1,  # Mutation strength
        mutate_elite=False,  # Mutate best agent in population
        arch=NET_CONFIG["arch"],  # Network architecture
        rand_seed=1,  # Random seed

    max_episodes = 50  # Max training episodes
    max_steps = 50  # Max steps per episode

    evo_epochs = 2  # Evolution frequency
    evo_loop = 1  # Number of evaluation episodes


    regret = [[0] for _ in pop]
    score = [[0] for _ in pop]
    total_steps = 0

    for idx_epi in trange(max_episodes):
        for i, agent in enumerate(pop):  # Loop through population
            losses = []
            context = env.reset()  # Reset environment at start of episode
            for idx_step in range(max_steps):
                # Get next action from agent
                action = agent.getAction(context)
                next_context, reward = env.step(action)  # Act in environment

                # Save experience to replay buffer
                memory.save2memory(context[action], reward)

                # Learn according to learning frequency
                if (
                    memory.counter % agent.learn_step == 0
                    and len(memory) >= agent.batch_size
                    for _ in range(2):
                        experiences = memory.sample(
                        )  # Sample replay buffer
                        # Learn according to agent's RL algorithm
                        loss = agent.learn(experiences)

                context = next_context
                regret[i].append(regret[i][-1] + 1 - reward)

            total_steps += max_steps

        # Now evaluate and evolve population if necessary
        if (idx_epi + 1) % evo_epochs == 0:
            # Evaluate population
            fitnesses = [
                for agent in pop

            print(f"Episode {idx_epi+1}/{max_episodes}")
            print(f"Regret: {[regret[i][-1] for i in range(len(pop))]}")

            # Tournament selection and population mutation
            elite, pop =
            pop = mutations.mutation(pop)

    # Plot the results
    for i, agent_regret in enumerate(regret):
            np.linspace(0, total_steps, len(agent_regret)),
            label=f"NeuralTS: Agent {i}",
    plt.xlabel("Training Step")

    for i, agent_score in enumerate(score):
        smoothed_score = gaussian_filter1d(agent_score, sigma=80)
            np.linspace(0, total_steps, len(smoothed_score)),
            label=f"NeuralTS: Agent {i}",
    plt.xlabel("Training Step")