Large Language Model Finetuning¶
Reinforcement learning (RL) has emerged as a powerful technique for improving reasoning capabilities in Large Language Models. Models like DeepSeek-R1 and OpenAI’s o1 exemplify this approach, demonstrating how RL can be used to develop LLMs with superior reasoning abilities without relying on traditional supervised fine-tuning. Through training with reinforcement learning, models develop agency and can be described as agents.
Algorithms |
Tutorials |
---|---|
Reinforcement learning for reasoning¶
The standard approach to creating instruction-following LLMs has traditionally relied on Supervised Fine-Tuning, where models are trained on high-quality human-generated examples. However, this method has limitations when it comes to complex reasoning tasks. What makes reinforcement learning particularly effective for enhancing reasoning is that it:
Rewards the process, not just the outcome: By designing reward mechanisms that value step-by-step thinking and self-correction
Allows for exploration: Models can try different reasoning approaches and learn which ones lead to better outcomes
Enables self-improvement cycles: Creating a virtuous loop where better reasoning leads to better rewards
What makes this approach powerful is that the model discovers effective reasoning strategies on its own. It might learn to:
Break complex problems into manageable steps
Double-check calculations along the way
Backtrack when it encounters contradictions
Generate structural outlines before diving into details
Verify final answers by working backward
These are called emergent behaviours.
The agent receives no explicit instructions on which specific reasoning techniques to employ. It learns through trial and error which approaches tend to produce correct answers. This allows the emergence of sophisticated reasoning patterns that weren’t necessarily anticipated by the model’s creators, similar to how AlphaGo discovered novel chess strategies through self-play.
import re
from typing import Tuple
import torch
from accelerate import Accelerator
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
from torch.utils.data import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from agilerl.algorithms import GRPO
from agilerl.training.train_llm import finetune_llm
from agilerl.utils.llm_utils import HuggingFaceGym
MODEL_PATH = "Qwen/Qwen2.5-3B"
DATASET = "Jiayi-Pan/Countdown-Tasks-3to4"
def create_model(pretrained_model_name_or_path):
model = AutoModelForCausalLM.from_pretrained(
pretrained_model_name_or_path=pretrained_model_name_or_path,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2",
)
peft_config = LoraConfig(
r=32,
lora_alpha=32,
target_modules=[
"q_proj",
"k_proj",
"v_proj",
"o_proj",
"up_proj",
"down_proj",
"gate_proj",
],
task_type="CAUSAL_LM",
lora_dropout=0.05,
)
model = get_peft_model(model, peft_config)
return model
def countdown_chat_template(q, a, tokenizer):
conversation = [
{
"role": "system",
"content": "You are a helpful assistant. You first think about the reasoning process in your mind and then provide the user with the answer.",
},
{
"role": "user",
"content": f"Using each number in this tensor only once {tuple(i.item() for i in q)}, create an equation that equals {a.item()}. You can use basic arithmetic operations (+, -, *, /) and each number can only be used once. Show your work in <think> </think> tags. And return the final equation and answer in <answer> </answer> tags, for example <answer>(1 + 2) / 3</answer>.",
},
{"role": "assistant", "content": "Let me solve this step by step.\n<think>"},
]
updated_prompt = tokenizer.apply_chat_template(
conversation, tokenize=False, continue_final_message=True
)
tokenized_prompt = tokenizer(
[updated_prompt],
return_tensors="pt",
padding=True,
padding_side="left",
return_attention_mask=True,
)
return tokenized_prompt
def make_dataset(dataset_name: str) -> Tuple[Dataset, Dataset]:
raw_dataset = (
load_dataset(DATASET, split="train").shuffle(seed=42).select(range(50000))
)
raw_dataset = raw_dataset.rename_column("target", "answer")
raw_dataset = raw_dataset.rename_column("nums", "question")
train_test_split = raw_dataset.train_test_split(test_size=0.1)
train_dataset = train_test_split["train"]
test_dataset = train_test_split["test"]
return train_dataset, test_dataset
def format_reward_func(completions, target, **kwargs):
rewards = []
for completion, gt in zip(completions, target):
try:
# add synthetic <think> as its already part of the prompt and prefilled for the assistant to more easily match the regex
completion = "<think>" + completion
regex = r"^<think>([^<]*(?:<(?!/?think>)[^<]*)*)<\/think>\n<answer>([\s\S]*?)<\/answer>$"
match = re.search(regex, completion, re.DOTALL)
if match is None or len(match.groups()) != 2:
rewards.append(0.0)
else:
rewards.append(1.0)
except Exception:
rewards.append(0.0)
return rewards
def equation_reward_func(completions, target, nums, **kwargs):
rewards = []
for completion, gt, numbers in zip(completions, target, nums):
try:
# add synthetic <think> as its already part of the prompt and prefilled for the assistant to more easily match the regex
completion = "<think>" + completion
answer_tags = re.findall(r"<answer>([\s\S]*?)<\/answer>", completion)
if len(answer_tags) != 1:
rewards.append(0.0)
continue
equation = answer_tags[0].strip()
used_numbers = [int(n) for n in re.findall(r"\d+", equation)]
if sorted(used_numbers) != sorted(numbers):
print(f"Numbers mismatch: {used_numbers} vs {numbers}")
rewards.append(0.0)
continue
allowed_pattern = r"^[\d+\-*/().\s]+$"
if not re.match(allowed_pattern, equation):
print(f"Equation format invalid: {equation}")
rewards.append(0.0)
continue
result = eval(equation, {"__builtins__": None}, {})
if abs(float(result) - float(gt)) < 1e-5:
rewards.append(1.0)
else:
print(f"Result {result} doesn't match target {gt}")
rewards.append(0.0)
except Exception as e:
print(f"Equation error: {e}")
rewards.append(0.0)
return rewards
def combined_rewards(completion, solution, prompt):
reward = (
equation_reward_func([completion], [solution], [prompt])[0]
+ format_reward_func([completion], [solution])[0]
)
print(
f"""
============================================, \n
Completion: {completion}, \n
Numbers: {prompt}, \n
Correct Answer: {solution.item()} \n
Reward: {reward}
"""
)
if reward == 2.0:
with open("countdown_completions.txt", "a") as text_file:
text_file.write(
f"Prompt {prompt}" + "\n" + completion + "\n" + "=" * 50 + "\n"
)
return reward
def custom_collate_fn(batch):
# Extract answers and questions
answers = torch.tensor([item["answer"] for item in batch])
# For questions of variable length, we need to pad them
# First, find the maximum length
max_len = max(len(item["question"]) for item in batch)
# Create padded tensor
questions = torch.zeros(len(batch), max_len, dtype=torch.long)
for i, item in enumerate(batch):
q_len = len(item["question"])
questions[i, :q_len] = torch.tensor(item["question"])
return {"answer": answers, "question": questions}
def main():
# Instantiate the model and the associated tokenizer
model = create_model(**{"pretrained_model_name_or_path": MODEL_PATH})
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
tokenizer.pad_token = tokenizer.eos_token
train_dataset, test_dataset = make_dataset(DATASET)
# Convert the HuggingFace dataset into a Gymnasium environment
env = HuggingFaceGym(
train_dataset=train_dataset,
test_dataset=test_dataset,
tokenizer=tokenizer,
reward_fn=combined_rewards,
apply_chat_template_fn=countdown_chat_template,
max_answer_tokens=1024,
data_batch_size=1,
custom_collate_fn=custom_collate_fn,
)
# Instantiate the grpo agent
agent = GRPO(
env.observation_space,
env.action_space,
actor_network=model,
pad_token_id=tokenizer.eos_token_id,
batch_size=1,
group_size=12,
reduce_memory_peak=True,
accelerator=Accelerator(),
)
finetune_llm(
agent=agent,
env=env,
evaluation_interval=5,
wb=True,
checkpoint_interval=100,
checkpoint_path="saved_llms",
max_reward=2.0,
)
if __name__ == "__main__":
main()