<a href="https://colab.research.google.com/github/cmu-llms-class/11-766-hw1-dev/blob/main/src/task2/11_766_HW1_Problem2_food_classifier_merged.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [None]:
# Install Pytorch & other libraries
%pip install "torch==2.4.1" tensorboard
# flash-attn might need to be deleted due to library version unmatch
%pip install flash-attn "setuptools<71.0.0" scikit-learn

# Install Hugging Face libraries
%pip install  --upgrade \
  "datasets==3.1.0" \
  "hf-transfer==0.1.8" \
  "transformers==4.48.1" \
  "accelerate>=0.34.0" \
  "peft==0.13.2" \
  # "accelerate==1.2.1" \
  # "transformers==4.47.1"
%pip uninstall -y torchvision
%pip install --no-cache-dir --index-url https://download.pytorch.org/whl/cu121 torchvision==0.19.1+cu121
# ModernBERT is not yet available in an official release, so we need to install it from github
# %pip install "git+https://github.com/huggingface/transformers.git@6e0515e99c39444caae39472ee1b2fd76ece32f1" --upgrade


In [None]:
# @title Imports
import pandas as pd
import gspread
import pandas as pd
from google.colab import userdata
from openai import OpenAI
from tqdm import tqdm

from google.colab import auth
auth.authenticate_user()

import gspread
from google.auth import default
creds, _ = default()

gc = gspread.authorize(creds)

# Part 1

Make a copy of [this spreadsheet](https://docs.google.com/spreadsheets/d/1rKTpPvXESM5PDOc4clAHiYuowC9zi5P3WbkknWAZJgs/edit?usp=sharing) and annotate the data. Then, update the `SHEET_URL` variable below to link to your copy of the spreadsheet.

In [None]:
# @title Load data

# SHEET_URL = "https://docs.google.com/spreadsheets/d/1Aq9NJT7vjbjI-1NJMk125XV6E6Sf-39jxuawtv3A5S4/edit?gid=0#gid=0" # @param {"type":"string","placeholder":""}
SHEET_URL = "https://docs.google.com/spreadsheets/d/1Lt_SX9QGHB08XaxermeI7q61TRbwwM6B12HzdpqAf_E" # @param {"type":"string","placeholder":""}

worksheet = gc.open_by_url(SHEET_URL).sheet1

rows = worksheet.get_all_values()
df = pd.DataFrame.from_records(rows1:=rows[1:], columns=rows[0])

LABEL_NAMES  = ["wouldn't want to eat", "meh", "sounds tasty"]
num_labels = len(label_names)

train_df = df[df["Split"] == "train"]
valid_df = df[df["Split"] == "valid"]
test_df = df[df["Split"] == "test"]

print(df["Label"].value_counts())
valid_df

# Part 2: Use the OpenAI API to build a food tastiness classifer.

To begin, you should create an API key with OpenAI. Then add that key as a secret to Colab by clicking on the key symbol on the left. Give it the name `OPENAI_API_KEY`.

We have provided you a very simple classifier implementation that does in-context learning.
You should try to improve this classifier using any of the following approaches:

- [Finetuning API](https://platform.openai.com/docs/guides/supervised-fine-tuning)
- [Structured model outputs](https://platform.openai.com/docs/guides/structured-outputs?api-mode=chat)
- Better few-shot prompting
- More complex system instruction and prompt format
- [Different models](https://platform.openai.com/docs/models) and [inference parameters](https://platform.openai.com/docs/api-reference/responses/create)
- Performing one annotation per API call versus several annotations per API call

However, please stick with the OpenAI API for this question, and remember to be mindful of how much you are spending. You should try to spend no more than 15 USD during your experimentation.

In [None]:
# @title Functions for calling OpenAI API
_CLIENT = OpenAI(api_key=userdata.get('OPENAI_API_KEY'))

def create_system_prompt(df):
  few_shot_examples = [f"{row.Food}\t{row.Label}" for row in df.itertuples()]
  few_shot_examples = few_shot_examples
  few_shot_examples = "\n".join(few_shot_examples)

  preferences = df
  s = "You are an expert predictor of my food preferences. " +\
      "For each food item I provide, you should output \"1\" if you think " +\
      "I'll dislike it, \"2\" if you think I'll have no opinion, and \"3\"if " +\
      "you think I'll really like it.\n\n" +\
      f"Here are some preferences to start you off: {few_shot_examples}\n\n" +\
      "Answer with just the number."

  return s

def prompt_gpt5(food_name):
  response = _CLIENT.responses.create(
    model="gpt-5-nano-2025-08-07",
    input=[
      {
        "role": "developer",
        "content": [
            {"type": "input_text", "text": create_system_prompt(train_df)}]
      },
      {
        "role": "user",
        "content": [{"type": "input_text", "text": "Cherry icecream"}]
      },
    ],
    text={
      "format": {
        "type": "text"
      },
      "verbosity": "medium"
    },
    reasoning={
      "effort": "medium",
      "summary": "auto"
    },
    tools=[],
    store=True,
    include=[
      "reasoning.encrypted_content",
    ]
  )
  return response


system_prompt = create_system_prompt(train_df)
print("===System prompt===")
print(system_prompt)

print("\n===Grilled Cheese Test===")
response = prompt_gpt5("Grilled Cheese Sandwich")
print(response.output[-1].content[0].text)

In [None]:
# @title Run inference on full validation set

gpt_responses = []
for row in tqdm(valid_df.itertuples(), total=len(valid_df)):
  response = prompt_gpt5(row.Food)
  gpt_responses.append(response)


In [None]:
# @title Evaluation
predicted_labels = []
true_labels = []
correct = []

for response, row in zip(gpt_responses, valid_df.itertuples()):
  predicted_label = int(response.output[-1].content[0].text)
  true_label = int(row.Label[0])

  predicted_labels.append(predicted_label)
  true_labels.append(true_label)
  correct.append(predicted_label == true_label)

print("Accuracy:", sum(correct) / len(correct))

import numpy as np
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Generate the confusion matrix
cm = confusion_matrix(true_labels, predicted_labels)

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(
    cm, annot=True, fmt="d", cmap="Blues",
    xticklabels=LABEL_NAMES, yticklabels=LABEL_NAMES)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.show()

In [None]:
import numpy as np
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Generate the confusion matrix
cm = confusion_matrix(true_labels, predicted_labels)

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=label_names, yticklabels=label_names)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.show()

# Part 3: Finetune a tiny model on the synthetically labeled data.

**Fine-tune classifier with ModernBERT:** Large Language Models (LLMs) have become ubiquitous in 2024. However, smaller, specialized models - particularly for classification tasks - remain critical for building efficient and cost-effective AI systems. In this part, we will fine-tune our smaller models (ModernBERT, a new tiny encoder model) on your own food classification dataset, and evaluate the fine-tuned model on your own food testset.

ModernBERT is a refreshed version of BERT models, with 8192 token context length, significantly better downstream performance, and much faster processing speeds.

For this part, you will need to:
1. Load and prepare the classification dataset  
2. Fine-tune & evaluate ModernBERT with the Hugging Face `Trainer`
3. Run inference & test model

**Quick intro: ModernBERT**: ModernBERT is a modernization of BERT maintaining full backward compatibility while delivering dramatic improvements through architectural innovations like rotary positional embeddings (RoPE), alternating attention patterns, and hardware-optimized design. The model comes in two sizes:
- ModernBERT Base (139M parameters, we are using it in this part)
- ModernBERT Large (395M parameters)

ModernBERT achieves state-of-the-art performance across classification, retrieval and code understanding tasks while being 2-4x faster than previous encoder models. This makes it ideal for high-throughput production applications like LLM routing, where both accuracy and latency are critical.

ModernBERT was trained on 2 trillion tokens of diverse data including web documents, code, and scientific articles - making it much more robust than traditional BERT models trained primarily on Wikipedia. This broader knowledge helps it better understand the nuances of user prompts across different domains.

If you want to learn more about ModernBERT's architecture and training process, check out the official [blog](https://huggingface.co/blog/modernbert).

---

Now let's get started building our LLM router with ModernBERT! ðŸš€

*Note: This part was created and tested on a Colab T4 GPU.*

### Setup environment and install libraries

Our first step is to install Hugging Face Libraries and Pyroch, including transformers and datasets.

We will use the [Hugging Face Hub](https://huggingface.co/models) as a remote model versioning service. This means we will automatically push our model, logs and information to the Hub during training. You must register on the [Hugging Face](https://huggingface.co/join) for this, then create an access token at [this link](https://huggingface.co/settings/tokens).

Add your token to the Colab's list of secrets with the name `HF_TOKEN`.

We will use the `login` util from the `huggingface_hub` package to log into our account and store our token (access key) on the disk.

In [None]:
from huggingface_hub import login

login(token=userdata.get('HF_TOKEN'), add_to_git_credential=True)

### 1. Load and prepare the dataset

We will fine-tune ModernBERT on the same food-preference data structure produced in the previous part 3. Concretely:
Let's use the [ðŸ¤— Datasets](https://huggingface.co/docs/datasets/index) library to build a `DatasetDict` from each DataFrame.

In [None]:
from datasets import Dataset, DatasetDict

assert "synth_df" in globals(), "Run Part 3 to create synth_df before Part 4."

def _ensure_int_labels(df):
    """Extract numeric labels from string format like '1 - wouldn't want to eat'."""
    df = df[["Food", "Label"]].dropna().reset_index(drop=True).copy()

    # Handle both string format ("1 - ...") and integer format
    def extract_label(label):
        if isinstance(label, str):
            # Extract the first character (the number)
            return int(label.split()[0])
        else:
            return int(label)

    df["Label"] = df["Label"].apply(extract_label)
    assert set(df["Label"].unique()).issubset({1, 2, 3}), "Labels must be 1/2/3."
    return df

train_df_synth = _ensure_int_labels(train_df)
valid_df_food = _ensure_int_labels(valid_df)
test_df_food = _ensure_int_labels(test_df)

raw_dataset = DatasetDict(
    {
        "train": Dataset.from_pandas(train_df_synth),
        "validation": Dataset.from_pandas(valid_df_food),
        "test": Dataset.from_pandas(test_df_food),
    }
)

print(f"Train dataset size: {len(raw_dataset['train'])}")
print(f"Validation dataset size: {len(raw_dataset['validation'])}")
print(f"Test dataset size: {len(raw_dataset['test'])}")
print(f"Train data sample: {raw_dataset['train'][0]}")
print(f"Validation data sample: {raw_dataset['validation'][0]}")
print(f"Test data sample {raw_dataset['test'][0]}")

To train our model, we need to convert our text prompts to token IDs. This is done by a Tokenizer, which tokenizes the inputs (including converting the tokens to their corresponding IDs in the pre-trained vocabulary) if you want to learn more about this, outÂ **[chapter 6](https://huggingface.co/course/chapter6/1?fw=pt)**Â of the [Hugging Face Course](https://huggingface.co/course/chapter1/1).

In [None]:
from transformers import AutoTokenizer

# Model id to load the tokenizer
model_id = "answerdotai/ModernBERT-base"

# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.model_max_length = 512  # food names are short


# Tokenize helper function
def tokenize(batch):
    return tokenizer(batch["Food"], padding="max_length", truncation=True)


# Tokenize dataset
# Keep original Label (1/2/3) for clarity, but create Trainer-compatible labels (0/1/2)
raw_dataset = raw_dataset.map(
    lambda b: {"labels": [int(x) - 1 for x in b["Label"]]},
    batched=True,
)
# If you want to drop raw text later, pass remove_columns=["Food"]
tokenized_dataset = raw_dataset.map(tokenize, batched=True)

print(tokenized_dataset["train"].features.keys())

### 2. Fine-tune & evaluate ModernBERT with the Hugging Face `Trainer`

After we have processed our dataset, we can start training our model. We will use the [answerdotai/ModernBERT-base](https://huggingface.co/answerdotai/ModernBERT-base) model. The first step is to load our model with `AutoModelForSequenceClassification` class from the [Hugging Face Hub](https://huggingface.co/answerdotai/ModernBERT-base). This will initialize the pre-trained ModernBERT weights with a classification head on top. Here we pass the number of classes (3) from our dataset and the label names to have readable outputs for inference.

In [None]:
from transformers import AutoModelForSequenceClassification
import torch

# Model id to load the tokenizer
model_id = "answerdotai/ModernBERT-base"

# Prepare model labels - useful for inference
label2id = {name: str(i) for i, name in enumerate(label_names)}
id2label = {str(i): name for i, name in enumerate(label_names)}

# Download the model from huggingface.co/models
model = AutoModelForSequenceClassification.from_pretrained(
    model_id,
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
    attn_implementation="sdpa",  # avoids flash-attn path
    torch_dtype=torch.float16,    # good default on A100/L4/T4? (T4 prefers fp16)
)

We evaluate our model during training. TheÂ `Trainer`Â supports evaluation during training by providing aÂ `compute_metrics` method. We use the `evaluate` library to calculate the [f1 metric](https://huggingface.co/spaces/evaluate-metric/f1) during training on our test split.

In [None]:
import numpy as np
from sklearn.metrics import f1_score, accuracy_score

# Metric helper method
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    # Calculate accuracy
    accuracy = accuracy_score(labels, predictions)

    # Calculate weighted F1 score
    f1 = f1_score(labels, predictions, average="weighted")

    return {
        "accuracy": float(accuracy),
        "f1": float(f1)
    }

The last step is to define the hyperparameters (`TrainingArguments`) we use for our training. Here we are adding optimizations introduced features for fast training times using `torch_compile` option in the `TrainingArguments`.

We also leverage theÂ [Hugging Face Hub](https://huggingface.co/models)Â integration of theÂ `Trainer`Â to push our checkpoints, logs, and metrics during training into a repository.

In [None]:
from huggingface_hub import HfFolder
from transformers import Trainer, TrainingArguments

# Define training args
training_args = TrainingArguments(
    output_dir= "modernbert-llm-router",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=16,
    learning_rate=5e-5,
		num_train_epochs=1,
    optim="adamw_torch_fused", # improved optimizer
    # logging & evaluation strategies
    logging_strategy="steps",
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    # push to hub parameters
    report_to="tensorboard",

)

# Create a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
)

We can start our training by using theÂ **`train`**Â method of the `Trainer`.

In [None]:
trainer.train()

# Optional: Synthetic Dataset for Distillation

Suppose you have a big, highly capable model, but you want to develop a much smaller model that has learned a set of skills from the big model.
Oone way to do this is through model distillation--finetuning the small model to produce outputs that look like those of the big model.
A very simple way to do model distillation is to generate a large number of examples using the big model, and then finetuning those examples on the small model.

Below, we have provided you starter code that uses the OpenAI API to label 1,000 food items. As one approach for improving your BERT model, you may consider
training on a larger data that of GPT-5-labeled foods.

In [None]:
# @title Load + sample unlabeled recipes
!pip -q install datasets

import json
import re

import pandas as pd
from datasets import load_dataset

DATASET_ID = "Shengtao/recipe"
SEED = 42
N_UNLABELED = 1000  # FOR STUDENT IMPL
MAX_UNLABELED = 10000

dataset = load_dataset(DATASET_ID, split="train")  # FOR STUDENT IMPL
raw_titles = dataset.shuffle(seed=SEED).select(range(min(len(dataset), MAX_UNLABELED * 3)))["title"]

s = (
    pd.Series(raw_titles)
    .astype(str)
    .str.strip()
    .str.replace(r"\s+(?=[IVXLCDM]+$)[IVXLCDM]+$", "", regex=True)
)
clean_titles = s[s.ne("") & ~s.str.lower().duplicated()].head(N_UNLABELED).tolist()

unlabeled_df = pd.DataFrame({"Food": clean_titles})
unlabeled_df.head()


In [None]:
# @title Build Batch input JSONL for OpenAI
from io import BytesIO
from openai import OpenAI
from google.colab import userdata

client = OpenAI(api_key=userdata.get("OPENAI_API_KEY"))  # FOR STUDENT IMPL

SYSTEM_PROMPT = create_system_prompt(train_df)
MODEL_NAME = "gpt-5.2-2025-12-11"  # FOR STUDENT IMPL


def build_request(food_name, custom_id):
    return {
        "custom_id": custom_id,
        "method": "POST",
        "url": "/v1/responses",
        "body": {
            "model": MODEL_NAME,
            "input": [
                {
                    "role": "developer",
                    "content": [
                        {"type": "input_text", "text": SYSTEM_PROMPT},
                    ],
                },
                {
                    "role": "user",
                    "content": [
                        {"type": "input_text", "text": food_name},
                    ],
                },
            ],
            "text": {"format": {"type": "text"}},
            "reasoning": {"effort": "low", "summary": "auto"},
        },
    }


batch_lines = [
    json.dumps(build_request(food, custom_id=f"food-{i}"))
    for i, food in enumerate(unlabeled_df["Food"])
]

print("Batch input lines:", len(batch_lines))
display(unlabeled_df.head())
display(pd.Series(batch_lines[:3], name="jsonl"))

batch_input_jsonl = "\n".join(batch_lines)
batch_bytes = batch_input_jsonl.encode("utf-8")


batch_file = BytesIO(batch_bytes)
batch_file.name = "openai_batch_input.jsonl"  # required by OpenAI file upload

In [None]:
# @title Submit batch + poll status
import time

batch_input_file = client.files.create(
    file=batch_file,
    purpose="batch",
)  # FOR STUDENT IMPL

print("Uploaded batch file id:", batch_input_file.id)

batch = client.batches.create(
    input_file_id=batch_input_file.id,
    endpoint="/v1/responses",
    completion_window="24h",
    metadata={"job": "food-preference-synth"},
)  # FOR STUDENT IMPL

print("Batch id:", batch.id)

You can check on the status of your batch at [this link](https://platform.openai.com/batches/batch_696c17cbec308190b210336da56d092a) or using the code block below.

If your Colab runtime disconnects after submitting the batch, you do **not** need to re-submit. Just paste the `batch.id` into the `BATCH_ID` variable below.

When the status below reads ``completed``, you are readt to head onto the next part.

In [None]:
# Batch prediction can take a while. Run this cell to see status at any time.
# If your runtime restarted, paste your saved batch id here.
BATCH_ID = ""  # e.g. "batch_abc123" (leave empty to use current batch variable)

batch_id = BATCH_ID or batch.id
batch = client.batches.retrieve(batch_id)  # FOR STUDENT IMPL
counts = batch.request_counts
print(
    f"Status: {batch.status} | total={counts.total} "
    f"completed={counts.completed} failed={counts.failed}"
)


In [None]:
assert batch.status == "completed", f"Batch not completed: {batch.status}"  # FOR STUDENT IMPL
output_file_id = batch.output_file_id  # FOR STUDENT IMPL

output_lines = client.files.content(output_file_id).read().decode("utf-8").splitlines()  # FOR STUDENT IMPL
print("Output lines:", len(output_lines))


def extract_label_from_response(resp_obj):
    # response.output is a list; last item has content[0].text
    output = resp_obj["response"]['body']["output"]
    text = output[-1]["content"][0]["text"].strip()
    label = int(text[0])
    assert label in (1, 2, 3), f"Unexpected label: {text}"
    return label


records = []
for line in output_lines:
    obj = json.loads(line)
    idx = int(obj["custom_id"].split("-")[1])
    food = unlabeled_df.iloc[idx]["Food"]
    label = extract_label_from_response(obj)
    records.append({"Food": food, "Label": label})

synth_df = pd.DataFrame.from_records(records)
print("Labeled rows:", len(synth_df))
display(synth_df["Label"].value_counts())
display(synth_df.head(5))
