# Notebook to Generate Attention History Tensors for Vizualization

This is really written for Mac but should be easy to change.  

[Here is the vizualization tool.](https://storage.googleapis.com/lvs-usa/attention-viz/visualizer.html)

Here are some pregrenerated tensors to analyze: 

[Small (400KB, 4B Model)](https://storage.googleapis.com/lvs-usa/attention-viz/GQA_tensor.bin) 

[Medium (5 MB, 0.6B Model)](https://storage.googleapis.com/lvs-usa/attention-viz/dragon_attention_tensor.bin) 

[Large (20MB, 4B Model)](https://storage.googleapis.com/lvs-usa/attention-viz/vending_machine_attention_tensor.bin) 


In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using Apple Silicon MPS (GPU).")
else:
    device = torch.device("cpu")
    print("MPS not available. Using CPU.")

# Pick model (only tested on dense Qwen models)
model_name = "Qwen/Qwen3-0.6B"
# model_name="Qwen/Qwen3-4B"

In [None]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)


if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


model = AutoModelForCausalLM.from_pretrained(
    model_name,
    dtype="auto",
    trust_remote_code=True # Qwen models require this
)

model.eval()  # Set the model to evaluation mode

model.to(device)

print(f"Model '{model_name}' loaded successfully on {device}.")
print(model.config)


In [None]:
#Set prompt and use chat template

def token_str_arr_from_ids(token_ids):
    return [tokenizer.decode([token_id]) for token_id in token_ids]

def make_chat_template(messages):
    try:
        formatted_string = tokenizer.apply_chat_template(
            messages, 
            tokenize=False, 
            add_generation_prompt=True,
            enable_thinking=False #This one is nice to play with
        )
    except Exception as e:
        print(f"Error applying chat template: {e}")
        return None, 0

    #    We add return_tensors="pt" so the output is ready
    #    to be fed directly into the model.
    tokens = tokenizer(formatted_string, return_tensors="pt")
    
    # Get the token IDs from the PyTorch tensor
    token_ids = tokens['input_ids'][0]
    
    num_tokens = len(token_ids)
    
    # 4. Decode each token individually for analysis
    individual_tokens = token_str_arr_from_ids(token_ids)
    
    print(f"\nNumber of prompt tokens: {num_tokens}")

    print(individual_tokens)

    return formatted_string, tokens

prompt1 = [
    {
        "role": "user",
        "content": "Give one reason learning about new technology is important?"
    }
]

input_text, starter_tokens = make_chat_template(prompt1)

In [None]:
#Run Inference

import torch
import torch.nn.functional as F
from transformers.generation.logits_process import TopKLogitsWarper, TopPLogitsWarper, LogitsProcessorList
import tqdm

input_ids = starter_tokens['input_ids'].to(device)

# All new and existing tokens go here
generated_ids = input_ids.clone()

# --- Generation Parameters ---
max_new_tokens = 500

#Qwen recommended for (enable_thinking=False)
temperature = 0.7
top_p = 0.8
top_k = 20

#For thinking mode (enable_thinking=True)
# temperature = 0.6
# top_p = 0.95
# top_k = 20


# Handle single or list of EOS token IDs
eos_token_id_list = tokenizer.eos_token_id
if not isinstance(eos_token_id_list, list):
    eos_token_id_list = [eos_token_id_list]

past_key_values = None #kv cache

# This is to be able to apply both top k and top p filtering
logits_processors = LogitsProcessorList()

if top_k is not None and top_k > 0:
    logits_processors.append(TopKLogitsWarper(top_k=top_k))
    
if top_p is not None and top_p < 1.0:
    # min_tokens_to_keep=1 ensures we don't filter out all tokens
    logits_processors.append(TopPLogitsWarper(top_p=top_p, min_tokens_to_keep=1))

torch.set_printoptions(sci_mode=True, threshold=8, precision=4)

model.set_attn_implementation('eager') #this is required to get attentions output

print(f"Starting token-by-token generation for '{input_text}'...")

steps = [] #This will our attention export tensor but starts a list that will get stacked later

with torch.no_grad():
    for i in tqdm.notebook.tqdm(range(max_new_tokens)):
        # On the first iteration, 'past_key_values' is None, so we use the full 'input_ids'.
        # On subsequent iterations, we use the 'past_key_values' and only pass the last generated token as the 'input_ids'.
        if past_key_values is None:
            current_input_ids = input_ids
        else:
            current_input_ids = generated_ids[:, -1:] # Get the last token

        # Forward pass through the model
        outputs = model(
            input_ids=current_input_ids,
            past_key_values=past_key_values,
            use_cache=True,  # This is needed to get the 'past_key_values' output
            output_attentions=True # Enable attentions output
        )
        
        logits = outputs.logits

        past_key_values = outputs.past_key_values #update for next step

        step = torch.sum(torch.stack([attn.squeeze() for attn in outputs.attentions], dim=0),  dim=(1), dtype= torch.float16) #combine heads by summing #optional dtype=torch.float32,
        # step = torch.stack([attn.squeeze() for attn in outputs.attentions], dim=0).float() #don't combine heads
        steps.append(step)
        
        # Get the logits for the next token
        next_token_logits = logits[:, -1, :]
        
        # Apply temperature scaling
        next_token_logits = next_token_logits / temperature
        
        # Apply LogitsProcessors (Top-K, Top-P)
        # The 'logits_processors' object modifies the logits in-place.
        filtered_logits = logits_processors(input_ids=generated_ids, scores=next_token_logits)
        
        # Sample the next token
        # Convert filtered logits to probabilities and sample
        probs = F.softmax(filtered_logits, dim=-1)
        next_token = torch.multinomial(probs, num_samples=1)
        
        # Append the new token to our full sequence
        generated_ids = torch.cat([generated_ids, next_token], dim=-1)
        
        # Check for EOS token
        if next_token.item() in eos_token_id_list:
            print(f"\nEOS token ({next_token.item()}) generated at step {i+1}. Stopping.")
            break
    
    # End of loop
    if i == max_new_tokens - 1:
        print(f"\nReached max_new_tokens ({max_new_tokens}). Stopping.")

    for i in range(len(steps)):
        if i != 0:
            steps[i] = steps[i].unsqueeze(-1)

    seq_len = steps[-1].shape[1]

    # Pad tensors that are shorter than seq_len
    padded_list = []
    for t in steps:
        padding_needed = seq_len - t.shape[1]
        if padding_needed > 0:
            t = F.pad(t, (0, 0, 0, padding_needed), mode='constant', value=0)
        padded_list.append(t)

    steps = torch.cat(padded_list, dim=2).cpu().numpy()  # Shape: (num_layers, seq_len, seq_len)

# Decode the final generated sequence
pred_manual = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

print("\n--- Manual Generation Result ---")
print(pred_manual)

In [None]:
#Save tensor to binary file

filename = "learning_tensor.bin"

import numpy as np
import json
import struct

labels = token_str_arr_from_ids(generated_ids[0].cpu().tolist())
print(len(labels),labels)

data = steps

header_info = {
    "shape": data.shape,
    "dtype": "float16",
    "labels": labels
}

json_bytes = json.dumps(header_info).encode('utf-8')

# Calculate padding needed
remainder = len(json_bytes) % 4
padding = (4 - remainder) % 4 
json_bytes += b' ' * padding

header_len = len(json_bytes)

# 4. Write: [Length (4B)] + [Aligned JSON] + [Data]
with open(filename, "wb") as f:
    f.write(struct.pack('<I', header_len))
    f.write(json_bytes)
    data.tofile(f)

print(f"Saved file. JSON is {header_len} bytes (including {padding} padding bytes).")