<a href="https://colab.research.google.com/github/cmu-llms-class/11-766-hw1-dev/blob/main/src/task1/llm_task1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [None]:
!pip install evaluate bert_score unbabel-comet

In [None]:
from datasets import load_dataset, DownloadMode
import random
import sacrebleu
import evaluate
from comet import download_model, load_from_checkpoint
import pandas as pd
import matplotlib.pyplot as plt
import torch
from transformers import pipeline

def _suppress_logs():
    import logging
    import os
    loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict]
    for logger in loggers:
        logger.setLevel(logging.ERROR)

    os.environ["TOKENIZERS_PARALLELISM"] = "false"
    logging.getLogger("transformers").setLevel(logging.ERROR)
    logging.getLogger("accelerate").setLevel(logging.ERROR)
    logging.getLogger("tokenizers").setLevel(logging.ERROR)

# Helper Functions

## Dataset Processing

These functions handle:
- Downloading the OPUS Books EN-FR translation dataset
- Converting examples to chat template format (required by instruction-tuned models)
- Random sampling for testing

Note, we add `"Input: "` prefix to match the chat template format expected by the model.

In [None]:
# Do Not change this
DATASET_NAME = "opus_books"
LANGUAGE_PAIR = "en-fr"

def download_and_process_translation_dataset():
    """
    Download the translation_dataset
    :return: HuggingFace dataset
    """
    ds = load_dataset(DATASET_NAME, LANGUAGE_PAIR, download_mode=DownloadMode.FORCE_REDOWNLOAD)
    ds = ds["train"]
    ds = ds.map(convert_english_to_chat_template_format)
    return ds

def convert_english_to_chat_template_format(examples) -> dict:
    """
    Given an example, map fn transforms it to the below output
    You may print an example to see how it looks!
    We want to later use the content for translation
    {
        'id': '0', 'translation':
        {'en': 'The Wanderer', 'fr': 'Le grand Meaulnes'},
        'role': 'user',
        'en_content': 'The Wanderer',
        'fr_content': 'Le grand Meaulnes'
    }
    :param examples: One dataset example
    :return: Dict
    """
    # solution
    return {"role": "user",
            "en_content": f"{examples["translation"]["en"]}",
            "fr_content": f"Input{examples["translation"]["fr"]}"}

def sample_from_dataset(dataset, num_samples=100):
    """
    Randomly sample num_samples from dataset
    :param dataset: HuggingFace dataset
    :param num_samples: Number of random samples
    :return: Subset of Dataset with samples
    """
    random.seed(42)
    random_indices = random.sample(range(len(dataset)), k=num_samples)
    return dataset.select(random_indices)

def create_english_user_prompt(example) -> dict:
    """

    :param example: One Training data example from huggingFace dataset (assume it's like a dict)
    :return: Dict which looks like
    {'role': 'user', 'content': 'Input: The Wanderer'}
    """
    # Solution
    return {'role': example['role'], 'content': example['en_content']}

def create_french_user_prompt(example) -> dict:
    """

        :param example: One Training data example from huggingFace dataset (assume it's like a dict)
        :return: Dict which looks like
        {'role': 'user', 'content': 'Input: The Wanderer'}
        """
    # Solution
    return {'role': example['role'], 'content': example['fr_content']}

## Inference helper functions

In [None]:
def batch_translate_using_hf(user_prompts: list[dict[str, str]],
                             system_prompt: dict[str, str],
                             model_name: str,
                             max_new_tokens: int,
                             do_sample: bool = False):
    """
    :param user_prompts: List of user prompts for batch translation
    :param system_prompt: System prompt of text
    :param model_name: Name of HuggingFace model
    :param max_new_tokens: Max tokens to decode
    :param do_sample: sampling prompts
    :return: List of outputs
    """
    pipe = pipeline(task="text-generation",
                    model=model_name,
                    dtype=torch.bfloat16,
                    device_map="auto")

    # Prepare batch of prompts
    prompts = []
    for user_prompt in user_prompts:
        messages = [system_prompt, user_prompt]
        prompt = pipe.tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True)
        prompts.append(prompt)

    # Batch processing with batches of 32
    outputs = pipe(
        prompts,
        max_new_tokens=max_new_tokens,
        do_sample=do_sample,
        return_full_text=False,
        batch_size=32)
    return outputs


def batch_post_process_output(batch_outputs) -> list[str]:
    """
    Filter out the translated text from batch outputs
    :param batch_outputs: Batch outputs from batch_translate_using_hf
    :return: List of translated texts
    """
    translations = []
    for outputs in batch_outputs:
        raw_output = outputs[0]["generated_text"]
        translation = raw_output.split("Output:")[-1].split("<|im_end|>")[0].strip()
        translations.append(translation)
    return translations


---

## Evaluation Metrics

This section contains implementations for the translation quality metrics you'll examine, as well as an evaluation function which calls them.

In [None]:
# You will implement this function for question 1.2.
def ai_as_judge_scorer() -> float:
    return -1.0 # TODO: Add your implementation

In [None]:
# Load BERTScore once
_bertscore = evaluate.load("bertscore")


def compute_bleu_score(
    predicted_translation: str,
    ground_truth: str,
) -> float:
    """
    Compute sentence-level BLEU using SacreBLEU with smoothing.

    :param predicted_translation: Model translation
    :param ground_truth: Reference translation
    :return: BLEU score
    """
    bleu = sacrebleu.sentence_bleu(
        predicted_translation,
        [ground_truth],
    )
    return bleu.score


def compute_bert_score(
    predicted_translation: str,
    ground_truth: str,
    lang: str,
) -> float:
    """
    Compute BERTScore (F1).

    :param predicted_translation: Model translation
    :param ground_truth: Reference translation
    :param lang: Language code (e.g., 'en', 'fr')
    :return: BERTScore F1
    """
    result = _bertscore.compute(
        predictions=[predicted_translation],
        references=[ground_truth],
        lang=lang,
    )
    return result["f1"][0]


class CometScorer:
    def __init__(self):
        # Do not change this!
        self.model_path = download_model("Unbabel/wmt22-comet-da")
        self.model = load_from_checkpoint(self.model_path)

    def compute_comet_score(
        self,
        reference: str,
        predicted_translation: str,
        ground_truth: str,
    ) -> float:
        """
        Compute COMET score.

        :param reference: Source sentence
        :param predicted_translation: Model output
        :param ground_truth: Reference translation
        :return: COMET score
        """
        data = [
            {
                "src": reference,
                "mt": predicted_translation,
                "ref": ground_truth,
            }
        ]
        output = self.model.predict(
            data,
            batch_size=1,
            progress_bar=False,
        )
        return output["scores"][0]

    def compute_comet_scores_batch(
        self,
        references: list[str],
        predicted_translations: list[str],
        ground_truths: list[str],
    ) -> list[float]:
        """
        Compute COMET scores for a batch.

        :param references: List of source sentences
        :param predicted_translations: List of model outputs
        :param ground_truths: List of reference translations
        :return: List of COMET scores
        """
        data = [
            {
                "src": ref,
                "mt": pred,
                "ref": gt,
            }
            for ref, pred, gt in zip(references, predicted_translations, ground_truths)
        ]
        output = self.model.predict(
            data,
            batch_size=32,
            progress_bar=False,
        )
        return output["scores"]

def evaluate_translation_batch(
    predictions: list[str],
    references: list[str],
    sources: list[str],
    comet_scorer: CometScorer,
    lang: str,
) -> list[dict]:
    """
    Batch evaluation of translations.

    :param predictions: List of predicted texts
    :param references: List of ground truths
    :param sources: List of source texts
    :param comet_scorer: CometScorer instance
    :param lang: Language code
    :return: List of score dictionaries
    """
    # Compute BLEU scores (not batched, but fast)
    bleu_scores = [compute_bleu_score(pred, ref) for pred, ref in zip(predictions, references)]

    # Compute BERTScores in batch
    bert_results = _bertscore.compute(
        predictions=predictions,
        references=references,
        lang=lang,
    )
    bert_scores = bert_results["f1"]

    # Compute COMET scores in batch
    comet_scores = comet_scorer.compute_comet_scores_batch(
        references=sources,
        predicted_translations=predictions,
        ground_truths=references,
    )

    # Combine results
    results = []
    for bleu, bert, comet in zip(bleu_scores, bert_scores, comet_scores):
        # TODO Complete AI as judge scorer
        ai_judge = ai_as_judge_scorer()
        results.append({
            "bleu": bleu,
            "bert": bert,
            "comet": comet,
            "ai_judge": ai_judge,
        })

    return results

def batch_translate(
    user_prompts: list[dict[str, str]],
    system_prompt: dict[str, str],
) -> list[str]:
    outputs = batch_translate_using_hf(
        user_prompts=user_prompts,
        system_prompt=system_prompt,
        model_name=MODEL_NAME,
        max_new_tokens=MAX_NEW_TOKENS,
        do_sample=False,
    )
    return batch_post_process_output(outputs)




---

## Part 1 and 2: Direct Translation (FR→EN)

This section evaluates **direct translation** from French to English.

### Configuration

Define the system prompt and model parameters:
- **System prompt**: Instructions telling the model what task to perform
- **Model**: SmolLM2-1.7B-Instruct (a small instruction-tuned language model)
- **MAX_NEW_TOKENS**: Maximum length of generated translation
- **NUM_SAMPLES**: Number of examples to evaluate

**TODO for students**:

1. Experiment with different system prompts to improve translation quality!
2. Implement `ai_as_judge_scorer` above.

In [None]:
FORWARD_TRANSLATION_SYSTEM_PROMPT = {
    "role":"system",
    # TODO Change this prompt to better one
    "content": "Translate this text from French to English."
}

MODEL_NAME = "HuggingFaceTB/SmolLM2-1.7B-Instruct"
MAX_NEW_TOKENS = 512
NUM_SAMPLES = 100

In [None]:
# Suppress verbose logs (especially COMET)
_suppress_logs()

dataset = download_and_process_translation_dataset()
examples = sample_from_dataset(dataset, num_samples=NUM_SAMPLES)

comet_scorer = CometScorer()

metrics_fr_en = {"bleu": 0.0, "bert": 0.0, "comet": 0.0}

# Prepare batch data
user_prompts = [create_french_user_prompt(example) for example in examples]

# Batch translation
print("Starting batch translation...")
# Use specific variable name for Part 3 compatibility
direct_translations_part1 = batch_translate(
    user_prompts, FORWARD_TRANSLATION_SYSTEM_PROMPT)
print(f"Completed batch translation of {len(direct_translations_part1)} examples\n")

# Prepare data for batch evaluation
en_texts = [example["translation"]["en"] for example in examples]
fr_texts = [example["translation"]["fr"] for example in examples]

# Batch evaluation
print("Starting batch evaluation...")
all_scores = evaluate_translation_batch(
    predictions=direct_translations_part1,
    references=en_texts,
    sources=fr_texts,
    comet_scorer=comet_scorer,
    lang="en",
)
print(f"Completed batch evaluation\n")

# Print individual results and accumulate metrics
for i, (example, en_pred, scores_fr_en) in enumerate(zip(examples, direct_translations_part1, all_scores)):
    en_text = example["translation"]["en"]
    fr_text = example["translation"]["fr"]

    print(f"Example {i+1}/{NUM_SAMPLES}:")
    print(f"FR Reference: {fr_text}")
    print(f"EN Ground Truth: {en_text}")
    print(f"EN Prediction: {en_pred}")
    print(f"Scores FR→EN: {scores_fr_en}\n")

    for key in metrics_fr_en:
        metrics_fr_en[key] += scores_fr_en[key] / NUM_SAMPLES

print("====== Final Averages ======")
print(f"FR → EN | BLEU: {metrics_fr_en['bleu']:.4f}, "
      f"BERTScore: {metrics_fr_en['bert']:.4f}, "
      f"COMET: {metrics_fr_en['comet']:.4f}")

---

## Part 2: Backtranslation (EN→FR→EN)

Backtranslation tests **information preservation** through round-trip translation:
1. Translate English → French (forward)
2. Translate French → English (back)
3. Compare final English with original English

### Why Backtranslation?
- Measures how much meaning survives the translation process
- Lower scores indicate information loss or distortion
- Useful for evaluating translation robustness

### Configure System Prompts

Implement a prompt for back translation.

In [None]:
# TODO For students
# Improve the prompt
BACK_TRANSLATION_SYSTEM_PROMPT = {
    "role":"system",
    "content":"Translate the given text from French to English."
}

def _convert_backtranslation_to_chat_format(fr_pred):
    return {"role": "user", "content": f"{fr_pred}"}


### Run Backtranslation Pipeline

Execute the complete backtranslation workflow:
1. **Forward translation**: English → French
2. **Back translation**: French → English  
3. **Evaluation**: Compare back-translated English with original English

In [None]:
# Suppress verbose logs (especially COMET)
_suppress_logs()

dataset = download_and_process_translation_dataset()
examples = sample_from_dataset(dataset, num_samples=NUM_SAMPLES)

comet_scorer = CometScorer()

metrics_fr_en = {"bleu": 0.0, "bert": 0.0, "comet": 0.0}

# Prepare batch data for forward translation (EN -> FR)
user_prompts = [create_english_user_prompt(example) for example in examples]

# Forward translation: EN -> FR
print("Starting forward translation (EN -> FR)...")
fr_predictions = batch_translate(user_prompts, FORWARD_TRANSLATION_SYSTEM_PROMPT)
print(f"Completed forward translation of {len(fr_predictions)} examples\n")

# Prepare batch data for back translation (FR -> EN)
back_user_prompts = [
    _convert_backtranslation_to_chat_format(fr_pred) for fr_pred in fr_predictions]

# Back translation: FR -> EN
print("Starting back translation (FR -> EN)...")
# Use specific variable name for Part 3 compatibility
backtranslations_part2 = batch_translate(back_user_prompts, BACK_TRANSLATION_SYSTEM_PROMPT)
print(f"Completed back translation of {len(backtranslations_part2)} examples\n")

# Prepare data for batch evaluation
en_texts = [example["translation"]["en"] for example in examples]
fr_texts = [example["translation"]["fr"] for example in examples]

# Batch evaluation
print("Starting batch evaluation...")
all_scores = evaluate_translation_batch(
    predictions=backtranslations_part2,
    references=en_texts,
    sources=fr_texts,
    comet_scorer=comet_scorer,
    lang="en",
)
print(f"Completed batch evaluation\n")

# Print individual results and accumulate metrics
for i, (example, fr_pred, en_pred, scores_fr_en) in enumerate(zip(examples, fr_predictions, backtranslations_part2, all_scores)):
    en_text = example["translation"]["en"]
    fr_text = example["translation"]["fr"]

    print(f"Example {i+1}/{NUM_SAMPLES}:")
    print(f"EN Source: {en_text}")
    print(f"FR Prediction (forward): {fr_pred}")
    print(f"EN Prediction (back): {en_pred}")
    print(f"Backtranslation Scores: {scores_fr_en}\n")

    for key in metrics_fr_en:
        metrics_fr_en[key] += scores_fr_en[key] / NUM_SAMPLES

print("====== Final Averages ======")
print(f"Backtranslation (EN → FR → EN) | BLEU: {metrics_fr_en['bleu']:.4f}, "
        f"BERTScore: {metrics_fr_en['bert']:.4f}, "
        f"COMET: {metrics_fr_en['comet']:.4f}")