##### Copyright 2021 The TensorFlow Authors.

In [1]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Recommend movies for users with TensorFlow Ranking

<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://www.tensorflow.org/ranking/tutorials/quickstart"><img src="https://www.tensorflow.org/images/tf_logo_32px.png" />View on TensorFlow.org</a>
  </td>
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/tensorflow/ranking/blob/master/docs/tutorials/quickstart.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
  </td>
  <td>
    <a target="_blank" href="https://github.com/tensorflow/ranking/blob/master/docs/tutorials/quickstart.ipynb"><img src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />View source on GitHub</a>
  </td>
  <td>
    <a href="https://storage.googleapis.com/tensorflow_docs/ranking/docs/tutorials/quickstart.ipynb"><img src="https://www.tensorflow.org/images/download_logo_32px.png" />Download notebook</a>
  </td>
</table>

In this tutorial, we build a simple two tower ranking model using the [MovieLens 100K dataset](https://grouplens.org/datasets/movielens/100k/) with TF-Ranking. We can use this model to rank and recommend movies for a given user according to their predicted user ratings.

## Setup

Install and import the TF-Ranking library:

In [2]:
!pip install -q tensorflow-ranking
!pip install -q --upgrade tensorflow-datasets

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tf-keras 2.16.0 requires tensorflow<2.17,>=2.16, but you have tensorflow 2.15.1 which is incompatible.[0m[31m
[0m

In [3]:
from typing import Dict, Tuple

import tensorflow as tf

import tensorflow_datasets as tfds
import tensorflow_ranking as tfr

2024-03-19 11:34:49.704174: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-19 11:34:49.704225: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-19 11:34:49.705795: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## Read the data

Prepare to train a model by creating a ratings dataset and movies dataset. Use `user_id` as the query input feature, `movie_title` as the document input feature, and `user_rating` as the label to train the ranking model.

In [4]:
%%capture --no-display
# Ratings data.
ratings = tfds.load('movielens/100k-ratings', split="train")
# Features of all the available movies.
movies = tfds.load('movielens/100k-movies', split="train")

# Select the basic features.
ratings = ratings.map(lambda x: {
    "movie_title": x["movie_title"],
    "user_id": x["user_id"],
    "user_rating": x["user_rating"]
})

2024-03-19 11:34:53.385017: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:274] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


Build vocabularies to convert all user ids and all movie titles into integer indices for embedding layers:

In [5]:
movies = movies.map(lambda x: x["movie_title"])
users = ratings.map(lambda x: x["user_id"])

user_ids_vocabulary = tf.keras.layers.experimental.preprocessing.StringLookup(
    mask_token=None)
user_ids_vocabulary.adapt(users.batch(1000))

movie_titles_vocabulary = tf.keras.layers.experimental.preprocessing.StringLookup(
    mask_token=None)
movie_titles_vocabulary.adapt(movies.batch(1000))

Group by `user_id` to form lists for ranking models:


In [6]:
key_func = lambda x: user_ids_vocabulary(x["user_id"])
reduce_func = lambda key, dataset: dataset.batch(100)
ds_train = ratings.group_by_window(
    key_func=key_func, reduce_func=reduce_func, window_size=100)

In [7]:
for x in ds_train.take(1):
  for key, value in x.items():
    print(f"Shape of {key}: {value.shape}")
    print(f"Example values of {key}: {value[:5].numpy()}")
    print()

Shape of movie_title: (100,)
Example values of movie_title: [b'Man Who Would Be King, The (1975)' b'Silence of the Lambs, The (1991)'
 b'Next Karate Kid, The (1994)' b'2001: A Space Odyssey (1968)'
 b'Usual Suspects, The (1995)']

Shape of user_id: (100,)
Example values of user_id: [b'405' b'405' b'405' b'405' b'405']

Shape of user_rating: (100,)
Example values of user_rating: [1. 4. 1. 5. 5.]



Generate batched features and labels:

In [8]:
def _features_and_labels(
    x: Dict[str, tf.Tensor]) -> Tuple[Dict[str, tf.Tensor], tf.Tensor]:
  labels = x.pop("user_rating")
  return x, labels


ds_train = ds_train.map(_features_and_labels)

ds_train = ds_train.apply(
    tf.data.experimental.dense_to_ragged_batch(batch_size=32))

Instructions for updating:
Use `tf.data.Dataset.ragged_batch` instead.


The `user_id` and `movie_title` tensors generated in `ds_train` are of shape `[32, None]`, where the second dimension is 100 in most cases except for the batches when less than 100 items grouped in lists. A model working on ragged tensors is thus used.

In [9]:
for x, label in ds_train.take(1):
  for key, value in x.items():
    print(f"Shape of {key}: {value.shape}")
    print(f"Example values of {key}: {value[:3, :3].numpy()}")
    print()
  print(f"Shape of label: {label.shape}")
  print(f"Example values of label: {label[:3, :3].numpy()}")

Shape of movie_title: (32, None)
Example values of movie_title: [[b'Man Who Would Be King, The (1975)'
  b'Silence of the Lambs, The (1991)' b'Next Karate Kid, The (1994)']
 [b'Flower of My Secret, The (Flor de mi secreto, La) (1995)'
  b'Little Princess, The (1939)' b'Time to Kill, A (1996)']
 [b'Kundun (1997)' b'Scream (1996)' b'Power 98 (1995)']]

Shape of user_id: (32, None)
Example values of user_id: [[b'405' b'405' b'405']
 [b'655' b'655' b'655']
 [b'13' b'13' b'13']]

Shape of label: (32, None)
Example values of label: [[1. 4. 1.]
 [3. 3. 3.]
 [5. 1. 1.]]


## Define a model

Define a ranking model by inheriting from `tf.keras.Model` and implementing the `call` method:

In [10]:
class MovieLensRankingModel(tf.keras.Model):

  def __init__(self, user_vocab, movie_vocab):
    super().__init__()

    # Set up user and movie vocabulary and embedding.
    self.user_vocab = user_vocab
    self.movie_vocab = movie_vocab
    self.user_embed = tf.keras.layers.Embedding(user_vocab.vocabulary_size(),
                                                64)
    self.movie_embed = tf.keras.layers.Embedding(movie_vocab.vocabulary_size(),
                                                 64)

  def call(self, features: Dict[str, tf.Tensor]) -> tf.Tensor:
    # Define how the ranking scores are computed: 
    # Take the dot-product of the user embeddings with the movie embeddings.

    user_embeddings = self.user_embed(self.user_vocab(features["user_id"]))
    movie_embeddings = self.movie_embed(
        self.movie_vocab(features["movie_title"]))

    return tf.reduce_sum(user_embeddings * movie_embeddings, axis=2)

Create the model, and then compile it with ranking `tfr.keras.losses` and `tfr.keras.metrics`, which are the core of the TF-Ranking package. 

This example uses a ranking-specific **softmax loss**, which is a listwise loss introduced to promote all relevant items in the ranking list with better chances on top of the irrelevant ones. In contrast to the softmax loss in the multi-class classification problem, where only one class is positive and the rest are negative, the TF-Ranking library supports multiple relevant documents in a query list and non-binary relevance labels.

For ranking metrics, this example uses in specific **Normalized Discounted Cumulative Gain (NDCG)** and **Mean Reciprocal Rank (MRR)**, which calculate the user utility of a ranked query list with position discounts. For more details about ranking metrics, review evaluation measures [offline metrics](https://en.wikipedia.org/wiki/Evaluation_measures_(information_retrieval)#Offline_metrics).

In [11]:
# Create the ranking model, trained with a ranking loss and evaluated with
# ranking metrics.
model = MovieLensRankingModel(user_ids_vocabulary, movie_titles_vocabulary)
optimizer = tf.keras.optimizers.Adagrad(0.5)
loss = tfr.keras.losses.get(
    loss=tfr.keras.losses.RankingLossKey.SOFTMAX_LOSS, ragged=True)
eval_metrics = [
    tfr.keras.metrics.get(key="ndcg", name="metric/ndcg", ragged=True),
    tfr.keras.metrics.get(key="mrr", name="metric/mrr", ragged=True)
]
model.compile(optimizer=optimizer, loss=loss, metrics=eval_metrics)

## Train and evaluate the model

Train the model with `model.fit`.

In [12]:
model.fit(ds_train, epochs=3)

Epoch 1/3


      1/Unknown - 5s 5s/step - loss: 1477.8344 - metric/ndcg: 0.7807 - metric/mrr: 1.0000

      2/Unknown - 5s 273ms/step - loss: 1527.9211 - metric/ndcg: 0.7945 - metric/mrr: 1.0000

      3/Unknown - 5s 222ms/step - loss: 1564.8604 - metric/ndcg: 0.8016 - metric/mrr: 1.0000

      4/Unknown - 5s 200ms/step - loss: 1584.4081 - metric/ndcg: 0.8058 - metric/mrr: 1.0000

      5/Unknown - 5s 184ms/step - loss: 1575.7542 - metric/ndcg: 0.8057 - metric/mrr: 1.0000

      6/Unknown - 6s 176ms/step - loss: 1579.3936 - metric/ndcg: 0.8049 - metric/mrr: 1.0000

      7/Unknown - 6s 167ms/step - loss: 1584.7850 - metric/ndcg: 0.8059 - metric/mrr: 1.0000

      8/Unknown - 6s 161ms/step - loss: 1593.7064 - metric/ndcg: 0.8073 - metric/mrr: 1.0000

      9/Unknown - 6s 152ms/step - loss: 1590.5203 - metric/ndcg: 0.8075 - metric/mrr: 1.0000

     10/Unknown - 6s 148ms/step - loss: 1597.3547 - metric/ndcg: 0.8104 - metric/mrr: 1.0000

     11/Unknown - 6s 142ms/step - loss: 1599.4286 - metric/ndcg: 0.8108 - metric/mrr: 1.0000

     12/Unknown - 6s 140ms/step - loss: 1604.5621 - metric/ndcg: 0.8123 - metric/mrr: 1.0000

     13/Unknown - 6s 141ms/step - loss: 1605.0614 - metric/ndcg: 0.8127 - metric/mrr: 1.0000

     14/Unknown - 6s 140ms/step - loss: 1605.7787 - metric/ndcg: 0.8136 - metric/mrr: 1.0000

     15/Unknown - 7s 143ms/step - loss: 1606.2090 - metric/ndcg: 0.8142 - metric/mrr: 1.0000

     16/Unknown - 7s 140ms/step - loss: 1606.6212 - metric/ndcg: 0.8144 - metric/mrr: 1.0000

     17/Unknown - 7s 137ms/step - loss: 1603.9541 - metric/ndcg: 0.8141 - metric/mrr: 1.0000

     18/Unknown - 7s 134ms/step - loss: 1601.2360 - metric/ndcg: 0.8149 - metric/mrr: 1.0000

     19/Unknown - 7s 131ms/step - loss: 1579.0735 - metric/ndcg: 0.8142 - metric/mrr: 1.0000

     30/Unknown - 7s 83ms/step - loss: 1231.8003 - metric/ndcg: 0.8279 - metric/mrr: 1.0000 

     39/Unknown - 7s 65ms/step - loss: 1152.8252 - metric/ndcg: 0.8254 - metric/mrr: 1.0000



Epoch 2/3


 1/48 [..............................] - ETA: 52s - loss: 1476.4519 - metric/ndcg: 0.9138 - metric/mrr: 1.0000

 2/48 [>.............................] - ETA: 8s - loss: 1526.5396 - metric/ndcg: 0.9113 - metric/mrr: 1.0000 

 3/48 [>.............................] - ETA: 7s - loss: 1563.4860 - metric/ndcg: 0.9130 - metric/mrr: 1.0000

 4/48 [=>............................] - ETA: 6s - loss: 1583.0378 - metric/ndcg: 0.9161 - metric/mrr: 1.0000

 5/48 [==>...........................] - ETA: 6s - loss: 1574.3459 - metric/ndcg: 0.9128 - metric/mrr: 1.0000

 6/48 [==>...........................] - ETA: 6s - loss: 1577.8793 - metric/ndcg: 0.9145 - metric/mrr: 1.0000

 7/48 [===>..........................] - ETA: 5s - loss: 1583.1188 - metric/ndcg: 0.9151 - metric/mrr: 1.0000

 8/48 [====>.........................] - ETA: 5s - loss: 1591.9490 - metric/ndcg: 0.9154 - metric/mrr: 1.0000

 9/48 [====>.........................] - ETA: 5s - loss: 1588.6942 - metric/ndcg: 0.9136 - metric/mrr: 1.0000

10/48 [=====>........................] - ETA: 5s - loss: 1595.4871 - metric/ndcg: 0.9148 - metric/mrr: 1.0000

11/48 [=====>........................] - ETA: 5s - loss: 1597.5161 - metric/ndcg: 0.9142 - metric/mrr: 1.0000























Epoch 3/3


 1/48 [..............................] - ETA: 51s - loss: 1470.2378 - metric/ndcg: 0.9325 - metric/mrr: 1.0000

 2/48 [>.............................] - ETA: 10s - loss: 1521.0977 - metric/ndcg: 0.9251 - metric/mrr: 1.0000

 3/48 [>.............................] - ETA: 8s - loss: 1558.6129 - metric/ndcg: 0.9254 - metric/mrr: 1.0000 

 4/48 [=>............................] - ETA: 7s - loss: 1578.3770 - metric/ndcg: 0.9288 - metric/mrr: 1.0000

 5/48 [==>...........................] - ETA: 7s - loss: 1569.9446 - metric/ndcg: 0.9263 - metric/mrr: 1.0000

 6/48 [==>...........................] - ETA: 6s - loss: 1573.5231 - metric/ndcg: 0.9279 - metric/mrr: 1.0000

 7/48 [===>..........................] - ETA: 6s - loss: 1578.7963 - metric/ndcg: 0.9286 - metric/mrr: 1.0000

 8/48 [====>.........................] - ETA: 6s - loss: 1587.7528 - metric/ndcg: 0.9300 - metric/mrr: 1.0000

 9/48 [====>.........................] - ETA: 5s - loss: 1584.5347 - metric/ndcg: 0.9284 - metric/mrr: 1.0000

10/48 [=====>........................] - ETA: 5s - loss: 1591.4133 - metric/ndcg: 0.9295 - metric/mrr: 1.0000

11/48 [=====>........................] - ETA: 5s - loss: 1593.4933 - metric/ndcg: 0.9291 - metric/mrr: 1.0000























<keras.src.callbacks.History at 0x7f666424d700>

Generate predictions and evaluate.

In [13]:
# Get movie title candidate list.
for movie_titles in movies.batch(2000):
  break

# Generate the input for user 42.
inputs = {
    "user_id":
        tf.expand_dims(tf.repeat("42", repeats=movie_titles.shape[0]), axis=0),
    "movie_title":
        tf.expand_dims(movie_titles, axis=0)
}

# Get movie recommendations for user 42.
scores = model(inputs)
titles = tfr.utils.sort_by_scores(scores,
                                  [tf.expand_dims(movie_titles, axis=0)])[0]
print(f"Top 5 recommendations for user 42: {titles[0, :5]}")

Top 5 recommendations for user 42: [b'Star Wars (1977)' b'Liar Liar (1997)' b'Toy Story (1995)'
 b'Raiders of the Lost Ark (1981)' b'Sound of Music, The (1965)']
