A Content-based Recommender using TF-IDF in Python

A Content-based Recommender using TF-IDF

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from itertools import combinations
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def genre_recommendations (i, M, items, k=10):
The following function takes as input a given movie i,
the similarity matrix M, the items dataframe and returns
up to k recommendations:

ix = M.loc[:,i].to_numpy().argpartition(range(-1,-k,-1))
closest = M.columns[ix[-1:-(k+2):-1]]
closest = closest.drop(i, errors='ignore')
return pd.DataFrame(closest).merge(items).head(k)

movies = pd.read_csv('movies.csv', usecols=['movieId', 'title', 'genres'])
ratings = pd.read_csv('ratings.csv', usecols=['userId', 'movieId', 'rating'])
movie_rating = movies.merge(ratings)

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 200)

s = "Adventure Animation Children Comedy Fantasy"
tf_wrong = TfidfVectorizer(analyzer='word', ngram_range=(1,2))[s])
# ['animation', 'animation children', 'children', 'children comedy', 'comedy']
g = [c for i in range(1,2) for c in combinations(s.split(), r=i)]
tf = TfidfVectorizer(analyzer=lambda s: (c for i in range(1,5)
for c in combinations(s.split('|'), r=i)))
tfidf_matrix = tf.fit_transform(movies['genres'])

df = pd.DataFrame(tfidf_matrix.todense(), columns=tf.get_feature_names_out(),
index=movies.title).sample(2, axis=1).sample(10, axis=0)

cosine_sim = cosine_similarity(tfidf_matrix)
cosine_sim_df = pd.DataFrame(cosine_sim, index=movies['title'],
print('Shape:', cosine_sim_df.shape)
cosine_sim_df.sample(3, axis=1).round(2)

genre_recommendations('Pulp Fiction (1994)',
cosine_sim_df, movies[['title', 'genres']])
