用 Python 构建智能推荐系统的实用路线
一、整体流程与数据准备
二、核心方法与 Python 实现要点
三、端到端最小可行示例(基于 MovieLens,演示 ItemCF + 评估)
# pip install pandas scikit-learn
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
# 1) 读取数据(示例:MovieLens 100K,列名: userId, movieId, rating, timestamp)
# ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=['userId','movieId','rating','timestamp'])
# movies = pd.read_csv('ml-100k/u.item', sep='|', encoding='latin-1',
# usecols=range(5), names=['movieId','title','release_date','video_release_date','imdb_url'])
# 为演示生成小样本(请替换为真实数据加载)
data = {'userId': [1,1,1,2,2,3,3,4,4,5],
'movieId': [1,2,3,2,3,1,4,2,4,3],
'rating': [5,3,4,4,5,3,2,4,5,4]}
ratings = pd.DataFrame(data)
# 2) 划分训练/测试
train, test = train_test_split(ratings, test_size=0.2, random_state=42)
# 3) 构建用户-物品评分矩阵(训练集)
user_item = train.pivot_table(index='userId', columns='movieId', values='rating')
# 4) 物品相似度(ItemCF)
item_sim = cosine_similarity(user_item.fillna(0).T) # 转置为物品-物品
item_sim_df = pd.DataFrame(item_sim, index=user_item.columns, columns=user_item.columns)
# 5) 预测评分(对测试集中已评分的(user,item)做预测,便于RMSE评估)
def predict_ratings(train_ui, item_sim_df, test_df):
preds = []
for _, row in test_df.iterrows():
u, i, r = row['userId'], row['movieId'], row['rating']
if i not in train_ui.columns or u not in train_ui.index:
preds.append(np.nan)
continue
rated_items = train_ui.loc[u].dropna()
if rated_items.empty:
preds.append(np.nan)
continue
# 加权平均:sim(i, j) * r(u, j) / sum(|sim(i, j)|)
sims = item_sim_df[i].loc[rated_items.index]
numer = (sims * rated_items).sum()
denom = sims.abs().sum()
pred = numer / denom if denom != 0 else np.nan
preds.append(pred)
return np.array(preds)
preds = predict_ratings(user_item, item_sim_df, test)
valid = ~np.isnan(preds)
rmse = np.sqrt(mean_squared_error(test['rating'].iloc[valid], preds[valid]))
print(f'ItemCF RMSE: {rmse:.4f}')
# 6) Top-N 推荐函数(示例:给用户1推荐5部)
def recommend_topn(user_id, train_ui, item_sim_df, k=5):
if user_id not in train_ui.index:
return []
rated = train_ui.loc[user_id].dropna().index
scores = pd.Series(0.0, index=train_ui.columns)
for item in rated:
sims = item_sim_df[item].drop(rated, errors='ignore') # 排除已评分
scores[sims.index] += sims
# 过滤训练集中已评分,取Top-N
scores = scores.drop(rated, errors='ignore').sort_values(ascending=False)
return scores.head(k).index.tolist()
print('Top-5 for user 1:', recommend_topn(1, user_item, item_sim_df, k=5))
四、评估与优化实践
免责声明:本站发布的内容(图片、视频和文字)以原创、转载和分享为主,文章观点不代表本网站立场,如果涉及侵权请联系站长邮箱:is@yisu.com进行举报,并提供相关证据,一经查实,将立刻删除涉嫌侵权内容。