文章目录
六 实时产生推荐结果
6.1 推荐任务处理
-
CTR预测模型 + 特征 ==> 预测结果 ==> TOP-N列表
-
特征获取
import redis
import json
import pandas as pd
from pyspark.ml.linalg import DenseVector
def create_datasets(userId, pid):
client_of_recall = redis.StrictRedis(host="192.168.19.137", port=6379, db=9)
client_of_features = redis.StrictRedis(host="192.168.19.137", port=6379, db=10)
# 获取用户特征
user_feature = json.loads(client_of_features.hget("user_features", userId))
# 获取用户召回集
recall_sets = client_of_recall.smembers(userId)
result = []
# 遍历召回集
for adgroupId in recall_sets:
adgroupId = int(adgroupId)
# 获取该广告的特征值
ad_feature = json.loads(client_of_features.hget("ad_features", adgroupId))
features = {
}
features.update(user_feature)
features.update(ad_feature)
for k,v in features.items():
if v is None:
features[k] = -1
features_col = [
# 特征值
"price",
"cms_group_id",
"final_gender_code",
"age_level",
"shopping_level",
"occupation",
"pid",
"pvalue_level",
"new_user_class_level"
]
''' "cms_group_id", 类别型特征,约13个分类 ==> 13维 "final_gender_code", 类别型特征,2个分类 ==> 2维 "age_level", 类别型特征,7个分类 ==>7维 "shopping_level", 类别型特征,3个分类 ==> 3维 "occupation", 类别型特征,2个分类 ==> 2维 '''
price = float(features["price"])
pid_value = [0 for i in range(2)]#[0,0]
cms_group_id_value = [0 for i in range(13)]
final_gender_code_value = [0 for i in range(2)]
age_level_value = [0 for i in range(7)]
shopping_level_value = [0 for i in range(3)]
occupation_value = [0 for i in range(2)]
pvalue_level_value = [0 for i in range(4)]
new_user_class_level_value = [0 for i in range(5)]
pid_value[pid_rela[pid]] = 1
cms_group_id_value[cms_group_id_rela[int(features["cms_group_id"])]] = 1
final_gender_code_value[final_gender_code_rela[int(features["final_gender_code"])]] = 1
age_level_value[age_level_rela[int(features["age_level"])]] = 1
shopping_level_value[shopping_level_rela[int(features["shopping_level"])]] = 1
occupation_value[occupation_rela[int(features["occupation"])]] = 1
pvalue_level_value[pvalue_level_rela[int(features["pvalue_level"])]] = 1
new_user_class_level_value[new_user_class_level_rela[int(features["new_user_class_level"])]] = 1
# print(pid_value)
# print(cms_group_id_value)
# print(final_gender_code_value)
# print(age_level_value)
# print(shopping_level_value)
# print(occupation_value)
# print(pvalue_level_value)
# print(new_user_class_level_value)
vector = DenseVector([price] + pid_value + cms_group_id_value + final_gender_code_value\
+ age_level_value + shopping_level_value + occupation_value + pvalue_level_value + new_user_class_level_value)
result.append((userId, adgroupId, vector))
return result
# create_datasets(88, "430548_1007")
- 载入训练好的模型
from pyspark.ml.classification import LogisticRegressionModel
CTR_model = LogisticRegressionModel.load("hdfs://localhost:9000/models/CTRModel_AllOneHot.obj")
pdf = pd.DataFrame(create_datasets(8, "430548_1007"), columns=["userId", "adgroupId", "features"])
datasets = spark.createDataFrame(pdf)
datasets.show()
显示结果:
+------+---------+--------------------+
|userId|adgroupId| features|
+------+---------+--------------------+
| 8| 445914|[9.89999961853027...|
| 8| 258252|[7.59999990463256...|
| 8| 129682|[8.5,1.0,0.0,1.0,...|
| 8| 763027|[68.0,1.0,0.0,1.0...|
| 8| 292027|[16.0,1.0,0.0,1.0...|
| 8| 430023|[34.2000007629394...|
| 8| 133457|[169.0,1.0,0.0,1....|
| 8| 816999|[5.0,1.0,0.0,1.0,...|
| 8| 221714|[4.80000019073486...|
| 8| 186334|[106.0,1.0,0.0,1....|
| 8| 169717|[2.20000004768371...|
| 8| 31314|[15.8000001907348...|
| 8| 815312|[2.29999995231628...|
| 8| 199445|[5.0,1.0,0.0,1.0,...|
| 8| 746178|[16.7999992370605...|
| 8| 290950|[6.5,1.0,0.0,1.0,...|
| 8| 221585|[18.5,1.0,0.0,1.0...|
| 8| 692672|[47.0,1.0,0.0,1.0...|
| 8| 797982|[33.0,1.0,0.0,1.0...|
| 8| 815219|[2.40000009536743...|
+------+---------+--------------------+
only showing top 20 rows
prediction = CTR_model.transform(datasets).sort("probability")
prediction.show()
+------+---------+--------------------+--------------------+--------------------+----------+
|userId|adgroupId| features| rawPrediction| probability|prediction|
+------+---------+--------------------+--------------------+--------------------+----------+
| 8| 631204|[19888.0,1.0,0.0,...|[2.69001234046578...|[0.93643471623189...| 0.0|
| 8| 583215|[3750.0,1.0,0.0,1...|[2.69016170680037...|[0.93644360664433...| 0.0|
| 8| 275819|[3280.0,1.0,0.0,1...|[2.69016605691669...|[0.93644386554961...| 0.0|
| 8| 401433|[1200.0,1.0,0.0,1...|[2.69018530849532...|[0.93644501133142...| 0.0|
| 8| 29466|[640.0,1.0,0.0,1....|[2.69019049161265...|[0.93644531980785...| 0.0|
| 8| 173327|[356.0,1.0,0.0,1....|[2.69019312019358...|[0.93644547624893...| 0.0|
| 8| 241402|[269.0,1.0,0.0,1....|[2.69019392542787...|[0.93644552417271...| 0.0|
| 8| 351366|[246.0,1.0,0.0,1....|[2.69019413830591...|[0.93644553684221...| 0.0|
| 8| 229827|[238.0,1.0,0.0,1....|[2.69019421235044...|[0.93644554124900...| 0.0|
| 8| 164807|[228.0,1.0,0.0,1....|[2.69019430490611...|[0.93644554675747...| 0.0|
| 8| 227731|[199.0,1.0,0.0,1....|[2.69019457331754...|[0.93644556273205...| 0.0|
| 8| 265403|[198.0,1.0,0.0,1....|[2.69019458257311...|[0.93644556328290...| 0.0|
| 8| 569939|[188.0,1.0,0.0,1....|[2.69019467512877...|[0.93644556879138...| 0.0|
| 8| 277335|[181.5,1.0,0.0,1....|[2.69019473528996...|[0.93644557237189...| 0.0|
| 8| 575633|[180.0,1.0,0.0,1....|[2.69019474917331...|[0.93644557319816...| 0.0|
| 8| 201867|[179.0,1.0,0.0,1....|[2.69019475842887...|[0.93644557374900...| 0.0|
| 8| 25542|[176.0,1.0,0.0,1....|[2.69019478619557...|[0.93644557540155...| 0.0|
| 8| 133457|[169.0,1.0,0.0,1....|[2.69019485098454...|[0.93644557925748...| 0.0|
| 8| 494224|[169.0,1.0,0.0,1....|[2.69019485098454...|[0.93644557925748...| 0.0|
| 8| 339382|[163.0,1.0,0.0,1....|[2.69019490651794...|[0.93644558256256...| 0.0|
+------+---------+--------------------+--------------------+--------------------+----------+
only showing top 20 rows
- TOP-20
# TOP-20
prediction.select("adgroupId").head(20)
显示结果:
[Row(adgroupId=631204),
Row(adgroupId=583215),
Row(adgroupId=275819),
Row(adgroupId=401433),
Row(adgroupId=29466),
Row(adgroupId=173327),
Row(adgroupId=241402),
Row(adgroupId=351366),
Row(adgroupId=229827),
Row(adgroupId=164807),
Row(adgroupId=227731),
Row(adgroupId=265403),
Row(adgroupId=569939),
Row(adgroupId=277335),
Row(adgroupId=575633),
Row(adgroupId=201867),
Row(adgroupId=25542),
Row(adgroupId=133457),
Row(adgroupId=494224),
Row(adgroupId=339382)]
[i.adgroupId for i in prediction.select("adgroupId").head(20)]
显示结果:
[631204,
583215,
275819,
401433,
29466,
173327,
241402,
351366,
229827,
164807,
227731,
265403,
569939,
277335,
575633,
201867,
25542,
133457,
494224,
339382]