六 实时产生推荐结果

6.1 推荐任务处理

  • CTR预测模型 + 特征 ==> 预测结果 ==> TOP-N列表

  • 特征获取

import redis
import json
import pandas as pd
from pyspark.ml.linalg import DenseVector


def create_datasets(userId, pid):
    client_of_recall = redis.StrictRedis(host="192.168.19.137", port=6379, db=9)
    client_of_features = redis.StrictRedis(host="192.168.19.137", port=6379, db=10)
    # 获取用户特征
    user_feature = json.loads(client_of_features.hget("user_features", userId))
    
    # 获取用户召回集
    recall_sets = client_of_recall.smembers(userId)
    
    result = []
    
    # 遍历召回集
    for adgroupId in recall_sets:
        adgroupId = int(adgroupId)
        # 获取该广告的特征值
        ad_feature = json.loads(client_of_features.hget("ad_features", adgroupId))
        
        features = {
   }
        features.update(user_feature)
        features.update(ad_feature)

        for k,v in features.items():
            if v is None:
                features[k] = -1

        features_col = [
            # 特征值
            "price",
            "cms_group_id",
            "final_gender_code",
            "age_level",
            "shopping_level",
            "occupation",
            "pid", 
            "pvalue_level",
            "new_user_class_level"
        ]
        ''' "cms_group_id", 类别型特征,约13个分类 ==> 13维 "final_gender_code", 类别型特征,2个分类 ==> 2维 "age_level", 类别型特征,7个分类 ==>7维 "shopping_level", 类别型特征,3个分类 ==> 3维 "occupation", 类别型特征,2个分类 ==> 2维 '''

        price = float(features["price"])

        pid_value = [0 for i in range(2)]#[0,0]
        cms_group_id_value = [0 for i in range(13)]
        final_gender_code_value = [0 for i in range(2)]
        age_level_value = [0 for i in range(7)]
        shopping_level_value = [0 for i in range(3)]
        occupation_value = [0 for i in range(2)]
        pvalue_level_value = [0 for i in range(4)]
        new_user_class_level_value = [0 for i in range(5)]

        pid_value[pid_rela[pid]] = 1
        cms_group_id_value[cms_group_id_rela[int(features["cms_group_id"])]] = 1
        final_gender_code_value[final_gender_code_rela[int(features["final_gender_code"])]] = 1
        age_level_value[age_level_rela[int(features["age_level"])]] = 1
        shopping_level_value[shopping_level_rela[int(features["shopping_level"])]] = 1
        occupation_value[occupation_rela[int(features["occupation"])]] = 1
        pvalue_level_value[pvalue_level_rela[int(features["pvalue_level"])]] = 1
        new_user_class_level_value[new_user_class_level_rela[int(features["new_user_class_level"])]] = 1
 # print(pid_value)
# print(cms_group_id_value)
# print(final_gender_code_value)
# print(age_level_value)
# print(shopping_level_value)
# print(occupation_value)
# print(pvalue_level_value)
# print(new_user_class_level_value)
        
        vector = DenseVector([price] + pid_value + cms_group_id_value + final_gender_code_value\
        + age_level_value + shopping_level_value + occupation_value + pvalue_level_value + new_user_class_level_value)
        
        result.append((userId, adgroupId, vector))
        
    return result

# create_datasets(88, "430548_1007")
  • 载入训练好的模型
from pyspark.ml.classification import LogisticRegressionModel
CTR_model = LogisticRegressionModel.load("hdfs://localhost:9000/models/CTRModel_AllOneHot.obj")
pdf = pd.DataFrame(create_datasets(8, "430548_1007"), columns=["userId", "adgroupId", "features"])
datasets = spark.createDataFrame(pdf)
datasets.show()

显示结果:

+------+---------+--------------------+
|userId|adgroupId|            features|
+------+---------+--------------------+
|     8|   445914|[9.89999961853027...|
|     8|   258252|[7.59999990463256...|
|     8|   129682|[8.5,1.0,0.0,1.0,...|
|     8|   763027|[68.0,1.0,0.0,1.0...|
|     8|   292027|[16.0,1.0,0.0,1.0...|
|     8|   430023|[34.2000007629394...|
|     8|   133457|[169.0,1.0,0.0,1....|
|     8|   816999|[5.0,1.0,0.0,1.0,...|
|     8|   221714|[4.80000019073486...|
|     8|   186334|[106.0,1.0,0.0,1....|
|     8|   169717|[2.20000004768371...|
|     8|    31314|[15.8000001907348...|
|     8|   815312|[2.29999995231628...|
|     8|   199445|[5.0,1.0,0.0,1.0,...|
|     8|   746178|[16.7999992370605...|
|     8|   290950|[6.5,1.0,0.0,1.0,...|
|     8|   221585|[18.5,1.0,0.0,1.0...|
|     8|   692672|[47.0,1.0,0.0,1.0...|
|     8|   797982|[33.0,1.0,0.0,1.0...|
|     8|   815219|[2.40000009536743...|
+------+---------+--------------------+
only showing top 20 rows
prediction = CTR_model.transform(datasets).sort("probability")
prediction.show()
+------+---------+--------------------+--------------------+--------------------+----------+
|userId|adgroupId|            features|       rawPrediction|         probability|prediction|
+------+---------+--------------------+--------------------+--------------------+----------+
|     8|   631204|[19888.0,1.0,0.0,...|[2.69001234046578...|[0.93643471623189...|       0.0|
|     8|   583215|[3750.0,1.0,0.0,1...|[2.69016170680037...|[0.93644360664433...|       0.0|
|     8|   275819|[3280.0,1.0,0.0,1...|[2.69016605691669...|[0.93644386554961...|       0.0|
|     8|   401433|[1200.0,1.0,0.0,1...|[2.69018530849532...|[0.93644501133142...|       0.0|
|     8|    29466|[640.0,1.0,0.0,1....|[2.69019049161265...|[0.93644531980785...|       0.0|
|     8|   173327|[356.0,1.0,0.0,1....|[2.69019312019358...|[0.93644547624893...|       0.0|
|     8|   241402|[269.0,1.0,0.0,1....|[2.69019392542787...|[0.93644552417271...|       0.0|
|     8|   351366|[246.0,1.0,0.0,1....|[2.69019413830591...|[0.93644553684221...|       0.0|
|     8|   229827|[238.0,1.0,0.0,1....|[2.69019421235044...|[0.93644554124900...|       0.0|
|     8|   164807|[228.0,1.0,0.0,1....|[2.69019430490611...|[0.93644554675747...|       0.0|
|     8|   227731|[199.0,1.0,0.0,1....|[2.69019457331754...|[0.93644556273205...|       0.0|
|     8|   265403|[198.0,1.0,0.0,1....|[2.69019458257311...|[0.93644556328290...|       0.0|
|     8|   569939|[188.0,1.0,0.0,1....|[2.69019467512877...|[0.93644556879138...|       0.0|
|     8|   277335|[181.5,1.0,0.0,1....|[2.69019473528996...|[0.93644557237189...|       0.0|
|     8|   575633|[180.0,1.0,0.0,1....|[2.69019474917331...|[0.93644557319816...|       0.0|
|     8|   201867|[179.0,1.0,0.0,1....|[2.69019475842887...|[0.93644557374900...|       0.0|
|     8|    25542|[176.0,1.0,0.0,1....|[2.69019478619557...|[0.93644557540155...|       0.0|
|     8|   133457|[169.0,1.0,0.0,1....|[2.69019485098454...|[0.93644557925748...|       0.0|
|     8|   494224|[169.0,1.0,0.0,1....|[2.69019485098454...|[0.93644557925748...|       0.0|
|     8|   339382|[163.0,1.0,0.0,1....|[2.69019490651794...|[0.93644558256256...|       0.0|
+------+---------+--------------------+--------------------+--------------------+----------+
only showing top 20 rows
  • TOP-20
# TOP-20
prediction.select("adgroupId").head(20)

显示结果:

[Row(adgroupId=631204),
 Row(adgroupId=583215),
 Row(adgroupId=275819),
 Row(adgroupId=401433),
 Row(adgroupId=29466),
 Row(adgroupId=173327),
 Row(adgroupId=241402),
 Row(adgroupId=351366),
 Row(adgroupId=229827),
 Row(adgroupId=164807),
 Row(adgroupId=227731),
 Row(adgroupId=265403),
 Row(adgroupId=569939),
 Row(adgroupId=277335),
 Row(adgroupId=575633),
 Row(adgroupId=201867),
 Row(adgroupId=25542),
 Row(adgroupId=133457),
 Row(adgroupId=494224),
 Row(adgroupId=339382)]
[i.adgroupId for i in prediction.select("adgroupId").head(20)]

显示结果:

[631204,
 583215,
 275819,
 401433,
 29466,
 173327,
 241402,
 351366,
 229827,
 164807,
 227731,
 265403,
 569939,
 277335,
 575633,
 201867,
 25542,
 133457,
 494224,
 339382]