SELECT dt
        ,ROUND(SUM(IF(DATEDIFF(next_active_date ,dt) = 1 , 1, 0))/COUNT(uid), 2) AS uv_left_rate
FROM (
    SELECT *
            ,DATE(in_time) AS dt
            ,FIRST_VALUE(in_time) OVER(PARTITION BY uid ORDER BY in_time) AS first_date
            ,lEAD(DATE(in_time),1) OVER(PARTITION BY uid ORDER BY DATE(in_time)) AS next_active_date
    FROM tb_user_log
) data1
WHERE in_time= first_date AND DATE_FORMAT(in_time, '%Y%m') = "202111"
GROUP BY dt
ORDER BY dt
  • 以上为第一次尝试,发现不能通过第二题,问题出现在有一个新用户开始到结束跨越了当天到下一天,但以上代码只考虑了用户一次的登入时间,没有考虑登出时间会落在第二天内,因此结果应该在那一天输出100%,却输出0%。


SELECT dt
        ,ROUND(SUM(IF(DATEDIFF(next_active_date ,dt) = 1 , 1, 0))/COUNT(uid), 2) AS uv_left_rate
FROM (
    SELECT *
            ,DATE(in_time) AS dt
            ,FIRST_VALUE(in_time) OVER(PARTITION BY uid ORDER BY in_time) AS first_date
            ,lEAD(DATE(in_time),1) OVER(PARTITION BY uid ORDER BY in_time) AS next_active_date
    FROM (
        SELECT uid, in_time 
        FROM tb_user_log
        UNION
        SELECT uid, out_time
        FROM tb_user_log
    ) data2
) data1
WHERE in_time= first_date AND DATE_FORMAT(in_time, '%Y%m') = "202111"
GROUP BY dt
ORDER BY dt
  • 以上为第二次尝试,将tb_user_log拆分为两个表再结合,ORDER BY DATE(in_time)改为ORDER BY in_time
  • 以此希望同时考虑登入状态与登出状态的时间,同时此时考虑登入状态与登出状态互不影响,即使登入状态与登出状态日期相同,lag将位于登出状态,也一定会存在登出状态或登入状态使得DATEDIFF(next_active_date ,dt) =1。
  • 但结果发现显示结果全是0%

SELECT dt
        ,ROUND(SUM(IF(DATEDIFF(next_active_date ,dt) = 1 , 1, 0))/COUNT(DISTINCT uid), 2) AS uv_left_rate
FROM (
    SELECT *
            ,DATE(in_time) AS dt
            ,FIRST_VALUE(in_time) OVER(PARTITION BY uid ORDER BY in_time) AS first_date
            ,lEAD(DATE(in_time),1) OVER(PARTITION BY uid ORDER BY in_time) AS next_active_date
    FROM (
        SELECT uid, in_time 
        FROM tb_user_log
        UNION
        SELECT uid, out_time
        FROM tb_user_log
    ) data2
) data1
WHERE DATE(in_time)= DATE(first_date) AND DATE_FORMAT(in_time, '%Y%m') = "202111"
GROUP BY dt
ORDER BY dt
  • 以上为第三次尝试,将限制条件改为WHERE DATE(in_time)= DATE(first_date),COUNT(DISTINCT uid)
  • 以此发现由于lag可能将新用户的登入时间对应的next标记为新用户的当天登出时间,当天登出时间的next标记为第二天登录的登入时间,这本身问题不大,但遇到in_time= first_date时,会导致要求数据必须是新用户的第一次登入时间,但是next的取值又极大可能出现该新用户通天的登出时间,从而DATEDIFF(next_active_date ,dt) = 1不成立,因此第二次尝试时几乎许多均为0%,因此放宽限制条件,将新用户开户当天的所有数据全部放进去,只要有一个满足in_time与next差值为1即为0,但需要强化分母的限制,采用DISTINCT,使得每个新用户当天多条数据只记录一次,结果从而满足。
  • 代码可以更新一下,比如从一开始就全部采用DATE形式,但是我要睡觉了,明天再说,晚安!