项目说明:部分思路参考 和鲸社区-巴西电商Olist订单数据分析
数据来源:kaggle
数据说明:本数据集为巴西电商Olist的交易订单数据,提供了该平台16年到18年近10万条订单交易记录,数据集包含9张数据表。

一、分析框架:

  1. 流量指标:活跃用户数(DAU、MAU、时段)
  2. 运营指标:GMV(季度、月)、ARPU(季度、月)、订单数(天、月、时段)
  3. RFM用户价值分层:各层次用户品类(热门指数 = 金额 + 评价分数)

二、数据清洗:

# 原数据表名过长,为方便后续处理对表重新命名
# 重命名表
RENAME TABLE olist_customers_dataset to customer;
RENAME TABLE olist_geolocation_dataset to geo;
RENAME TABLE olist_orders_dataset to orders;
RENAME TABLE olist_order_items_dataset to item;
RENAME TABLE olist_order_payments_dataset to payment;
RENAME TABLE olist_order_reviews_dataset to review;
RENAME TABLE olist_products_dataset to product ;
RENAME TABLE olist_sellers_dataset to seller;
RENAME TABLE product_category_name_translation to category;
# 空值处理
# 从kaggle数据源提供的各表各列空值情况可知,review表的空值集中在review_comment_title列和review_comment_message列
# 而orders表的空值集中在order_approved_at列、order_delivered_carrier_date列和order_delivered_customer_date列
# 用0替换这几列空值数据
UPDATE review
SET review_comment_title=0
where review_comment_title is NULL;

UPDATE review
SET review_comment_message=0
WHERE review_comment_message is NULL;

UPDATE orders
SET order_approved_at=0
where order_approved_at is NULL;

UPDATE orders
SET order_delivered_carrier_date= 0
where order_delivered_carrier_date is NULL;

UPDATE orders
SET order_delivered_customer_date=0
where order_delivered_customer_date is NULL;
# 检查重复值
# orders表
SELECT order_id from orders
group by order_id
HAVING count(*)>1;

# item表
SELECT order_id from item
GROUP BY order_id,order_item_id
HAVING count(*)>1;

# review表
SELECT review_id,order_id
from review
group by review_id,order_id
HAVING count(*)>1;

# product表
SELECT product_id from product
GROUP BY product_id
HAVING count(*)>1;

# category表
SELECT product_category_name_english from category
GROUP BY product_category_name_english
HAVING count(*)>1;

# 本次分析所用到的表均没有重复值
# 提取订单时间数据
CREATE TABLE order_time AS
SELECT order_id,customer_id,
year(order_purchase_timestamp) as y,
quarter(order_purchase_timestamp) as q,
month(order_purchase_timestamp) as m,
date(order_purchase_timestamp) as date,
day(order_purchase_timestamp) as d,
hour(order_purchase_timestamp) as h 
from orders
where order_purchase_timestamp not like '2016-09-%' # 过滤数据量异常的月份
and order_purchase_timestamp not like '2016-12-%'
and order_purchase_timestamp not like '2018-09-%' 
and order_purchase_timestamp not like '2018-10-%' 
# 计算每笔订单金额
CREATE view total_order_value as
select order_id,product_id,seller_id,
       (price*count(*))+(freight_value*count(*)) as order_value
from item
group by order_id;

# 整合每笔订单的时间和金额,为后面计算GMV等指标做准备
CREATE view order_detail as
select a.order_id,product_id,seller_id,
       customer_id,round(order_value,2) as order_value,
       y,q,m,date,d,h 
from total_order_value a 
inner join order_time b 
on a.order_id=b.order_id ;

三、数据分析

1. 用户粘性分析

用户粘性通常用 DAU / MAU 的公式来计算,比值越趋近于1表明用户活跃度越高,在比值低于0.2时,应用的传播性和互动性将会很弱。

# 样本数据中,每个用户只有一笔订单记录,订单和用户是一一对应的,分布趋势一致,所以这里订单数和活跃用户数放在一起分析
# 活跃用户数时间分布
# 日活跃用户数
select date 日期,count(DISTINCT customer_id) DAU
from order_detail
group by date
order by date;

# 月活跃用户数
CREATE view MAU as
select y 年,m 月,count(DISTINCT customer_id) MAU
from order_detail
group by y,m
order by y,m;

# 用户粘性趋势分析(用户粘性=当月平均日活/当月月活)
# 构造当月平均日活
CREATE view avg_DAU as
select y 年,m 月,round(avg(dau),2) avg_DAU
from (select y,m,date,count(distinct customer_id) dau
      from order_detail
      group by date
      order by date asc) t
group by m;

# 计算月度用户粘性
CREATE view user_viscocity as
select y 年,m 月,concat(round(avg_dau/MAU*100,2),'%') 用户粘性
from avg_dau a join MAU b
on a.年 = b.年 and a.月 = b.月

图片说明

整体来看Olist用户粘性并不高,而且呈现逐月下降的趋势,可以深入发掘2018年2月、8月用户粘性上升的原因,进一步发挥优势。

2. GMV分析

GMV指标属于电商平台企业成交类指标,主要指拍下订单的总金额,包含付款和未付款的两部分订单,而GMV统计的指标是其二者之和。

# 季度 GMV
select y 年份,q 季度,sum(order_value) 季度GMV
from order_detail
group by y,q
order by y,q;

# 月度 GMV
select y 年份,m 月份,sum(order_value) 月GMV
from order_detail
group by y,m
order by y,m;

图片说明

图片说明
整体来看,Olist的GMV呈上升趋势,但增速有所放缓。2018年第三季度由于缺失9月份数据,故不能正确反映实际业务情况。

3. ARPU分析

ARPU指的是一个时期内(通常为一个月或一年)平均每个用户贡献的业务收入。相对用户数量,业务收入越高,ARPU值越大。当用户构成中高端客户占的比重越高,ARPU值也越高。

# 各季度 ARPU 值 = GMV / 季度活跃用户数
select y 年份,q 季度,
       round((sum(order_value)/count(DISTINCT customer_id)),2) 季度ARPU
from order_detail
group by y,q
order by y,q;

# 各月 ARPU 值 = GMV / 月活跃用户数
select y 年份,m 月份,
       round((sum(order_value)/count(DISTINCT customer_id)),2) 月ARPU
from order_detail
group by y,m
order by y,m;

图片说明

图片说明
从ARPU的同期比较情况来看,2018年1月和2月均较上一年出现明显下降,5、6、7月则较上一年出现明显上升。

4. RFM模型用户分层

RFM模型是衡量客户价值和客户创利能力的重要工具和手段。该机械模型通过一个客户的近期购买行为、购买的总体频率以及花了多少钱3项指标来描述该客户的价值状况。

# RFM用户分群
# 查看所有用户的消费次数
SELECT customer_id, count(*) from order_detail
GROUP BY customer_id
HAVING count(*)>1;
# 由于所有用户消费次数都只有一次,故Frequency默认为低

# 构造R值
CREATE VIEW Recency AS
SELECT customer_id,(CASE WHEN DATEDIFF(d,(SELECT MAX(d) FROM order_detail)) >
                              (SELECT AVG(DATEDIFF(d,(SELECT max(d)
                                                      FROM order_detail)))
                               FROM order_detail)
                    THEN 1 ELSE 0 END) R
FROM order_detail;

# 构造M值
# 建立每个顾客对应的消费金额视图
CREATE VIEW ordervalue_per_user AS
select customer_id,sum(order_value) sum_value
from order_detail
group by customer_id;

# 赋予M值
CREATE VIEW Monetary AS
select customer_id,(case when (select avg(sum_value) from ordervalue_per_user) < sum_value
                    then 1 else 0 end) M
from ordervalue_per_user;

# 构造RFM分层
CREATE VIEW RFM AS
SELECT Recency.customer_id, (CASE 
WHEN R=1 AND M=1 THEN '重要发展用户'
WHEN R=0 AND M=1 THEN '重要挽留用户'
WHEN R=1 AND M=0 THEN '一般发展用户'
WHEN R=0 AND M=0 THEN '一般挽留用户'
ELSE '其他' END) AS 用户类型
FROM Recency INNER JOIN Monetary ON Recency.customer_id=Monetary.customer_id;

5. 各类型用户的热门商品类型

热门指数 = 0.7 × 消费金额 + 0.3 × 商品评分 × 10000
这里乘10000是为了平衡消费金额和评分之间的数量级差距

# 查看各类型用户数量
SELECT 用户类型,count(*) 数量 from rfm
GROUP BY 用户类型
ORDER BY 用户类型;

# 重要发展用户的热门商品品类
SELECT e.product_category_name_english 商品品类,SUM(a.order_value) 消费金额,ROUND(AVG(c.review_score),2) 商品评分,
(0.7*SUM(a.order_value)+0.3*10000*ROUND(AVG(c.review_score),7)) 热门指数,
rank() over(ORDER BY (0.7*SUM(a.order_value)+0.3*10000*ROUND(AVG(c.review_score),7)) DESC) 热门排名
FROM order_detail a 
INNER JOIN (SELECT customer_id from rfm WHERE 用户类型='重要发展用户' ) as b ON a.customer_id=b.customer_id
LEFT JOIN review c ON a.order_id=c.order_id
LEFT JOIN product d ON a.product_id=d.product_id
LEFT JOIN category e ON d.product_category_name=e.product_category_name
GROUP BY e.product_category_name_english;


# 重要挽留用户的热门商品品类
SELECT e.product_category_name_english 商品品类,SUM(a.order_value) 消费金额,ROUND(AVG(c.review_score),2) 商品评分,
(0.7*SUM(a.order_value)+0.3*10000*ROUND(AVG(c.review_score),7)) 热门指数,
rank() over(ORDER BY (0.7*SUM(a.order_value)+0.3*10000*ROUND(AVG(c.review_score),7)) DESC) 热门排名
FROM order_detail a 
INNER JOIN (SELECT customer_id from rfm WHERE 用户类型='重要挽留用户' ) as b ON a.customer_id=b.customer_id
LEFT JOIN review c ON a.order_id=c.order_id
LEFT JOIN product d ON a.product_id=d.product_id
LEFT JOIN category e ON d.product_category_name=e.product_category_name
GROUP BY e.product_category_name_english;


# 一般发展用户的热门商品品类
SELECT e.product_category_name_english 商品品类,SUM(a.order_value) 消费金额,ROUND(AVG(c.review_score),2) 商品评分,
(0.7*SUM(a.order_value)+0.3*10000*ROUND(AVG(c.review_score),7)) 热门指数,
rank() over(ORDER BY (0.7*SUM(a.order_value)+0.3*10000*ROUND(AVG(c.review_score),7)) DESC) 热门排名
FROM order_detail a 
INNER JOIN (SELECT customer_id from rfm WHERE 用户类型='一般发展用户' ) as b ON a.customer_id=b.customer_id
LEFT JOIN review c ON a.order_id=c.order_id
LEFT JOIN product d ON a.product_id=d.product_id
LEFT JOIN category e ON d.product_category_name=e.product_category_name
GROUP BY e.product_category_name_english;


# 一般挽留用户的热门商品品类
SELECT e.product_category_name_english 商品品类,SUM(a.order_value) 消费金额,ROUND(AVG(c.review_score),2) 商品评分,
(0.7*SUM(a.order_value)+0.3*10000*ROUND(AVG(c.review_score),7)) 热门指数,
rank() over(ORDER BY (0.7*SUM(a.order_value)+0.3*10000*ROUND(AVG(c.review_score),7)) DESC) 热门排名
FROM order_detail a 
INNER JOIN (SELECT customer_id from rfm WHERE 用户类型='一般挽留用户' ) as b ON a.customer_id=b.customer_id
LEFT JOIN review c ON a.order_id=c.order_id
LEFT JOIN product d ON a.product_id=d.product_id
LEFT JOIN category e ON d.product_category_name=e.product_category_name
GROUP BY e.product_category_name_english;