文章目录
4.4 hive综合案例
-
内容推荐数据处理
- 需求
- 根据用户行为以及文章标签筛选出用户最感兴趣(阅读最多)的标签
- 需求
-
相关数据
user_id article_id event_time
11,101,2018-12-01 06:01:10 22,102,2018-12-01 07:28:12 33,103,2018-12-01 07:50:14 11,104,2018-12-01 09:08:12 22,103,2018-12-01 13:37:12 33,102,2018-12-02 07:09:12 11,101,2018-12-02 18:42:12 35,105,2018-12-03 09:21:12 22,104,2018-12-03 16:42:12 77,103,2018-12-03 18:31:12 99,102,2018-12-04 00:04:12 33,101,2018-12-04 19:10:12 11,101,2018-12-05 09:07:12 35,102,2018-12-05 11:00:12 22,103,2018-12-05 12:11:12 77,104,2018-12-05 18:02:02 99,105,2018-12-05 20:09:11
- 文章数据
artical_id,artical_url,artical_keywords 101,http://www.itcast.cn/1.html,kw8|kw1 102,http://www.itcast.cn/2.html,kw6|kw3 103,http://www.itcast.cn/3.html,kw7 104,http://www.itcast.cn/4.html,kw5|kw1|kw4|kw9 105,http://www.itcast.cn/5.html,
-
数据上传hdfs
hadoop fs -mkdir /tmp/demo hadoop fs -mkdir /tmp/demo/user_action
-
创建外部表
- 用户行为表
drop table if exists user_actions; CREATE EXTERNAL TABLE user_actions( user_id STRING, article_id STRING, time_stamp STRING ) ROW FORMAT delimited fields terminated by ',' LOCATION '/tmp/demo/user_action';
- 文章表
drop table if exists articles; CREATE EXTERNAL TABLE articles( article_id STRING, url STRING, key_words array<STRING> ) ROW FORMAT delimited fields terminated by ',' COLLECTION ITEMS terminated BY '|' LOCATION '/tmp/demo/article_keywords'; /* key_words array<STRING> 数组的数据类型 COLLECTION ITEMS terminated BY '|' 数组的元素之间用'|'分割 */
- 查看数据
select * from user_actions; select * from articles;
-
分组查询每个用户的浏览记录
- collect_set/collect_list作用:
- 将group by中的某列转为一个数组返回
- collect_list不去重而collect_set去重
- collect_set
select user_id,collect_set(article_id) from user_actions group by user_id;
11 ["101","104"] 22 ["102","103","104"] 33 ["103","102","101"] 35 ["105","102"] 77 ["103","104"] 99 ["102","105"]
- collect_list
select user_id,collect_list(article_id) from user_actions group by user_id;
11 ["101","104","101","101"] 22 ["102","103","104","103"] 33 ["103","102","101"] 35 ["105","102"] 77 ["103","104"] 99 ["102","105"]
- sort_array: 对数组排序
select user_id,sort_array(collect_list(article_id)) as contents from user_actions group by user_id;
11 ["101","101","101","104"] 22 ["102","103","103","104"] 33 ["101","102","103"] 35 ["102","105"] 77 ["103","104"] 99 ["102","105"]
- collect_set/collect_list作用:
-
查看每一篇文章的关键字 lateral view explode
- explode函数 将array 拆分
select explode(key_words) from articles;
- lateral view 和 explode 配合使用,将一行数据拆分成多行数据,在此基础上可以对拆分的数据进行聚合
select article_id,kw from articles lateral view explode(key_words) t as kw;
101 kw8 101 kw1 102 kw6 102 kw3 103 kw7 104 kw5 104 kw1 104 kw4 104 kw9
select article_id,kw from articles lateral view outer explode(key_words) t as kw;
101 kw8 101 kw1 102 kw6 102 kw3 103 kw7 104 kw5 104 kw1 104 kw4 104 kw9 105 NULL #含有outer
-
根据文章id找到用户查看文章的关键字
- 原始数据
101 http://www.itcast.cn/1.html ["kw8","kw1"] 102 http://www.itcast.cn/2.html ["kw6","kw3"] 103 http://www.itcast.cn/3.html ["kw7"] 104 http://www.itcast.cn/4.html ["kw5","kw1","kw4","kw9"] 105 http://www.itcast.cn/5.html []
select a.user_id, b.kw from user_actions as a left outer JOIN (select article_id,kw from articles lateral view outer explode(key_words) t as kw) b on (a.article_id = b.article_id) order by a.user_id;
11 kw1 11 kw8 11 kw5 11 kw1 11 kw4 11 kw1 11 kw9 11 kw8 11 kw1 11 kw8 22 kw1 22 kw7 22 kw9 22 kw4 22 kw5 22 kw7 22 kw3 22 kw6 33 kw8 33 kw1 33 kw3 33 kw6 33 kw7 35 NULL 35 kw6 35 kw3 77 kw9 77 kw1 77 kw7 77 kw4 77 kw5 99 kw3 99 kw6 99 NULL
-
根据文章id找到用户查看文章的关键字并统计频率
select a.user_id, b.kw,count(1) as weight from user_actions as a left outer JOIN (select article_id,kw from articles lateral view outer explode(key_words) t as kw) b on (a.article_id = b.article_id) group by a.user_id,b.kw order by a.user_id,weight desc;
11 kw1 4 11 kw8 3 11 kw5 1 11 kw9 1 11 kw4 1 22 kw7 2 22 kw9 1 22 kw1 1 22 kw3 1 22 kw4 1 22 kw5 1 22 kw6 1 33 kw3 1 33 kw8 1 33 kw7 1 33 kw6 1 33 kw1 1 35 NULL 1 35 kw3 1 35 kw6 1 77 kw1 1 77 kw4 1 77 kw5 1 77 kw7 1 77 kw9 1 99 NULL 1 99 kw3 1 99 kw6 1
-
CONCAT:
CONCAT(str1,str2,…)返回结果为连接参数产生的字符串。如有任何一个参数为NULL ,则返回值为 NULL。
select concat(user_id,article_id) from user_actions;
CONCAT_WS:
使用语法为:CONCAT_WS(separator,str1,str2,…)
CONCAT_WS() 代表 CONCAT With Separator ,是CONCAT()的特殊形式。第一个参数是其它参数的分隔符。分隔符的位置放在要连接的两个字符串之间。分隔符可以是一个字符串,也可以是其它参数。如果分隔符为 NULL,则结果为 NULL。
select concat_ws(':',user_id,article_id) from user_actions;
-
将用户查看的关键字和频率合并成 key:value形式
select a.user_id, concat_ws(':',b.kw,cast (count(1) as string)) as kw_w from user_actions as a left outer JOIN (select article_id,kw from articles lateral view outer explode(key_words) t as kw) b on (a.article_id = b.article_id) group by a.user_id,b.kw;
11 kw1:4 11 kw4:1 11 kw5:1 11 kw8:3 11 kw9:1 22 kw1:1 22 kw3:1 22 kw4:1 22 kw5:1 22 kw6:1 22 kw7:2 22 kw9:1 33 kw1:1 33 kw3:1 33 kw6:1 33 kw7:1 33 kw8:1 35 1 35 kw3:1 35 kw6:1 77 kw1:1 77 kw4:1 77 kw5:1 77 kw7:1 77 kw9:1 99 1 99 kw3:1 99 kw6:1
-
将用户查看的关键字和频率合并成 key:value形式并按用户聚合
select cc.user_id,concat_ws(',',collect_set(cc.kw_w)) from( select a.user_id, concat_ws(':',b.kw,cast (count(1) as string)) as kw_w from user_actions as a left outer JOIN (select article_id,kw from articles lateral view outer explode(key_words) t as kw) b on (a.article_id = b.article_id) group by a.user_id,b.kw ) as cc group by cc.user_id;
11 kw1:4,kw4:1,kw5:1,kw8:3,kw9:1 22 kw1:1,kw3:1,kw4:1,kw5:1,kw6:1,kw7:2,kw9:1 33 kw1:1,kw3:1,kw6:1,kw7:1,kw8:1 35 1,kw3:1,kw6:1 77 kw1:1,kw4:1,kw5:1,kw7:1,kw9:1 99 1,kw3:1,kw6:1
-
将上面聚合结果转换成map
select cc.user_id,str_to_map(concat_ws(',',collect_set(cc.kw_w))) as wm from( select a.user_id, concat_ws(':',b.kw,cast (count(1) as string)) as kw_w from user_actions as a left outer JOIN (select article_id,kw from articles lateral view outer explode(key_words) t as kw) b on (a.article_id = b.article_id) group by a.user_id,b.kw ) as cc group by cc.user_id;
11 { "kw1":"4","kw4":"1","kw5":"1","kw8":"3","kw9":"1"} 22 { "kw1":"1","kw3":"1","kw4":"1","kw5":"1","kw6":"1","kw7":"2","kw9":"1"} 33 { "kw1":"1","kw3":"1","kw6":"1","kw7":"1","kw8":"1"} 35 { "1":null,"kw3":"1","kw6":"1"} 77 { "kw1":"1","kw4":"1","kw5":"1","kw7":"1","kw9":"1"} 99 { "1":null,"kw3":"1","kw6":"1"}
-
将用户的阅读偏好结果保存到表中
create table user_kws as select cc.user_id,str_to_map(concat_ws(',',collect_set(cc.kw_w))) as wm from( select a.user_id, concat_ws(':',b.kw,cast (count(1) as string)) as kw_w from user_actions as a left outer JOIN (select article_id,kw from articles lateral view outer explode(key_words) t as kw) b on (a.article_id = b.article_id) group by a.user_id,b.kw ) as cc group by cc.user_id;
-
从表中通过key查询map中的值
select user_id, wm['kw1'] from user_kws;
11 4 22 1 33 1 35 NULL 77 1 99 NULL
-
从表中获取map中所有的key 和 所有的value
select user_id,map_keys(wm),map_values(wm) from user_kws;
11 ["kw1","kw4","kw5","kw8","kw9"] ["4","1","1","3","1"] 22 ["kw1","kw3","kw4","kw5","kw6","kw7","kw9"] ["1","1","1","1","1","2","1"] 33 ["kw1","kw3","kw6","kw7","kw8"] ["1","1","1","1","1"] 35 ["1","kw3","kw6"] [null,"1","1"] 77 ["kw1","kw4","kw5","kw7","kw9"] ["1","1","1","1","1"] 99 ["1","kw3","kw6"] [null,"1","1"]
-
用lateral view explode把map中的数据转换成多列
select user_id,keyword,weight from user_kws lateral view explode(wm) t as keyword,weight;
11 kw1 4 11 kw4 1 11 kw5 1 11 kw8 3 11 kw9 1 22 kw1 1 22 kw3 1 22 kw4 1 22 kw5 1 22 kw6 1 22 kw7 2 22 kw9 1 33 kw1 1 33 kw3 1 33 kw6 1 33 kw7 1 33 kw8 1 35 1 NULL 35 kw3 1 35 kw6 1 77 kw1 1 77 kw4 1 77 kw5 1 77 kw7 1 77 kw9 1 99 1 NULL 99 kw3 1 99 kw6 1