2、Hive数据处理

STEP1:创建数据库并导入数据

#创建数据库
create database weblog;
#使用数据库
use weblog;
#创建表
create table weblog (ip string,time string,url string) row format delimited fields terminated by '\t';
#加载数据
load data inpath '/weblog/final.log' into table weblog;

STEP2:按小时进行浏览量处理

#创建浏览量表
create table pageviews(time string,number int) row format delimited fields terminated by ',';

#按小时查询浏览量并插入到pageviews
insert into pageviews(time , number) select '2013053017',count(*) from weblog where time like '2013053017%';
insert into pageviews(time , number) select '2013053018',count(*) from weblog where time like '2013053018%';
insert into pageviews(time , number) select '2013053019',count(*) from weblog where time like '2013053019%';
insert into pageviews(time , number) select '2013053020',count(*) from weblog where time like '2013053020%';
insert into pageviews(time , number) select '2013053021',count(*) from weblog where time like '2013053021%';
insert into pageviews(time , number) select '2013053022',count(*) from weblog where time like '2013053022%';
insert into pageviews(time , number) select '2013053023',count(*) from weblog where time like '2013053023%';

#计算总浏览量
select sum(number) from pageviews;

STEP3:按小时进行注册用户处理

#创建注册用户数表
create table registernumber(time string,number int)row format delimited fields terminated by ',';

#按小时查询注册用户数并插入到数据库registernumber
insert into registernumber(time , number) select '2013053017',count(*) from weblog where time like '2013053017%' and url like '%member.php?mod=register%';
insert into registernumber(time , number) select '2013053018',count(*) from weblog where time like '2013053018%' and url like '%member.php?mod=register%';
insert into registernumber(time , number) select '2013053019',count(*) from weblog where time like '2013053019%' and url like '%member.php?mod=register%';
insert into registernumber(time , number) select '2013053020',count(*) from weblog where time like '2013053020%' and url like '%member.php?mod=register%';
insert into registernumber(time , number) select '2013053021',count(*) from weblog where time like '2013053021%' and url like '%member.php?mod=register%';
insert into registernumber(time , number) select '2013053022',count(*) from weblog where time like '2013053022%' and url like '%member.php?mod=register%';
insert into registernumber(time , number) select '2013053023',count(*) from weblog where time like '2013053023%' and url like '%member.php?mod=register%';

#计算总注册用户数量
select sum(number) from registernumber;

STEP4:按ip进行访问量处理

#创建ip访问页面数量表
create table ip_page_number(ip string,number int) row format delimited fields terminated by ',';

#查询每个ip访问的页面的数量并插入到数据库ip_page_number
insert into ip_page_number(ip,number) select ip,count(*) from weblog group by ip;

STEP5:按不同页面对访问量进行处理

#创建页面访问量表
#forum包括forum.php和forum%.html...
create table page_visti_number(page string,number int) row format delimited fields terminated by ',';
#查询并插入到数据库
insert into page_visti_number(page, number) select 'api.php',count(*) from weblog where url like '%api.php%';
insert into page_visti_number(page, number) select 'connect.php',count(*) from weblog where url like '%connect.php%';
insert into page_visti_number(page, number) select 'forum',count(*) from weblog where url like '%forum%';
insert into page_visti_number(page, number) select 'home.php',count(*) from weblog where url like '%home.php%';
insert into page_visti_number(page, number) select 'like.php',count(*) from weblog where url like '%like.php%';
insert into page_visti_number(page, number) select 'member.php',count(*) from weblog where url like '%member.php%';
insert into page_visti_number(page, number) select 'misc.php',count(*) from weblog where url like '%misc.php%';
insert into page_visti_number(page, number) select 'my.php',count(*) from weblog where url like '%my.php%';
insert into page_visti_number(page, number) select 'search.php',count(*) from weblog where url like '%search.php%';
insert into page_visti_number(page, number) select 'api/connect/like.php',count(*) from weblog where url like '%api/connect/like.php%';

STEP6:Sqoop将数据导入MySQL,注意将连接的Mysql的jar包拷到sqoop的lib文件夹里

#列出mysql数据库中的所有数据库命令
./sqoop list-databases --connect jdbc:mysql://localhost:3306/ --username root --password root
#导出
./sqoop export --connect jdbc:mysql://hadoop1:3306/weblog --username root --password root --table PageViews --export-dir /user/hive/warehouse/weblog.db/pageviews
./sqoop export --connect jdbc:mysql://hadoop1:3306/weblog --username root --password root --table IpVisitDetailes --export-dir /user/hive/warehouse/weblog.db/ip_page_number
./sqoop export --connect jdbc:mysql://hadoop1:3306/weblog --username root --password root --table RegisterNumber --export-dir /user/hive/warehouse/weblog.db/registernumbe
./sqoop export --connect jdbc:mysql://hadoop1:3306/weblog --username root --password root --table PageVistiDetails--export-dir /user/hive/warehouse/page_visti_number

STEP7:使用数据库管理工具连接hadoop1的mysql,推荐使用navicat

连接过程可能报错:远程连接Mysql时出现Can't connect to Mysql on '主机名'(10061),解决办法参考(远程连接Mysql异常解决)

连接效果:

clipboard.png

clipboard.png
©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

推荐阅读更多精彩内容