STEP1:创建数据库并导入数据
#创建数据库
create database weblog;
#使用数据库
use weblog;
#创建表
create table weblog (ip string,time string,url string) row format delimited fields terminated by '\t';
#加载数据
load data inpath '/weblog/final.log' into table weblog;
STEP2:按小时进行浏览量处理
#创建浏览量表
create table pageviews(time string,number int) row format delimited fields terminated by ',';
#按小时查询浏览量并插入到pageviews
insert into pageviews(time , number) select '2013053017',count(*) from weblog where time like '2013053017%';
insert into pageviews(time , number) select '2013053018',count(*) from weblog where time like '2013053018%';
insert into pageviews(time , number) select '2013053019',count(*) from weblog where time like '2013053019%';
insert into pageviews(time , number) select '2013053020',count(*) from weblog where time like '2013053020%';
insert into pageviews(time , number) select '2013053021',count(*) from weblog where time like '2013053021%';
insert into pageviews(time , number) select '2013053022',count(*) from weblog where time like '2013053022%';
insert into pageviews(time , number) select '2013053023',count(*) from weblog where time like '2013053023%';
#计算总浏览量
select sum(number) from pageviews;
STEP3:按小时进行注册用户处理
#创建注册用户数表
create table registernumber(time string,number int)row format delimited fields terminated by ',';
#按小时查询注册用户数并插入到数据库registernumber
insert into registernumber(time , number) select '2013053017',count(*) from weblog where time like '2013053017%' and url like '%member.php?mod=register%';
insert into registernumber(time , number) select '2013053018',count(*) from weblog where time like '2013053018%' and url like '%member.php?mod=register%';
insert into registernumber(time , number) select '2013053019',count(*) from weblog where time like '2013053019%' and url like '%member.php?mod=register%';
insert into registernumber(time , number) select '2013053020',count(*) from weblog where time like '2013053020%' and url like '%member.php?mod=register%';
insert into registernumber(time , number) select '2013053021',count(*) from weblog where time like '2013053021%' and url like '%member.php?mod=register%';
insert into registernumber(time , number) select '2013053022',count(*) from weblog where time like '2013053022%' and url like '%member.php?mod=register%';
insert into registernumber(time , number) select '2013053023',count(*) from weblog where time like '2013053023%' and url like '%member.php?mod=register%';
#计算总注册用户数量
select sum(number) from registernumber;
STEP4:按ip进行访问量处理
#创建ip访问页面数量表
create table ip_page_number(ip string,number int) row format delimited fields terminated by ',';
#查询每个ip访问的页面的数量并插入到数据库ip_page_number
insert into ip_page_number(ip,number) select ip,count(*) from weblog group by ip;
STEP5:按不同页面对访问量进行处理
#创建页面访问量表
#forum包括forum.php和forum%.html...
create table page_visti_number(page string,number int) row format delimited fields terminated by ',';
#查询并插入到数据库
insert into page_visti_number(page, number) select 'api.php',count(*) from weblog where url like '%api.php%';
insert into page_visti_number(page, number) select 'connect.php',count(*) from weblog where url like '%connect.php%';
insert into page_visti_number(page, number) select 'forum',count(*) from weblog where url like '%forum%';
insert into page_visti_number(page, number) select 'home.php',count(*) from weblog where url like '%home.php%';
insert into page_visti_number(page, number) select 'like.php',count(*) from weblog where url like '%like.php%';
insert into page_visti_number(page, number) select 'member.php',count(*) from weblog where url like '%member.php%';
insert into page_visti_number(page, number) select 'misc.php',count(*) from weblog where url like '%misc.php%';
insert into page_visti_number(page, number) select 'my.php',count(*) from weblog where url like '%my.php%';
insert into page_visti_number(page, number) select 'search.php',count(*) from weblog where url like '%search.php%';
insert into page_visti_number(page, number) select 'api/connect/like.php',count(*) from weblog where url like '%api/connect/like.php%';
STEP6:Sqoop将数据导入MySQL,注意将连接的Mysql的jar包拷到sqoop的lib文件夹里
#列出mysql数据库中的所有数据库命令
./sqoop list-databases --connect jdbc:mysql://localhost:3306/ --username root --password root
#导出
./sqoop export --connect jdbc:mysql://hadoop1:3306/weblog --username root --password root --table PageViews --export-dir /user/hive/warehouse/weblog.db/pageviews
./sqoop export --connect jdbc:mysql://hadoop1:3306/weblog --username root --password root --table IpVisitDetailes --export-dir /user/hive/warehouse/weblog.db/ip_page_number
./sqoop export --connect jdbc:mysql://hadoop1:3306/weblog --username root --password root --table RegisterNumber --export-dir /user/hive/warehouse/weblog.db/registernumbe
./sqoop export --connect jdbc:mysql://hadoop1:3306/weblog --username root --password root --table PageVistiDetails--export-dir /user/hive/warehouse/page_visti_number
STEP7:使用数据库管理工具连接hadoop1的mysql,推荐使用navicat
连接过程可能报错:远程连接Mysql时出现Can't connect to Mysql on '主机名'(10061),解决办法参考(远程连接Mysql异常解决)