1.INSERT
官网说明:
INSERT OVERWRITE TABLE tablename1 [PARTITION (partcol1=val1, partcol2=val2...)[IF NOT EXISTS]] select_statement1 FROM from_statement;
INSERT INTO TABLE tablename1 [PARTITION (partcol1=val1, partcol2=val2...)]select_statement1 FROM from_statement;
hive> create table ruozedata_emp4 like ruozedata_emp;
hive> INSERT OVERWRITE TABLE ruozedata_emp4 select * FROM ruozedata_emp;
hive> select * from ruozedata_emp4;
hive> INSERT INTO TABLE ruozedata_emp4 select * FROM ruozedata_emp;
hive> select * from ruozedata_emp4;
数据被追加进去
上边两个命令都是select * ,两表都是一一对应的,如果insert的表的列数与原表不等或者列的顺序与原表不同会怎样呢?
hive> INSERT OVERWRITE TABLE ruozedata_emp4
> SELECT empno,ename from ruozedata_emp;
hive> INSERT INTO TABLE ruozedata_emp4
> select empno,job, ename,mgr, hiredate, salary, comm, deptno from ruozedata_emp;
(列的顺序与原表不同)
hive> select * from ruozedata_emp4;
2.Writing data into the filesystem from queries
官网说明:
INSERT OVERWRITE [LOCAL] DIRECTORY directory1
[ROW FORMAT row_format] [STORED AS file_format] (Note: Only available starting with Hive0.11.0)
SELECT ... FROM ...
3.where语句
hive> select * from ruozedata_emp where deptno=10;
hive> select * from ruozedata_emp where empno>=8000;
hive> select * from ruozedata_emp where salary between 800 and 1500;(左闭右闭)
hive> select * from ruozedata_emp limit 5;
hive> select * from ruozedata_emp where ename in ("SMITH","KING");
hive> select * from ruozedata_emp where ename not in ("SMITH","KING");
hive> select * from ruozedata_emp where comm is null;
4.聚合函数
max/min/count/sum/avg 特点:多进一出
hive> select count(1) from ruozedata_emp where deptno=10;
hive>select max(salary), min(salary), avg(salary), sum(salary) from ruozedata_emp;
5.分组函数 group by
1)
hive> select deptno,avg(salary) from ruozedata_emp group by deptno;(求部门平均工资)
hive> select ename,deptno,avg(salary) from ruozedata_emp group by deptno;
FAILED: SemanticException [Error 10025]: Line 1:7 Expression not in GROUP BY key 'ename'
报错原因:select中出现的字段,如果没有出现在组函数/聚合函数中,必须出现在group by里面
2)求每个部门(deptno)、工作岗位(job)的最高工资(salary)
hive> select deptno,job,max(salary) from ruozedata_emp group by deptno,job;
3)求平均工资大于2000的部门
hive> select deptno,avg(salary) from ruozedata_emp group by deptno having avg(salary)>2000;
不能用where,对于分组之后的结果进行筛选只能用having;where是用来对单条数据进行筛选的,而且where需要写在group by之前的
4)case when then
select ename,salary,
case
when salary>1 and salary<=1000 then 'lower'
when salary>1000 and salary<=2000 then 'middle'
when salary>2000 and salary<=4000 then 'high'
else 'highest'
end
from ruozedata_emp;
5)join
hive> select * from a join b;
inner join=join
outer join:left join,right join,full join
hive> select a.id,a.name,b.age from a join b on a.id=b.id;
hive> select a.id,a.name,b.age from a left join b on a.id=b.id;
hive> select a.id,a.name,b.age from a full join b on a.id=b.id;
6.分区表 partition
1)静态分区
hive> create table order_partition(
> ordernumber string,
> eventtime string
> )
> partitioned by (event_month string)
> row format delimited fields terminated by '\t';
hive> LOAD DATA LOCAL INPATH '/home/hadoop/data/order.txt'
> OVERWRITE INTO TABLE order_partition
> PARTITION(event_month='2014-05');
hive> select * from order_partition;
OK
10703007267488 2014-05-01 2014-05
10101043505096 2014-05-01 2014-05
10103043509747 2014-05-01 2014-05
10103043501575 2014-05-01 2014-05
10104043514061 2014-05-01 2014-05
hive> desc formatted order_partition;
[hadoop@hadoop001 data]$ hdfs dfs -ls hdfs://192.168.137.141:9000/user/hive/warehouse/ruozedata.db/order_partition
[hadoop@hadoop001 data]$ hdfs dfs -ls hdfs://192.168.137.141:9000/user/hive/warehouse/ruozedata.db/order_partition/event_month=2014-05
hive> select * from order_partition where event_month='2014-05';
分区查询的时候务必要记得带上分区
添加分区的方法
a)以上是通过建表的时候直接分区,hdfs里显示相应的分区会有相应的文件夹/order_partition/event_month=2014-05,那么可不可以通过直接在hdfs里新建一个分区文件夹来进行分区呢?
[hadoop@hadoop001 data]$ hdfs dfs -mkdir /user/hive/warehouse/ruozedata.db/order_partition/event_month=2014-06
[hadoop@hadoop001 data]$ hdfs dfs -put order.txt /user/hive/warehouse/ruozedata.db/order_partition/event_month=2014-06
新建了/event_month=2014-06文件夹,并将order.txt放入了该文件夹内
[hadoop@hadoop001 data]$ hdfs dfs -ls hdfs://192.168.137.141:9000/user/hive/warehouse/ruozedata.db/order_partition
hive> select * from order_partition where event_month='2014-06';
在hive里查询一下分区event_month=2014-06下的数据,却发现是空的:
进入mysql查看一下元数据
mysql> show databases;
mysql> use ruozedata_basic03;
mysql> show tables;
mysql> select * from partitions;
mysql> select * from partition_keys;
mysql> select * from partition_key_vals;
发现元数据里只有event_month=2014-05这一个分区,为什么呢?
查询官网的说明:
metastore没有感知到,需使用msck命令修复MySQL表的元数据信息,Hive上才能查到到相应的数据结果
hive> MSCK REPAIR TABLE order_partition;
OK
Partitions not in metastore: order_partition:event_month=2014-06
Repair: Added partition to metastore order_partition:event_month=2014-06
再次查看mysql里的元数据,分区信息都进来了
hive> select * from order_partition where event_month='2014-06';
但是,这个命令会刷新所有的分区信息,过于简单粗暴,不建议使用
b)应该用Add Partitions
ALTER TABLE table_name ADD [IF NOT EXISTS] PARTITION partition_spec [LOCATION 'location'][, PARTITION partition_spec [LOCATION 'location'], ...];
partition_spec:
: (partition_column = partition_col_value, partition_column = partition_col_value, ...)
再来试验一下
[hadoop@hadoop001 data]$ hdfs dfs -mkdir /user/hive/warehouse/ruozedata.db/order_partition/event_month=2014-07
[hadoop@hadoop001 data]$ hdfs dfs -put order.txt /user/hive/warehouse/ruozedata.db/order_partition/event_month=2014-07
[hadoop@hadoop001 data]$ hdfs dfs -ls /user/hive/warehouse/ruozedata.db/order_partition/event_month=2014-07
Found 1 items
-rw-r--r-- 1 hadoop supergroup 217 2018-06-17 21:11 /user/hive/warehouse/ruozedata.db/order_partition/event_month=2014-07/order.txt
hive> select * from order_partition where event_month='2014-07';
OK(没有数据)
使用Add Partitions命令
hive> ALTER TABLE order_partition ADD IF NOT EXISTS
> PARTITION (event_month='2014-07') ;
hive> select * from order_partition where event_month='2014-07';
c)还有一种方法,如下:
hive> create table order_4_partition(
> ordernumber string,
> eventtime string
> )
> row format delimited fields terminated by '\t';
hive> load data local inpath '/home/hadoop/data/order.txt' overwrite into table order_4_partition;
hive> insert overwrite table order_partition
> partition(event_month='2014-08')
> select * from order_4_partition;
hive> select * from order_partition where event_month='2014-08';
在hive里查看当前有几个分区的方法:
hive> show partitions order_partition;
静态分区是最简单的分区,单级分区,分区与表中的字段内容没有关系,而且出现在分区内的字段内容是不能出现在表中的
d)多级分区
hive> create table order_mulit_partition(
> ordernumber string,
> eventtime string
> )
> partitioned by (event_month string,event_day string)
> row format delimited fields terminated by '\t';
hive> desc formatted order_mulit_partition;
hive> LOAD DATA LOCAL INPATH '/home/hadoop/data/order.txt'
> OVERWRITE INTO TABLE order_mulit_partition
> PARTITION(event_month='2014-05', event_day='01');
hive> select * from order_mulit_partition where event_month='2014-05' and event_day='01';
查看一下在hdfs下的目录结构
2)动态分区
hive> create table ruozedata_static_emp
> (empno int, ename string, job string, mgr int, hiredate string, salary double, comm double,deptno string)
> PARTITIONED by(deptno string)
> ROW FORMAT DELIMITED
> FIELDS TERMINATED BY '\t';
FAILED: SemanticException [Error 10035]: Column repeated in partitioning columns(报错,分区字段不能出现在表中)
hive> create table ruozedata_static_emp
> (empno int, ename string, job string, mgr int, hiredate string, salary double, comm double)
> PARTITIONED by(deptno string)
> ROW FORMAT DELIMITED
> FIELDS TERMINATED BY '\t';
hive> select * from ruozedata_static_emp;
OK(里面没有数据)
将表 ruozedata_emp中deptno=10的数据写入 ruozedata_static_emp的deptno='10'的分区内
hive> insert into table ruozedata_static_emp partition(deptno='10')
> select empno,ename,job,mgr,hiredate,salary,comm from ruozedata_emp
> where deptno=10;
hive> select * from ruozedata_static_emp;
以上还是静态分区的方法,如果需要分区很多(比如1000个分区),这种方式太耗时间精力
所以需要动态分区
hive> create table ruozedata_dynamic_emp
> (empno int, ename string, job string, mgr int, hiredate string, salary double, comm double)
> PARTITIONED by(deptno string)
> ROW FORMAT DELIMITED
> FIELDS TERMINATED BY '\t' ;
动态分区明确要求:分区字段写在select的最后面
hive> insert into table ruozedata_dynamic_emp partition(deptno)
> select empno,ename,job,mgr,hiredate,salary,comm,deptno from ruozedata_emp ;
hive> set hive.exec.dynamic.partition.mode=nonstrict;(只是临时设置为 nonstrict ,重新进入hive又会自动变为strict模式)
【set hive.exec.dynamic.partition.mode=nonstrict;
这是hive中常用的设置key=value的方式
语法格式:
set key=value; 设置
set key; 取值 】
hive> insert into table ruozedata_dynamic_emp partition(deptno)
> select empno,ename,job,mgr,hiredate,salary,comm,deptno from ruozedata_emp ;
hive> show partitions ruozedata_dynamic_emp;