mapreduce小demo
1.单词分类
a.txt
hello tom
hello jim
hello kitty
hello rose
b.txt
hello jerry
hello jim
hello kitty
hello jack
c.txt
hello jerry
hello java
hello c++
hello c++
需要得到以下结果:
hello a.txt-->4 b.txt-->4 c.txt-->4
java c.txt-->1
jerry b.txt-->1 c.txt-->1
....
1.1 思路
1 将单词个数以及文件位置 统计个数
2 将相同的单词 将数据合并为一条数据
1.2 代码
得到文件名
从输入切片信息中获取当前正在处理的一行数据所属的文件
InputSplit 的实现类 FileSplit inputSplit = (FileSplit)context.getInputSplit();
1. 第一步
- IndexesMapper
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import java.io.IOException;
public class IndexesMapper extends Mapper<LongWritable, Text, Text , IntWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
FileSplit inputSplit = (FileSplit) context.getInputSplit();
String fileName = inputSplit.getPath().getName();
String[] str = value.toString().split(" ");
for (String s : str) {
context.write(new Text(s+"-"+fileName), new IntWritable(1));
}
}
}
- IndexesReduce
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class IndexesReduce extends Reducer<Text, IntWritable, Text, IntWritable> {
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int temp = 0;
for (IntWritable value : values) {
temp+=1;
}
context.write(key,new IntWritable(temp));
}
}
- IndexesSub
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class IndexesSub {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration configuration = new Configuration();
Job job = Job.getInstance(configuration);
job.setJarByClass(IndexesSub.class);
job.setMapperClass(IndexesMapper.class);
job.setReducerClass(IndexesReduce.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.setInputPaths(job, new Path("E:\\soft\\java\\ideaProject\\hadoop\\file\\倒排索引demo\\索引数据"));
FileOutputFormat.setOutputPath(job, new Path("E:\\soft\\java\\ideaProject\\hadoop\\file\\倒排索引demo\\output"));
job.waitForCompletion(true);
}
}
1. 第二步
1 IndexSecMapper
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class IndexSecMapper extends Mapper<LongWritable, Text, Text, Text> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] str = value.toString().split("-");
context.write(new Text(str[0]), new Text(str[1]));
}
}
2 IndexSecReduce
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class IndexSecReduce extends Reducer<Text, Text, Text, Text> {
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
StringBuilder stringBuilder = new StringBuilder();
for (Text value : values) {
stringBuilder.append(" >"+value.toString());
}
context.write(key, new Text(stringBuilder.toString()));
}
}
3 IndexSecSub
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class IndexSecSub {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration configuration = new Configuration();
Job job = Job.getInstance(configuration);
job.setJarByClass(IndexSecSub.class);
job.setMapperClass(IndexSecMapper.class);
job.setReducerClass(IndexSecReduce.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.setInputPaths(job, new Path("E:\\soft\\java\\ideaProject\\hadoop\\file\\倒排索引demo\\output"));
FileOutputFormat.setOutputPath(job, new Path("E:\\soft\\java\\ideaProject\\hadoop\\file\\倒排索引demo\\output2"));
job.waitForCompletion(true);
}
}
2.需要求出每一个订单中成交金额最大的三笔
order001,u001,小米6,1999.9,2
order001,u001,雀巢咖啡,99.0,2
order001,u001,安慕希,250.0,2
order001,u001,经典红双喜,200.0,4
order001,u001,防水电脑包,400.0,2
order002,u002,小米手环,199.0,3
order002,u002,榴莲,15.0,10
order002,u002,苹果,4.5,20
order002,u002,肥皂,10.0,40
order003,u001,小米6,1999.9,2
order003,u001,雀巢咖啡,99.0,2
order003,u001,安慕希,250.0,2
order003,u001,经典红双喜,200.0,4
order003,u001,防水电脑包,400.0,2
2.1 代码
2.1.1 OrderMapper
public class OrderMapper extends Mapper<LongWritable, Text, Text, OrderBean> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] split = value.toString().split(",");
context.write(new Text(split[0]),
new OrderBean(split[0],split[1],split[2],Float.parseFloat(split[3]),Integer.parseInt(split[4])));
}
}
2.1.2 OrderReduce
public class OrderReduce extends Reducer<Text, OrderBean, OrderBean, NullWritable> {
@Override
protected void reduce(Text key, Iterable<OrderBean> values, Context context) throws IOException, InterruptedException {
List<OrderBean> orderBeans = new ArrayList<OrderBean>();
for (OrderBean value : values) {
orderBeans.add(new OrderBean(value));
}
Collections.sort(orderBeans);
for (int i = 0; i < 3; i++) {
context.write(orderBeans.get(i), NullWritable.get());
}
}
}
2.1.3 OrderSub
public class OrderSub {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration configuration = new Configuration();
Job job = Job.getInstance(configuration);
job.setJarByClass(OrderSub.class);
job.setMapperClass(OrderMapper.class);
job.setReducerClass(OrderReduce.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(OrderBean.class);
job.setOutputKeyClass(OrderBean.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.setInputPaths(job, new Path("E:\\soft\\java\\ideaProject\\hadoop\\file\\倒排索引demo\\交金额最大的三笔\\input"));
FileOutputFormat.setOutputPath(job, new Path("E:\\soft\\java\\ideaProject\\hadoop\\file\\倒排索引demo\\交金额最大的三笔\\output"));
job.waitForCompletion(true);
}
}
2.1.4 OrderBean
public class OrderBean implements WritableComparable<OrderBean> {
private String orderId;
private String userId;
private String comName;
private Float price;
private Integer nub;
private Float subPrice;
@Override
public String toString() {
return orderId+"\t"+userId+"\t"+comName+"\t"+price+"\t"+nub+subPrice;
}
public int compareTo(OrderBean o) {
Float temp = o.getSubPrice() - this.getSubPrice();
if (temp == 0){
return this.getComName().compareTo(o.getComName());
}else if (temp>0){
return 1;
}else{
return 0;
}
}
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeUTF(this.orderId);
dataOutput.writeUTF(this.userId);
dataOutput.writeUTF(this.comName);
dataOutput.writeFloat(this.price);
dataOutput.writeInt(this.nub);
dataOutput.writeFloat(this.subPrice);
}
public void readFields(DataInput dataInput) throws IOException {
this.orderId = dataInput.readUTF();
this.userId = dataInput.readUTF();
this.comName = dataInput.readUTF();
this.price = dataInput.readFloat();
this. nub = dataInput.readInt();
this.subPrice = price*nub;
}
public OrderBean() {}
public OrderBean(OrderBean ob){
this.orderId = ob.getOrderId();
this.userId = ob.getUserId();
this.comName = ob.getComName();
this.price = ob.getPrice();
this.nub = ob.getNub();
this.subPrice = ob.getSubPrice();
}
public OrderBean(String orderId, String userId, String comName, Float price, Integer nub) {
this.orderId = orderId;
this.userId = userId;
this.comName = comName;
this.price = price;
this.nub = nub;
this.subPrice = price*nub;
}
public String getOrderId() {
return orderId;
}
public void setOrderId(String orderId) {
this.orderId = orderId;
}
public String getUserId() {
return userId;
}
public void setUserId(String userId) {
this.userId = userId;
}
public String getComName() {
return comName;
}
public void setComName(String comName) {
this.comName = comName;
}
public Float getPrice() {
return price;
}
public void setPrice(Float price) {
this.price = price;
}
public Integer getNub() {
return nub;
}
public void setNub(Integer nub) {
this.nub = nub;
}
public Float getSubPrice() {
return subPrice;
}
public void setSubPrice(Float subPrice) {
this.subPrice = subPrice;
}
}
2.2 解释
- 排序
- bean 中调用 WritableComparable 接口
public int compareTo(OrderBean o) { Float temp = o.getSubPrice() - this.getSubPrice(); if (temp == 0){ return this.getComName().compareTo(o.getComName()); }else if (temp>0){ return 1; }else{ return 0; } }
- OrderReduce中使用
List<OrderBean> orderBeans = new ArrayList<OrderBean>(); for (OrderBean value : values) { orderBeans.add(new OrderBean(value)); } Collections.sort(orderBeans);
2.2 输出
order001 u001 雀巢咖啡 99.0 2198.0
order001 u001 安慕希 250.0 2500.0
order001 u001 小米6 1999.9 23999.8
order002 u002 榴莲 15.0 10150.0
order002 u002 肥皂 10.0 40400.0
order002 u002 苹果 4.5 2090.0
order003 u001 小米6 1999.9 23999.8
order003 u001 雀巢咖啡 99.0 2198.0
order003 u001 安慕希 250.0 2500.0
2.3 优化
自定义分发规则
- 自定义 mapper分发的规则
public class OptimizeOrderPartitione extends Partitioner<OrderBean, NullWritable> {
@Override
public int getPartition(OrderBean orderBean, NullWritable nullWritable, int i) {
return (orderBean.getOrderId().hashCode() & Integer.MAX_VALUE)%i;
}
}
- 自定义 reduce对比的规则(分组比较器)
public class OptimizeOrderGroupingComparator extends WritableComparator {
public OptimizeOrderGroupingComparator(){
super(OrderBean.class,true);
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
OrderBean o1 = (OrderBean) a;
OrderBean o2 = (OrderBean) b;
//返回为零则代表相同
return o1.getOrderId().compareTo(o2.getOrderId());
}
}
- bean 中的compareTo
public int compareTo(OrderBean o) {
return this.getOrderId().compareTo(o.getOrderId())==0 ? Float.compare(o.getSubPrice(),this.getSubPrice()):this.getOrderId().compareTo(o.getOrderId());
}
代码
public class OptimizeOrder {
public static class OpOrdMapper extends Mapper<LongWritable, Text, OrderBean, NullWritable> {
NullWritable v = NullWritable.get();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] split = value.toString().split(",");
context.write(new OrderBean(
split[0],split[1],split[2],Float.parseFloat(split[3]),Integer.parseInt(split[4])),v);
}
}
public static class OpOrdReduce extends Reducer<OrderBean, NullWritable, OrderBean, NullWritable>{
@Override
protected void reduce(OrderBean key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
int temp = 0;
for (NullWritable value : values) {
context.write(key, value);
if (++temp==3){
return;
}
}
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration configuration = new Configuration();
Job job = Job.getInstance(configuration);
job.setJarByClass(OptimizeOrder.class);
job.setMapperClass(OpOrdMapper.class);
job.setReducerClass(OpOrdReduce.class);
job.setMapOutputKeyClass(OrderBean.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(OrderBean.class);
job.setOutputValueClass(NullWritable.class);
//设置分割器
job.setPartitionerClass(OptimizeOrderPartitione.class);
//设置分组比较器
job.setGroupingComparatorClass(OptimizeOrderGroupingComparator.class);
FileInputFormat.setInputPaths(job, new Path("E:\\soft\\java\\ideaProject\\hadoop\\file\\倒排索引demo\\交金额最大的三笔\\input"));
FileOutputFormat.setOutputPath(job, new Path("E:\\soft\\java\\ideaProject\\hadoop\\file\\倒排索引demo\\交金额最大的三笔\\output2"));
job.waitForCompletion(true);
}
}
3.共同好友分析
A:B,C,D,F,E,O
B:A,C,E,K
C:F,A,D,I
D:A,E,F,L
E:B,C,D,M,L
F:A,B,C,D,E,O,M
G:A,C,D,E,F
H:A,C,D,E,O
I:A,O
J:B,O
K:A,C,D
L:D,E,F
M:E,F,G
O:A,H,I,J
3.1 代码
- 第一步
public class MutualFriend {
public static class MutualFriendMapper extends Mapper<LongWritable, Text, Text, Text>{
Text k = new Text();
Text v = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] split = value.toString().split(":");
String[] split1 = split[1].split(",");
v.set(split[0]);
for (String s : split1) {
k.set(s);
context.write(k,v);
}
}
}
public static class MutualFriendReduce extends Reducer<Text,Text,Text,Text>{
Text k = new Text();
Text v = new Text();
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
List<String> list = new ArrayList<String>();
for (Text value : values) {
list.add(value.toString());
}
v.set(key);
for (int i =0 ;i<list.size()-1;i++){
for (int j = i+1; j<list.size();j++){
k.set(list.get(i)+"-"+list.get(j));
context.write(k,v);
}
}
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration configuration = new Configuration();
Job job = Job.getInstance(configuration);
job.setJarByClass(MutualFriend.class);
job.setMapperClass(MutualFriendMapper.class);
job.setReducerClass(MutualFriendReduce.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.setInputPaths(job, new Path("E:\\soft\\java\\ideaProject\\hadoop\\file\\倒排索引demo\\共同好友\\input"));
FileOutputFormat.setOutputPath(job, new Path("E:\\soft\\java\\ideaProject\\hadoop\\file\\倒排索引demo\\共同好友\\output"));
job.waitForCompletion(true);
}
}
结果:
I-K A
I-C A
I-B A
I-G A
I-F A
I-H A
I-O A
I-D A
K-C A
K-B A
K-G A
K-F A
K-H A
K-O A
K-D A
C-B A
C-G A
C-F A
C-H A
C-O A
C-D A
B-G A
B-F A
B-H A
B-O A
B-D A
G-F A
G-H A
G-O A
G-D A
F-H A
F-O A
F-D A
H-O A
H-D A
O-D A
A-F B
A-J B
A-E B
F-J B
F-E B
J-E B
A-E C
A-B C
A-H C
A-F C
A-G C
A-K C
E-B C
E-H C
E-F C
E-G C
E-K C
B-H C
B-F C
B-G C
B-K C
H-F C
H-G C
H-K C
F-G C
F-K C
G-K C
G-C D
G-K D
G-A D
G-L D
G-F D
G-E D
G-H D
C-K D
C-A D
C-L D
C-F D
C-E D
C-H D
K-A D
K-L D
K-F D
K-E D
K-H D
A-L D
A-F D
A-E D
A-H D
L-F D
L-E D
L-H D
F-E D
F-H D
E-H D
G-M E
G-L E
G-H E
G-A E
G-F E
G-B E
G-D E
M-L E
M-H E
M-A E
M-F E
M-B E
M-D E
L-H E
L-A E
L-F E
L-B E
L-D E
H-A E
H-F E
H-B E
H-D E
A-F E
A-B E
A-D E
F-B E
F-D E
B-D E
L-M F
L-D F
L-C F
L-G F
L-A F
M-D F
M-C F
M-G F
M-A F
D-C F
D-G F
D-A F
C-G F
C-A F
G-A F
O-C I
D-E L
E-F M
A-H O
A-I O
A-J O
A-F O
H-I O
H-J O
H-F O
I-J O
I-F O
J-F O
- 第二步
public class MutualFriend2 {
public static class MutualFriend2Mapper extends Mapper<LongWritable, Text, Text, Text>{
Text k = new Text();
Text v = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] split = value.toString().split("\t");
k.set(split[0]);
v.set(split[1]);
context.write(k,v);
}
}
public static class MutualFriend2Reduce extends Reducer<Text,Text,Text,Text>{
Text k = new Text();
Text v = new Text();
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
k.set(key+">");
StringBuilder stringBuilder = new StringBuilder();
for (Text value : values) {
stringBuilder.append(value.toString()+"\t");
}
v.set(stringBuilder.toString());
context.write(k, v);
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration configuration = new Configuration();
Job job = Job.getInstance(configuration);
job.setJarByClass(MutualFriend2.class);
job.setMapperClass(MutualFriend2Mapper.class);
job.setReducerClass(MutualFriend2Reduce.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.setInputPaths(job, new Path("E:\\soft\\java\\ideaProject\\hadoop\\file\\倒排索引demo\\共同好友\\output"));
FileOutputFormat.setOutputPath(job, new Path("E:\\soft\\java\\ideaProject\\hadoop\\file\\倒排索引demo\\共同好友\\output2"));
job.waitForCompletion(true);
}
}
结果:
A-B> E C
A-D> E
A-E> D C B
A-F> C D B O E
A-G> C
A-H> D O C
A-I> O
A-J> O B
A-K> C
A-L> D
B-D> E A
B-F> C A
B-G> C A
B-H> C A
B-K> C
B-O> A
C-A> D F
C-B> A
C-D> A
C-E> D
C-F> D A
C-G> F A
C-H> A D
C-K> D
C-L> D
C-O> A
D-A> F
D-C> F
D-E> L
D-G> F
E-B> C
E-F> M C
E-G> C
E-H> C D
E-K> C
F-B> E
F-D> E A
F-E> B D
F-G> C
F-H> D A
F-J> B
F-K> C
F-O> A
G-A> E D F
G-B> E
G-C> D
G-D> E A
G-E> D
G-F> D E A
G-H> E A D
G-K> D C
G-L> D E
G-M> E
G-O> A
H-A> E
H-B> E
H-D> A E
H-F> C E O
H-G> C
H-I> O
H-J> O
H-K> C
H-O> A
I-B> A
I-C> A
I-D> A
I-F> A O
I-G> A
I-H> A
I-J> O
I-K> A
I-O> A
J-E> B
J-F> O
K-A> D
K-B> A
K-C> A
K-D> A
K-E> D
K-F> D A
K-G> A
K-H> D A
K-L> D
K-O> A
L-A> F E
L-B> E
L-C> F
L-D> E F
L-E> D
L-F> E D
L-G> F
L-H> E D
L-M> F
M-A> F E
M-B> E
M-C> F
M-D> F E
M-F> E
M-G> F
M-H> E
M-L> E
O-C> I
O-D> A