当前位置：首页 > news >正文

MapReduce基础入门1

news 来源：原创 2024/5/14 17:52:01

Hadoop系列

注：大家觉得博客好的话，别忘了点赞收藏呀，本人每周都会更新关于人工智能和大数据相关的内容，内容多为原创，Python Java Scala SQL 代码，CV NLP 推荐系统等，Spark Flink Kafka Hbase Hive Flume等等~写的都是纯干货，各种顶会的论文解读，一起进步。
今天继续和大家分享一下MapReduce基础入门1
#博学谷IT学习技术支持

文章目录

Hadoop系列
前言
一、Map阶段
二、Reduce阶段
三、Driver运行入口
总结

前言

在这里插入图片描述

1、MapReduce会将一个大的计算任务进行拆分，拆分成小任务，让这些小任务在不同的计算机中进行处理,最后再将这些小任务的结果记性整体汇总

2、MapReduce分为两个阶段，一个Map阶段负责任务的拆分，一个是Reduce阶段，负责任务的汇总

3、整个MapReduce工作流程可以分为3个阶段：map、shuffle、reduce。

作者这里用一个简单的单词计数案例来作为入门案例

一、Map阶段

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class WordCountMapper extends Mapper<LongWritable, Text,Text,LongWritable> {
    @Override
    protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, LongWritable>.Context context) throws IOException, InterruptedException {
        //1:想办法得到k2
        String[] wordArray = value.toString().split(" "); //["hadoop","allen"]

        //2:想办法得到v2
        //3:将k2 v2写入下一个环节
        for (String k2 : wordArray) {
            context.write(new Text(k2),new LongWritable(1));
        }

    }
}

二、Reduce阶段

这里入门案例先跳过shuffle阶段，直接来写Reduce阶段

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class WordCountReducer extends Reducer<Text, LongWritable,Text,LongWritable> {
    @Override
    protected void reduce(Text key, Iterable<LongWritable> values, Reducer<Text, LongWritable, Text, LongWritable>.Context context) throws IOException, InterruptedException {
        //1.想办法得到K3
        //2.想办法得到V3 遍历这个v2集合，将集合的内容进行相加
        long count = 0;
        for (LongWritable value : values) {
            count += value.get();
        }
        //3.将K3 V3写入上下文
        context.write(key,new LongWritable(count));
    }
}

三、Driver运行入口

运行MapReduce，需要一个Driver运行入口

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;


import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;

public class WorderCountDriver {
    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException, URISyntaxException {
        //1.创建一个JOB对象
        Configuration configuration = new Configuration();

        Job job = Job.getInstance(configuration, "WordCountBase");

        //2.对JOB进行设置
        //2.1设置当前主类的名字
        job.setJarByClass(WorderCountDriver.class);

        //2.2设置数据路径
        FileInputFormat.addInputPath(job,new Path("hdfs://node1:8020/input"));

        //2.3指定mapper
        job.setMapperClass(WordCountMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(LongWritable.class);

        //2.4指定reducer
        job.setReducerClass(WordCountReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(LongWritable.class);

        //2.5指定输出路径
        Path outPath = new Path("hdfs://node1:8020/output/wordcount");
        FileOutputFormat.setOutputPath(job,outPath);

        FileSystem fileSystem = FileSystem.get(new URI("hdfs://node1:8020"), new Configuration());
        boolean exists = fileSystem.exists(outPath);
        if (exists){
            fileSystem.delete(outPath,true);
        }

        //3.提交yarn执行
        boolean bl = job.waitForCompletion(true);

        //退出
        System.exit(bl ? 0 : 1);

    }
}