1. 环境准备:安装Java与SSH
Hadoop依赖Java环境,首先在Ubuntu上安装OpenJDK(推荐11或8版本):
sudo apt update
sudo apt install openjdk-11-jdk
验证安装:java -version(需显示Java版本信息)。
为避免后续启动Hadoop集群时重复输入密码,配置SSH无密码登录:
sudo apt install openssh-server
ssh-keygen -t rsa # 生成密钥对(直接回车默认路径)
cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys # 添加公钥到授权文件
chmod 600 ~/.ssh/authorized_keys # 设置文件权限
验证SSH登录:ssh localhost(无需密码即可登录)。
2. 下载与安装Hadoop
从Apache官网下载稳定版本的Hadoop(如3.3.6),解压至/usr/local目录:
wget https://downloads.apache.org/hadoop/core/hadoop-3.3.6/hadoop-3.3.6.tar.gz
sudo tar -xzf hadoop-3.3.6.tar.gz -C /usr/local/
sudo mv /usr/local/hadoop-3.3.6 /usr/local/hadoop # 重命名为hadoop方便使用
3. 配置Hadoop环境变量
编辑~/.bashrc文件,添加Hadoop相关环境变量:
echo "export HADOOP_HOME=/usr/local/hadoop" >> ~/.bashrc
echo "export PATH=\$PATH:\$HADOOP_HOME/bin:\$HADOOP_HOME/sbin" >> ~/.bashrc
source ~/.bashrc # 使变量立即生效
验证环境变量:hadoop version(需显示Hadoop版本信息)。
4. 配置Hadoop核心文件
进入Hadoop配置目录/usr/local/hadoop/etc/hadoop,修改以下文件:
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://localhost:9000</value> <!-- HDFS NameNode地址 -->
</property>
</configuration>
<configuration>
<property>
<name>dfs.replication</name>
<value>1</value> <!-- 数据副本数,集群模式下需调整 -->
</property>
</configuration>
<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value> <!-- 指定MapReduce运行在YARN上 -->
</property>
</configuration>
<configuration>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value> <!-- 启用MapReduce Shuffle服务 -->
</property>
</configuration>
5. 格式化HDFS与启动集群
首次使用HDFS前,需格式化NameNode(注意:格式化会清除原有数据,仅首次使用执行):
hdfs namenode -format
启动HDFS(分布式文件系统)和YARN(资源管理框架):
start-dfs.sh # 启动HDFS
start-yarn.sh # 启动YARN
验证集群状态:
http://localhost:9870(查看HDFS存储状态);http://localhost:8088(查看资源分配情况)。6. 大数据分析实战:WordCount示例
Hadoop的核心功能是分布式数据处理,以下以经典的“WordCount”(统计单词出现次数)为例,演示数据分析流程:
sample.txt,内容如下:Hello World
Welcome to Hadoop world
Hadoop is a powerful framework
hadoop fs -mkdir -p /input # 创建HDFS输入目录
hadoop fs -put sample.txt /input # 上传文件至HDFS
WordCount.java(核心逻辑:Mapper拆分单词,Reducer汇总计数):import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WordCount {
// Mapper:将每行文本拆分为单词,输出<单词,1>
public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
word.set(itr.nextToken().toLowerCase()); // 转换为小写统一统计
context.write(word, one);
}
}
}
// Reducer:汇总相同单词的计数,输出<单词,总次数>
public static class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
// 主函数:配置并提交Job
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "word count");
job.setJarByClass(WordCount.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class); // 合并Mapper输出,减少数据传输
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class); // 输出键类型(单词)
job.setOutputValueClass(IntWritable.class); // 输出值类型(计数)
FileInputFormat.addInputPath(job, new Path(args[0])); // 输入路径(HDFS)
FileOutputFormat.setOutputPath(job, new Path(args[1])); // 输出路径(HDFS)
System.exit(job.waitForCompletion(true) ? 0 : 1); // 等待Job完成
}
}
javac -classpath "$HADOOP_HOME/share/hadoop/common/*:$HADOOP_HOME/share/hadoop/mapreduce/*" -d . WordCount.java
jar -cvf wordcount.jar -C . . # 打包为jar文件
hadoop jar wordcount.jar WordCount /input /output # 输入路径为HDFS的/input,输出路径为HDFS的/output
hadoop fs -cat /output/part-r-00000 # 输出结果(单词及出现次数)
示例输出:a 1
framework 1
hadoop 2
hello 1
is 1
powerful 1
welcome 1
world 2
to 1
注意事项