在 Ubuntu 上使用 HDFS 进行大数据处理的实操指南
一 环境准备与安装
sudo apt update && sudo apt install -y openjdk-8-jdkjava -versionsudo apt install -y openssh-serverssh-keygen -t rsa -P ''cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys && chmod 600 ~/.ssh/authorized_keysssh localhostwget https://downloads.apache.org/hadoop/common/hadoop-3.3.6/hadoop-3.3.6.tar.gzsudo tar -xzvf hadoop-3.3.6.tar.gz -C /usr/local/ && sudo mv /usr/local/hadoop-3.3.6 /usr/local/hadoop~/.bashrc 或 /etc/profile):
export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
export HADOOP_HOME=/usr/local/hadoop
export PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
source ~/.bashrc二 配置 HDFS 与 YARN
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://localhost:9000</value>
</property>
</configuration>
<configuration>
<property>
<name>dfs.replication</name>
<value>1</value>
</property>
<property>
<name>dfs.namenode.name.dir</name>
<value>/usr/local/hadoop/hdfs/namenode</value>
</property>
<property>
<name>dfs.datanode.data.dir</name>
<value>/usr/local/hadoop/hdfs/datanode</value>
</property>
</configuration>
<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
</configuration>
<configuration>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<property>
<name>yarn.nodemanager.aux-services.mapreduce.shuffle.class</name>
<value>org.apache.hadoop.mapred.ShuffleHandler</value>
</property>
</configuration>
sudo mkdir -p /usr/local/hadoop/hdfs/{namenode,datanode}sudo chown -R hdfs:hdfs /usr/local/hadoop/hdfshdfs namenode -format三 启动与验证
start-dfs.shstart-dfs.sh && start-yarn.shjpssudo ufw allow 9000,9870,8088四 HDFS 基本操作与示例作业
hdfs dfs -mkdir -p /user/$USERhdfs dfs -put $HADOOP_HOME/etc/hadoop/*.xml /user/$USER/inputhdfs dfs -cat /user/$USER/input/*.xml | headhdfs dfs -get /user/$USER/output ./outputhdfs dfs -rm -r /user/$USER/outputhadoop jar $HADOOP_HOME/share/hadoop/mapreduce/hadoop-mapreduce-examples-3.3.6.jar grep /user/$USER/input /user/$USER/output 'dfs[a-z.]+'hdfs dfs -cat /user/$USER/output/*hdfs dfs -ls /user/$USERhdfs dfs -cp input/my.txt /input,hdfs dfs -mv input/my.txt /input2五 常见问题与优化建议
jps)、端口未被占用、以及本机/服务器防火墙已放行(如 9870/8088)。sudo -u hdfs 执行相关 HDFS 操作。dfs.replication 设为 3 以提升容错性。