In this tutorial I will go over the required steps for setting up a single-node Hadoop Cluster backed by the Hadoop Distributed File System, running on CentOS Linux. Our Linux Box will reside on our local system on top of VirtualBox running a CentOS 64 Bits. The goal is to get a simple Hadoop installation up and running so we can play around with it and start learning Hadoop stuff.
yum install java-1.8.0-* -y
[root@vnode ~]# java -version
openjdk version "1.8.0_91"
OpenJDK Runtime Environment (build 1.8.0_91-b14)
OpenJDK 64-Bit Server VM (build 25.91-b14, mixed mode)
-- create group
[root@vnode ~]# groupadd hdpgrp
-- create user and add him to the group
[root@vnode ~]# adduser -g hdpgrp hdpusr
[root@vnode home]# yum -y install openssh-server openssh-clients
....
....
-- setup to start on reboot
[root@vnode home]# chkconfig sshd on
-- start sshd
[root@vnode home]# service sshd start
-- validate sshd
[root@vnode home]# netstat -tulpn | grep :22
tcp 0 0 0.0.0.0:22 0.0.0.0:* LISTEN 11341/sshd
tcp 0 0 :::22 :::* LISTEN 11341/sshd
[root@vnode home]# su - hdpusr
[hdpusr@vnode ~]$
[hdpusr@vnode ~]$ ssh-keygen -t rsa
Generating public/private rsa key pair.
Enter file in which to save the key (/home/hdpusr/.ssh/id_rsa):
Created directory '/home/hdpusr/.ssh'.
Enter passphrase (empty for no passphrase):
Enter same passphrase again:
Your identification has been saved in /home/hdpusr/.ssh/id_rsa.
Your public key has been saved in /home/hdpusr/.ssh/id_rsa.pub.
The key fingerprint is:
e0:8e:a9:0d:22:5f:78:df:b8:fc:6d:f8:51:c4:19:b9 hdpusr@vnode
The key's randomart image is:
+--[ RSA 2048]----+
| .. |
| ..o |
| . +. |
| . . .E |
| . S . |
| . + . |
|o o = . .. |
|.o * o o.... |
| o . =oooo |
+-----------------+
[hdpusr@vnode ~]$ cd .ssh/
[hdpusr@vnode .ssh]$ ll
total 8
-rw------- 1 hdpusr hdpgrp 1675 Jul 14 20:25 id_rsa
-rw-r--r-- 1 hdpusr hdpgrp 394 Jul 14 20:25 id_rsa.pub
[hdpusr@vnode .ssh]$ cp id_rsa.pub authorized_keys
[hdpusr@vnode .ssh]$ ll
total 12
-rw-r--r-- 1 hdpusr hdpgrp 394 Jul 14 20:26 authorized_keys
-rw------- 1 hdpusr hdpgrp 1675 Jul 14 20:25 id_rsa
-rw-r--r-- 1 hdpusr hdpgrp 394 Jul 14 20:25 id_rsa.pub
[hdpusr@vnode .ssh]$ ssh localhost
The authenticity of host 'localhost (::1)' can't be established.
RSA key fingerprint is 9d:1f:2d:d9:97:7e:1e:77:9a:0f:35:69:76:e9:2e:aa.
Are you sure you want to continue connecting (yes/no)? yes
Warning: Permanently added 'localhost' (RSA) to the list of known hosts.
[hdpusr@vnode ~]$
[hdpusr@vnode ~]$
[hdpusr@vnode ~]$ exit
logout
Connection to localhost closed.
[hdpusr@vnode .ssh]$ ssh localhost
Last login: Thu Jul 14 20:28:01 2016 from localhost.localdomain
sysctl -w net.ipv6.conf.default.disable_ipv6=1
sysctl -w net.ipv6.conf.all.disable_ipv6=1
echo "net.ipv6.conf.default.disable_ipv6=1" /etc/sysctl.conf
echo "net.ipv6.conf.all.disable_ipv6=1" /etc/sysctl.conf
wget http://apache.mirror.digitalpacific.com.au/hadoop/core/hadoop-2.7.2/hadoop-2.7.2.tar.gz
[root@vnode hadoop-2.7.2]# gunzip hadoop-2.7.2.tar.gz
[root@vnode hadoop-2.7.2]# tar -xf hadoop-2.7.2.tar
[root@vnode home]# mkdir /usr/local/hadoop
[root@vnode home]# cd hadoop-2.7.2
[root@vnode hadoop-2.7.2]# mv * /usr/local/hadoop/'
[root@vnode hadoop-2.7.2]# chown -R hdpusr:hdpgrp /usr/local/hadoop/
[root@vnode hadoop-2.7.2]# which java
/usr/bin/java
[root@vnode hadoop-2.7.2]# ll $(!!)
ll $( which java)
lrwxrwxrwx 1 root root 22 Jul 14 20:06 /usr/bin/java - /etc/alternatives/java
[root@vnode hadoop-2.7.2]# ll /etc/alternatives/java
lrwxrwxrwx 1 root root 46 Jul 14 20:06 /etc/alternatives/java - /usr/lib/jvm/jre-1.8.0-openjdk.x86_64/bin/java
# Set Hadoop-related environment variables
export HADOOP_HOME=/usr/local/hadoop
# Set JAVA_HOME (we will also configure JAVA_HOME directly for Hadoop later on)
export JAVA_HOME=/usr/lib/jvm/jre-1.8.0-openjdk.x86_64
# Some convenient aliases and functions for running Hadoop-related commands
unalias fs & /dev/null
alias fs="hadoop fs"
unalias hls & /dev/null
alias hls="fs -ls"
# If you have LZO compression enabled in your Hadoop cluster and
# compress job outputs with LZOP (not covered in this tutorial):
# Conveniently inspect an LZOP compressed file from the command
# line; run via:
#
# $ lzohead /hdfs/path/to/lzop/compressed/file.lzo
#
# Requires installed 'lzop' command.
#
lzohead () {
hadoop fs -cat $1 | lzop -dc | head -1000 | less
}
# Add Hadoop bin/ directory to PATH
export PATH=$PATH:$HADOOP_HOME/bin
[hdpusr@vnode ~]$ source .bashrc
-- call the JAVA_HOME variable
[hdpusr@vnode ~]$ echo ${JAVA_HOME}
/usr/lib/jvm/jre-1.8.0-openjdk.x86_64
/usr/local/hadoop/etc/hadoop/hadoop-env.sh
find <hadoop unzipped folder/ -name hadoop-env.sh
# The java implementation to use.
export JAVA_HOME=${JAVA_HOME}
# The java implementation to use.
export JAVA_HOME=/usr/lib/jvm/jre-1.8.0-openjdk.x86_64
[hdpusr@vnode home]$ cd ~
[hdpusr@vnode ~]$ ll
total 0
[hdpusr@vnode ~]$ mkdir hadoopfs
[hdpusr@vnode ~]$ cd hadoopfs/
[hdpusr@vnode hadoopfs]$ mkdir tmp
[hdpusr@vnode hadoopfs]$ cd tmp/
[hdpusr@vnode tmp]$ pwd
/home/hdpusr/hadoopfs/tmp
<configuration
<property
<namehadoop.tmp.dir</name
<value/home/hdpusr/hadoopfs/tmp</value
<descriptionTemporary directories.</description
</property
<property
<namefs.default.name</name
<valuehdfs://localhost:54310</value
<descriptionThe name of the default file system.</description
</property
</configuration
<configuration
<property
<namemapred.job.tracker</name
<valuelocalhost:54311</value
<descriptionThe host and port that the MapReduce job tracker runs
at. If "local", then jobs are run in-process as a single map
and reduce task.
</description
</property
</configuration
<configuration
<property
<namedfs.replication</name
<value1</value
<descriptionDefault block replication.
The actual number of replications can be specified when the file is created.
The default is used if replication is not specified in create time.
</description
</property
</configuration
/usr/local/hadoop/bin/hadoop namenode -format
[hdpusr@vnode hadoop]$ /usr/local/hadoop/bin/hadoop namenode -format
16/07/14 22:14:11 INFO namenode.NameNode: STARTUP_MSG:
/************************************************************
STARTUP_MSG: Starting NameNode
STARTUP_MSG: host = localhost.localdomain/127.0.0.1
STARTUP_MSG: args = [-format]
STARTUP_MSG: version = 2.7.2
......
.....
.....
6/07/14 22:14:13 INFO namenode.FSImage: Allocated new BlockPoolId: BP-1366984355-127.0.0.1-1468498452948
16/07/14 22:14:13 INFO common.Storage: Storage directory /home/hdpusr/hadoopfs/tmp/dfs/name has been successfully formatted.
16/07/14 22:14:13 INFO namenode.NNStorageRetentionManager: Going to retain 1 images with txid = 0
16/07/14 22:14:13 INFO util.ExitUtil: Exiting with status 0
16/07/14 22:14:13 INFO namenode.NameNode: SHUTDOWN_MSG:
/************************************************************
SHUTDOWN_MSG: Shutting down NameNode at localhost.localdomain/127.0.0.1
************************************************************/
[hdpusr@vnode sbin]$ /usr/local/hadoop/sbin/start-all.sh
This script is Deprecated. Instead use start-dfs.sh and start-yarn.sh
16/07/14 22:24:35 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Starting namenodes on [localhost]
localhost: starting namenode, logging to /usr/local/hadoop/logs/hadoop-hdpusr-namenode-vnode.out
localhost: starting datanode, logging to /usr/local/hadoop/logs/hadoop-hdpusr-datanode-vnode.out
Starting secondary namenodes [0.0.0.0]
The authenticity of host '0.0.0.0 (0.0.0.0)' can't be established.
RSA key fingerprint is 9d:1f:2d:d9:97:7e:1e:77:9a:0f:35:69:76:e9:2e:aa.
Are you sure you want to continue connecting (yes/no)? yes
0.0.0.0: Warning: Permanently added '0.0.0.0' (RSA) to the list of known hosts.
0.0.0.0: starting secondarynamenode, logging to /usr/local/hadoop/logs/hadoop-hdpusr-secondarynamenode-vnode.out
16/07/14 22:24:55 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
starting yarn daemons
starting resourcemanager, logging to /usr/local/hadoop/logs/yarn-hdpusr-resourcemanager-vnode.out
localhost: starting nodemanager, logging to /usr/local/hadoop/logs/yarn-hdpusr-nodemanager-vnode.out
[hdpusr@vnode sbin]$ jps
13076 Jps
12356 DataNode
12678 ResourceManager
12230 NameNode
12779 NodeManager
12526 SecondaryNameNode
[hdpusr@vnode sbin]$ jps -l
13108 sun.tools.jps.Jps
12356 org.apache.hadoop.hdfs.server.datanode.DataNode
12678 org.apache.hadoop.yarn.server.resourcemanager.ResourceManager
12230 org.apache.hadoop.hdfs.server.namenode.NameNode
12779 org.apache.hadoop.yarn.server.nodemanager.NodeManager
12526 org.apache.hadoop.hdfs.server.namenode.SecondaryNameNode
/usr/local/hadoop/logs
[hdpusr@vnode sbin]$ /usr/local/hadoop/sbin/stop-all.sh
Stopping namenodes on [localhost]
localhost: stopping namenode
localhost: stopping datanode
Stopping secondary namenodes [0.0.0.0]
0.0.0.0: stopping secondarynamenode
16/07/14 22:30:28 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
stopping yarn daemons
stopping resourcemanager
localhost: stopping nodemanager
no proxyserver to stop
[hdpusr@vnode sbin]$ jps
3967 Jps