1. Java 安装
下载安装 jdk1.8
:
1、解压:
[root@bogon app]# pwd
/home/hj/app
[root@bogon app]# tar -zxvf jdk-8u261-linux-x64.tar.gz
2、配置 .bash_profile
:
export JAVA_HOME=/home/hj/app/jdk1.8.0_261
export PATH=$JAVA_HOME/bin:$PATH
3、使其生效:
[root@bogon app]# source ~/.bash_profile
[root@bogon app]# echo $JAVA_HOME
/home/hj/app/jdk1.8.0_261
[root@bogon app]# java -version
参考教程:
https://www.cnblogs.com/lykbk/p/sdsdsadsdwe343434343434.html
2. Scala 安装
1、解压:
tar -zxvf scala-2.11.8.tgz
# 或者
tar -zxvf scala-2.11.8.tgz -C /home/hj/app
2、添加环境变量:
[root@bogon app]# vim ~/.bash_profile
export SCALA_HOME=/home/hj/app/scala-2.11.8
export PATH=$SCALA_HOME/bin:$PATH
3、使其生效:
[root@bogon app]# source ~/.bash_profile
[root@bogon app]# echo $SCALA_HOME
/home/hj/app/scala-2.11.8
# 输入 scala 出现以下信息就表示安装成功
[root@bogon app]# scala
Welcome to Scala 2.11.8 (Java HotSpot(TM) 64-Bit Server VM, Java 1.8.0_261).
Type in expressions for evaluation. Or try :help.
scala>
3. Hadoop-2.6.0-cdh5.7.0安装
3.1 Hadoop 安装
1、解压:
[root@bogon app]# tar -zxvf hadoop-2.6.0-cdh5.7.0.tar.gz
2、配置环境变量:
[root@bogon app]# vim ~/.bash_profile
export HADOOP_HOME=/home/hj/app/hadoop-2.6.0-cdh5.7.0
export PATH=$HADOOP_HOME/bin:$PATH
3、使其生效:
[root@bogon app]# source ~/.bash_profile
[root@bogon app]# echo $HADOOP_HOME
/home/hj/app/hadoop-2.6.0-cdh5.7.0
3.2 修改配置文件
3.2.1 hadoop-env.sh
# 添加 JAVA_HOME
[root@bogon hadoop]# cd /home/hj/app/hadoop-2.6.0-cdh5.7.0/etc/hadoop
[root@bogon hadoop]# vim hadoop-env.sh
# The java implementation to use.
# export JAVA_HOME=${JAVA_HOME}
export JAVA_HOME=/home/hj/app/jdk1.8.0_261
3.2.2 core-site.xml
[root@bogon hadoop]# vim core-site.xml
<!-- Put site-specific property overrides in this file. -->
<configuration>
<property>
<name>fs.default.name</name>
<value>hdfs://hadoop000:8020</value>
</property>
注意:
Hadoop
启动后可以使用hadoop000:50070
访问,在此之前需要先将hadoop000
加入域名映射,否则无法识别主机名
[root@bogon ~]# vim /etc/hosts
# 在最后添加
127.0.0.1 hadoop000
参考:
https://blog.csdn.net/lijing742180/article/details/90486077
3.2.3 hdfs_site.xml
配置之前,先创建目录:/home/hj/app/tmp/hdfs
[root@bogon hadoop]# vim hdfs-site.xml
<!-- Put site-specific property overrides in this file. -->
<configuration>
<property>
<name>dfs.namenode.name.dir</name>
<value>/home/hj/app/tmp/dfs/name</value>
</property>
<property>
<name>dfs.datanode.data.dir</name>
<value>/home/hj/app/tmp/dfs/data</value>
</property>
<property>
<name>dfs.replication</name>
<value>1</value>
</property>
</configuration>
3.2.4 mapred-site.xml
原本没有 mapred-site.xml
文件,可由 mapred-site.xml.template
复制而来:
[root@bogon hadoop]# cp mapred-site.xml.template mapred-site.xml
[root@bogon hadoop]# vim mapred-site.xml
<!-- Put site-specific property overrides in this file. -->
<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
</configuration>
3.2.5 yarn-site.xml
[root@bogon hadoop]# vim yarn-site.xml
<configuration>
<!-- Site specific YARN configuration properties -->
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
</configuration>
3.2.6 namenode 格式化
[root@bogon hadoop-2.6.0-cdh5.7.0]# cd bin
[root@bogon bin]# ./hadoop namenode -format
# 正常情况下会输出如下信息(省略大部分内容)
21/01/10 13:13:11 INFO common.Storage: Storage directory /home/hj/app/tmp/dfs/name has been successfully formatted.
21/01/10 13:13:11 INFO namenode.NNStorageRetentionManager: Going to retain 1 images with txid >= 0
21/01/10 13:13:11 INFO util.ExitUtil: Exiting with status 0
21/01/10 13:13:11 INFO namenode.NameNode: SHUTDOWN_MSG:
/************************************************************
SHUTDOWN_MSG: Shutting down NameNode at bogon/192.168.131.131
************************************************************/
格式化成功会将格式化信息输出到:/home/hj/app/tmp/hdfs
目录下,会生成两个目录:data、name
:
[root@bogon tmp]# ls dfs/
name data
3.2.7 启动 hdfs
[root@bogon sbin]# cd /home/hj/app/hadoop-2.6.0-cdh5.7.0/sbin/
[root@bogon sbin]# ls
distribute-exclude.sh httpfs.sh slaves.sh start-dfs.sh stop-all.sh stop-yarn.cmd
hadoop-daemon.sh kms.sh start-all.cmd start-secure-dns.sh stop-balancer.sh stop-yarn.sh
hadoop-daemons.sh Linux start-all.sh start-yarn.cmd stop-dfs.cmd yarn-daemon.sh
hdfs-config.cmd mr-jobhistory-daemon.sh start-balancer.sh start-yarn.sh stop-dfs.sh yarn-daemons.sh
hdfs-config.sh refresh-namenodes.sh start-dfs.cmd stop-all.cmd stop-secure-dns.sh
[root@bogon sbin]# ./start-dfs.sh
3.2.8 检查是否启动成功
[root@bogon sbin]# jps
8881 SecondaryNameNode
7379 DataNode
8664 NameNode
9261 Jps
1054 QuorumPeerMain
本机可以访问:hadoop:50070
,其他机器:http://192.168.131.131:50070
,如下图所示:
测试 HDFS
# 先检查 hdfs 根目录是否有文件
[root@bogon app]# hadoop fs -ls /
21/01/10 15:13:38 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
# 创建目录 test
[root@bogon app]# hadoop fs -mkdir /test
21/01/10 15:17:47 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
# 本地随便创建一个文件,写入一些内容
[root@bogon app]# vim dfs.txt
# 将其传入到 test 目录下
[root@bogon app]# hadoop fs -put dfs.txt /test/
21/01/10 15:19:20 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
# 再次查看
[root@bogon app]# hadoop fs -ls /
21/01/10 15:19:40 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Found 1 items
drwxr-xr-x - root supergroup 0 2021-01-10 15:19 /test
# 读取 hdfs 中的文件
[root@bogon app]# hadoop fs -text /test/dfs.txt
21/01/10 15:20:26 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
测试 dfs
Web
页面 Utilities/Browse the file system
可以查看:
3.2.9 启动 yarn
[root@bogon sbin]# ./start-yarn.sh
starting yarn daemons
starting resourcemanager, logging to /home/hj/app/hadoop-2.6.0-cdh5.7.0/logs/yarn-root-resourcemanager-bogon.out
localhost: /usr/bin/python: No module named virtualenvwrapper
localhost: virtualenvwrapper.sh: There was a problem running the initialization hooks.
localhost:
localhost: If Python could not import the module virtualenvwrapper.hook_loader,
localhost: check that virtualenvwrapper has been installed for
localhost: VIRTUALENVWRAPPER_PYTHON=/usr/bin/python and that PATH is
localhost: set properly.
localhost: starting nodemanager, logging to /home/hj/app/hadoop-2.6.0-cdh5.7.0/logs/yarn-root-nodemanager-bogon.out
[root@bogon sbin]# jps
12945 DataNode
13089 SecondaryNameNode
14065 Jps
13924 ResourceManager
12840 NameNode
14031 NodeManager
碰到的问题
1、启动 start-yarn.sh
时发现 NodeManager、ResourceManager
进程总是会掉,查看日志发现是没有配置主机名
[root@bogon logs]# tailf yarn-root-resourcemanager-bogon.log
Caused by: java.net.UnknownHostException: bogon: 未知的名称或服务
at java.net.Inet6AddressImpl.lookupAllHostAddr(Native Method)
at java.net.InetAddress$2.lookupAllHostAddr(InetAddress.java:929)
at java.net.InetAddress.getAddressesFromNameService(InetAddress.java:1324)
at java.net.InetAddress.getLocalHost(InetAddress.java:1501)
... 14 more
2021-01-10 16:14:23,107 INFO org.apache.hadoop.yarn.server.resourcemanager.ResourceManager: SHUTDOWN_MSG:
/************************************************************
SHUTDOWN_MSG: Shutting down ResourceManager at java.net.UnknownHostException: bogon: bogon: 未知的名称或服务
************************************************************/
2021-01-10 16:29:45,644 INFO org.apache.hadoop.yarn.server.resourcemanager.ResourceManager: STARTUP_MSG:
/************************************************************
解决
# 查看当前主机名
[root@bogon logs]# hostname
bogon
# 修改 /etc/sysconfig/network
[root@bogon logs]# vim /etc/sysconfig/network
# Created by anaconda
NETWORKING=yes
HOSTNAME=bogon
# 修改 /etc/hosts,将 bogon 添加进去
[root@bogon logs]# cat /etc/hosts
127.0.0.1 localhost bogon localhost.localdomain localhost4 localhost4.localdomain4
::1 localhost bogon localhost.localdomain localhost6 localhost6.localdomain6
127.0.0.1 hadoop000
# 最后执行 /etc/rc.d/init.d/network restart
# 参考文章:https://blog.csdn.net/qq_34721795/article/details/80506010
2、8088
端口被占用,修改端口 yarn-site.xml
:
<configuration>
<!-- Site specific YARN configuration properties -->
<property>
<name>yarn.resourcemanager.webapp.address</name>
<value>192.168.131.131:9099</value>
</property>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
</configuration>
Web
页面可访问:http://192.168.131.131:9099/cluster
:
3.2.10 免密启动 hdfs
每次启动 ./start-dfs.sh
都需要输入密码,可以免密登录:
ssh-keygen -t rsa # 生成公钥私钥
# 将公钥拷贝到要免密码登录的机器上面
ssh-copy-id hadoop000
3.2.11 参考文章
Hadoop-2.6.0-cdh5.7.0
安装详解:https://blog.csdn.net/weixin_39216383/article/details/78953236
Namenode
启动不起来的原因:https://blog.csdn.net/qq_43733123/article/details/103749663
Hadoop-2.6.0-cdh5.7.0
安装:https://blog.csdn.net/weixin_43999780/article/details/95245424
3.2.12 常用命令合集
# 启动 hdfs 集群
./start-dfs.sh
# 关闭 hdfs 集群
stop-dfs.sh
# 启动 yarn 集群
./start-yarn.sh
# 停止 yarn 集群
./stop-yarn.sh
参考文章:
https://www.cnblogs.com/jun1019/p/6266615.html
4. Maven 安装
# 下载
[root@bogon app]# wget https://archive.apache.org/dist/maven/maven-3/3.3.9/binaries/apache-maven-3.3.9-bin.tar.gz
[root@bogon app]# tar -zxvf apache-maven-3.3.9-bin.tar.gz
[root@bogon app]# vim ~/.bash_profile
export MAVEN_HOME=/home/hj/app/apache-maven-3.3.9
export PATH=$MAVEN_HOME/bin:$PATH
[root@bogon app]# source ~/.bash_profile
# 检查版本
[root@bogon app]# mvn -v
Apache Maven 3.3.9 (bb52d8502b132ec0a5a3f4c09453c07478323dc5; 2015-11-11T00:41:47+08:00)
Maven home: /home/hj/app/apache-maven-3.3.9
Java version: 1.8.0_261, vendor: Oracle Corporation
Java home: /home/hj/app/jdk1.8.0_261/jre
Default locale: zh_CN, platform encoding: UTF-8
OS name: "linux", version: "3.10.0-1127.10.1.el7.x86_64", arch: "amd64", family: "unix"
配置 settings.xml
, 创建 maven_repository
目录:
[root@bogon apache-maven-3.3.9]# cd conf/
[root@bogon conf]# vim settings.xml
<!-- localRepository
| The path to the local repository maven will use to store artifacts.
|
| Default: ${user.home}/.m2/repository
<localRepository>/path/to/local/repo</localRepository>
-->
# 配置仓库地址
<localRepository>/home/hj/app/maven_repository</localRepository>
<!-- interactiveMode
| This will determine whether maven prompts you when it needs input. If set to false,
| maven will use a sensible default value, perhaps based on some other setting, for
| the parameter in question.
|
| Default: true
<interactiveMode>true</interactiveMode>
-->
5. Maven 编译安装 Spark
5.1 配置 Maven 下载源
为了能够使编译速度快一些,可以更改 Maven
仓库为国内地址,修改:/home/hj/app/apache-maven-3.3.9/conf/settings.xml
:
<mirror>
<id>nexus-aliyun</id>
<mirrorOf>central</mirrorOf>
<name>Nexus aliyun</name>
<url>http://maven.aliyun.com/nexus/content/groups/public</url>
</mirror>
5.2 Spark 源码下载
下载地址:https://github.com/apache/spark/
,选择下载 .tgz
方式下载,上传到服务器:
scp spark-2.4.7.tgz root@182.168.131.131:/home/hj/app/
5.3 修改 make-distribution.sh
文件:
tar -zxvf spark-2.4.7 # 解压到当前目录
# 修改 make-distribution.sh
vim /home/hj/app/spark-2.4.7/dev/make-distribution.sh
MVN="$MAVEN_HOME/bin/mvn"
5.4 修改 Maven
仓库改成阿里云仓库
修改 Spark
源码的 pom.xml
文件,将 Maven
仓库改成阿里云仓库:
<name>Maven Repository</name>
<!--<url>https://repo1.maven.org/maven2</url>-->
<url>https://maven.aliyun.com/nexus/content/groups/public/</url>
<releases>
<enabled>true</enabled>
</releases>
5.5 编译
编译前需要先配置:
# 配置 Maven 内存,设置为 1g,以防止内存溢出
export MAVEN_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=1g"
mvn -DskipTests clean package
编译:
cd /home/hj/app/spark-2.4.7
./dev/make-distribution.sh --name 2.6.0-cdh5.7.0 --tgz -Pyarn -Phadoop-2.6 -Phive -Phive-thriftserver -Dhadoop.version=2.6.0-cdh5.7.0
经过漫长等待后,编译完成会生成一个名为 spark-2.2.0-bin-2.6.0-cdh5.7.0.tgz
的文件,解压后里面目录结构:
[root@bogon app]# cd spark-2.2.0-bin-2.6.0-cdh5.7.0
[root@bogon spark-2.2.0-bin-2.6.0-cdh5.7.0]# pwd
/home/hj/app/spark-2.2.0-bin-2.6.0-cdh5.7.0
[root@bogon spark-2.2.0-bin-2.6.0-cdh5.7.0]# cd bin/
[root@bogon bin]# ls
beeline load-spark-env.cmd pyspark2.cmd spark-class sparkR2.cmd spark-shell.cmd spark-submit.cmd
beeline.cmd load-spark-env.sh pyspark.cmd spark-class2.cmd sparkR.cmd spark-sql
derby.log metastore_db run-example spark-class.cmd spark-shell spark-submit
find-spark-home pyspark run-example.cmd sparkR spark-shell2.cmd spark-submit2.cmd
# 运行开启 spark shell
[root@bogon bin]# ./spark-shell
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/01/17 02:52:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
21/01/17 02:52:36 WARN Utils: Your hostname, bogon resolves to a loopback address: 127.0.0.1; using 192.168.131.131 instead (on interface ens33)
21/01/17 02:52:36 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
21/01/17 02:53:20 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 1.2.0
21/01/17 02:53:21 WARN ObjectStore: Failed to get database default, returning NoSuchObjectException
21/01/17 02:53:25 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException
Spark context Web UI available at http://192.168.131.131:4040
Spark context available as 'sc' (master = local[*], app id = local-1610823160220).
Spark session available as 'spark'.
Welcome to
____ __
/ __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/
/___/ .__/\_,_/_/ /_/\_\ version 2.2.0
/_/
Using Scala version 2.11.8 (Java HotSpot(TM) 64-Bit Server VM, Java 1.8.0_261)
Type in expressions to have them evaluated.
Type :help for more information.
scala> sc
Web
可访问:http://192.168.131.131:4040/jobs/
查看:
能看到以上信息,说明 Spark
编译成功,至此整套流程搭建完毕~
参考文章
- 使用
Maven
编译Spark
源码:https://blog.csdn.net/lc_1123/article/details/78628989
- 官网:
http://spark.apache.org/docs/latest/building-spark.html#apache-maven
-
Spark
源码安装:https://www.cnblogs.com/heml/archive/2004/01/13/6124819.html
<mirrors>
<!-- mirror
| Specifies a repository mirror site to use instead of a given repository. The repository that
| this mirror serves has an ID that matches the mirrorOf element of this mirror. IDs are used
| for inheritance and direct lookup purposes, and must be unique across the set of mirrors.
|
<mirror>
<id>mirrorId</id>
<mirrorOf>repositoryId</mirrorOf>
<name>Human Readable Name for this Mirror.</name>
<url>http://my.repository.com/repo/path</url>
</mirror>
<name>Maven Repository</name>
<url>https://repo.maven.apache.org/maven2</url>
<releases>
<enabled>true</enabled>
</releases>