绪、需求说明
- 将源系统mysql表数据全量抽取到hive中作为ODS层,不保留历史变化
create table T_YYBZB_TGH_BANKINFO
(
id int(8),
bank_id int(8),
bank_name varchar(200)
);
insert into T_YYBZB_TGH_BANKINFO (ID, BANK_ID, BANK_NAME)values (11, 11, '工商银行(广州)');
1、创建目标表
create table ods.ods_t_yybzb_tgh_bankinfo_di
(
id int,
bank_id int,
bank_name string
);
2、编写相关脚本
2.1、编写数据库配置文件db_conf
mysql_username=root
mysql_password=123456
mysql_ip=192.168.6.102
mysql_port=3306
mysql_sid=source
hadoop102_ip=192.168.6.102
hadoop102_port=8020
2.2、编写数据迁移JSON文件
{
"job": {
"content": [
{
"reader": {
"name": "mysqlreader",
"parameter": {
"connection": [
{
"jdbcUrl": [
"jdbc:mysql://$ip:$port/$sid"],
"querySql": ["select id,bank_id,bank_name from T_YYBZB_TGH_BANKINFO"],
}
],
"password": "$password",
"username": "$username"
}
},
"writer": {
"name": "hdfswriter",
"parameter": {
"defaultFS": "hdfs://$hdfs_ip:$hdfs_port",
"fileType": "text",
"path": "/user/hive/warehouse/ods.db/ods_t_yybzb_tgh_bankinfo_di",
"fileName": "ods_t_yybzb_tgh_bankinfo_di",
"column": [
{"name":"id","type":"int"},
{"name":"bank_id","type":"int"},
{"name":"bank_name","type":"string"}
],
"writeMode": "append",
"fieldDelimiter": "\t",
"encoding": "utf-8"
}
}
}],
"setting": {
"speed": {
"channel": "1"
}
}
}
}
2.3、编写shell脚本传参
#!/bin/bash
#配置文件路径:
config_file=../db_conf/db.conf
#源系统名称
src_system=mysql
#目标系统名称
tag_system=hadoop102
export in_username=`grep -w ${src_system}_username ${config_file} |awk -F '=' '{print $2}'`
export in_password=`grep -w ${src_system}_password ${config_file} |awk -F '=' '{print $2}'`
export in_ip=`grep -w ${src_system}_ip ${config_file} |awk -F '=' '{print $2}'`
export in_port=`grep -w ${src_system}_port ${config_file} |awk -F '=' '{print $2}'`
export in_sid=`grep -w ${src_system}_sid ${config_file} |awk -F '=' '{print $2}'`
export in_hdfs_ip=`grep -w ${tag_system}_ip ${config_file} |awk -F '=' '{print $2}'`
export in_hdfs_port=`grep -w ${tag_system}_port ${config_file} |awk -F '=' '{print $2}'`
pre_day=`date -d -1day +%Y%m%d`
pre_day_mon=`date -d -1day +%Y%m`
echo ${in_username}
echo ${in_password}
echo ${in_ip}
echo ${in_port}
echo ${in_sid}
echo ${in_hdfs_ip}
echo ${in_hdfs_port}
echo ${pre_day}
echo ${pre_day_mon}
# 全量导入:
hive -e "truncate table test_hive.stu;"
# nsrun_workgroup.json
python ../../datax.py -p"-Dusername=$in_username -Dpassword=$in_password -Dip=$in_ip -Dport=$in_port -Dsid=$in_sid -Dhdfs_ip=$in_hdfs_ip -Dhdfs_port=$in_hdfs_port" ../json_conf/bank_name.json
3、dolphinscheduler配置
3.1、资源中心配置
(1)创建主文件夹datax_text
(2)创建三个子文件夹
sh_start
json_conf
db_conf
(3)将脚本分被上传至对应文件夹中
① start.sh上传至sh_start
② bank_name.json上传至json_conf
③db.conf上传至db_conf文件夹
3.2、项目创建
- 根据业务主题进行划分
3.3、工作流创建
3.3.1、shell脚本调度datax(此路未通)
(1)创建工作流
- 工作流定义-创建工作流
(2)配置sql+shell控件节点
- 配置数据源
- 配置sql控件(清空表)
truncate table ods.ods_t_yybzb_tgh_bankinfo_di
- 配置shell控件(导入数据)
(3)保存shell节点作为工作流
(4)调度运行
- 第一步:上线部署
- 第二步:手动运行
(5)查看运行状态
- 通过工作流进入画布查看子任务运行状态(非常推荐)
- 通过工作流实例查看运行状态
- 通过甘特图查看运行状态
3.3.2、DataX组件调度
(1)JSON脚本编写
{
"job": {
"content": [
{
"reader": {
"name": "mysqlreader",
"parameter": {
"connection": [
{
"jdbcUrl": [
"jdbc:mysql://${ip}:${port}/${sid}?useSSL=false"],
"querySql": ["select id,bank_id,bank_name from T_YYBZB_TGH_BANKINFO"],
}
],
"password": "${password}",
"username": "${username}"
}
},
"writer": {
"name": "hdfswriter",
"parameter": {
"defaultFS": "hdfs://${hdfs_ip}:${hdfs_port}",
"fileType": "text",
"path": "/user/hive/warehouse/ods.db/ods_t_yybzb_tgh_bankinfo_di",
"fileName": "ods_t_yybzb_tgh_bankinfo_di",
"column": [
{"name":"id","type":"int"},
{"name":"bank_id","type":"int"},
{"name":"bank_name","type":"string"}
],
"writeMode": "append",
"fieldDelimiter": "\t",
"encoding": "utf-8"
}
}
}],
"setting": {
"speed": {
"channel": "1"
}
}
}
}
(2)组件部署
- SQL组件清空表
truncate table ods.ods_t_yybzb_tgh_bankinfo_di
- DataX组件全量导入数据
(3)参数传入
(4)上线运行
- 第一步:上线部署
- 第二步:手动运行
(5)查看运行结果
- SQL组件日志
[INFO] 2021-12-07 16:04:20.697 - [taskAppId=TASK-10-23-85]:[115] - create dir success /tmp/dolphinscheduler/exec/process/4/10/23/85
[INFO] 2021-12-07 16:04:20.750 - [taskAppId=TASK-10-23-85]:[112] - sql task params {"postStatements":[],"connParams":"","receiversCc":"","udfs":"","type":"HIVE","title":"","sql":"truncate table ods.ods_t_yybzb_tgh_bankinfo_di","preStatements":[],"sqlType":"1","sendEmail":false,"receivers":"","datasource":7,"displayRows":10,"limit":10000,"showType":"TABLE","localParams":[]}
[INFO] 2021-12-07 16:04:20.755 - [taskAppId=TASK-10-23-85]:[128] - Full sql parameters: SqlParameters{type='HIVE', datasource=7, sql='truncate table ods.ods_t_yybzb_tgh_bankinfo_di', sqlType=1, sendEmail=false, displayRows=10, limit=10000, udfs='', showType='TABLE', connParams='', title='', receivers='', receiversCc='', preStatements=[], postStatements=[]}
[INFO] 2021-12-07 16:04:20.755 - [taskAppId=TASK-10-23-85]:[129] - sql type : HIVE, datasource : 7, sql : truncate table ods.ods_t_yybzb_tgh_bankinfo_di , localParams : [],udfs : ,showType : TABLE,connParams : , query max result limit : 10000
[INFO] 2021-12-07 16:04:20.765 - [taskAppId=TASK-10-23-85]:[549] - after replace sql , preparing : truncate table ods.ods_t_yybzb_tgh_bankinfo_di
[INFO] 2021-12-07 16:04:20.765 - [taskAppId=TASK-10-23-85]:[558] - Sql Params are replaced sql , parameters:
[INFO] 2021-12-07 16:04:20.767 - [taskAppId=TASK-10-23-85]:[52] - can't find udf function resource
[INFO] 2021-12-07 16:04:20.974 - [taskAppId=TASK-10-23-85]:[458] - prepare statement replace sql : [email protected]
- DataX组件日志
2021-12-07 16:04:36.123 [job-0] INFO JobContainer - PerfTrace not enable!
2021-12-07 16:04:36.123 [job-0] INFO StandAloneJobContainerCommunicator - Total 15 records, 134 bytes | Speed 13B/s, 1 records/s | Error 0 records, 0 bytes | All Task WaitWriterTime 0.000s | All Task WaitReaderTime 0.000s | Percentage 100.00%
2021-12-07 16:04:36.124 [job-0] INFO JobContainer -
任务启动时刻 : 2021-12-07 16:04:25
任务结束时刻 : 2021-12-07 16:04:36
任务总计耗时 : 11s
任务平均流量 : 13B/s
记录写入速度 : 1rec/s
读出记录总数 : 15
读写失败总数 : 0
3.4、工作流调试
(1)查看运行日志
- 任务实例-操作查看运行日志
- 具体日志如下
(2)任务下线
- 工作流定义-》操作-》下线
(3)调整任务
- 编辑节点-》调整datax.py的启动目录
- 根据个人情况调整:我的datax启动目录(/opt/module/datax/bin)
(3)重新上线并运行
5、常见问题
5.1、无可用master节点
(1)问题如下
- 在作业流调试运行报错:无可用master节点
(2)问题原因
masterserver服务挂调
(3)重启服务
./stop-all.sh
./start-all.sh
5.2、Hive数据源配置
(1)hive的多用户配置
hive本身不创建用户,用户就是linux的用户,
(2)打开hive的server服务
- 第一步:运行hive下bin/hiveserver2脚本
./hiveserver2
- 第二步:新建xshell窗口
beeline
# 2、连接hive2
!connect jdbc:hive2://192.168.6.102:10000
# 3、输入用户名和密码
Enter username for jdbc:hive2://192.168.6.102:10000: atguigu
Enter password for jdbc:hive2://192.168.6.102:10000: ******
(3)海豚调度器创建数据源
5.3、hive数据源连接失败
(1)问题现象
(2)问题原因
# 1、检查hiveserver是否启动
netstat -anp |grep 10000
(3)解决方式
启动hiveserver2
5.24、作业流提交后没有任务实例
- 查看海豚日志
current cpu load average 0.35 is too high or available memory 0.14G is too low, under max.cpuload.avg=16.0 and reserved.memory=0.3G
[WARN] 2021-12-06 17:17:45.031 org.apache.dolphinscheduler.server.master.dispatch.host.LowerWeightHostManager:[159] - worker 192.168.6.102:1234 current cpu load average 0.42 is too high or available memory 0.14G is too low
[WARN] 2021-12-06 17:17:45.032 org.apache.dolphinscheduler.server.master.dispatch.host.LowerWeightHostManager:[159] - worker 192.168.6.102:1234 current cpu load average 0.42 is too high or available memory 0.14G is too low
[WARN] 2021-12-06 17:17:50.032 org.apache.dolphinscheduler.server.master.dispatch.host.LowerWeightHostManager:[159] - worker 192.168.6.102:1234 current cpu load average 0.42 is too high or available memory 0.14G is too low
[WARN] 2021-12-06 17:17:50.032 org.apache.dolphinscheduler.server.master.dispatch.host.LowerWeightHostManager:[159] - worker 192.168.6.102:1234 current cpu load average 0.42 is too high or available memory 0.14G is too low
[WARN] 2021-12-06 17:17:51.317 org.apache.dolphinscheduler.server.registry.HeartBeatTask:[80] - current cpu load average 0.3 is too high or available memory 0.14G is too low, under max.cpuload.avg=16.0 and reserved.memory=0.3G
- 解决方式
花个一亿提升电脑配置
5.5、datax文件路径出错
(1)问题现象
- 日志报错没有这样的文件路径,but集群上的datax启动目录是这样的
(2)问题原因
- 海豚调度器的datax路径配置错误,找不到该文件
- dataX在Hadoop02上部署,没有在103,104节点部署
(3)解决方案
- 将datax文件scp到集群其他节点
5.6、/bin/sh: java: 未找到命令
(1)问题现象
(2)问题原因
未找到,待解决,吐槽日志…
(3)方案结果
放弃shell调度datax的方式
5.7、[Errno 2] No such file or directory
(1)问题现象
(2)问题原因
- 未配置dolphinscheduler_env.sh文件
(3)解决方案
dolphinscheduler_env.sh
- 添加如下内容
#JAVA_HOME
export JAVA_HOME=/opt/module/jdk1.8.0_212
export PATH=$PATH:$JAVA_HOME/bin
#HADOOP_HOME
export HADOOP_HOME=/opt/module/hadoop-3.1.3
export PATH=$PATH:$HADOOP_HOME/bin
export PATH=$PATH:$HADOOP_HOME/sbin
#HIVE_HOME
export HIVE_HOME=/opt/module/hive
export PATH=$PATH:$HIVE_HOME/bin
#KAFKA_HOME
export KAFKA_HOME=/opt/module/kafka
export PATH=$PATH:$KAFKA_HOME/bin
#HBASE_HOME
export HBASE_HOME=/opt/module/hbase
export PATH=$PATH:$HBASE_HOME/bin
#FLINK_HOME
export FLINK_HOME=/opt/module/flink-1.10.1
export PATH=$PATH:$FLINK_HOME/bin
#SPARK_HOME
export SPARK_HOME=/opt/module/spark-3.1.2
export PATH=$PATH:$SPARK_HOME/bin
export PATN=$PATH:$SPARK_HOME/sbin
#DATAX_HOME
export DATAX_HOME=/opt/module/datax
export PATH=$PATH:$DATAX_HOME/bin
- 之后分发到其他节点
xsync dolphinscheduler_env.sh
5.8、 org.apache.hadoop.security.AccessControlException: Permission denied:test123
(1)问题现状
- 无法写入文件
(2)问题原因
- 调度平台使用的test123租户进行JOB调度,二hdfs的hive表权限是atguigu用户,所以权限不足,导致无法写入文件
(3)解决方案
- 在安全中心-租户管理菜单中配置atguigu用户
- 保存作业流时选择atguigu用户
5.9、连接mysql失败
(1)问题现状
java.lang.Exception: DataX无法连接对应的数据库,可能原因是:1) 配置的ip/port/database/jdbc错误,无法连接。2) 配置的username/password错误,鉴权失败。请和DBA确认该数据库的连接信息是否正确。
(2)问题原因
- JDBC参数配置出错
(3)解决方案
在jdbc的连接配置中加上useSSL=false
"jdbcUrl": ["jdbc:mysql://$ip:$port/$sid?useSSL=false"],
原文链接:https://blog.csdn.net/qq_27924553/article/details/121772937