orchestrator+vip实现高可用
环境
系统:CentOS 7.6
服务器
192.168.20.101 node1
192.168.20.102 node2
192.168.20.103 node3数据库版本: MySQL 5.7.27 均已安装
GTID 已开启
log-slave-updates = ON
report_host最好配置上orchestrator 版本 : v3.1.4
MySQL主从配置略...,需注意设置主从时添加参数如下,优化发现切换时间
change master to
master_host='192.168.20.101',
master_port=3306,
master_user='rpl',
master_password='123456',
master_auto_position=1,
MASTER_HEARTBEAT_PERIOD=2,
MASTER_CONNECT_RETRY=1,
MASTER_RETRY_COUNT=86400;
set global slave_net_timeout=8;
set global read_only=1;
set global super_read_only=1;
1-基础配置
1.1-hosts配置
每台机器分别添加hosts
echo '192.168.20.101 node1' >> /etc/hosts
echo '192.168.20.102 node2' >> /etc/hosts
echo '192.168.20.103 node3' >> /etc/hosts
1.2-配置ssh免密
所有机器执行:
# 一路回车
ssh-keygen
ssh-copy-id node1
ssh-copy-id node2
ssh-copy-id node3
2-部署 orchestrator
2.1-下载
下载路径:https://github.com/openark/orchestrator/releases
wget https://github.com/openark/orchestrator/releases/download/v3.1.4/orchestrator-3.1.4-1.x86_64.rpm
wget https://github.com/openark/orchestrator/releases/download/v3.1.4/orchestrator-client-3.1.4-1.x86_64.rpm
2.2-安装
三台机器都安装 orchestrator的服务端和客户端
# 安装依赖
yum -y install jq
# 安装
[root@node1 ~]# rpm -ivh orchestrator-3.1.4-1.x86_64.rpm
Preparing... ################################# [100%]
Updating / installing...
1:orchestrator-1:3.1.4-1 ################################# [100%]
[root@node1 ~]# rpm -ivh orchestrator-client-3.1.4-1.x86_64.rpm
Preparing... ################################# [100%]
Updating / installing...
1:orchestrator-client-1:3.1.4-1 ################################# [100%]
2.3-创建orchestrator管理用户
# mysql 主库上操作
mysql> CREATE USER 'orchestrator'@'192.168.20.%' IDENTIFIED BY '123456';
mysql> GRANT SUPER, PROCESS, REPLICATION SLAVE, RELOAD ON *.* TO 'orchestrator'@'192.168.20.%';
mysql> GRANT SELECT ON mysql.slave_master_info TO 'orchestrator'@'192.168.20.%';
2.4-设置配置文件
cp /usr/local/orchestrator/orchestrator-sample-sqlite.conf.json /etc/orchestrator.conf.json
vi /etc/orchestrator.conf.json
修改如下几个参数:
- MySQLTopologyUser/MySQLTopologyPassword
检测MySQL集群的用户名/密码 - SQLite3DataFile
SQLite库存放路径,需要有写的权限 - DefaultInstancePort
mysql 实例端口 - FailureDetectionPeriodBlockMinutes
在该时间内再次出现故障,不会被多次发现。 - coveryPeriodBlockSeconds
在该时间内再次出现故障,不会进行迁移,避免出现并发恢复和不稳定。 - RecoverMasterClusterFilters
只对匹配这些正则表达式模式的集群进行主恢复(“*”模式匹配所有)。 - RecoverIntermediateMasterClusterFilters
只对匹配这些正则表达式模式的集群进行主恢复(“*”模式匹配所有)。 - PostFailoverProcesses
修改为脚本实际的存放路径 - 添加Raft相关
Raft相关用于做orchestrator高可用 - RaftBind
修改为本机IP
{
"Debug": true,
"EnableSyslog": false,
"ListenAddress": ":3000",
"MySQLTopologyUser": "orchestrator",
"MySQLTopologyPassword": "123456",
"MySQLTopologyCredentialsConfigFile": "",
"MySQLTopologySSLPrivateKeyFile": "",
"MySQLTopologySSLCertFile": "",
"MySQLTopologySSLCAFile": "",
"MySQLTopologySSLSkipVerify": true,
"MySQLTopologyUseMutualTLS": false,
"BackendDB": "sqlite",
"SQLite3DataFile": "/usr/local/orchestrator/orchestrator.db",
"MySQLConnectTimeoutSeconds": 1,
"DefaultInstancePort": 3306,
"DiscoverByShowSlaveHosts": true,
"InstancePollSeconds": 5,
"DiscoveryIgnoreReplicaHostnameFilters": [
"a_host_i_want_to_ignore[.]example[.]com",
".*[.]ignore_all_hosts_from_this_domain[.]example[.]com",
"a_host_with_extra_port_i_want_to_ignore[.]example[.]com:3307"
],
"UnseenInstanceForgetHours": 240,
"SnapshotTopologiesIntervalHours": 0,
"InstanceBulkOperationsWaitTimeoutSeconds": 10,
"HostnameResolveMethod": "default",
"MySQLHostnameResolveMethod": "@@hostname",
"SkipBinlogServerUnresolveCheck": true,
"ExpiryHostnameResolvesMinutes": 60,
"RejectHostnameResolvePattern": "",
"ReasonableReplicationLagSeconds": 10,
"ProblemIgnoreHostnameFilters": [],
"VerifyReplicationFilters": false,
"ReasonableMaintenanceReplicationLagSeconds": 20,
"CandidateInstanceExpireMinutes": 60,
"AuditLogFile": "",
"AuditToSyslog": false,
"RemoveTextFromHostnameDisplay": ".mydomain.com:3306",
"ReadOnly": false,
"AuthenticationMethod": "",
"HTTPAuthUser": "",
"HTTPAuthPassword": "",
"AuthUserHeader": "",
"PowerAuthUsers": [
"*"
],
"ClusterNameToAlias": {
"127.0.0.1": "test suite"
},
"ReplicationLagQuery": "",
"DetectClusterAliasQuery": "SELECT SUBSTRING_INDEX(@@hostname, '.', 1)",
"DetectClusterDomainQuery": "",
"DetectInstanceAliasQuery": "",
"DetectPromotionRuleQuery": "",
"DataCenterPattern": "[.]([^.]+)[.][^.]+[.]mydomain[.]com",
"PhysicalEnvironmentPattern": "[.]([^.]+[.][^.]+)[.]mydomain[.]com",
"PromotionIgnoreHostnameFilters": [],
"DetectSemiSyncEnforcedQuery": "",
"ServeAgentsHttp": false,
"AgentsServerPort": ":3001",
"AgentsUseSSL": false,
"AgentsUseMutualTLS": false,
"AgentSSLSkipVerify": false,
"AgentSSLPrivateKeyFile": "",
"AgentSSLCertFile": "",
"AgentSSLCAFile": "",
"AgentSSLValidOUs": [],
"UseSSL": false,
"UseMutualTLS": false,
"SSLSkipVerify": false,
"SSLPrivateKeyFile": "",
"SSLCertFile": "",
"SSLCAFile": "",
"SSLValidOUs": [],
"URLPrefix": "",
"StatusEndpoint": "/api/status",
"StatusSimpleHealth": true,
"StatusOUVerify": false,
"AgentPollMinutes": 60,
"UnseenAgentForgetHours": 6,
"StaleSeedFailMinutes": 60,
"SeedAcceptableBytesDiff": 8192,
"PseudoGTIDPattern": "",
"PseudoGTIDPatternIsFixedSubstring": false,
"PseudoGTIDMonotonicHint": "asc:",
"DetectPseudoGTIDQuery": "",
"BinlogEventsChunkSize": 10000,
"SkipBinlogEventsContaining": [],
"ReduceReplicationAnalysisCount": true,
"FailureDetectionPeriodBlockMinutes": 5,
"RecoveryPeriodBlockSeconds": 30,
"RecoveryIgnoreHostnameFilters": [],
"RecoverMasterClusterFilters": [
"*"
],
"RecoverIntermediateMasterClusterFilters": [
"*"
],
"OnFailureDetectionProcesses": [
"echo '`date +'%Y-%m-%d %T'` Detected {failureType} on {failureCluster}. Affected replicas: {countSlaves}' >> /tmp/recovery.log"
],
"PreGracefulTakeoverProcesses": [
"echo '`date +'%Y-%m-%d %T'` Planned takeover about to take place on {failureCluster}. Master will switch to read_only' >> /tmp/recovery.log"
],
"PreFailoverProcesses": [
"echo '`date +'%Y-%m-%d %T'` Will recover from {failureType} on {failureCluster}' >> /tmp/recovery.log"
],
"PostFailoverProcesses": [
"echo '`date +'%Y-%m-%d %T'` (for all types) Recovered from {failureType} on {failureCluster}. Failed: {failedHost}:{failedPort}; Successor: {successorHost}:{successorPort}; failureClusterAlias:{failureClusterAlias}' >> /tmp/recovery.log",
"/usr/local/orchestrator/orch_hook.sh {failureType} {failureClusterAlias} {failedHost} {successorHost} >> /tmp/orch.log"
],
"PostUnsuccessfulFailoverProcesses": [ "echo '`date +'%Y-%m-%d %T'` Unsuccessful Failover ' >> /tmp/recovery.log"],
"PostMasterFailoverProcesses": [
"echo '`date +'%Y-%m-%d %T'` Recovered from {failureType} on {failureCluster}. Failed: {failedHost}:{failedPort}; Promoted: {successorHost}:{successorPort}' >> /tmp/recovery.log"
],
"PostIntermediateMasterFailoverProcesses": [
"echo '`date +'%Y-%m-%d %T'` Recovered from {failureType} on {failureCluster}. Failed: {failedHost}:{failedPort}; Successor: {successorHost}:{successorPort}' >> /tmp/recovery.log"
],
"PostGracefulTakeoverProcesses": [
"echo '`date +'%Y-%m-%d %T'` Planned takeover complete' >> /tmp/recovery.log"
],
"CoMasterRecoveryMustPromoteOtherCoMaster": true,
"DetachLostSlavesAfterMasterFailover": true,
"ApplyMySQLPromotionAfterMasterFailover": true,
"PreventCrossDataCenterMasterFailover": false,
"PreventCrossRegionMasterFailover": false,
"MasterFailoverDetachReplicaMasterHost": false,
"MasterFailoverLostInstancesDowntimeMinutes": 0,
"PostponeReplicaRecoveryOnLagMinutes": 0,
"OSCIgnoreHostnameFilters": [],
"GraphiteAddr": "",
"GraphitePath": "",
"GraphiteConvertHostnameDotsToUnderscores": true,
"ConsulAddress": "",
"ConsulAclToken": "",
"RaftEnabled":true,
"RaftDataDir":"/usr/local/orchestrator",
"RaftBind":"192.168.20.101",
"DefaultRaftPort":10008,
"RaftNodes":[
"192.168.20.101",
"192.168.20.102",
"192.168.20.103"
]
}
2.5-配置文件复制到其他机器上去
scp /etc/orchestrator.conf.json root@node2:/etc/orchestrator.conf.json
scp /etc/orchestrator.conf.json root@node3:/etc/orchestrator.conf.json
修改最后的 "RaftBind":"192.168.20.101",
的地址为本机地址
2.6-VIP切换脚本创建
所有服务器均创建,建议创建一个后复制过去改下
/usr/local/orchestrator/orch_hook.sh
/usr/local/orchestrator/orch_vip.sh
orch_hook.sh
可能需要修改:
- 注意脚本路径当前为:/usr/local/orchestrator/orch_vip.sh
- array=( ens32 "192.168.20.111" root "192.168.20.101")
对应值为: 网卡名称 VIP地址 ssh用户名 本机IP - MYSQL_PWD 忽略
[root@node1 orchestrator]# cat orch_hook.sh
#!/bin/bash
isitdead=$1
cluster=$2
oldmaster=$3
newmaster=$4
mysqluser="orchestrator"
export MYSQL_PWD="xxxpassxxx"
logfile="/var/log/orch_hook.log"
# list of clusternames
#clusternames=(rep blea lajos)
# clustername=( interface IP user Inter_IP)
#rep=( ens32 "192.168.56.121" root "192.168.56.125")
if [[ $isitdead == "DeadMaster" ]]; then
array=( ens32 "192.168.20.111" root "192.168.20.101")
interface=${array[0]}
IP=${array[1]}
user=${array[2]}
if [ ! -z ${IP} ] ; then
echo $(date)
echo "Revocering from: $isitdead"
echo "New master is: $newmaster"
echo "/usr/local/orchestrator/orch_vip.sh -d 1 -n $newmaster -i ${interface} -I ${IP} -u ${user} -o $oldmaster" | tee $logfile
/usr/local/orchestrator/orch_vip.sh -d 1 -n $newmaster -i ${interface} -I ${IP} -u ${user} -o $oldmaster
#mysql -h$newmaster -u$mysqluser < /usr/local/bin/orch_event.sql
else
echo "Cluster does not exist!" | tee $logfile
fi
elif [[ $isitdead == "DeadIntermediateMasterWithSingleSlaveFailingToConnect" ]]; then
array=( ens32 "192.168.20.111" root "192.168.20.101")
interface=${array[0]}
IP=${array[3]}
user=${array[2]}
slavehost=`echo $5 | cut -d":" -f1`
echo $(date)
echo "Revocering from: $isitdead"
echo "New intermediate master is: $slavehost"
echo "/usr/local/orchestrator/orch_vip.sh -d 1 -n $slavehost -i ${interface} -I ${IP} -u ${user} -o $oldmaster" | tee $logfile
/usr/local/orchestrator/orch_vip.sh -d 1 -n $slavehost -i ${interface} -I ${IP} -u ${user} -o $oldmaster
elif [[ $isitdead == "DeadIntermediateMaster" ]]; then
array=( ens32 "192.168.20.111" root "192.168.20.101")
interface=${array[0]}
IP=${array[3]}
user=${array[2]}
slavehost=`echo $5 | sed -E "s/:[0-9]+//g" | sed -E "s/,/ /g"`
showslave=`mysql -h$newmaster -u$mysqluser -sN -e "SHOW SLAVE HOSTS;" | awk '{print $2}'`
newintermediatemaster=`echo $slavehost $showslave | tr ' ' '\n' | sort | uniq -d`
echo $(date)
echo "Revocering from: $isitdead"
echo "New intermediate master is: $newintermediatemaster"
echo "/usr/local/orchestrator/orch_vip.sh -d 1 -n $newintermediatemaster -i ${interface} -I ${IP} -u ${user} -o $oldmaster" | tee $logfile
/usr/local/orchestrator/orch_vip.sh -d 1 -n $newintermediatemaster -i ${interface} -I ${IP} -u ${user} -o $oldmaster
fi
orch_vip.sh
需要发邮件可以修改emailaddress
的地址,并将 sendmail
改为 1
[root@node1 orchestrator]# cat orch_vip.sh
#!/bin/bash
emailaddress="email@example.com"
sendmail=0
function usage {
cat << EOF
usage: $0 [-h] [-d master is dead] [-o old master ] [-s ssh options] [-n new master] [-i interface] [-I] [-u SSH user]
OPTIONS:
-h Show this message
-o string Old master hostname or IP address
-d int If master is dead should be 1 otherweise it is 0
-s string SSH options
-n string New master hostname or IP address
-i string Interface exmple eth0:1
-I string Virtual IP
-u string SSH user
EOF
}
while getopts ho:d:s:n:i:I:u: flag; do
case $flag in
o)
orig_master="$OPTARG";
;;
d)
isitdead="${OPTARG}";
;;
s)
ssh_options="${OPTARG}";
;;
n)
new_master="$OPTARG";
;;
i)
interface="$OPTARG";
;;
I)
vip="$OPTARG";
;;
u)
ssh_user="$OPTARG";
;;
h)
usage;
exit 0;
;;
*)
usage;
exit 1;
;;
esac
done
if [ $OPTIND -eq 1 ]; then
echo "No options were passed";
usage;
fi
shift $(( OPTIND - 1 ));
# discover commands from our path
ssh=$(which ssh)
arping=$(which arping)
ip2util=$(which ip)
# command for adding our vip
cmd_vip_add="sudo -n $ip2util address add ${vip} dev ${interface}"
# command for deleting our vip
cmd_vip_del="sudo -n $ip2util address del ${vip}/32 dev ${interface}"
# command for discovering if our vip is enabled
cmd_vip_chk="sudo -n $ip2util address show dev ${interface} to ${vip%/*}/32"
# command for sending gratuitous arp to announce ip move
cmd_arp_fix="sudo -n $arping -c 1 -I ${interface} ${vip%/*} "
# command for sending gratuitous arp to announce ip move on current server
cmd_local_arp_fix="sudo -n $arping -c 1 -I ${interface} ${vip%/*} "
vip_stop() {
rc=0
# ensure the vip is removed
$ssh ${ssh_options} -tt ${ssh_user}@${orig_master} \
"[ -n \"\$(${cmd_vip_chk})\" ] && ${cmd_vip_del} && sudo ${ip2util} route flush cache || [ -z \"\$(${cmd_vip_chk})\" ]"
rc=$?
return $rc
}
vip_start() {
rc=0
# ensure the vip is added
# this command should exit with failure if we are unable to add the vip
# if the vip already exists always exit 0 (whether or not we added it)
$ssh ${ssh_options} -tt ${ssh_user}@${new_master} \
"[ -z \"\$(${cmd_vip_chk})\" ] && ${cmd_vip_add} && ${cmd_arp_fix} || [ -n \"\$(${cmd_vip_chk})\" ]"
rc=$?
$cmd_local_arp_fix
return $rc
}
vip_status() {
$arping -c 1 -I ${interface} ${vip%/*}
if ping -c 1 -W 1 "$vip"; then
return 0
else
return 1
fi
}
if [[ $isitdead == 0 ]]; then
echo "Online failover"
if vip_stop; then
if vip_start; then
echo "$vip is moved to $new_master."
if [ $sendmail -eq 1 ]; then mail -s "$vip is moved to $new_master." "$emailaddress" < /dev/null &> /dev/null ; fi
else
echo "Can't add $vip on $new_master!"
if [ $sendmail -eq 1 ]; then mail -s "Can't add $vip on $new_master!" "$emailaddress" < /dev/null &> /dev/null ; fi
exit 1
fi
else
echo $rc
echo "Can't remove the $vip from orig_master!"
if [ $sendmail -eq 1 ]; then mail -s "Can't remove the $vip from orig_master!" "$emailaddress" < /dev/null &> /dev/null ; fi
exit 1
fi
elif [[ $isitdead == 1 ]]; then
echo "Master is dead, failover"
# make sure the vip is not available
if vip_status; then
if vip_stop; then
if [ $sendmail -eq 1 ]; then mail -s "$vip is removed from orig_master." "$emailaddress" < /dev/null &> /dev/null ; fi
else
if [ $sendmail -eq 1 ]; then mail -s "Couldn't remove $vip from orig_master." "$emailaddress" < /dev/null &> /dev/null ; fi
exit 1
fi
fi
if vip_start; then
echo "$vip is moved to $new_master."
if [ $sendmail -eq 1 ]; then mail -s "$vip is moved to $new_master." "$emailaddress" < /dev/null &> /dev/null ; fi
else
echo "Can't add $vip on $new_master!"
if [ $sendmail -eq 1 ]; then mail -s "Can't add $vip on $new_master!" "$emailaddress" < /dev/null &> /dev/null ; fi
exit 1
fi
else
echo "Wrong argument, the master is dead or live?"
fi
2.7-设置VIP
仅在master节点上创建VIP
ip addr add 192.168.20.111 dev ens32
# 删除 ip addr del 192.168.20.111 dev ens32
2.8-启动orchestrator
所有机器启动
cd /usr/local/orchestrator && nohup ./orchestrator --config=/etc/orchestrator.conf.json http &
2.9-设置环境变量
因为配置了Raft,有多个Orchestrator,所以需要ORCHESTRATOR_API的环境变量,orchestrator-client会自动选择leader。
所有机器执行 vi /etc/profile
# 在最后添加
export ORCHESTRATOR_API="node1:3000/api node2:3000/api node3:3000/api"
# 添加完成后source生效
source /etc/profile
2.10-发现拓扑
# 任意服务器上执行,发现leader在哪
[root@node2 ~]# orchestrator-client -c which-api
node1:3000/api
# 在任意服务器上执行
[root@node2 ~]# orchestrator-client -c discover -i 192.168.20.101:3306
node1:3306
展开
3-测试
3.1-kill或关闭mysql
结果会自动进行切换,IP自动漂移
3.2-直接poweroff掉master服务器
结果会自动进行切换,IP自动漂移
3.3-恢复
change master to master_host='192.168.20.101',master_port=3306,master_user='orchestrator',master_password='123456',master_auto_position=1,MASTER_HEARTBEAT_PERIOD=2,MASTER_CONNECT_RETRY=1,MASTER_RETRY_COUNT=86400;
set global slave_net_timeout=8;
start slave;
show slave status\G
4.1-查看拓扑
# 任意一个节点
[root@node1 orchestrator]# orchestrator-client -c topology -i node2
node1:3306 [0s,ok,5.7.27-log,rw,ROW,>>,GTID]
+ node2:3306 [0s,ok,5.7.27-log,ro,ROW,>>,GTID]
+ node3:3306 [0s,ok,5.7.27-log,ro,ROW,>>,GTID]
4.2-设置为只读模式/读写模式
# 设置node2为只读模式
orchestrator-client -c set-read-only -i node2
# 设置node2为读写模式
orchestrator-client -c set-writeable -i node2
4.3-优雅的进行主和指定从切换
# 提升node2为master,原主node1需要手动去执行 start slave
orchestrator-client -c graceful-master-takeover -a node1:3306 -d node2:3306
原结构:
node1:3306 [0s,ok,5.7.27-log,rw,ROW,>>,GTID]
+ node2:3306 [0s,ok,5.7.27-log,ro,ROW,>>,GTID]
+ node3:3306 [0s,ok,5.7.27-log,ro,ROW,>>,GTID]
变更后结构:
node2:3306 [0s,ok,5.7.27-log,rw,ROW,>>,GTID]
- node1:3306 [null,nonreplicating,5.7.27-log,ro,ROW,>>,GTID,downtimed] # 需要手动启动slave
+ node3:3306 [0s,ok,5.7.27-log,ro,ROW,>>,GTID]
参考:
https://www.cnblogs.com/zhoujinyi/p/10394389.html
https://github.com/openark/orchestrator
https://github.com/theTibi/orchestrator_vip
附录参数注释:
{
"Debug": true, #debug模式,输出详细信息
"EnableSyslog": false, #是否输出到系统日志里
"ListenAddress": ":3000", #orch监听的端口,web端口
"MySQLTopologyUser": "orchestrator", #后端被管理的集群账号:所有实例都要有
"MySQLTopologyPassword": "123456", #后端被管理的集群密码
"MySQLTopologyCredentialsConfigFile": "", #后端集群的用户密码配置文件,账号密码可以直接写入文件,读取
"MySQLTopologySSLPrivateKeyFile": "", #SSL相关
"MySQLTopologySSLCertFile": "",
"MySQLTopologySSLCAFile": "", #证书相关
"MySQLTopologySSLSkipVerify": true, #跳过SSL验证
"MySQLTopologyUseMutualTLS": false, #是否使用TLS验证
"BackendDB": "sqlite", # 后台库类型
"SQLite3DataFile": "/usr/local/orchestrator/orchestrator.db", # 数据文件存放路径
"MySQLConnectTimeoutSeconds": 1, #orch连接MySQL超时秒数
"DefaultInstancePort": 3306, #mysql 集群实例端口
"DiscoverByShowSlaveHosts": true, #是否使用show slave hosts自动发现集群
"InstancePollSeconds": 5, #使用show slave hosts 探测间隔秒数
"DiscoveryIgnoreReplicaHostnameFilters": [
"a_host_i_want_to_ignore[.]example[.]com",
".*[.]ignore_all_hosts_from_this_domain[.]example[.]com",
"a_host_with_extra_port_i_want_to_ignore[.]example[.]com:3307"
],
"UnseenInstanceForgetHours": 240, #忽略不可见的实例的小时数
"SnapshotTopologiesIntervalHours": 0, #快照拓扑调用之间的小时间隔。默认:0(表示禁用)
"InstanceBulkOperationsWaitTimeoutSeconds": 10, #执行批量操作时,在单个实例上等待的时间
"HostnameResolveMethod": "default", #解析主机名,默认使用主机名:default;不解析为none,直接用IP
"MySQLHostnameResolveMethod": "@@hostname", #解析主机名,发出select @@hostname或发出select @@report_host(需要配置report_host);不解析用"",直接用IP。
"SkipBinlogServerUnresolveCheck": true, #跳过检查 将未解析的主机名解析为和binlog服务器相同的主机名
"ExpiryHostnameResolvesMinutes": 60, #域名检测过期周期(分钟)
"RejectHostnameResolvePattern": "", #禁止使用正则表达式表示域名
"ReasonableReplicationLagSeconds": 10, #复制延迟高于10S表示异常
"ProblemIgnoreHostnameFilters": [], #将主机做正则匹配筛选成最小化
"VerifyReplicationFilters": false, #在拓扑重构之前检查复制筛选器
"ReasonableMaintenanceReplicationLagSeconds": 20, #复制延迟高于该值会上下移动调整MySQL拓扑
"CandidateInstanceExpireMinutes": 60, #该时间之后,使用实例作为候选从库(在主故障转移时提升)的建议到期
"AuditLogFile": "", #审计日志,空的时候禁用
"AuditToSyslog": false, #审计日志是否写入到系统日志
"RemoveTextFromHostnameDisplay": ".mydomain.com:3306", #去除集群的文本
"ReadOnly": false, #全局只读
"AuthenticationMethod": "", #身份验证类型
"HTTPAuthUser": "", #http验证用户名
"HTTPAuthPassword": "",
"AuthUserHeader": "",
"PowerAuthUsers": [
"*"
],
"ClusterNameToAlias": { #正则表达式匹配集群名称和别名之间的映射
"127.0.0.1": "test suite"
},
"ReplicationLagQuery": "",
"DetectClusterAliasQuery": "SELECT SUBSTRING_INDEX(@@hostname, '.', 1)", #查询集群别名
"DetectClusterDomainQuery": "", #可选查询,返回集群主服务器的VIP/别名/域名。
"DetectInstanceAliasQuery": "", #可选查询,返回实例的别名
"DetectPromotionRuleQuery": "", #可选查询,返回实例的提升规则
"DataCenterPattern": "[.]([^.]+)[.][^.]+[.]mydomain[.]com", #从主机名称中提取数据中心名称
"PhysicalEnvironmentPattern": "[.]([^.]+[.][^.]+)[.]mydomain[.]com", #从主机名中提取物理环境信息
"PromotionIgnoreHostnameFilters": [], #不使用主机名匹配模式来提升副本
"DetectSemiSyncEnforcedQuery": "", #检测是否强制半同步
"ServeAgentsHttp": false, #产生一个专用于orche-client的HTTP端口
"AgentsServerPort": ":3001", #可选,对于raft设置,此节点将向其他节点通告HTTP的地址
"AgentsUseSSL": false, #当为true时,orch将使用SSL侦听代理端口已经通过SSL连接的代理
"AgentsUseMutualTLS": false, #当为true时,使用TLS服务器与代理通信
"AgentSSLSkipVerify": false, #为代理使用SSL
"AgentSSLPrivateKeyFile": "",
"AgentSSLCertFile": "",
"AgentSSLCAFile": "",
"AgentSSLValidOUs": [],
"UseSSL": false, #在服务器WEB端口上使用SSL
"UseMutualTLS": false, #true时使用TLS作为服务器的WEB和API连接
"SSLSkipVerify": false,
"SSLPrivateKeyFile": "",
"SSLCertFile": "",
"SSLCAFile": "",
"SSLValidOUs": [],
"URLPrefix": "", #在非跟web路径上运行orch的URL前缀
"StatusEndpoint": "/api/status", #状态查看
"StatusSimpleHealth": true,
"StatusOUVerify": false,
"AgentPollMinutes": 60, #代理之间轮询的分钟数
"UnseenAgentForgetHours": 6, #忘记不可见代理的小时数
"StaleSeedFailMinutes": 60, #无进展60分钟后被认为失败
"SeedAcceptableBytesDiff": 8192, #种子源和目标源数据大小的字节差异仍被视为成功复制
"PseudoGTIDPattern": "", #为空时,禁用基于伪GTID的重构
"PseudoGTIDPatternIsFixedSubstring": false, #如为TRUE,则上个参数不被视为正则表达式而被视为固定子字符串
"PseudoGTIDMonotonicHint": "asc:",
"DetectPseudoGTIDQuery": "", #可选查询,用于决定是否在实例上启用伪GTID
"BinlogEventsChunkSize": 10000, #show binlog events 块的大小。较小意味着更少的锁定和工作要做
"SkipBinlogEventsContaining": [], #扫描/比较Pseudo-GTID的binlog 时,跳过包含给定文本的条目。这些不是正则表达式(扫描binlog时会消耗太多的CPU),只需查找子字符串
"ReduceReplicationAnalysisCount": true, #如果为true,则复制分析将报告可能首先处理问题的可能性的实例。 如果为false,则为每个已知实例提供一个条目
"FailureDetectionPeriodBlockMinutes": 10, #在该时间内再次出现故障,不会被再次发现
"RecoveryPeriodBlockSeconds": 30, #在该时间内再次出现故障,不会进行failover,避免出现并发恢复和不稳定
"RecoveryIgnoreHostnameFilters": [], #恢复会忽略的主机
"RecoverMasterClusterFilters": [ #只对能匹配这些正则表达式模式的集群进行主故障恢复
"*"
],
"RecoverIntermediateMasterClusterFilters": [ #只对能匹配这些正则表达式模式的集群进行主故障恢复(“*”模式匹配所有)
"*"
],
"OnFailureDetectionProcesses": [ #检测到主故障时执行的命令和脚本
"echo 'Detected {failureType} on {failureCluster}. Affected replicas: {countSlaves}' >> /tmp/recovery.log"
],
"PreGracefulTakeoverProcesses": [ #在执行故障转移之前执行的命令和脚本
"echo 'Planned takeover about to take place on {failureCluster}. Master will switch to read_only' >> /tmp/recovery.log"
],
"PreFailoverProcesses": [ #执行恢复操作前执行
"echo 'Will recover from {failureType} on {failureCluster}' >> /tmp/recovery.log"
],
"PostFailoverProcesses": [ #在failover全部成功后执行
"echo '(for all types) Recovered from {failureType} on {failureCluster}. Failed: {failedHost}:{failedPort}; Successor: {successorHost}:{successorPort}' >> /tmp/recovery.log"
],
"PostUnsuccessfulFailoverProcesses": [], #在failover失败后执行
"PostMasterFailoverProcesses": [ #在主恢复成功结束时执行
"echo 'Recovered from {failureType} on {failureCluster}. Failed: {failedHost}:{failedPort}; Promoted: {successorHost}:{successorPort}' >> /tmp/recovery.log"
],
"PostIntermediateMasterFailoverProcesses": [ #在中间主成功恢复结束时执行
"echo 'Recovered from {failureType} on {failureCluster}. Failed: {failedHost}:{failedPort}; Successor: {successorHost}:{successorPort}' >> /tmp/recovery.log"
],
"PostGracefulTakeoverProcesses": [ #在新主晋升之后执行
"echo 'Planned takeover complete' >> /tmp/recovery.log"
],
"CoMasterRecoveryMustPromoteOtherCoMaster": true, #当false时,任何实例都可以得到提升;当true时,将提升共同主人,否则失败
"DetachLostSlavesAfterMasterFailover": true, #恢复过程中可能会丢失一些副本。如果为true,将通过detach-replica命令强制中断其复制,并认为它们不正常运行。
"ApplyMySQLPromotionAfterMasterFailover": true, #在主上执行reset slave all,并设置read_only=0
"PreventCrossDataCenterMasterFailover": false, #如果为true(默认值:false),则不允许跨DC主故障转移,orchestrator将尽其所能仅在同一DC内进行故障转移,否则不进行故障转移。
"PreventCrossRegionMasterFailover": false,
"MasterFailoverDetachReplicaMasterHost": false,
"MasterFailoverLostInstancesDowntimeMinutes": 0, #主故障转移后丢失的服务器停机的分钟数(包括失败的主和丢失的从)。0表示禁用
"PostponeReplicaRecoveryOnLagMinutes": 0, #在崩溃恢复时,延迟超过给定分钟的从库在主被选出后才复活。 值为0将禁用此功能。
"OSCIgnoreHostnameFilters": [],
"GraphiteAddr": "",
"GraphitePath": "",
"GraphiteConvertHostnameDotsToUnderscores": true,
"ConsulAddress": "",
"ConsulAclToken": "", # 这里的逗号需要注意
# 在最后添加上,此配置用于高可用,注意上面一行要加上逗号
"RaftEnabled":true,
"RaftDataDir":"/usr/local/orchestrator",
"RaftBind":"192.168.20.101", # 根据本机IP更改
"DefaultRaftPort":10008, # 端口所有机器上得保持一致
"RaftNodes":[
"192.168.20.101",
"192.168.20.102",
"192.168.20.103"
]