digoal
2013-02-21
PostgreSQL , 监控 , nagios , check_postgres , customize nagios script , lock
数据库锁等待监控脚本.
提供给nagios使用 :
本例环境 :
被监控的数据库DATABASEs
1. 192.168.2.2 : 1921
2. 192.168.21.3 : 1921
监控服务器monitor server
192.168.100.2
nagios 服务器
192.168.10.2
首先要在被监控的数据库中创建1个超级用户, 用于查询pg_stat_activity表. 此例该用户名为digoal.
create role digoal superuser login connection limit 32 encrypted password 'DIGOAL';
接下来需要配置被监控的数据库的pg_hba.conf, 允许192.168.100.2通过digoal用户访问postgres数据库.
vi pg_hba.conf
host postgres digoal 192.168.100.2/32 md5
pg_ctl reload
然后需要在monitor server新建PGPASSFILE
vi /etc/.pgpass
*:*:postgres:digoal:DIGOAL
chmod 400 /etc/.pgpass
.pgpass的owner必须改成调用nagios脚本的用户. 该用户配置在/etc/xinetd.d/nrpe.
在监控服务器上新建检测脚本 :
vi /usr/local/nagios/libexec/check_pg_lock.sh
#!/bin/bash
# 环境变量
export LANG=en_US.utf8
export PGHOME=/opt/pgsql
export LD_LIBRARY_PATH=$PGHOME/lib:/lib64:/usr/lib64:/usr/local/lib64:/lib:/usr/lib:/usr/local/lib
export DATE=`date +"%Y%m%d%H%M"`
export PATH=$PGHOME/bin:$PATH:.
export MANPATH=$PGHOME/share/man:$MANPATH
export PGPASSFILE=/etc/.pgpass
create_func() {
VER=`psql -q -t $CONN_INFO -c "select '9' from (select substr(version(),12,1) as ver) t where ver::int>=9"|grep -c 9`
if [ $VER -ge 1 ]; then
IS_STDBY=`psql -q -t $CONN_INFO -c "select 'this_is_standby' as a where pg_is_in_recovery();"|grep -c this_is_standby`
if [ $IS_STDBY -ge 1 ]; then
return 0
fi
fi
echo $CONN_INFO
psql -q -t $CONN_INFO <<EOF
create or replace function lock_wait (i_lockwait interval) returns void as \$\$
declare
v_dbip inet;
v_dbport int;
v_datname name;
v_usename name;
v_client_addr inet;
v_client_port int;
v_query_start timestamp with time zone;
v_current_query text;
v_ver text;
begin
set client_min_messages = notice;
select substr(version(),12,3) into v_ver;
select inet_server_addr() into v_dbip;
select inet_server_port() into v_dbport;
if v_ver >= '9.2' then
for v_datname,v_usename,v_client_addr,v_client_port,v_query_start,v_current_query in
select datname,usename,client_addr,client_port,query_start,query from pg_stat_activity where waiting and now()-query_start > i_lockwait
loop
raise notice 'dbip:%, dbport:%, datname:%, usename:%, client_addr:%, client_port:%, query_start:%, current_query:%'
,v_dbip,v_dbport,v_datname,v_usename,v_client_addr,v_client_port,v_query_start,v_current_query;
end loop;
return;
else
for v_datname,v_usename,v_client_addr,v_client_port,v_query_start,v_current_query in
select datname,usename,client_addr,client_port,query_start,current_query from pg_stat_activity where waiting and now()-query_start > i_lockwait
loop
raise notice 'dbip:%, dbport:%, datname:%, usename:%, client_addr:%, client_port:%, query_start:%, current_query:%'
,v_dbip,v_dbport,v_datname,v_usename,v_client_addr,v_client_port,v_query_start,v_current_query;
end loop;
return;
end if;
return;
end;
\$\$ language plpgsql;
EOF
}
mon() {
VER=`psql -q -t $CONN_INFO -c "select '9' from (select substr(version(),12,1) as ver) t where ver::int>=9"|grep -c 9`
if [ $VER -ge 1 ]; then
IS_STDBY=`psql -q -t $CONN_INFO -c "select 'this_is_standby' as a where pg_is_in_recovery();"|grep -c this_is_standby`
if [ $IS_STDBY -ge 1 ]; then
return 0
fi
fi
echo $CONN_INFO
psql -q -t $CONN_INFO -c "$SQL"
}
# global result
result=0
# set env
. /usr/local/nagios/etc/check_pg_env.sh
# monitor
LOG="/tmp/check_pg_lock.log"
SQL="select lock_wait(interval '5 second')"
for i in "HZ1_CHECK_PG_LOCK_DIGOAL" "HZ2_CHECK_PG_LOCK_DIGOAL"
do
eval CONN_INFO="$"$i
create_func
cnt=`mon 2>&1|grep -c -E "NOTICE|ERROR"`
if [ $cnt -ge 1 ]; then
result=2
mon 2>&1|tee $LOG
fi
done
# exit global proc
exit $result
配置脚本权限 :
chmod 555 /usr/local/nagios/libexec/check_pg_lock.sh
配置连接文件 :
vi /usr/local/nagios/etc/check_pg_env.sh
# CHECK_PG_LOCK env
# 杭州某IDC
HZ1_CHECK_PG_LOCK_DIGOAL="-h 10.1.110.40 -p 1921 -U digoal postgres"
HZ2_CHECK_PG_LOCK_DIGOAL="-h 10.1.123.212 -p 1921 -U digoal postgres"
修改权限:
chmod 444 /usr/local/nagios/etc/check_pg_env.sh
修改监控服务器的nagios客户端配置文件 :
vi /usr/local/nagios/etc/nrpe.cfg
command[check_pg_lock]=/usr/local/nagios/libexec/check_pg_lock.sh
重启xinetd服务
service xinetd restart
Stopping xinetd: [ OK ]
Starting xinetd: [ OK ]
配置nagios服务端, 增加check_pg_lock的检测.
如果发现锁等待超时, 可以使用如下3个视图进行分析 :
CREATE VIEW lockview AS
SELECT pid,virtualtransaction AS vxid, locktype AS lock_type,
mode AS lock_mode,granted,
CASE
WHEN virtualxid IS NOT NULL AND transactionid IS NOT NULL
THEN virtualxid||' ' || transactionid
WHEN virtualxid::text IS NOT NULL
THEN virtualxid
ELSE transactionid::text
END AS xid_lock, relname,
page, tuple,classid,objid,objsubid
FROM pg_locks LEFT OUTER JOIN pg_class ON (pg_locks.relation= pg_class.oid)
WHERE -- donot showour view'slocks
pid != pg_backend_pid()AND
--no needtoshowself-vxid locks
virtualtransaction IS DISTINCT FROM virtualxid
--granted isordered earlier
ORDER BY 1, 2,5 DESC, 6, 3,4, 7;
CREATE VIEW lockview1 AS
SELECT pid,vxid, lock_type,lock_mode,granted,xid_lock, relname
FROM lockview
--granted isordered earlier
ORDER BY 1,2,5 DESC, 6,3,4,7;
CREATE VIEW lockview2 AS
SELECT pid,vxid, lock_type,page, tuple,classid,objid,objsubid
FROM lockview
--granted isfirst
--add non-display columns tomatch orderingoflockview
ORDER BY 1,2,granted DESC, vxid, xid_lock::text, 3,4,5,6,7,8;
例如 :
[root@db-digoal ~]# /usr/local/nagios/libexec/check_pg_lock.sh
NOTICE: dbip:10.1.173.212, dbport:1921, datname:postgres, usename:postgres, client_addr:<NULL>, client_port:-1, query_start:2013-02-21 21:35:23.245022+08, current_query:update test set id=1;
NOTICE: dbip:10.1.100.40, dbport:1921, datname:android_market, usename:postgres, client_addr:<NULL>, client_port:-1, query_start:2013-02-21 21:30:47.899411+08, current_query:update test set id=3;
NOTICE: dbip:10.1.100.40, dbport:1921, datname:android_market, usename:postgres, client_addr:<NULL>, client_port:-1, query_start:2013-02-21 21:31:22.157254+08, current_query:update test set id=3;
返回值, nagios要用到这个值判断检查状态.
[root@db-digoal ~]# echo $?
2
分析10.1.100.40的NOTICE :
select * from lockview1 where relname='test';
pid | vxid | lock_type | lock_mode | granted | xid_lock | relname
-------+------------+-----------+------------------+---------+----------+---------
12396 | 219/122462 | relation | RowExclusiveLock | t | | test
12396 | 219/122462 | tuple | ExclusiveLock | t | | test
21011 | 234/100082 | relation | RowExclusiveLock | t | | test
21011 | 234/100082 | tuple | ExclusiveLock | f | | test
27207 | 236/256259 | relation | RowExclusiveLock | t | | test
(5 rows)
从以上锁信息分析,
12396正是21011和27207等待的会话.
KILL掉即可.
digoal=# select pg_terminate_backend(12396);
pg_terminate_backend
----------------------
t
(1 row)
digoal=# select * from lockview1 where relname='test';
pid | vxid | lock_type | lock_mode | granted | xid_lock | relname
-------+------------+-----------+------------------+---------+----------+---------
21011 | 234/100082 | relation | RowExclusiveLock | t | | test
21011 | 234/100082 | tuple | ExclusiveLock | t | | test
27207 | 236/256259 | relation | RowExclusiveLock | t | | test
(3 rows)
此时21011获得了tuple exclusivelock, 而27207还未获得, 它在等21011释放.
KILL 21011即可.
digoal=# select pg_terminate_backend(21011);
1. 程序BUG可能带来持有锁, 但是不释放的问题. 例如持有锁后啥都不干, 也不结束事务, 也就是pg_stat_activity.current_query看到的IDLE in transaction.
\所以数据库的锁等待超时监控显得尤为重要.
2. 由于集中监控, 可能因为网络的延时导致执行时间过长, nagios有超时机制, 容易导致不稳定.
可以考虑将监控放到crontab中, nagios调用的脚本通过检测输出日志. 分出如下两部分 :
crontab执行脚本 :
#!/bin/bash
# 环境变量
export LANG=en_US.utf8
export PGHOME=/opt/pgsql
export LD_LIBRARY_PATH=$PGHOME/lib:/lib64:/usr/lib64:/usr/local/lib64:/lib:/usr/lib:/usr/local/lib
export DATE=`date +"%Y%m%d%H%M"`
export PATH=$PGHOME/bin:$PATH:.
export MANPATH=$PGHOME/share/man:$MANPATH
export PGPASSFILE=/etc/.pgpass
create_func() {
VER=`psql -q -t $CONN_INFO -c "select '9' from (select substr(version(),12,1) as ver) t where ver::int>=9"|grep -c 9`
if [ $VER -ge 1 ]; then
IS_STDBY=`psql -q -t $CONN_INFO -c "select 'this_is_standby' as a where pg_is_in_recovery();"|grep -c this_is_standby`
if [ $IS_STDBY -ge 1 ]; then
return 0
fi
fi
#DEBUG switch
#echo $CONN_INFO
psql -q -t $CONN_INFO <<EOF
create or replace function lock_wait (i_lockwait interval) returns void as \$\$
declare
v_dbip inet;
v_dbport int;
v_datname name;
v_usename name;
v_client_addr inet;
v_client_port int;
v_query_start timestamp with time zone;
v_current_query text;
v_ver text;
begin
set client_min_messages = notice;
select substr(version(),12,3) into v_ver;
select inet_server_addr() into v_dbip;
select inet_server_port() into v_dbport;
if v_ver >= '9.2' then
for v_datname,v_usename,v_client_addr,v_client_port,v_query_start,v_current_query in
select datname,usename,client_addr,client_port,query_start,query from pg_stat_activity where waiting and now()-query_start > i_lockwait
loop
raise notice 'dbip:%, dbport:%, datname:%, usename:%, client_addr:%, client_port:%, query_start:%, current_query:%'
,v_dbip,v_dbport,v_datname,v_usename,v_client_addr,v_client_port,v_query_start,v_current_query;
end loop;
return;
else
for v_datname,v_usename,v_client_addr,v_client_port,v_query_start,v_current_query in
select datname,usename,client_addr,client_port,query_start,current_query from pg_stat_activity where waiting and now()-query_start > i_lockwait
loop
raise notice 'dbip:%, dbport:%, datname:%, usename:%, client_addr:%, client_port:%, query_start:%, current_query:%'
,v_dbip,v_dbport,v_datname,v_usename,v_client_addr,v_client_port,v_query_start,v_current_query;
end loop;
return;
end if;
return;
end;
\$\$ language plpgsql;
EOF
}
mon() {
VER=`psql -q -t $CONN_INFO -c "select '9' from (select substr(version(),12,1) as ver) t where ver::int>=9"|grep -c 9`
if [ $VER -ge 1 ]; then
IS_STDBY=`psql -q -t $CONN_INFO -c "select 'this_is_standby' as a where pg_is_in_recovery();"|grep -c this_is_standby`
if [ $IS_STDBY -ge 1 ]; then
return 0
fi
fi
#DEBUG switch
#echo $CONN_INFO
psql -q -t $CONN_INFO -c "$SQL"
}
# global result
result=0
# set env
. /usr/local/nagios/etc/check_pg_env.sh
# monitor
LOGFILE="/tmp/check_pg_lock.log"
TMPLOGFILE="/tmp/check_pg_lock.log.tmp"
SQL="select lock_wait(interval '5 second')"
# 清除临时日志数据
echo "" >$TMPLOGFILE
for i in "HZ1_CHECK_PG_LOCK_DIGOAL" "HZ2_CHECK_PG_LOCK_DIGOAL"
do
eval CONN_INFO="$"$i
create_func
cnt=`mon 2>&1|grep -c -E "NOTICE|ERROR"`
if [ $cnt -ge 1 ]; then
result=2
mon >>$TMPLOGFILE 2>&1
fi
done
echo -e "return 2" >>$TMPLOGFILE 2>&1
# 替换数据
cat $TMPLOGFILE >$LOGFILE
# exit global proc
exit $result
nagios脚本 :
#!/bin/bash
# 环境变量
export LANG=en_US.utf8
export PGHOME=/opt/pgsql
export LD_LIBRARY_PATH=$PGHOME/lib:/lib64:/usr/lib64:/usr/local/lib64:/lib:/usr/lib:/usr/local/lib
export DATE=`date +"%Y%m%d%H%M"`
export PATH=$PGHOME/bin:$PATH:.
export MANPATH=$PGHOME/share/man:$MANPATH
LOGFILE=/tmp/check_pg_lock.log
CNT=`cat $LOGFILE|grep -c -E "NOTICE|ERROR"`
if [ $CNT -ge 1 ]; then
# summary
echo -e "ERROR: SQL lock timeout."
cat $LOGFILE|grep -v "^\$"|awk '{print $2}'|uniq -c
# detail
echo -e "\nDETAIL:"
cat $LOGFILE|grep -v "^\$"
exit 2
fi
exit 0
1. 现成的监控脚本可以参考bucardo的check_postgres.pl
支持nagios, mrtg, cacti等
http://bucardo.org/wiki/Check_postgres
2. csvlog监控
http://blog.163.com/digoal@126/blog/static/163877040201312241028667/