一.项目说明
1.1 项目目的
1.盘活服务器资源,提高资源的使用率;资源是公司的资产,只有尽可能发挥其价值,才能创造更多的价值。所以,有必要知道,公司整体(或某业务、产品)所属的 db server的资源使用情况。主要从cpu、内存、disk的平均数和中位数来反映。实现更合理的资源分配和集中性的管理,节省资源成本。
2.慢查询的次数,既可以说明程序的性能和server的压力,说明了待确认和优化的情况,也说明了资源的紧张性。
3.此类历史数据的积累,可以生成一个变化趋势图,说明资源使用趋势。
4.之前的监控大部分诊断具体的一个db server或应用,这个是针对公司整体(或某业务、产品)所属的 db server;是监控体系的一个完善和补充。
即:资源盘活、充分利用、降本增效、监控补充。
1.2 部署环境及架构
现有的监控数据已收集到influxdb 和 elasticsearch 中,本次要实现的功能是将数据计算聚合到mysql中,然后通过邮件发送给相关人员。存储到mysql 数据库中,一是因为 此类数据有一定的价值(具有追溯性和便于历史趋势分析),二是 influxdb 、elasticsearch 数据都有过期时间,数据保留的天数不是太长。
二.表的创建
2.1 存储db资源使用情况的表
表名定义为weekly_dbperformance,具体的脚本如下:
create table `weekly_dbperformance` ( `id` int(11) not null auto_increment, `cpu_mean` varchar(255) not null default '', `cpu_median` varchar(255) not null default '', `mem_mean` varchar(255) not null default '', `mem_median` varchar(255) not null default '', `disk_mean` varchar(255) not null default '', `disk_median` varchar(255) not null default '', `datetime_created` timestamp null default current_timestamp on update current_timestamp comment '数据行创建时间', primary key (`id`) ) engine=innodb auto_increment=3740 default charset=utf8mb4;
在记录数据生成的时间字段添加个索引
create index idx_datetime on weekly_dbperformance (datetime_created);
2.2 存储db 实例慢查询情况的表
表名定义为weekly_dbslowqty,具体的脚本如下:
create table `weekly_dbslowqty` ( `id` int(11) not null auto_increment, `qindex_name` varchar(50) not null default '', `qstartdate` varchar(50) not null default '', `qenddate` varchar(50) not null default '', `slowqty` varchar(20) not null default '', `datetime_created` timestamp null default current_timestamp on update current_timestamp comment '数据行创建时间', primary key (`id`) ) engine=innodb auto_increment=3740 default charset=utf8mb4;
在记录查询的开始时间的字段上添加个索引
create index idx_qstartdate on weekly_dbslowqty (qstartdate);
三.主要功能代码
3.1 统计db server资源使用率
可执行文件为collect_dbperformance.py
从influxdb中查询db server的资源使用情况。包括cpu的平均数、cpu的中位数、内存使用的平均数、内存使用的中位数、磁盘平均使用率、磁盘使用的中位数。
拉取计算的是过去7天的数据。
#!/usr/bin/python
# -*- coding: utf-8 -*-
from influxdb import influxdbclient
import pytz
import time
import dateutil.parser
import datetime
import db_monitor_conn
mysqldb = db_monitor_conn.db
# use cursor
cursor = mysqldb.cursor()
class dbapi(object):
"""
通过infludb获取数据
"""
def __init__(self, ip, port):
"""
初始化数据
:param ip:influxdb地址
:param port: 端口
"""
self.db_name = 'telegraf'
self.use_cpu_table = 'cpu' # cpu使用率表
self.phy_mem_table = 'mem'# 物理内存表
self.disk_table = 'disk'# 磁盘表
self.client = influxdbclient(ip, port, '用*户*名', '密*码', self.db_name) # 连接influxdb数据库
print ('test link influxdb')
def get_use_dbperformance(self, s_time, e_time):
"""
获取磁盘io使用率
:param host: 查询的主机host (telegraf 配置参数中的host栏位)
:param s_time: 开始时间
:param e_time: 结束时间
:return:
"""
response = {}
### 时间还需转换,否则报错 typeerror: tuple or struct_time argument required
#s = time.strptime(s_time, '%y-%m-%d %h:%m:%s')
#e = time.strptime(e_time, '%y-%m-%d %h:%m:%s')
s = time.strptime(s_time, '%y-%m-%d')
e = time.strptime(e_time, '%y-%m-%d')
start_time = int(time.mktime(s)) * 1000 * 1000 * 1000
end_time = int(time.mktime(e)) * 1000 * 1000 * 1000
#start_time = s_time
#end_time = e_time
cpu_mean_list = cpu_median_list = mem_mean_list = mem_median_list = disk_mean_list = disk_median_list = ['0.0']
##print('开始查询cpu使用率的平均数')
cpu_mean_list = self.client.query(
"select mean(usage_user) from cpu where time>=%s and time<=%s and cpu = 'cpu-total' and host != 'qqlog_xxx_xxx' ;" % (
start_time, end_time))
##print(cpu_mean_list)
### cpu_mean_list的格式 resultset({'('cpu', none)': [{'time': '2018-06-21t16:00:00z', 'mean': 1.7141865567279297}]})
cpu_mean_points = list(cpu_mean_list.get_points(measurement='cpu'))
##print(cpu_mean_points)
### cpu_mean_points的格式[{'time': '2018-06-21t16:00:00z', 'mean': 1.7141865567279297}]
cpu_mean = cpu_mean_points[0]['mean']
##print(cpu_mean)
### cpu_mean 的格式1.7141865567279297
##print('查询cpu使用率的平均数结束')
##print('开始查询cpu使用率的中位数')
cpu_median_list = self.client.query(
"select median(usage_user) from cpu where time>=%s and time<=%s and cpu = 'cpu-total' and host != 'qqlog_xxx_xxx';" % (
start_time, end_time))
##print(cpu_median_list)
#### cpu_median_list的格式为resultset({'('cpu', none)': [{'time': '2018-06-21t16:00:00z', 'median': 0.726817042581142}]})
cpu_median_points = list(cpu_median_list.get_points(measurement='cpu'))
cpu_median = cpu_median_points[0]['median']
##print(cpu_median)
##print('开始查询mem使用率的平均数')
mem_mean_list = self.client.query(
"select mean(used) /mean(total) from mem where time>=%s and time<=%s and host != 'qqlog_xxx_xxx';" % (
start_time, end_time))
print(mem_mean_list)
### mem_mean_list的格式为resultset({'('mem', none)': [{'time': '2018-06-21t16:00:00z', 'mean_mean': 0.729324184536873}]})
mem_mean_points = list(mem_mean_list.get_points(measurement='mem'))
mem_mean = mem_mean_points[0]['mean_mean']
##print(mem_mean)
##print('开始查询mem使用率的中位数')
mem_median_list = self.client.query(
"select median(used) /median(total) from mem where time>=%s and time<=%s and host != 'qqlog_xxx_xxx' ;" % (
start_time, end_time))
##print(mem_median_list)
###mem_median_list的格式为resultset({'('mem', none)': [{'time': '2018-06-21t16:00:00z', 'median_median': 0.8698493636354012}]})
mem_median_points = list(mem_median_list.get_points(measurement='mem'))
mem_median = mem_median_points[0]['median_median']
##print('开始查询disk使用率的平均数')
disk_mean_list = self.client.query(
"select mean(used) /mean(total) from disk where time>=%s and time<=%s and host != 'qqlog_xxx_xxx';" % (
start_time, end_time))
##print (disk_mean_list)
###disk_mean_list的格式为esultset({'('disk', none)': [{'time': '2018-06-21t16:00:00z', 'mean_mean': 0.31204798557786284}]})
disk_mean_points = list(disk_mean_list.get_points(measurement='disk'))
disk_mean = disk_mean_points[0]['mean_mean']
##print(disk_mean)
##print('开始查询disk使用率的中位数')
disk_median_list = self.client.query(
"select median(used) /median(total) from disk where time>=%s and time<=%s and host != 'qqlog_xxx_xxx';" % (
start_time, end_time))
##print (disk_median_list)
###disk_median_list的格式resultset({'('disk', none)': [{'time': '2018-06-21t16:00:00z', 'median_median': 0.08009824336938143}]})
disk_median_points = list(disk_median_list.get_points(measurement='disk'))
##print(disk_median_points)
disk_median = disk_median_points[0]['median_median']
##print(disk_median)
### 将计算统计的结果放到mysql中,以便汇总发送report
sql_insert = "insert into weekly_dbperformance(cpu_mean,cpu_median,mem_mean,mem_median,disk_mean,disk_median) " \
"values('%s','%s','%s','%s','%s','%s')" % \
(cpu_mean,cpu_median,mem_mean,mem_median,disk_mean,disk_median)
cursor.execute(sql_insert)
mysqldb.commit()
def change_time(self, params):
"""
时间转换
:param params:
:return:
"""
item = dateutil.parser.parse(params).astimezone(pytz.timezone('asia/shanghai'))
result = str(item).split("+")[0]
response = time.strptime(result, '%y-%m-%d %h:%m:%s')
param = time.strftime('%y-%m-%d %h:%m:%s', response)
return param
# 连接 influxdb
# influxdb_ip influxdb所在主机
# influxdb_prot influxdb端口
db = dbapi(ip='xxx.110.119.xxx', port='?????')
###查询的时间范围
### typeerror: strptime() argument 0 must be str, not <class 'datetime.datetime'>
##e_time = datetime.datetime.now()
e_time = datetime.datetime.now().strftime('%y-%m-%d')
##s_time = e_time + datetime.timedelta(-7)
s_time = (datetime.datetime.now() + datetime.timedelta(-7)).strftime('%y-%m-%d')
print('打印查询范围----时间参数如下:')
print(e_time)
print(s_time)
db.get_use_dbperformance(s_time,e_time)
#print(disk_points)
注意:此份代码的运行环境是python 3.6.8;此外还要注意下influxdb的query返回值的处理;可执行文件可以通过crontab设置定时任务,周期性抓取数据。
3.2 统计db实例的慢查询
可执行文件为count_dbslow.py
从elasticsearch中读取慢查询的数据,主要是统计符合条件的个数。
需要说明的是某产品线下的数据库慢查询放到index命名一样。本例中mysql-slow-qqorders-*,是查询mysql-slow-qqorders-开通的所有慢查询的个数,qqorders是具体的产品线代码,*是日期的模糊匹配。
#coding:utf8
import os
import time
from datetime import date
### 导入模块 timedelta ,否则date.today()+ timedelta(days = -2) 报错: attributeerror: 'datetime.date' object has no attribute 'timedelta'
from datetime import timedelta
from os import walk
###导入模块的from datetime import datetime改成import datetime;否则在day = datetime.datetime.now()报错:attributeerror: type object 'datetime.datetime' has no attribute 'datetime'
##from datetime import datetime
import datetime
from elasticsearch import elasticsearch
from elasticsearch.helpers import bulk
import db_monitor_conn
mysqldb = db_monitor_conn.db
# use cursor
cursor = mysqldb.cursor()
###数据收集前,清除之前收集的数据
##sql_delete = "delete from weekly_dbslowqty "
##cursor.execute(sql_delete)
##mysqldb.commit()
class elasticobj:
def __init__(self, index_name,index_type,ip ="es实例所在的serverip"):
'''
:param index_name: 索引名称
:param index_type: 索引类型,默认为_doc
'''
self.index_name =index_name
self.index_type = index_type
# 无用户名密码状态
#self.es = elasticsearch([ip])
#用户名密码状态
self.es = elasticsearch([ip],http_auth=('es用*户*名', 'es用*户*密*码'),port=es端口号)
#### 获取数据量
def get_slowqty_by_indexname(self,dstart,dend):
doc = {
"query": {
"bool": {
"must": [
{"exists":{"field": "query_time"}},
{"range":{
"@timestamp": {
"gte": dstart.strftime('%y-%m-%d %h:%m:%s'),
"lte": dend.strftime('%y-%m-%d %h:%m:%s'),
"format": "yyyy-mm-dd hh:mm:ss",
"time_zone": "+08:00"
}
}}
],
"must_not": [
## 排除不符合条件的server,例如 排除 xxx.xxx.xxx.xxx
{"term": {"fields.db_host": "xxx.110.119.xxx"}}
]
}
}
}
_slowqty = self.es.count(index=self.index_name, doc_type=self.index_type, body=doc)
print(_slowqty)
#### _slowqty 的返回格式是字典类型,如下{'count': 2374, '_shards': {'total': 16, 'successful': 16, 'skipped': 0, 'failed': 0}}
slowqty = _slowqty['count']
print(slowqty)
#### 将数据保存到mysql中,以便发送报表
sql_insert = "insert into weekly_dbslowqty(qindex_name,qstartdate,qenddate,slowqty) " \
"values('%s','%s','%s','%s')" % \
(self.index_name,dstart,dend,slowqty)
cursor.execute(sql_insert)
mysqldb.commit()
obj =elasticobj("mysql-slow-qqorders-*","_doc",ip ="es 所在机器的 serverip")
###时间参数
##day = datetime.datetime.now()
##start = datetime.datetime.strptime('20180628 00:00:00','%y%m%d %h:%m:%s')
##end = datetime.datetime.strptime('20180629 00:00:00','%y%m%d %h:%m:%s')
##dstart = (datetime.datetime.now() + datetime.timedelta(-2))
##dend = (datetime.datetime.now() + datetime.timedelta(-1))
today = date.today()
dstart = (date.today()+ timedelta(days = -2)).strftime('%y-%m-%d')
dend = (date.today()+ timedelta(days = -1)).strftime('%y-%m-%d')
####print(dstart)
####print(dend)
###添加.strftime('%y-%m-%d'),,否则报错typeerror: strptime() argument 1 must be str, not datetime.date
dstart = datetime.datetime.strptime(dstart,'%y-%m-%d')
dend = datetime.datetime.strptime(dend,'%y-%m-%d')
print(dstart)
print(dend)
obj.get_slowqty_by_indexname(dstart,dend)
注意:此份代码的运行环境也是python 3.6.8
3.3 发送server资源性能周报
可执行文件为dbperformance_report_weekly.py
#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys
reload(sys)
sys.setdefaultencoding( "utf-8" )
import db_monitor_conn
import os
import time
import smtp_config_dbperformance
from email.mime.text import mimetext
from email.header import header
def send_mail(mail_msg):
# 调用send_mail函数
mail_body = """
<style type="text/css">
table.gridtable {
font-family: verdana,arial,sans-serif;
font-size:11px;
color:#333333;
border-width: 1px;
border-color: #666666;
border-collapse: collapse;
}
table.gridtable th {
border-width: 1px;
padding: 8px;
border-style: solid;
border-color: #666666;
background-color: #dedede;
}
table.gridtable td {
border-width: 1px;
padding: 8px;
border-style: solid;
border-color: #666666;
background-color: #ffffff;
}
</style>
<!-- table goes in the document body -->
<table class="gridtable">
<tr>
<th>cpu平均数</th><th>cpu中位数据</th><th>内存平均数</th><th>内存中位数据</th>
<th>disk平均数</th><th>disk中位数</th><th>统计时间</th>
</tr>
"""
mail_body = mail_body + mail_msg + "</table>"
message = mimetext(mail_body, 'html', 'utf-8')
subject = 'db服务器性能周报[资源性能]'
message['subject'] = header(subject, 'utf-8')
smtp_config_dbperformance.send_mail(message)
return
#定义邮件体变量
mail_msg = ""
# 获取数据库连接
db = db_monitor_conn.db
print(db)
# 使用cursor()方法获取操作游标
cursor = db.cursor()
# sql 查询语句
# 备份日报
sql_dbper_report = " select round(cpu_mean,2) as cpu_mean,round(cpu_median,2) as cpu_median ,round(mem_mean *100 ,2)as mem_mean , " \
" round(mem_median *100,2) as mem_median ,round(disk_mean * 100,2) as disk_mean,round(disk_median *100,2) as disk_median,date_format(datetime_created, '%y-%m-%d') as datetime_created " \
" from weekly_dbperformance " \
" where 1=1" \
" order by datetime_created limit 1 "
try:
# 执行sql语句
cursor.execute(sql_dbper_report)
# 获取所有记录列表
results = cursor.fetchall()
for row in results:
cpu_mean = str(row[0])
cpu_median = str(row[1])
mem_mean = str(row[2])
mem_median = str(row[3])
disk_mean = str(row[4])
disk_median = str(row[5])
rdatetime = str(row[6])
# 生成邮件内容 注意邮件列数和参数的个数一直(<type 'exceptions.exception'> not all arguments converted during string formatting)
mail_msg_single = """
<tr>
<td align="center">%s</td><td>%s</td><td align="right">%s</td>
<td>%s</td><td align="right">%s</td><td align="right">%s</td>
<td align="right">%s</td>
</tr> """ % \
(cpu_mean, cpu_median, mem_mean, mem_median, disk_mean, disk_median, rdatetime)
mail_msg = mail_msg + mail_msg_single
# 发送邮件
send_mail(mail_msg)
except exception as e:
print str(exception)
print str(e)
# 关闭游标
cursor.close()
# 关闭数据库连接
db.close()
注意:此份代码的运行环境是python 2.7.5
2.4 发送db 慢查询周报
可执行文件为dbslowlog_report_weekly.py
#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys
reload(sys)
sys.setdefaultencoding( "utf-8" )
import db_monitor_conn
import os
import time
import smtp_config_dbperformance
from email.mime.text import mimetext
from email.header import header
def send_mail(mail_msg):
# 调用send_mail函数
mail_body = """
<style type="text/css">
table.gridtable {
font-family: verdana,arial,sans-serif;
font-size:11px;
color:#333333;
border-width: 1px;
border-color: #666666;
border-collapse: collapse;
}
table.gridtable th {
border-width: 1px;
padding: 8px;
border-style: solid;
border-color: #666666;
background-color: #dedede;
}
table.gridtable td {
border-width: 1px;
padding: 8px;
border-style: solid;
border-color: #666666;
background-color: #ffffff;
}
</style>
<!-- table goes in the document body -->
<table class="gridtable">
<tr>
<th>统计时间开始参数</th><th>时间结束参数</th><th>db慢查询个数</th>
</tr>
"""
mail_body = mail_body + mail_msg + "</table>"
message = mimetext(mail_body, 'html', 'utf-8')
subject = 'db服务器性能周报[db慢查询]'
message['subject'] = header(subject, 'utf-8')
smtp_config_dbperformance.send_mail(message)
return
#定义邮件体变量
mail_msg = ""
# 获取数据库连接
db = db_monitor_conn.db
print(db)
# 使用cursor()方法获取操作游标
cursor = db.cursor()
# sql 查询语句
# 备份日报
sql_dbslow_report = " select distinct qstartdate,qenddate,slowqty " \
" from weekly_dbslowqty " \
" where qindex_name ='mysql-slow-qqorders-*' and qstartdate >= date_sub(curdate(),interval 8 day) and qstartdate < date_sub(curdate(),interval 1 day) " \
" order by datetime_created asc "
try:
# 执行sql语句
cursor.execute(sql_dbslow_report)
# 获取所有记录列表
results = cursor.fetchall()
for row in results:
qstartdate = str(row[0])
qenddate = str(row[1])
slowqty = str(row[2])
# 生成邮件内容 注意邮件列数和参数的个数一直(<type 'exceptions.exception'> not all arguments converted during string formatting)
mail_msg_single = """
<tr>
<td align="center">%s</td><td align="right">%s</td>
<td align="right">%s</td>
</tr> """ % \
(qstartdate, qenddate, slowqty)
mail_msg = mail_msg + mail_msg_single
# 发送邮件
send_mail(mail_msg)
except exception as e:
print str(exception)
print str(e)
# 关闭游标
cursor.close()
# 关闭数据库连接
db.close()
注意:此份代码的运行环境也是python 2.7.5
3.5 其他模块
mysql的连接模块:db_monitor_conn
相应的代码可在《通过python将监控数据由influxdb写入到mysql》一文中查看,参阅db_conn.py的编写,在此不再赘述。
短信发送的模块:smtp_config_dbperformance
请参阅前面的分享《mysql数据归档小工具推荐及优化–mysql_archiver》,github地址: 下载的代码,有发送邮件的模块smtp_config.py,在此不再赘述。
四 实现
4.1 dbserver资源报告示样
下图是通过邮件的形式发送某业务线下面db server资源使用率的邮件。
4.2 慢查询报告示样
下图是通过邮件的形式发送某业务线下面所有db 实例的一周的sql慢查询的个数。
这是个简单的demo,项目规划是随着db资源的监控指标清晰、完善,数据丰富,整合到一个dashboard上。
五 题外话–das
阿里云的das(database autonomy service)是一种基于机器学习和专家经验实现数据库自感知、自修复、自优化、自运维及自安全的云服务,帮助用户消除数据库管理的复杂性及人工操作引发的服务故障,有效保障数据库服务的稳定、安全及高效,解决方案架构 如下图。
个人认为, das 要实现的目标(自感知、自修复、自优化、自运维及自安全)是我们dba的努力的方向。