promethues提供了一些常用的exporter,但是在进程监控上,我们需要一些更加详细的监控,程序的存活,内存,CPU,句柄数,线程数等等。
下面我有一个示例,如何在机器上监控指定进程,并且推送到promethues。
#!/usr/bin/python# -*- coding:utf-8 -*-import globimport loggingimport osimport psutilimport requestsimport socketimport structimport timeimport fcntlfrom prometheus_client import Gauge, start_http_serverDataBase = { "user": "prometheus", "password": "AgTKZu4RkOZvqKZA", "project": "ht", "host": "127.0.0.1"}monitor = ['_cpu', '_memory', '_threads', '_fs', '_status', '_ctime']logger = logging.getLogger("LOG FORMAT")logger.setLevel(logging.DEBUG)ch = logging.StreamHandler()ch.setLevel(logging.DEBUG)formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")ch.setFormatter(formatter)logger.addHandler(ch)class GameServer(object): def __init__(self): pass @property def channel(self): """ :return: 读取游戏服数量,方法是从/data/下读取游戏服目录[当前使用渠道区分版本,每个渠道下有自己独立的登录服,游戏服] """ channel = glob.glob("/data/%s_*" % DataBase["project"]) return channel @property def mysql(self): """ :return: mysql数据库进程详细状态 """ pid_file = glob.glob("/data/mysql_data/data/*.pid")[0] with open(pid_file) as f: pid = f.read() p = psutil.Process(int(pid)) if p.is_running(): mem = p.memory_info().rss / 1024 / 1024 return mem else: return 0 @property def local_ip(self): """ :return: 主机内网ip """ s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) return socket.inet_ntoa(fcntl.ioctl( s.fileno(), 0x8915, # SIOCGIFADDR struct.pack('256s', 'eth0'[:15]) )[20:24]) @property def wan_ip(self): """ :return: 腾讯云主机获取本机外网IP """ try: r = requests.get('http://metadata.tencentyun.com/meta-data/public-ipv4', timeout=5) data = r.content except Exception as e: logging.exception(e) data = '0.0.0.0' return dataclass Action: @staticmethod def step_1(channels): """ :return: 读取渠道内的pid获取游戏的pid """ temp_dict = dict() for channel in channels: os.chdir(channel) servers = glob.glob("*_*") for server in servers: if not os.path.isdir(server): continue try: pid_file = "%s/run/%s.pid" % (server, "".join(server.split("_"))) if not os.path.exists(pid_file): temp_dict[server] = 0 else: with open('%s/run/%s.pid' % (server, "".join(server.split("_"))), 'r') as f: pid = f.readline().strip('\n') temp_dict[server] = pid except Exception as e: logging.exception(e) temp_dict[server] = 0 return temp_dict @staticmethod def step_2(m_data=None): """ :return: 提取游戏服的重要指标作为监控项目 """ monitor_data = dict() for server, pid in m_data.iteritems(): if int(pid) > 0: try: d = dict() p = psutil.Process(int(pid)) if p.is_running(): d['status'] = 1 d['cpu'] = p.cpu_percent(interval=0.1) d['memory'] = p.memory_info().rss / 1024 / 1024 d['threads'] = p.num_threads() d['fs'] = p.num_fds() d['ctime'] = p.create_time() monitor_data[server] = d except Exception as e: logging.exception(e) d = dict() d['cpu'] = 0 d['memory'] = 0 d['threads'] = 0 d['status'] = 0 d['fs'] = 0 d['ctime'] = 0 monitor_data[server] = d else: d = dict() d['cpu'] = 0 d['memory'] = 0 d['threads'] = 0 d['status'] = 0 d['fs'] = 0 d['ctime'] = 0 monitor_data[server] = d return monitor_data @staticmethod def create_series(role=None): """ :return: 创建 promethues gauge series 实例方法 """ role_list = [] func = dict() for item in monitor: gague = Gauge(role.split("_")[0] + item, role + item, ['role', 'serverid', 'local_ip', 'wan_ip']) func[role + item] = gague return funcif __name__ == '__main__': gs = GameServer() wan_ip = gs.wan_ip local_ip = gs.local_ip channels = gs.channel start_http_server(8111) temp_roles_data = [x for x,y in Action.step_1(channels=channels).iteritems()] roles = list(set([x.split("_")[0] for x in temp_roles_data])) series_tmp = [ Action.create_series(x) for x in roles] series = dict() for d in series_tmp: series.update(d) while True: s1 = Action.step_1(channels=channels) s2 = Action.step_2(m_data=s1) try: for role, data in s2.iteritems(): key_role, key_id = role.split("_")[0], role.split("_")[1] for m in monitor: series.get(key_role + m).labels(role=key_role, serverid=key_id, local_ip=local_ip, wan_ip=wan_ip).set(data.get(m.strip("_"))) time.sleep(60) #60秒刷新一次数据 except Exception as e: logging.exception(e)
启动程序,使用ip+端口访问web,就可以得到你想要的数据了。
在promethues页面我们使用采集的数据检查下
可以看到,我们自己的exporter也采集到了数据
这样我们就能在granfana里进行展现了
好了,自定义的exporter就完成了,下次我们说说promethues如何和consul搭配来完成我们的自动发现