#!/usr/bin/env python
# vim: set ts=8 sw=4 sts=4 et ai:
# Copyright (C) 2011, OSSO B.V.
# Walter Doekes <walter at wjd dot nu>
#
# This little script can be used to take a sampling of queries that
# are run often and/or take a lot of time.
#
# Every time you run this, a single 'show processlist' is executed on
# the local mysqld. If any queries are seen, they're cleaned up a bit
# and printed to stdout. Run this as often as you feel comfortable
# with; if run often/long enough you'll get statistically significant
# data about which queries put the most load on your database.
#
# Example output sort:
# $ grep -v '^\[' output.txt | sed -e 's/^[^:]*: //' |
#     sed -e 's/[0-9]\{5,\}/<NUMBER>/g' | sort | uniq -c | sort -n |
#     tail -n5
#
import datetime, re, subprocess

WS_RE = re.compile(r'(\s{2,}|\t)')

def get_processlist():
    if False:
        f = open('mysql-slow-queries-sample.input', 'r')
        data = f.read()
        f.close()
    else:
        devnull = open('/dev/null', 'w')
        proc = subprocess.Popen("echo 'show full processlist;' | mysql --defaults-file=/etc/mysql/debian.cnf -E", stdout=subprocess.PIPE, stderr=devnull, shell=True)
        (data, void) = proc.communicate()
        ret = proc.wait()
        if ret != 0:
            raise Exception()
    return data

def processlist_to_dict(data):
    lines = data.strip().split('\n')
    if not lines:
        return
    line = lines.pop(0)

    while line.startswith('****'):
        line = lines.pop(0)
        in_info = False
        row = {}
        while not line.startswith('****'):
            if in_info:
                row['info'] = row['info'] + ' ' + WS_RE.sub(' ', line.strip())
            else:
                key, value = line.split(':', 1)
                key = key.strip().lower()
                row[key] = WS_RE.sub(' ', value.strip())
                if key == 'info':
                    in_info = True
            if not lines:
                break
            line = lines.pop(0)

        yield row

def relevant_rows(row_iter):
    for row in processlist_to_dict(get_processlist()):
        if row['info'] == 'NULL' or row['info'] == 'show full processlist':
            continue
        host = row['host'].split(':')[0]
        duration = row['time']
        db = row['db'].split(':')[0]
        query = row['info']
        yield {'host': host, 'duration': duration, 'db': db, 'query': query}
    
def main():
    rows = list(relevant_rows(processlist_to_dict(get_processlist())))
    if rows:
        print '[%s]' % (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),)
        for row in rows:
            print '%s (%s, %ss): %s' % (row['host'], row['db'], row['duration'], row['query'])


if __name__ == '__main__':
    main()
