#!/bin/sh
#
############################################################################
# This program is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation; either version 2 of the License, or (at your
# option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You may have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307,
# USA.
#
# An on-line copy of the GNU General Public License can be found
# http://www.fsf.org/copyleft/gpl.html.
############################################################################
#
# log_processlist.sh (C) 2011 Simon J Mudd <sjmudd@pobox.com>
#
# This script maintains a number of servers statistics in /var/log/mysql_pl.
#
# It is intended to be run every minute from cron, and takes no parameters
# as indicated below:
#
# * * * * * /usr/local/bin/log_processlist.sh
#
# It must run as root and will refuse to run if you try as a different
# user.
#
# Access to the db instances is currently exclusively done by looking for
# and then connecting to instances using defaults files matching the
# the following pattern: /root/.my*.cnf.
#
# The script has been run successfully on MySQL versions 5.0, 5.1 and
# 5.5.
#
# Running this script minutely from cron gives you a minutely overview of
# what the server and instances are doing and can be invaluable for
# debugging problems. Output includes the following information:
# - ps output for the server
# - SHOW FULL PROCESSLIST
# - SHOW SLAVE STATUS ( if the instance is a slave )
# - SHOW ENGINE INNODB STATUS (if the instance uses InnoDB)
#
# Files are dropped into /var/log/mysql_pl/XXX where XXX is (Mon,Tue,Wed,
# ...).  Each file is named according to the time of day and instance
# 'name', e.g. 15_30xx.
# names:
#
# Config file-> Suffix
# .my.cnf    -> empty
# .my-xx.cnf -> xx
# .my-aa.cnf -> aa
#
# Exit codes:
# 0 - all is fine.
# 1 - not started by root, refusing to run.
# 2 - there is no mysql command line client to be found.
# 3 - the required instance is not running (cannot be found in the process list).
# 4 - SQL failure
#
# Suggestions for improvement are always welcome and should be directed
# to me at the above email.

[ -n "$DEBUG" ] && set -x
myname=$(basename $0)
lockfile=/tmp/$myname.lock
myuser=$(id -un)
myhome=$(getent passwd $myuser | cut -d: -f6)
myhostname=$(hostname -s)

# reference - get the day time in one go to avoid timing issues.
reference=$(date +%a_%H_%M)    # Mon_11_40  for Monday at 11:40 (unless locale is broken)
day_name=$(echo $reference | cut -d_ -f1)
current_time=$(echo $reference | cut -d_ -f2-3)

# We will drop stuff in $logdir/$day_name, and name it $logdir/$day_name/$current_time$suffix (with no dot)
logdir=/var/log/mysql_pl

# for cleaning up old format files.
warning_time=15_00

msg_info () {
	echo "$(date +'%b %d %H:%M:%S') $myhostname $myname[$$]: $*"
}

msg_verbose () {
	[ -n "$verbose" ] && msg_info "$*"
}

# add the uptime info
show_system_info () {
	local log_file_base=$1
	local log_file=$log_file_base.unix

	echo 'UPTIME:'         >  $log_file 2>&1
	uptime                 >> $log_file 2>&1
	echo 'UNIX_PROCESSES:' >> $log_file 2>&1
	ps auwwwx              >> $log_file 2>&1
	echo 'DISK SPACE'      >> $log_file 2>&1
	df -Th                 >> $log_file 2>&1
	echo 'MEMORY'          >> $log_file 2>&1
	free -m                >> $log_file 2>&1
}

try_sql () {
	local my_cnf=$1
	shift

	mysql --defaults-file=$my_cnf --silent --execute="$*" > /dev/null 2>&1
	return $?
}

# Run a mysql command and APPEND output to logfile
do_sql () {
	local logfile=$1
	local my_cnf=$2
	shift 2

	echo "SQL: $*" >> $logfile 2>&1
	mysql --defaults-file=$my_cnf --silent --execute="$*" >> $logfile 2>&1
	local rc=$?
	if [ "$rc" -ne 0 ]; then
		echo "$myname: MySQL generates error $rc using defaults-file $my_cnf when executing: $*"
		exit 4
	fi
}

# We now get the processlist and also the replication delay
get_instance_info () {
	local log_file_base=$1
	local my_cnf=$2

	# determine the suffix (which may be empty)
	local suffix=$(echo "$my_cnf" | sed -e 's,^.*/\.my,,' -e 's/\.cnf$//' -e 's/^-//')
	local log_file=${log_file_base}${suffix}

	# Cleanup output file. We delete it and if NO SQL access works it will stay deleted
	test -f $log_file && rm -f $log_file

	# Look at the config file and if it's trying to talk to a socket
	# see if the instance is up. If not don't bother to try to even
	# connect to it. Note: we can't check for the full instance path as it might not match:
	# The path MAY be relative or absolute and may not include
	# the implicit $datadir prefix.  Don't bother to try and
	# be clever.
	if grep -q socket= $my_cnf; then
		config_socket=$(grep socket $my_cnf | sed -e "s/#.*//" -e 's/^ *socket *= *//' | tail -1)
		# originally we tried to find a mysqld in the processlist
		# which matches out $config_socket, but it is much more
		# complicated than that. For example, we might be talking
		# with $config_socket=/path/to/data/mysql.sock
		# but the server runs with 
		#  --datadir=/path/to/data --socket=mysql.sock
		# and we will never find that out using grep.
		#
		# We try differently now:
		#   We attempt a connect, and if that works, all is fine.
		if [ -n "$config_socket" ]; then
			if ! mysql --defaults-file=$my_cnf -e 'SELECT 1' > /dev/null 2>&1; then
				return
			fi
		fi
	fi

	# Fetch processlist - use normal output so it's easier to filter the command
	# list from the shell when looking for long running queries.
	if try_sql $my_cnf 'SHOW FULL PROCESSLIST'; then
		msg_verbose "my_cnf: $my_cnf - Adding processlist info to $log_file"
		do_sql $log_file $my_cnf 'SHOW FULL PROCESSLIST'
	elif [ $current_time = $warning_time ]; then
		# To avoid spamming the world we only complain at 3 pm (arbitrary time)
		# and try to give a bit more info.

		cat <<-EOF
		WARNING: $myname

		MySQL generates error $rc using defaults-file $my_cnf when executing: SHOW FULL PROCESSLIST

		This is generally because the defaults file is missing an entry:
		socket=/path/to/data/mysql.sock

		Please check the config or remove the $my_cnf file.
		This warning will be repeated again at $(echo "$warning_time" | sed -e 's/_/:/').
		EOF
	fi

	# Check the replication delay.
	# - As we might not have the right grants we should first see if the command
	#   works and only if it works run it again and capture the output, this time
	#   to the end of the original config file.
	if try_sql $my_cnf 'SHOW SLAVE STATUS\G'; then
		msg_verbose "my_cnf: $my_cnf - Adding replication info to $log_file"
		do_sql $log_file $my_cnf 'SHOW SLAVE STATUS\G'
	fi

	# get the innodb status out but only if the engine is available
	HAVE_INNODB=$(mysql --defaults-file=$my_cnf -BN -e "SHOW VARIABLES LIKE 'have_innodb'" | awk '{ print $2 }')

	# Remove any old .innodb files for this instance.
	test -f $log_file.innodb && rm -f $log_file.innodb

	case $HAVE_INNODB in
	YES)	# put this in a try block as some remote connections may not have SUPER privilege to run this.
		if try_sql $my_cnf 'SHOW ENGINE INNODB STATUS\G'; then
			(
				log_file=$log_file.innodb
				msg_verbose "my_cnf: $my_cnf - Adding innodb info to $log_file"
				do_sql $log_file $my_cnf 'SHOW ENGINE INNODB STATUS\G'
			)
		fi
		;;
	*)	: ;; # do nothing
	esac
}

sanity_check () {
	local dir

	# This script currently assumes we have root privileges so check and give up if not.
	[ $myuser = root ] || {
		echo "$myname: This script is running as user $myuser and expects to be run as root."
		echo "- Please change the user or fix the script!"
		exit 1
	}
	msg_verbose "Checked that we are running as root"

	# If we are already running then exit
	cleanup_lockfile=0
	if [ -e $lockfile ]; then
		# Note: there's a race condition here between checking for the file, reading it and trying to get the pid.
		# It is possible that the file disappears in between so we have to take that into account.
		other_pid=$(sed -e 's/[ \t]//g' $lockfile 2>/dev/null)
		if [ $? = 0 ]; then
			if ps -p $other_pid >/dev/null; then
				msg_verbose "Another copy of this script appears to be running already with pid $other_pid. Exiting."
				exit 0
			fi
		fi
		# if the $? fails it means that the file has gone, so it should be ok to continue.
	fi

	# Keep the sysadmins happy.
	# - If no mysql binary can be found just exit.
	# mysqld on MySQL Community and Enterprise RPMS are normally in /usr/sbin
	# mysqld on RedHat system rpms are normally in /usr/libexec
	# mysqld on InfoBright rpms are normally in /usr/local/infobright/bin 
	mysqld=
	mysqld_dirs='/usr/sbin /usr/libexec /usr/local/infobright/bin'
	for dir in $mysqld_dirs; do
		test -x $dir/mysqld && mysqld=$dir/mysqld
	done
	if [ -z "$mysqld" ]; then
		msg_verbose "Can not find mysqld in $mysqld_paths. Exiting"
		exit 2
	else
		msg_verbose "Found mysqld: $mysqld"
	fi

	# - If no mysql binary is running assume that the server is down and don't
	#   bother to try and query it. This avoids cron errors which upset us all.
	local count=$(ps -ef | grep $mysqld | grep -v grep | wc -l)
	if [ $count = 0 ]; then
		msg_verbose "No instances of mysqld have been found"
		exit 3
	fi
	msg_verbose "Checked that we can see at least one instance of mysqld running"

	cleanup_lockfile=1
	echo $$ > $lockfile
	msg_verbose "Checked that no other copy of this script is running"
}

# clean up the lockfile if asked to do this.
cleanup () {
	local rc=$?

	# This file is used to check for instance info.
	if [ -n "$instance_file" ]; then
		msg_verbose "Cleaning up instance_file: $instance_file"
		rm -f $instance_file
	fi

	if [ "$cleanup_lockfile" = 1 ]; then
		msg_verbose "Cleaning up lock file: $lockfile"
		[ -e $lockfile ] && rm -f $lockfile
	fi

	exit $rc
}

create_target_dir () {
	# check that the required target dir exists and if not, make it.
	# also drop a sensible README in there.
	local logdir=$1
	local day_name=$2
	
	if [ ! -d $logdir ]; then
		msg_verbose "creating missing directory $logdir"
		mkdir $logdir || exit 1
		cat <<-EOF > $logdir/README
		README
		======

		This directory contains the output of command $command run from
		script $myname which SHOULD be running every minute. Files are created
		in the subdirectories Mon, Tue, Wed, ... with the names according to
		the time in format hh_mm and a possible suffix according to the
		instance name.

		The script reads the instance configuration from ~/.my.cnf or ~/.my-XXX.cnf
		and generates output for each instance.  ~/.my-dba.cnf files are ignored.
		EOF
	fi

	# Create the directory for each day if needed.
	if [ ! -d $logdir/$day_name ]; then
		msg_verbose "Creating missing directory $logdir/$day_name"
		mkdir $logdir/$day_name || exit 1
	fi
}

# Sometimes we may have more than one .my-XXX file pointing to the same
# instance. This checks the instance by asking the @@hostname, @@datadir.
# if this has been seen before then we return 0, otherise 1. The instance
# information is stored in $instance_file which is cleaned up when the
# script exits.

seen_instance () {
	local instance_file=$1
	local my_cnf=$2
	local rc=1

	# Don't ask about this stupid CHAR(58). It's to avoid quotes inside try_sql
	# char(58) is ':'
	if try_sql $my_cnf 'SELECT CONCAT(@@hostname,CHAR(58),@@datadir)'; then
		instance_info=$(mysql --defaults-file=$my_cnf -BNe 'SELECT CONCAT(@@hostname,CHAR(58),@@datadir)')
		grep -q $instance_info $instance_file
		rc=$?
		# We have not seen this instance before so add it to the list.
		if  [ $rc != 0 ] ; then
			msg_verbose "my_cnf: $my_cnf - NOT SEEN instance info $instance_info before"
			echo $instance_info >> $instance_file
		else
			msg_verbose "my_cnf: $my_cnf - HAVE SEEN instance info: $instance_info before, so ignoring"
		fi
	else
		msg_verbose "Unable to check for instance_info using defaults-file: $my_cnf"
	fi

	return $rc
}

verbose=
while getopts v flag; do
	case $flag in
	v)	verbose=1;;
	*)	msg_info "Usage: $myname [-v]"
	esac
done

shift $(($OPTIND - 1))

msg_verbose "Running in verbose mode"

# Just in case we're upgrading mysql or the client has disappeared check to see
# if it's available. If it disappears then exit silently.
which mysql >/dev/null 2>&1 || exit 0

trap cleanup 0 1 2 3 9 15

sanity_check

# Create any missing directories if needed.
create_target_dir $logdir $day_name

log_file_base=$logdir/$day_name/$current_time
old_log_file_base=$logdir/${day_name}_${current_time}
instance_file=$(mktemp -t $myname.XXXXXXXX)

# Get some more unix process info which is often handy.
# We now do this BEFORE getting the db output. If the db locks up we still have something to show.
show_system_info $log_file_base

# Iterate over all instances found (EXCEPT .my-dba.cnf) and generate the output.
for my_cnf in $(cd $myhome && ls -a | egrep '^\.my(|-[0-9a-z-]*)\.cnf$' | grep -v '^\.my-dba\.cnf$')
do
        # the text LOG_PROCESSLIST: NO will stop us looking at this config file
	if ! grep -iq 'LOG_PROCESSLIST:[[:space:]]*NO' $my_cnf; then
		if ! seen_instance $instance_file $myhome/$my_cnf; then
			get_instance_info $log_file_base $myhome/$my_cnf
		fi
	fi
done
