260 lines
8.1 KiB
Bash
Executable File
260 lines
8.1 KiB
Bash
Executable File
#! /usr/bin/env bash
|
|
#
|
|
# Cleanup tasks after Zeek termination: move the node's working directory
|
|
# to a tmp dir and create a new working directory, create a crash report if
|
|
# the node crashed, wait for this node's archive-log processes to finish,
|
|
# try to archive any remaining logs (and send an email if this fails), and
|
|
# finally (if the node didn't crash) remove the tmp dir if all logs were
|
|
# successfully archived.
|
|
#
|
|
# post-terminate <type> <dir> [<crashflag>]
|
|
#
|
|
# <type> is the node's type ("manager", "worker", etc.).
|
|
# <dir> is the node's working directory.
|
|
#
|
|
# If <crashflag> is not set, then ZeekControl has stopped Zeek normally.
|
|
# If <crashflag> is "crash", then ZeekControl has determined that Zeek crashed
|
|
# and this script will return information about the crash on stdout which is
|
|
# suitable for mailing to the user. If <crashflag> is "killed", then
|
|
# ZeekControl terminated Zeek forcefully (but intentionally) by SIGKILL while
|
|
# trying to stop Zeek.
|
|
|
|
sendfailuremail()
|
|
{
|
|
if [ "${mailarchivelogfail}" = "0" ]; then
|
|
return
|
|
fi
|
|
|
|
$scriptdir/send-mail "archive log failure on node $nodename" <<_EOF_
|
|
Unable to archive one or more logs in directory:
|
|
${postdir}
|
|
Check the post-terminate.out file in that directory for any error messages.
|
|
_EOF_
|
|
}
|
|
|
|
if [ $# -lt 2 ] || [ $# -gt 3 ]; then
|
|
echo "post-terminate: wrong usage: $@"
|
|
exit 1
|
|
fi
|
|
|
|
nodetype=$1
|
|
dir=$2
|
|
nodename=`basename $dir`
|
|
|
|
if [ ! -d "$dir" ]; then
|
|
echo "post-terminate: directory not found: $dir"
|
|
exit 1
|
|
fi
|
|
|
|
crash=0
|
|
killed=0
|
|
if [ "$3" = "crash" ]; then
|
|
crash=1
|
|
elif [ "$3" = "killed" ]; then
|
|
killed=1
|
|
fi
|
|
|
|
scriptdir=`dirname $0`
|
|
. $scriptdir/zeekctl-config.sh
|
|
|
|
if [ -z "${tmpdir}" ]; then
|
|
echo "post-terminate: zeekctl option tmpdir not set"
|
|
exit 1
|
|
fi
|
|
|
|
if [ ! -d "${tmpdir}" ]; then
|
|
mkdir "${tmpdir}"
|
|
fi
|
|
|
|
tmpdirtimestamp=`date +%Y-%m-%d-%H-%M-%S`
|
|
postterminatetime=`date +%y-%m-%d_%H.%M.%S`
|
|
|
|
postdir=${tmpdir}/post-terminate-$nodetype-$tmpdirtimestamp-$$
|
|
|
|
if [ $crash -eq 1 ]; then
|
|
postdir=$postdir-crash
|
|
fi
|
|
|
|
mv "$dir" "$postdir"
|
|
if [ $? -ne 0 ]; then
|
|
exit 1
|
|
fi
|
|
|
|
mkdir "$dir"
|
|
|
|
cd "$postdir"
|
|
|
|
if [ -d .state ]; then
|
|
mv .state "$dir"
|
|
fi
|
|
|
|
if [ $crash -eq 1 ]; then
|
|
# Output the crash report and save it to disk in case the user doesn't
|
|
# receive the email.
|
|
"${scriptsdir}"/crash-diag -c "$postdir" > .crash-diag.out
|
|
cat .crash-diag.out
|
|
fi
|
|
|
|
if [ ! -f .startup ]; then
|
|
echo "post-terminate: file not found: .startup"
|
|
exit 1
|
|
fi
|
|
|
|
wait_for_archivelog()
|
|
{
|
|
# Gather list of all archive-log PID files.
|
|
pidfiles=$(find . -maxdepth 1 -type f -name '.archive-log.*.tmp')
|
|
|
|
# Wait for any archive-log processes to finish, so that we can either
|
|
# launch new ones (below) or remove this directory.
|
|
while [ -n "$pidfiles" ]; do
|
|
for pfile in $pidfiles ; do
|
|
# If PID file is empty, then check it again later.
|
|
if [ -s $pfile ]; then
|
|
# Check if a process with given PID exists
|
|
ps -p $(cat $pfile) > /dev/null 2>&1
|
|
if [ $? -ne 0 ]; then
|
|
# No such process exists, so remove PID file
|
|
rm -f $pfile
|
|
fi
|
|
fi
|
|
done
|
|
|
|
sleep 1
|
|
|
|
pidfiles=$(find . -maxdepth 1 -type f -name '.archive-log.*.tmp')
|
|
done
|
|
}
|
|
|
|
parse_filename()
|
|
{
|
|
filename=$1
|
|
|
|
# Try to extract a timestamp from the filename, and adjust the base name
|
|
# accordingly. If the filename doesn't contain any recognized timestamp
|
|
# format, then just assume the whole thing is the base name.
|
|
|
|
# Try to remove suffix ".YYYY-MM-DD-HH-MM-SS" (this format is specified in
|
|
# Log::default_rotation_date_format and is used by the ascii writer script
|
|
# to rename a log immediately after Zeek rotates it).
|
|
tmp=`echo $filename | sed 's/[.][1-2][0-9][0-9][0-9]-[0-1][0-9]-[0-3][0-9]-[0-2][0-9]-[0-5][0-9]-[0-5][0-9]$//'`
|
|
if [ "$filename" != "$tmp" ]; then
|
|
basename=$tmp
|
|
# Remove the base name and '.' to get the timestamp.
|
|
ts=${filename#$basename.}
|
|
# Convert time from YYYY-MM-DD-HH-MM-SS to YY-MM-DD_HH.MM.SS
|
|
strt=`echo $ts | awk -F '-' '{ printf("%s-%s-%s_%s.%s.%s",substr($1,3,2),$2,$3,$4,$5,$6) }'`
|
|
else
|
|
# Try to remove suffix "-YY-MM-DD_HH.MM.SS" (this format is hard-coded
|
|
# in Zeek, and is the format used by Zeek when a log is rotated).
|
|
tmp=`echo $filename | sed 's/-[0-9][0-9]-[0-1][0-9]-[0-3][0-9]_[0-2][0-9][.][0-5][0-9][.][0-5][0-9]$//'`
|
|
if [ "$filename" != "$tmp" ]; then
|
|
basename=$tmp
|
|
# Remove the base name and '-' to get the timestamp.
|
|
strt=${filename#$basename-}
|
|
fi
|
|
fi
|
|
}
|
|
|
|
archivelogs()
|
|
{
|
|
startuptime=`cat .startup | tail -1`
|
|
|
|
# Attempt to archive all log files. Although stdout.log/stderr.log are
|
|
# not really Zeek logs, we try to archive them anyway, because they might
|
|
# contain useful info, especially if Zeek crashes.
|
|
|
|
# If there's a .log_suffix file, set ZEEK_ARG_LOG_SUFFIX so that
|
|
# archive-log and make-archive-name know about it.
|
|
if [ -f .log_suffix ]; then
|
|
export ZEEK_ARG_LOG_SUFFIX=$(cat .log_suffix)
|
|
fi
|
|
|
|
for logname in *.log; do
|
|
# Get the base name (such as "conn") by removing the file extension.
|
|
basename=`basename "$logname" .log`
|
|
|
|
# Start time of log.
|
|
strt=
|
|
|
|
# If the filename contains a timestamp (i.e., a log that was rotated
|
|
# but not archived), then try to get the start time from the log
|
|
# filename. If a timestamp is found, then the base name is also
|
|
# updated to not include the timestamp.
|
|
parse_filename "$basename"
|
|
|
|
# Assume the end time of the log is the time this script is run,
|
|
# because Zeek stopped running before this script started.
|
|
end=$postterminatetime
|
|
|
|
if [ -z "$strt" ]; then
|
|
# We couldn't extract the start time from the log filename, likely
|
|
# because it isn't there (or possibly it's in an unrecognized
|
|
# format).
|
|
strt=$startuptime
|
|
if [ -f .rotated.$basename ]; then
|
|
# The time obtained here is always >= the startup time of Zeek,
|
|
# so it's usually a more accurate guess of this log's start
|
|
# time.
|
|
strt=`cat .rotated.$basename`
|
|
|
|
# However, if archive-log archived a log with the same base
|
|
# name as this log, and if it did so after this script started,
|
|
# then the start time that we computed will be later than the
|
|
# end time. If so, then reset the start time to equal the end
|
|
# time.
|
|
expr "$strt" ">" "$end" >/dev/null
|
|
if [ $? -eq 0 ]; then
|
|
strt=$end
|
|
fi
|
|
fi
|
|
fi
|
|
|
|
# Note: here we assume the log writer type is "ascii"
|
|
"${scriptsdir}"/archive-log $logname $basename $strt $end 1 ascii
|
|
if [ $? -ne 0 ]; then
|
|
failed=1
|
|
fi
|
|
done
|
|
}
|
|
|
|
postterminate()
|
|
{
|
|
# Wait until all running archive-log processes have terminated.
|
|
wait_for_archivelog
|
|
|
|
failed=0
|
|
|
|
# Archive all logs.
|
|
archivelogs
|
|
|
|
# If one or more logs failed to be archived, then try to send an email.
|
|
if [ $failed -ne 0 ]; then
|
|
sendfailuremail
|
|
fi
|
|
|
|
# If Zeek crashed, then we don't need to do anything else, because we don't
|
|
# want to remove the directory.
|
|
if [ $crash -eq 1 ]; then
|
|
exit 0
|
|
fi
|
|
|
|
# If no archive-log processes started from this script failed, then remove
|
|
# the directory. If the directory is not removed, then an email was sent
|
|
# to notify the user to look in this directory for logs.
|
|
if [ $failed -eq 0 ]; then
|
|
rm -rf "$postdir"
|
|
fi
|
|
}
|
|
|
|
# Execute the remaining part of this script in the background so that zeekctl
|
|
# doesn't need to wait for it to finish. Stdout/stderr is redirected to a
|
|
# file to capture error messages.
|
|
postterminate >post-terminate.out 2>&1 &
|
|
|
|
# In some situations (such as testing), we may want the zeekctl stop command to
|
|
# wait for the post-terminate script to finish.
|
|
if [ "${stopwait}" = "1" ]; then
|
|
wait
|
|
fi
|