zeek/auxil/zeekctl/bin/post-terminate
Patrick Kelley 8fd444092b initial
2025-05-07 15:35:15 -04:00

260 lines
8.1 KiB
Bash
Executable File

#! /usr/bin/env bash
#
# Cleanup tasks after Zeek termination: move the node's working directory
# to a tmp dir and create a new working directory, create a crash report if
# the node crashed, wait for this node's archive-log processes to finish,
# try to archive any remaining logs (and send an email if this fails), and
# finally (if the node didn't crash) remove the tmp dir if all logs were
# successfully archived.
#
# post-terminate <type> <dir> [<crashflag>]
#
# <type> is the node's type ("manager", "worker", etc.).
# <dir> is the node's working directory.
#
# If <crashflag> is not set, then ZeekControl has stopped Zeek normally.
# If <crashflag> is "crash", then ZeekControl has determined that Zeek crashed
# and this script will return information about the crash on stdout which is
# suitable for mailing to the user. If <crashflag> is "killed", then
# ZeekControl terminated Zeek forcefully (but intentionally) by SIGKILL while
# trying to stop Zeek.
sendfailuremail()
{
if [ "${mailarchivelogfail}" = "0" ]; then
return
fi
$scriptdir/send-mail "archive log failure on node $nodename" <<_EOF_
Unable to archive one or more logs in directory:
${postdir}
Check the post-terminate.out file in that directory for any error messages.
_EOF_
}
if [ $# -lt 2 ] || [ $# -gt 3 ]; then
echo "post-terminate: wrong usage: $@"
exit 1
fi
nodetype=$1
dir=$2
nodename=`basename $dir`
if [ ! -d "$dir" ]; then
echo "post-terminate: directory not found: $dir"
exit 1
fi
crash=0
killed=0
if [ "$3" = "crash" ]; then
crash=1
elif [ "$3" = "killed" ]; then
killed=1
fi
scriptdir=`dirname $0`
. $scriptdir/zeekctl-config.sh
if [ -z "${tmpdir}" ]; then
echo "post-terminate: zeekctl option tmpdir not set"
exit 1
fi
if [ ! -d "${tmpdir}" ]; then
mkdir "${tmpdir}"
fi
tmpdirtimestamp=`date +%Y-%m-%d-%H-%M-%S`
postterminatetime=`date +%y-%m-%d_%H.%M.%S`
postdir=${tmpdir}/post-terminate-$nodetype-$tmpdirtimestamp-$$
if [ $crash -eq 1 ]; then
postdir=$postdir-crash
fi
mv "$dir" "$postdir"
if [ $? -ne 0 ]; then
exit 1
fi
mkdir "$dir"
cd "$postdir"
if [ -d .state ]; then
mv .state "$dir"
fi
if [ $crash -eq 1 ]; then
# Output the crash report and save it to disk in case the user doesn't
# receive the email.
"${scriptsdir}"/crash-diag -c "$postdir" > .crash-diag.out
cat .crash-diag.out
fi
if [ ! -f .startup ]; then
echo "post-terminate: file not found: .startup"
exit 1
fi
wait_for_archivelog()
{
# Gather list of all archive-log PID files.
pidfiles=$(find . -maxdepth 1 -type f -name '.archive-log.*.tmp')
# Wait for any archive-log processes to finish, so that we can either
# launch new ones (below) or remove this directory.
while [ -n "$pidfiles" ]; do
for pfile in $pidfiles ; do
# If PID file is empty, then check it again later.
if [ -s $pfile ]; then
# Check if a process with given PID exists
ps -p $(cat $pfile) > /dev/null 2>&1
if [ $? -ne 0 ]; then
# No such process exists, so remove PID file
rm -f $pfile
fi
fi
done
sleep 1
pidfiles=$(find . -maxdepth 1 -type f -name '.archive-log.*.tmp')
done
}
parse_filename()
{
filename=$1
# Try to extract a timestamp from the filename, and adjust the base name
# accordingly. If the filename doesn't contain any recognized timestamp
# format, then just assume the whole thing is the base name.
# Try to remove suffix ".YYYY-MM-DD-HH-MM-SS" (this format is specified in
# Log::default_rotation_date_format and is used by the ascii writer script
# to rename a log immediately after Zeek rotates it).
tmp=`echo $filename | sed 's/[.][1-2][0-9][0-9][0-9]-[0-1][0-9]-[0-3][0-9]-[0-2][0-9]-[0-5][0-9]-[0-5][0-9]$//'`
if [ "$filename" != "$tmp" ]; then
basename=$tmp
# Remove the base name and '.' to get the timestamp.
ts=${filename#$basename.}
# Convert time from YYYY-MM-DD-HH-MM-SS to YY-MM-DD_HH.MM.SS
strt=`echo $ts | awk -F '-' '{ printf("%s-%s-%s_%s.%s.%s",substr($1,3,2),$2,$3,$4,$5,$6) }'`
else
# Try to remove suffix "-YY-MM-DD_HH.MM.SS" (this format is hard-coded
# in Zeek, and is the format used by Zeek when a log is rotated).
tmp=`echo $filename | sed 's/-[0-9][0-9]-[0-1][0-9]-[0-3][0-9]_[0-2][0-9][.][0-5][0-9][.][0-5][0-9]$//'`
if [ "$filename" != "$tmp" ]; then
basename=$tmp
# Remove the base name and '-' to get the timestamp.
strt=${filename#$basename-}
fi
fi
}
archivelogs()
{
startuptime=`cat .startup | tail -1`
# Attempt to archive all log files. Although stdout.log/stderr.log are
# not really Zeek logs, we try to archive them anyway, because they might
# contain useful info, especially if Zeek crashes.
# If there's a .log_suffix file, set ZEEK_ARG_LOG_SUFFIX so that
# archive-log and make-archive-name know about it.
if [ -f .log_suffix ]; then
export ZEEK_ARG_LOG_SUFFIX=$(cat .log_suffix)
fi
for logname in *.log; do
# Get the base name (such as "conn") by removing the file extension.
basename=`basename "$logname" .log`
# Start time of log.
strt=
# If the filename contains a timestamp (i.e., a log that was rotated
# but not archived), then try to get the start time from the log
# filename. If a timestamp is found, then the base name is also
# updated to not include the timestamp.
parse_filename "$basename"
# Assume the end time of the log is the time this script is run,
# because Zeek stopped running before this script started.
end=$postterminatetime
if [ -z "$strt" ]; then
# We couldn't extract the start time from the log filename, likely
# because it isn't there (or possibly it's in an unrecognized
# format).
strt=$startuptime
if [ -f .rotated.$basename ]; then
# The time obtained here is always >= the startup time of Zeek,
# so it's usually a more accurate guess of this log's start
# time.
strt=`cat .rotated.$basename`
# However, if archive-log archived a log with the same base
# name as this log, and if it did so after this script started,
# then the start time that we computed will be later than the
# end time. If so, then reset the start time to equal the end
# time.
expr "$strt" ">" "$end" >/dev/null
if [ $? -eq 0 ]; then
strt=$end
fi
fi
fi
# Note: here we assume the log writer type is "ascii"
"${scriptsdir}"/archive-log $logname $basename $strt $end 1 ascii
if [ $? -ne 0 ]; then
failed=1
fi
done
}
postterminate()
{
# Wait until all running archive-log processes have terminated.
wait_for_archivelog
failed=0
# Archive all logs.
archivelogs
# If one or more logs failed to be archived, then try to send an email.
if [ $failed -ne 0 ]; then
sendfailuremail
fi
# If Zeek crashed, then we don't need to do anything else, because we don't
# want to remove the directory.
if [ $crash -eq 1 ]; then
exit 0
fi
# If no archive-log processes started from this script failed, then remove
# the directory. If the directory is not removed, then an email was sent
# to notify the user to look in this directory for logs.
if [ $failed -eq 0 ]; then
rm -rf "$postdir"
fi
}
# Execute the remaining part of this script in the background so that zeekctl
# doesn't need to wait for it to finish. Stdout/stderr is redirected to a
# file to capture error messages.
postterminate >post-terminate.out 2>&1 &
# In some situations (such as testing), we may want the zeekctl stop command to
# wait for the post-terminate script to finish.
if [ "${stopwait}" = "1" ]; then
wait
fi