#! /usr/bin/env bash # # Cleanup tasks after Zeek termination: move the node's working directory # to a tmp dir and create a new working directory, create a crash report if # the node crashed, wait for this node's archive-log processes to finish, # try to archive any remaining logs (and send an email if this fails), and # finally (if the node didn't crash) remove the tmp dir if all logs were # successfully archived. # # post-terminate [] # # is the node's type ("manager", "worker", etc.). # is the node's working directory. # # If is not set, then ZeekControl has stopped Zeek normally. # If is "crash", then ZeekControl has determined that Zeek crashed # and this script will return information about the crash on stdout which is # suitable for mailing to the user. If is "killed", then # ZeekControl terminated Zeek forcefully (but intentionally) by SIGKILL while # trying to stop Zeek. sendfailuremail() { if [ "${mailarchivelogfail}" = "0" ]; then return fi $scriptdir/send-mail "archive log failure on node $nodename" <<_EOF_ Unable to archive one or more logs in directory: ${postdir} Check the post-terminate.out file in that directory for any error messages. _EOF_ } if [ $# -lt 2 ] || [ $# -gt 3 ]; then echo "post-terminate: wrong usage: $@" exit 1 fi nodetype=$1 dir=$2 nodename=`basename $dir` if [ ! -d "$dir" ]; then echo "post-terminate: directory not found: $dir" exit 1 fi crash=0 killed=0 if [ "$3" = "crash" ]; then crash=1 elif [ "$3" = "killed" ]; then killed=1 fi scriptdir=`dirname $0` . $scriptdir/zeekctl-config.sh if [ -z "${tmpdir}" ]; then echo "post-terminate: zeekctl option tmpdir not set" exit 1 fi if [ ! -d "${tmpdir}" ]; then mkdir "${tmpdir}" fi tmpdirtimestamp=`date +%Y-%m-%d-%H-%M-%S` postterminatetime=`date +%y-%m-%d_%H.%M.%S` postdir=${tmpdir}/post-terminate-$nodetype-$tmpdirtimestamp-$$ if [ $crash -eq 1 ]; then postdir=$postdir-crash fi mv "$dir" "$postdir" if [ $? -ne 0 ]; then exit 1 fi mkdir "$dir" cd "$postdir" if [ -d .state ]; then mv .state "$dir" fi if [ $crash -eq 1 ]; then # Output the crash report and save it to disk in case the user doesn't # receive the email. "${scriptsdir}"/crash-diag -c "$postdir" > .crash-diag.out cat .crash-diag.out fi if [ ! -f .startup ]; then echo "post-terminate: file not found: .startup" exit 1 fi wait_for_archivelog() { # Gather list of all archive-log PID files. pidfiles=$(find . -maxdepth 1 -type f -name '.archive-log.*.tmp') # Wait for any archive-log processes to finish, so that we can either # launch new ones (below) or remove this directory. while [ -n "$pidfiles" ]; do for pfile in $pidfiles ; do # If PID file is empty, then check it again later. if [ -s $pfile ]; then # Check if a process with given PID exists ps -p $(cat $pfile) > /dev/null 2>&1 if [ $? -ne 0 ]; then # No such process exists, so remove PID file rm -f $pfile fi fi done sleep 1 pidfiles=$(find . -maxdepth 1 -type f -name '.archive-log.*.tmp') done } parse_filename() { filename=$1 # Try to extract a timestamp from the filename, and adjust the base name # accordingly. If the filename doesn't contain any recognized timestamp # format, then just assume the whole thing is the base name. # Try to remove suffix ".YYYY-MM-DD-HH-MM-SS" (this format is specified in # Log::default_rotation_date_format and is used by the ascii writer script # to rename a log immediately after Zeek rotates it). tmp=`echo $filename | sed 's/[.][1-2][0-9][0-9][0-9]-[0-1][0-9]-[0-3][0-9]-[0-2][0-9]-[0-5][0-9]-[0-5][0-9]$//'` if [ "$filename" != "$tmp" ]; then basename=$tmp # Remove the base name and '.' to get the timestamp. ts=${filename#$basename.} # Convert time from YYYY-MM-DD-HH-MM-SS to YY-MM-DD_HH.MM.SS strt=`echo $ts | awk -F '-' '{ printf("%s-%s-%s_%s.%s.%s",substr($1,3,2),$2,$3,$4,$5,$6) }'` else # Try to remove suffix "-YY-MM-DD_HH.MM.SS" (this format is hard-coded # in Zeek, and is the format used by Zeek when a log is rotated). tmp=`echo $filename | sed 's/-[0-9][0-9]-[0-1][0-9]-[0-3][0-9]_[0-2][0-9][.][0-5][0-9][.][0-5][0-9]$//'` if [ "$filename" != "$tmp" ]; then basename=$tmp # Remove the base name and '-' to get the timestamp. strt=${filename#$basename-} fi fi } archivelogs() { startuptime=`cat .startup | tail -1` # Attempt to archive all log files. Although stdout.log/stderr.log are # not really Zeek logs, we try to archive them anyway, because they might # contain useful info, especially if Zeek crashes. # If there's a .log_suffix file, set ZEEK_ARG_LOG_SUFFIX so that # archive-log and make-archive-name know about it. if [ -f .log_suffix ]; then export ZEEK_ARG_LOG_SUFFIX=$(cat .log_suffix) fi for logname in *.log; do # Get the base name (such as "conn") by removing the file extension. basename=`basename "$logname" .log` # Start time of log. strt= # If the filename contains a timestamp (i.e., a log that was rotated # but not archived), then try to get the start time from the log # filename. If a timestamp is found, then the base name is also # updated to not include the timestamp. parse_filename "$basename" # Assume the end time of the log is the time this script is run, # because Zeek stopped running before this script started. end=$postterminatetime if [ -z "$strt" ]; then # We couldn't extract the start time from the log filename, likely # because it isn't there (or possibly it's in an unrecognized # format). strt=$startuptime if [ -f .rotated.$basename ]; then # The time obtained here is always >= the startup time of Zeek, # so it's usually a more accurate guess of this log's start # time. strt=`cat .rotated.$basename` # However, if archive-log archived a log with the same base # name as this log, and if it did so after this script started, # then the start time that we computed will be later than the # end time. If so, then reset the start time to equal the end # time. expr "$strt" ">" "$end" >/dev/null if [ $? -eq 0 ]; then strt=$end fi fi fi # Note: here we assume the log writer type is "ascii" "${scriptsdir}"/archive-log $logname $basename $strt $end 1 ascii if [ $? -ne 0 ]; then failed=1 fi done } postterminate() { # Wait until all running archive-log processes have terminated. wait_for_archivelog failed=0 # Archive all logs. archivelogs # If one or more logs failed to be archived, then try to send an email. if [ $failed -ne 0 ]; then sendfailuremail fi # If Zeek crashed, then we don't need to do anything else, because we don't # want to remove the directory. if [ $crash -eq 1 ]; then exit 0 fi # If no archive-log processes started from this script failed, then remove # the directory. If the directory is not removed, then an email was sent # to notify the user to look in this directory for logs. if [ $failed -eq 0 ]; then rm -rf "$postdir" fi } # Execute the remaining part of this script in the background so that zeekctl # doesn't need to wait for it to finish. Stdout/stderr is redirected to a # file to capture error messages. postterminate >post-terminate.out 2>&1 & # In some situations (such as testing), we may want the zeekctl stop command to # wait for the post-terminate script to finish. if [ "${stopwait}" = "1" ]; then wait fi