Patrick Kelley 8fd444092b initial
2025-05-07 15:35:15 -04:00

1553 lines
49 KiB
Python

# Functions to control the nodes' operations.
import glob
import logging
import os
import shutil
import time
from collections import namedtuple
from ZeekControl import cmdresult, config, cron, events, execute, install, util
from ZeekControl import node as node_mod
# Waits for the nodes' Zeek processes to reach the given status.
# Build the Zeek parameters for the given node. Include
# script for live operation if live is true.
def _make_zeek_params(node, live):
args = []
if live and node.interface:
try:
# Interface name needs quotes so that shell doesn't interpret any
# potential metacharacters in the name.
args += ["-i", f"'{node.interface}'"]
except AttributeError:
pass
if config.Config.savetraces:
args += ["-w", "trace.pcap"]
args += ["-U", ".status"]
args += ["-p", "zeekctl"]
if live:
args += ["-p", "zeekctl-live"]
if node_mod.is_standalone(node):
args += ["-p", "standalone"]
for prefix in config.Config.prefixes.split(":"):
args += ["-p", f"{prefix}"]
args += ["-p", f"{node.name}"]
# The order of loaded scripts is as follows:
# 1) SitePolicyScripts (local.zeek by default) gives a common set of loaded
# scripts for all nodes.
# 2) The common configuration of zeekctl is loaded via the zeekctl package.
# 3) The distribution's default settings for node configuration are loaded
# from either the cluster framework or standalone scripts. At this point
# anything in the distribution's default per-node is overridable and any
# identifiers in local.zeek are able to be used (e.g. in defining
# a notice policy).
# 4) Autogenerated zeekctl scripts are loaded, which may contain
# settings that override the previously loaded scripts.
# (e.g. see Log::default_rotation_interval)
args += config.Config.sitepolicyscripts.split()
args += ["zeekctl"]
if node_mod.is_standalone(node):
args += ["zeekctl/standalone"]
else:
args += ["base/frameworks/cluster"]
args += ["zeekctl/auto"]
if getattr(node, "aux_scripts", None):
args += [node.aux_scripts]
if config.Config.zeekargs:
# Some args in zeekargs might contain spaces, so we cannot split it.
args += [config.Config.zeekargs]
return args
# Build the environment variables for the given node.
def _make_env_params(node, returnlist=False):
envs = []
if not node_mod.is_standalone(node):
envs.append(f"CLUSTER_NODE={node.name}")
envs += [f"{key}={val}" for (key, val) in sorted(node.env_vars.items())]
if returnlist:
envlist = [("-v", i) for i in envs]
return [j for i in envlist for j in i]
return " ".join(envs)
def fmttime(t):
return time.strftime(config.Config.timefmt, time.localtime(float(t)))
class Controller:
def __init__(self, config, ui, executor, pluginregistry):
self.config = config
self.ui = ui
self.executor = executor
self.pluginregistry = pluginregistry
# Create zeekctl-config.sh file so that shell script helpers have
# current config values.
install.make_zeekctl_config_sh(ui)
def start(self, nodes):
results = cmdresult.CmdResult()
loggers, manager, proxies, workers = node_mod.separate_types(nodes)
for n in nodes:
n.setExpectRunning(True)
# Start nodes. Do it in the order loggers, manager, proxies, workers.
if loggers:
self._start_nodes(loggers, results)
if not results.ok:
for n in manager + proxies + workers:
results.set_node_fail(n)
return results
if manager:
self._start_nodes(manager, results)
if not results.ok:
for n in proxies + workers:
results.set_node_fail(n)
return results
if proxies:
self._start_nodes(proxies, results)
if not results.ok:
for n in workers:
results.set_node_fail(n)
return results
if workers:
self._start_nodes(workers, results)
return results
# Starts the given nodes.
def _start_nodes(self, nodes, results):
self.ui.info(f"starting {node_mod.nodes_describe(nodes)} ...")
filtered = []
# Ignore nodes which are still running.
for node, isrunning in self._isrunning(nodes):
if not isrunning:
filtered += [node]
nodes = filtered
# Generate crash report for any crashed nodes.
crashed = [node for node in nodes if node.hasCrashed()]
if crashed:
self.ui.info(
"creating crash report for previously crashed nodes: {}".format(
", ".join([n.name for n in crashed])
)
)
self._make_crash_reports(crashed)
# Make working directories.
dirs = [(node, node.cwd()) for node in nodes]
nodes = []
for node, success, output in self.executor.mkdirs(dirs):
if success:
nodes += [node]
else:
self.ui.error(f"cannot create working directory for {node.name}")
results.set_node_fail(node)
# Start Zeek process.
cmds = []
for node in nodes:
envs = []
pin_cpu = node.pin_cpus
# If this node isn't using CPU pinning, then use a placeholder value
if pin_cpu == "":
pin_cpu = -1
envs = _make_env_params(node, True)
cmds += [
(
node,
"start",
envs + [node.cwd(), str(pin_cpu)] + _make_zeek_params(node, True),
)
]
nodes = []
# Note: the shell is used to interpret the command because zeekargs
# might contain quoted arguments.
for node, success, output in self.executor.run_helper(cmds, shell=True):
if success:
if not output:
self.ui.error(f"failed to get PID of {node.name}")
results.set_node_fail(node)
continue
pidstr = output.splitlines()[0]
try:
pid = int(pidstr)
except ValueError:
self.ui.error(f"invalid PID for {node.name}: {pidstr}")
results.set_node_fail(node)
continue
nodes += [node]
node.setPID(pid)
else:
self.ui.error(f'cannot start {node.name}; check output of "diag"')
results.set_node_fail(node)
if output:
self.ui.error(output)
# Check whether processes did indeed start up.
hanging = []
running = []
for node, success in self._waitforzeeks(nodes, "RUNNING", 3, True):
if success:
running += [node]
else:
hanging += [node]
# It can happen that Zeek hangs in DNS lookups at startup
# which can take a while. At this point we already know
# that the process has been started (_waitforzeeks ensures that).
# If by now there is not a TERMINATED status, we assume that it
# is doing fine and will move on to RUNNING once DNS is done.
for node, success in self._waitforzeeks(hanging, "TERMINATED", 0, False):
if success:
self.ui.error(
f'{node.name} terminated immediately after starting; check output with "diag"'
)
node.clearPID()
results.set_node_fail(node)
else:
self.ui.info(f"({node.name} still initializing)")
running += [node]
for node in running:
self._log_action(node, "started")
results.set_node_success(node)
return results
def _isrunning(self, nodes, setcrashed=True):
results = []
cmds = []
for node in nodes:
pid = node.getPID()
if not pid:
results += [(node, False)]
continue
cmds += [(node, "check-pid", [str(pid)])]
for node, success, output in self.executor.run_helper(cmds):
# If we cannot run the helper script, then we ignore this node
# because the process might actually be running but we can't tell.
if not success:
self.ui.error(f"failed to run check-pid on node {node.name}")
continue
running = output.strip() == "running"
results += [(node, running)]
if not running:
if setcrashed:
# Grmpf. It crashed.
node.clearPID()
node.setCrashed()
return results
def _waitforzeeks(self, nodes, status, timeout, ensurerunning):
# If ensurerunning is true, process must still be running.
if ensurerunning:
running = self._isrunning(nodes)
else:
running = [(node, True) for node in nodes]
results = []
# Determine set of nodes still to check.
todo = {}
for node, isrunning in running:
if isrunning:
todo[node.name] = node
else:
results += [(node, False)]
while True:
# Determine whether process is still running. We need to do this
# before we get the state to avoid a race condition.
nodelist = sorted(todo.values(), key=node_mod.sortnode)
running = self._isrunning(nodelist, setcrashed=False)
# Check nodes' .status file
cmds = []
for node in nodelist:
cmds += [(node, "first-line", [f"{node.cwd()}/.status"])]
for node, success, output in self.executor.run_helper(cmds):
if not success or not output:
continue
fields = output.split()
if len(fields) == 2:
if status in fields[0]:
# Status reached. Cool.
del todo[node.name]
results += [(node, True)]
else:
# Something's wrong. We give up on that node.
del todo[node.name]
results += [(node, False)]
for node, isrunning in running:
if node.name in todo and not isrunning:
# Alright, a dead node's status will not change anymore.
del todo[node.name]
results += [(node, False)]
if not todo:
# All done.
break
# Wait a bit before we start over.
time.sleep(1)
# Timeout reached?
timeout -= 1
if timeout <= 0:
break
logging.debug("Waiting for %d node(s)...", len(todo))
for node in todo.values():
# These did time-out.
results += [(node, False)]
if todo:
logging.debug("Timeout while waiting for %d node(s)", len(todo))
return results
def _log_action(self, node, action):
if not self.config.statslogenable:
return
t = time.time()
with open(self.config.statslog, "a") as out:
out.write(f"{t} {node} action {action}\n")
# Do a "post-terminate crash" for the given nodes.
def _make_crash_reports(self, nodes):
for n in nodes:
self.pluginregistry.zeekProcessDied(n)
msg_header_backtrace = "If you want to help us debug this problem, then please forward\nthis mail to reports@zeek.org\n"
msg_header_no_backtrace = "This crash report does not include a backtrace. In order for crash reports\nto be useful when Zeek crashes, a backtrace is needed.\n"
postterminate = os.path.join(self.config.scriptsdir, "post-terminate")
cmds = [
(node, postterminate, [node.type, node.cwd(), "crash"]) for node in nodes
]
for node, success, output in self.executor.run_cmds(cmds):
if success:
crashreport = output
# Note: here it is assumed that the crash-diag script outputs
# this string only when there's a backtrace.
has_backtrace = "Core file: " in crashreport
if has_backtrace:
msg = msg_header_backtrace + crashreport
else:
msg = msg_header_no_backtrace + crashreport
msuccess, moutput = self._sendmail(
f"Crash report from {node.name}", msg
)
if not msuccess:
self.ui.error(
f"error occurred while trying to send mail: {moutput}"
)
else:
self.ui.error(
f"error running post-terminate for {node.name}:\n{output}"
)
node.clearCrashed()
def _sendmail(self, subject, body):
if not self.config.sendmail:
return True, ""
cmd = "{} '{}'".format(
os.path.join(self.config.scriptsdir, "send-mail"), subject
)
return execute.run_localcmd(cmd, inputtext=body)
# Stop Zeek processes on nodes.
def stop(self, nodes):
results = cmdresult.CmdResult()
loggers, manager, proxies, workers = node_mod.separate_types(nodes)
for n in nodes:
n.setExpectRunning(False)
# Stop nodes. Do it in the order workers, proxies, manager, loggers
# (the reverse of "start").
if workers:
self._stop_nodes(workers, results)
if not results.ok:
for n in proxies + manager + loggers:
results.set_node_fail(n)
return results
if proxies:
self._stop_nodes(proxies, results)
if not results.ok:
for n in manager + loggers:
results.set_node_fail(n)
return results
if manager:
self._stop_nodes(manager, results)
if not results.ok:
for n in loggers:
results.set_node_fail(n)
return results
if loggers:
self._stop_nodes(loggers, results)
return results
def _stop_nodes(self, nodes, results):
self.ui.info(f"stopping {node_mod.nodes_describe(nodes)} ...")
running = []
# Check which nodes are still running.
for node, isrunning in self._isrunning(nodes):
if isrunning:
running += [node]
else:
results.set_node_success(node)
# Generate crash report for any crashed nodes.
crashed = [node for node in nodes if node.hasCrashed()]
if crashed:
self.ui.info(
"creating crash report for previously crashed nodes: {}".format(
", ".join([n.name for n in crashed])
)
)
self._make_crash_reports(crashed)
# Helper function to stop nodes with given signal.
def stop(nodes, signal):
cmds = []
for node in nodes:
cmds += [(node, "stop", [str(node.getPID()), str(signal)])]
return self.executor.run_helper(cmds)
# Stop nodes.
for node, success, output in stop(running, 15):
if not success:
# Give up on this node. Most likely either we cannot connect
# to the host, or we don't have permission to kill the process.
self.ui.error(f"unable to stop {node.name}: {output}")
results.set_node_fail(node)
running.remove(node)
if running:
time.sleep(1)
# Check whether they terminated.
terminated = []
kill = []
for node, success in self._waitforzeeks(
running, "TERMINATED", self.config.stoptimeout, False
):
if not success:
# Check whether it crashed during shutdown ...
result = self._isrunning([node])
for node, isrunning in result:
if isrunning:
self.ui.info(f"{node.name} did not terminate ... killing ...")
kill += [node]
else:
# crashed flag is set by _isrunning().
self.ui.info(f"{node.name} crashed during shutdown")
if kill:
# Kill those which did not terminate gracefully.
stop(kill, 9)
# Give them a bit to disappear.
time.sleep(5)
# Check which are still running. We check all nodes to be on the safe
# side and give them a bit more time to finally disappear.
timeout = 10
todo = {}
for node in running:
todo[node.name] = node
while True:
nodelist = sorted(todo.values(), key=node_mod.sortnode)
running = self._isrunning(nodelist, setcrashed=False)
for node, isrunning in running:
if node.name in todo and not isrunning:
# Alright, it's gone.
del todo[node.name]
terminated += [node]
results.set_node_success(node)
if not todo:
# All done.
break
# Wait a bit before we start over.
if timeout <= 0:
break
time.sleep(1)
timeout -= 1
for node in todo.values():
results.set_node_fail(node)
# Do post-terminate cleanup for those which terminated gracefully.
cleanup = [node for node in terminated if not node.hasCrashed()]
cmds = []
postterminate = os.path.join(self.config.scriptsdir, "post-terminate")
for node in cleanup:
crashflag = "killed" if node in kill else ""
cmds += [(node, postterminate, [node.type, node.cwd(), crashflag])]
for node, success, output in self.executor.run_cmds(cmds):
if success:
self._log_action(node, "stopped")
else:
self.ui.error(
f"error running post-terminate for {node.name}:\n{output}"
)
self._log_action(node, "stopped (failed)")
node.clearPID()
node.clearCrashed()
return results
# Output status summary for nodes.
def status(self, nodes):
results = cmdresult.CmdResult()
showall = self.config.statuscmdshowall
if showall:
self.ui.info("Getting process status ...")
nodestatus = self._isrunning(nodes)
running = []
cmds = []
for node, isrunning in nodestatus:
if isrunning:
running += [node]
cmds += [
(
node,
"first-line",
[f"{node.cwd()}/.status", f"{node.cwd()}/.startup"],
)
]
statuses = {}
startups = {}
for n, success, output in self.executor.run_helper(cmds):
out = output.splitlines()
try:
val = out[0].split()[0].lower() if (success and out[0]) else "???"
except IndexError:
val = "???"
statuses[n.name] = val
try:
val = fmttime(out[1]) if (success and out[1]) else "???"
except (IndexError, ValueError):
val = "???"
startups[n.name] = val
if showall:
self.ui.info("Getting peer status ...")
peers = {}
nodes = [n for n in running if statuses[n.name] == "running"]
for node, success, args in self._query_peerstatus(nodes):
if success and args:
peers[node.name] = []
for f in args[0].split():
if not f.startswith("peer="):
continue
# Get everything after the '=' character.
val = f[5:]
if val:
peers[node.name] += [val]
for node, isrunning in nodestatus:
node_info = {
"name": node.name,
"type": node.type,
"host": node.host,
"status": "stopped",
"pid": None,
"started": None,
}
if showall:
node_info["peers"] = None
if isrunning:
node_info["status"] = statuses[node.name]
elif node.hasCrashed():
node_info["status"] = "crashed"
if isrunning:
node_info["pid"] = node.getPID()
if showall:
if node.name in peers:
node_info["peers"] = len(peers[node.name])
else:
node_info["peers"] = "???"
node_info["started"] = startups[node.name]
results.set_node_data(node, True, node_info)
return results
# Check the configuration for nodes without installing first.
def check(self, nodes):
return self._check_config(nodes, False, False)
# Print the loaded_scripts.log for either the installed scripts
# (if "check" is false), or the original scripts (if "check" is true).
def scripts(self, nodes, check):
return self._check_config(nodes, not check, True)
def _check_config(self, nodes, installed, list_scripts):
results = cmdresult.CmdResult()
nodetmpdirs = [
(node, os.path.join(self.config.tmpdir, f"check-config-{node.name}"))
for node in nodes
]
nodes = []
for node, cwd in nodetmpdirs:
if os.path.isdir(cwd):
try:
shutil.rmtree(cwd)
except OSError as err:
self.ui.error(f"cannot remove directory {cwd}: {err}")
results.ok = False
return results
try:
os.makedirs(cwd)
except OSError as err:
self.ui.error(f"cannot create temporary directory: {err}")
results.ok = False
return results
nodes += [(node, cwd)]
cmds = []
for node, cwd in nodes:
env = _make_env_params(node)
installed_policies = "1" if installed else "0"
print_scripts = "1" if list_scripts else "0"
if not install.make_layout(cwd, self.ui, True):
results.ok = False
return results
if not install.make_local_networks(cwd, self.ui):
results.ok = False
return results
if not install.make_zeekctl_config_policy(
cwd, self.ui, self.pluginregistry
):
results.ok = False
return results
cmd = os.path.join(
self.config.scriptsdir, "check-config"
) + " {} {} {} {}".format(
installed_policies,
print_scripts,
cwd,
" ".join(_make_zeek_params(node, False)),
)
cmd += " zeekctl/check"
cmds += [((node, cwd), cmd, env, None)]
for (node, cwd), success, output in execute.run_localcmds(cmds):
results.set_node_output(node, success, output)
try:
shutil.rmtree(cwd)
except OSError:
# Don't bother reporting an error now.
pass
return results
def _query_peerstatus(self, nodes):
running = self._isrunning(nodes)
eventlist = []
for node, isrunning in running:
if isrunning:
eventlist += [
(
node,
"Control::peer_status_request",
[],
"Control::peer_status_response",
)
]
return events.send_events_parallel(eventlist, config.Config.controltopic)
def execute_cmd(self, nodes, cmd):
results = cmdresult.CmdResult()
for node, success, out in self.executor.run_shell_cmds(
[(n, cmd) for n in nodes]
):
results.set_node_output(node, success, out)
return results
# Clean up the working directory for nodes (flushes state).
# If cleantmp is true, also wipes ${tmpdir}; this is done
# even when the node is still running.
def cleanup(self, nodes, cleantmp=False):
# Given a set of node names "orig" and command results "res", add
# all node names to "orig" that have a failed result in "res".
def addfailed(orig, res):
for node, status, output in res:
# if status is Fail, then add the node name
if not status:
orig.add(node.name)
return orig
results = cmdresult.CmdResult()
result = self._isrunning(nodes)
running = [node for (node, on) in result if on]
notrunning = [node for (node, on) in result if not on]
for node in running:
self.ui.info(f" {node} is still running, not cleaning work directory")
results1 = self.executor.rmdirs([(n, n.cwd()) for n in notrunning])
results2 = self.executor.mkdirs([(n, n.cwd()) for n in notrunning])
failed = set()
failed = addfailed(failed, results1)
failed = addfailed(failed, results2)
for node in notrunning:
node.clearCrashed()
if cleantmp:
self.ui.info(f"cleaning {self.config.tmpdir} ...")
results3 = self.executor.rmdirs(
[(n, self.config.tmpdir) for n in running + notrunning]
)
results4 = self.executor.mkdirs(
[(n, self.config.tmpdir) for n in running + notrunning]
)
failed = addfailed(failed, results3)
failed = addfailed(failed, results4)
for node in nodes:
if node.name in failed:
results.set_node_fail(node)
else:
results.set_node_success(node)
return results
# Report diagnostics for nodes (e.g., stderr output).
def diag(self, nodes):
results = cmdresult.CmdResult()
crashdiag = os.path.join(self.config.scriptsdir, "crash-diag")
cmds = [(node, crashdiag, [node.cwd()]) for node in nodes]
for node, success, output in self.executor.run_cmds(cmds):
if not success:
errmsgs = f"error running crash-diag for {node.name}\n"
errmsgs += output
results.set_node_output(node, False, errmsgs)
continue
results.set_node_output(node, True, output)
return results
def capstats(self, nodes, interval):
results = cmdresult.CmdResult()
if not self.config.capstatspath:
results.set_node_data(
nodes[0],
False,
{
"output": 'Error: cannot run capstats because zeekctl option "capstatspath" is not defined'
},
)
return results
for node, netif, success, vals in self.get_capstats_output(nodes, interval):
if not success:
vals = {"output": vals}
results.set_node_data(node, success, vals)
if not results.nodes:
results.ok = False
return results
# Gather capstats from interfaces.
#
# Returns a list of tuples of the form (node, netif, success, vals)
# where 'netif' is the network interface name used by capstats on
# the 'node', and 'success' is a boolean indicating whether or not
# we were able to get the data; in case there's no error, 'vals' maps
# tags to their values (otherwise, 'vals' is an error message).
#
# Tags are those as returned by capstats on the command-line.
#
# If there is more than one node, then the results will also contain
# one "pseudo-node" of the name "$total" with the sum of all individual
# values.
def get_capstats_output(self, nodes, interval):
results = []
# Construct a list of (node, interface) tuples, one tuple for each
# unique (host, interface) pair.
nodenetifs = []
hosts = {}
for node in nodes:
if not node.interface:
continue
netif = self._capstats_interface(node)
if not netif:
continue
if hosts.setdefault((node.addr, netif), node) == node:
nodenetifs.append((node, netif))
capstats = self.config.capstatspath
cmds = [
(node, capstats, ["-I", str(interval), "-n", "1", "-i", interface])
for (node, interface) in nodenetifs
]
outputs = self.executor.run_cmds(cmds)
totals = {}
for node, success, output in outputs:
netif = self._capstats_interface(node)
if output:
# Grab the first output line, because we might log this to
# stats.log later.
outputline = output.splitlines()[0]
if not success:
if output:
results += [
(
node,
netif,
False,
f"{node.name}: capstats failed ({outputline})",
)
]
else:
results += [
(node, netif, False, f"{node.name}: cannot execute capstats")
]
continue
if not output:
results += [(node, netif, False, f"{node.name}: no capstats output")]
continue
fields = outputline.split()[1:]
if not fields:
results += [
(
node,
netif,
False,
f"{node.name}: unexpected capstats output: {outputline}",
)
]
continue
vals = {}
try:
for field in fields:
key, val = field.split("=")
val = float(val)
vals[key] = val
if key in totals:
totals[key] += val
else:
totals[key] = val
except ValueError:
results += [
(
node,
netif,
False,
f"{node.name}: unexpected capstats output: {outputline}",
)
]
continue
results += [(node, netif, True, vals)]
# Add pseudo-node for totals when there is more than one result
if len(results) > 1:
results += [(node_mod.Node(self.config, "$total"), None, True, totals)]
return results
# Convert a Zeek network interface name to one that capstats can use.
def _capstats_interface(self, node):
netif = node.interface
if netif.startswith("dnacl") and netif.count("@") == 1:
# PF_RING+DNA with pfdnacluster_master is being used
# (e.g. interface name "dnacluster:21" gets changed to
# "dnacluster:21@1" by the zeekctl pf_ring plugin)
netif = netif.split("@", 1)[0]
elif "::" in netif:
# Interface name has packet source prefix (e.g. "af_packet::eth0"),
# so don't try to run capstats on this interface unless it is
# af_packet since we know that works.
if netif.startswith("af_packet"):
netif = netif.split("::")[1]
else:
netif = None
return netif
# Gets disk space on all volumes relevant to zeekctl installation.
# Returns a list of the form: [ (host, diskinfo), ...]
# where diskinfo is a list of the form DiskInfo named tuple objects (fs,
# total, used, avail, percent) or ["FAIL", <error message>] if an error
# is encountered.
def df(self, nodes):
results = cmdresult.CmdResult()
DiskInfo = namedtuple(
"DiskInfo", ("fs", "total", "used", "available", "percent")
)
dirs = (
"logdir",
"bindir",
"helperdir",
"cfgdir",
"spooldir",
"policydir",
"libdir",
"libdir64",
"tmpdir",
"staticdir",
"scriptsdir",
)
df = {}
for node in nodes:
df[node.name] = {}
cmds = []
for node in nodes:
for key in dirs:
if key == "logdir" and not (
node_mod.is_logger(node)
or node_mod.is_manager(node)
or node_mod.is_standalone(node)
):
# Don't need to check this on nodes that don't write logs.
continue
path = self.config.config[key]
if key == "libdir" or key == "libdir64":
if not os.path.exists(path):
continue
cmds += [(node, "df", [path])]
for node, success, output in self.executor.run_helper(cmds):
if success:
fields = output.split()
if len(fields) != 4:
df[node.name]["FAIL"] = "wrong number of fields from df helper"
continue
fs = fields[0]
# Ignore NFS mounted volumes.
if not fs.startswith("/") and ":" in fs:
continue
try:
total = float(fields[1])
used = float(fields[2])
avail = float(fields[3])
except ValueError as err:
df[node.name]["FAIL"] = f"bad output from df helper: {err}"
continue
perc = used * 100.0 / (used + avail)
df[node.name][fs] = DiskInfo(fs, total, used, avail, perc)
else:
df[node.name]["FAIL"] = output if output else "no output"
for node in nodes:
success = "FAIL" not in df[node.name]
results.set_node_data(node, success, df[node.name])
return results
# Returns a list of tuples of the form (node, error, vals) where 'error' is
# an error message string, or None if there was no error. 'vals' is a
# dict which maps tags to their values. Tags are "pid", "vsize",
# "rss", "cpu", and "cmd".
def get_top_output(self, nodes):
results = []
running = self._isrunning(nodes)
# Get all the PIDs first.
pids = {}
for node, isrunning in running:
if isrunning:
pids[node.name] = node.getPID()
else:
results += [(node, "not running", {})]
continue
if not pids:
return results
cmds = []
hosts = {}
# Now run top once per host.
for node in nodes: # Do the loop again to keep the order.
if node.name not in pids:
continue
if node.host in hosts:
continue
hosts[node.host] = 1
cmds += [(node, "top", [])]
if not cmds:
return results
res = {}
for node, success, output in self.executor.run_helper(cmds):
res[node.host] = success, output
# Gather results for all the nodes that are running
for node in nodes:
if node.name not in pids:
continue
success, output = res[node.host]
if not success:
# The error msg gets written to stats.log, so we only want
# the first line.
errmsg = output.splitlines()[0] if output else ""
results += [(node, f"top failed: {errmsg}", {})]
continue
if not output:
results += [(node, "no output from top", {})]
continue
# Get the zeek process info, which is a list of fields from
# the "top" helper.
procinfo = []
try:
for line in output.splitlines():
if int(line.split()[0]) == pids[node.name]:
procinfo = line.split()
break
except (IndexError, ValueError) as err:
results += [(node, f"bad output from top: {err}", {})]
continue
if not procinfo:
# It's possible that the process is no longer there.
results += [(node, "not running", {})]
continue
vals = {}
try:
pid = int(procinfo[0])
vals["pid"] = pid
vals["vsize"] = int(
float(procinfo[1])
) # May be something like 2.17684e+9
vals["rss"] = int(float(procinfo[2]))
vals["cpu"] = procinfo[3]
vals["cmd"] = " ".join(procinfo[4:])
except (IndexError, ValueError) as err:
results += [(node, f"unexpected top output: {err}", {})]
continue
results += [(node, None, vals)]
return results
# Produce a top-like output for node's processes.
def top(self, nodes):
results = cmdresult.CmdResult()
for node, error, vals in self.get_top_output(nodes):
top_info = {
"name": node.name,
"type": node.type,
"host": node.host,
"pid": None,
"vsize": None,
"rss": None,
"cpu": None,
"cmd": None,
"error": None,
}
if error:
top_info["error"] = error
results.set_node_data(node, False, {"procs": top_info})
continue
top_info2 = top_info.copy()
top_info2.update(vals)
results.set_node_data(node, True, {"procs": top_info2})
return results
def print_id(self, nodes, id):
results = cmdresult.CmdResult()
running = self._isrunning(nodes)
eventlist = []
for node, isrunning in running:
if isrunning:
eventlist += [
(
node,
"Control::id_value_request",
[id],
"Control::id_value_response",
)
]
if not eventlist:
results.set_node_output(nodes[0], False, "no running instances of Zeek")
return results
for node, success, args in events.send_events_parallel(
eventlist, config.Config.controltopic
):
if success:
out = "\n".join(args)
else:
out = args
results.set_node_output(node, success, out)
return results
def _query_netstats(self, nodes):
running = self._isrunning(nodes)
eventlist = []
for node, isrunning in running:
if isrunning:
eventlist += [
(
node,
"Control::net_stats_request",
[],
"Control::net_stats_response",
)
]
return events.send_events_parallel(eventlist, config.Config.controltopic)
def peerstatus(self, nodes):
results = cmdresult.CmdResult()
for node, success, args in self._query_peerstatus(nodes):
if success:
if args:
out = args[0]
else:
out = ""
else:
out = args
results.set_node_output(node, success, out)
if not results.nodes:
results.set_node_output(nodes[0], False, "no running instances of Zeek")
return results
def netstats(self, nodes):
results = cmdresult.CmdResult()
for node, success, args in self._query_netstats(nodes):
if success:
if args:
out = args[0].strip()
else:
out = ""
else:
out = args
results.set_node_output(node, success, out)
if not results.nodes:
results.set_node_output(nodes[0], False, "no running instances of Zeek")
return results
def process(self, trace, zeek_options, zeek_scripts):
results = cmdresult.CmdResult()
if not os.path.isfile(trace):
self.ui.error(f"trace file not found: {trace}")
results.ok = False
return results
if self.config.standalone:
node = self.config.nodes()[0]
else:
node = self.config.workers()[0]
cwd = os.path.join(self.config.tmpdir, "testing")
if os.path.isdir(cwd):
try:
shutil.rmtree(cwd)
except OSError as err:
self.ui.error(f"cannot remove directory: {err}")
results.ok = False
return results
try:
os.makedirs(cwd)
except OSError as err:
self.ui.error(f"cannot create directory: {err}")
results.ok = False
return results
env = _make_env_params(node)
zeek_args = " ".join(zeek_options + _make_zeek_params(node, False))
zeek_args += " zeekctl/process-trace"
if zeek_scripts:
zeek_args += " " + " ".join(zeek_scripts)
cmd = (
os.path.join(self.config.scriptsdir, "run-zeek-on-trace")
+ f" {0} {cwd} {trace} {zeek_args}"
)
self.ui.info(cmd)
success, output = execute.run_localcmd(cmd, env=env)
if not success:
results.ok = False
self.ui.info(output)
self.ui.info(f"### Zeek output in {cwd}")
return results
def install(self, local_only):
results = cmdresult.CmdResult()
try:
self.config.record_zeek_version()
except config.ConfigurationError as err:
self.ui.error(f"{err}")
results.ok = False
return results
manager = self.config.manager()
# Delete previously installed policy files to not mix things up.
policies = [
self.config.policydirsiteinstall,
self.config.policydirsiteinstallauto,
]
for dirpath in policies:
if os.path.isdir(dirpath):
self.ui.info(f"removing old policies in {dirpath} ...")
try:
shutil.rmtree(dirpath)
except OSError as err:
self.ui.error(f"failed to remove directory {dirpath}: {err}")
results.ok = False
return results
self.ui.info("creating policy directories ...")
for dirpath in policies:
try:
os.makedirs(dirpath)
except OSError as err:
self.ui.error(f"failed to create directory: {err}")
results.ok = False
return results
# Install local site policy.
if self.config.sitepolicypath:
self.ui.info("installing site policies ...")
dst = self.config.policydirsiteinstall
for dir in self.config.sitepolicypath.split(":"):
dirpath = self.config.subst(dir)
for pathname in glob.glob(os.path.join(dirpath, "*")):
if not execute.install(pathname, dst, self.ui):
results.ok = False
return results
if not install.make_layout(self.config.policydirsiteinstallauto, self.ui):
results.ok = False
return results
self.ui.info("generating local-networks.zeek ...")
if not install.make_local_networks(
self.config.policydirsiteinstallauto, self.ui
):
results.ok = False
return results
self.ui.info("generating zeekctl-config.zeek ...")
if not install.make_zeekctl_config_policy(
self.config.policydirsiteinstallauto, self.ui, self.pluginregistry
):
results.ok = False
return results
loggers = self.config.loggers()
if loggers:
# Just use the first logger that is defined.
node_cwd = loggers[0].cwd()
else:
node_cwd = manager.cwd()
current = self.config.subst(os.path.join(self.config.logdir, "current"))
try:
util.force_symlink(node_cwd, current)
except OSError as err:
results.ok = False
self.ui.error(f"failed to update symlink '{current}': {err}")
return results
self.ui.info("generating zeekctl-config.sh ...")
if not install.make_zeekctl_config_sh(self.ui):
results.ok = False
return results
if local_only:
return results
# Make sure we install each remote host only once.
nodes = self.config.hosts(exclude_local=True)
# If there are no remote hosts, then we're done.
if not nodes:
# Save current configuration state.
self.config.update_cfg_hash()
return results
# Sync to clients.
self.ui.info("updating nodes ...")
dirs = []
if not self.config.havenfs:
# Non-NFS, need to explicitly synchronize.
syncs = install.get_syncs()
else:
# NFS. We only need to take care of the spool/log directories.
# We need this only on the manager.
dirs.append((manager, self.config.logdir))
syncs = install.get_nfssyncs()
syncs = [
(dir, mirror)
for (dir, mirror, optional) in syncs
if not optional or os.path.exists(self.config.subst(dir))
]
createdirs = [self.config.subst(dir) for (dir, mirror) in syncs if not mirror]
for n in nodes:
for dir in createdirs:
dirs.append((n, dir))
for node, success, output in self.executor.mkdirs(dirs):
if not success:
self.ui.error(f"cannot create a directory on node {node.name}")
if output:
self.ui.error(output)
results.ok = False
return results
paths = [self.config.subst(dir) for (dir, mirror) in syncs if mirror]
if not execute.sync(nodes, paths, self.ui):
results.ok = False
return results
# Save current configuration state.
self.config.update_cfg_hash()
return results
# Triggers all activity which is to be done regularly via cron.
def cron(self, watch):
if not self.config.cronenabled:
logging.debug("cron is disabled")
return
# Check if "zeekctl install" has been run.
if not self.config.is_zeekctl_installed():
# Don't output anything here, otherwise the cron job may generate
# emails before the user has a chance to do "zeekctl install".
return
cronui = cron.CronUI()
tasks = cron.CronTasks(
cronui, self.config, self, self.executor, self.pluginregistry
)
cronui.buffer_output()
if watch:
# Check if node state matches expected state, and start/stop if
# necessary.
startlist = []
stoplist = []
for node, isrunning in self._isrunning(self.config.nodes()):
expectrunning = node.getExpectRunning()
if not isrunning and expectrunning:
startlist.append(node)
elif isrunning and not expectrunning:
stoplist.append(node)
if startlist:
self.start(startlist)
if stoplist:
self.stop(stoplist)
# Check for dead hosts.
tasks.check_hosts()
# Generate statistics.
tasks.log_stats(5)
# Check available disk space.
tasks.check_disk_space()
# Expire old log files.
tasks.expire_logs()
# Expire old crash directories.
tasks.expire_crash()
# Update the HTTP stats directory.
tasks.update_http_stats()
# Run external command if we have one.
tasks.run_cron_cmd()
# Mail potential output.
output = cronui.get_buffered_output()
if output:
success, out = self._sendmail("cron: " + output.splitlines()[0], output)
if not success:
self.ui.error(f"zeekctl cron failed to send mail: {out}")
self.ui.info(f"Output of zeekctl cron:\n{output}")
logging.debug("cron done")