# Functions to control the nodes' operations. import glob import logging import os import shutil import time from collections import namedtuple from ZeekControl import cmdresult, config, cron, events, execute, install, util from ZeekControl import node as node_mod # Waits for the nodes' Zeek processes to reach the given status. # Build the Zeek parameters for the given node. Include # script for live operation if live is true. def _make_zeek_params(node, live): args = [] if live and node.interface: try: # Interface name needs quotes so that shell doesn't interpret any # potential metacharacters in the name. args += ["-i", f"'{node.interface}'"] except AttributeError: pass if config.Config.savetraces: args += ["-w", "trace.pcap"] args += ["-U", ".status"] args += ["-p", "zeekctl"] if live: args += ["-p", "zeekctl-live"] if node_mod.is_standalone(node): args += ["-p", "standalone"] for prefix in config.Config.prefixes.split(":"): args += ["-p", f"{prefix}"] args += ["-p", f"{node.name}"] # The order of loaded scripts is as follows: # 1) SitePolicyScripts (local.zeek by default) gives a common set of loaded # scripts for all nodes. # 2) The common configuration of zeekctl is loaded via the zeekctl package. # 3) The distribution's default settings for node configuration are loaded # from either the cluster framework or standalone scripts. At this point # anything in the distribution's default per-node is overridable and any # identifiers in local.zeek are able to be used (e.g. in defining # a notice policy). # 4) Autogenerated zeekctl scripts are loaded, which may contain # settings that override the previously loaded scripts. # (e.g. see Log::default_rotation_interval) args += config.Config.sitepolicyscripts.split() args += ["zeekctl"] if node_mod.is_standalone(node): args += ["zeekctl/standalone"] else: args += ["base/frameworks/cluster"] args += ["zeekctl/auto"] if getattr(node, "aux_scripts", None): args += [node.aux_scripts] if config.Config.zeekargs: # Some args in zeekargs might contain spaces, so we cannot split it. args += [config.Config.zeekargs] return args # Build the environment variables for the given node. def _make_env_params(node, returnlist=False): envs = [] if not node_mod.is_standalone(node): envs.append(f"CLUSTER_NODE={node.name}") envs += [f"{key}={val}" for (key, val) in sorted(node.env_vars.items())] if returnlist: envlist = [("-v", i) for i in envs] return [j for i in envlist for j in i] return " ".join(envs) def fmttime(t): return time.strftime(config.Config.timefmt, time.localtime(float(t))) class Controller: def __init__(self, config, ui, executor, pluginregistry): self.config = config self.ui = ui self.executor = executor self.pluginregistry = pluginregistry # Create zeekctl-config.sh file so that shell script helpers have # current config values. install.make_zeekctl_config_sh(ui) def start(self, nodes): results = cmdresult.CmdResult() loggers, manager, proxies, workers = node_mod.separate_types(nodes) for n in nodes: n.setExpectRunning(True) # Start nodes. Do it in the order loggers, manager, proxies, workers. if loggers: self._start_nodes(loggers, results) if not results.ok: for n in manager + proxies + workers: results.set_node_fail(n) return results if manager: self._start_nodes(manager, results) if not results.ok: for n in proxies + workers: results.set_node_fail(n) return results if proxies: self._start_nodes(proxies, results) if not results.ok: for n in workers: results.set_node_fail(n) return results if workers: self._start_nodes(workers, results) return results # Starts the given nodes. def _start_nodes(self, nodes, results): self.ui.info(f"starting {node_mod.nodes_describe(nodes)} ...") filtered = [] # Ignore nodes which are still running. for node, isrunning in self._isrunning(nodes): if not isrunning: filtered += [node] nodes = filtered # Generate crash report for any crashed nodes. crashed = [node for node in nodes if node.hasCrashed()] if crashed: self.ui.info( "creating crash report for previously crashed nodes: {}".format( ", ".join([n.name for n in crashed]) ) ) self._make_crash_reports(crashed) # Make working directories. dirs = [(node, node.cwd()) for node in nodes] nodes = [] for node, success, output in self.executor.mkdirs(dirs): if success: nodes += [node] else: self.ui.error(f"cannot create working directory for {node.name}") results.set_node_fail(node) # Start Zeek process. cmds = [] for node in nodes: envs = [] pin_cpu = node.pin_cpus # If this node isn't using CPU pinning, then use a placeholder value if pin_cpu == "": pin_cpu = -1 envs = _make_env_params(node, True) cmds += [ ( node, "start", envs + [node.cwd(), str(pin_cpu)] + _make_zeek_params(node, True), ) ] nodes = [] # Note: the shell is used to interpret the command because zeekargs # might contain quoted arguments. for node, success, output in self.executor.run_helper(cmds, shell=True): if success: if not output: self.ui.error(f"failed to get PID of {node.name}") results.set_node_fail(node) continue pidstr = output.splitlines()[0] try: pid = int(pidstr) except ValueError: self.ui.error(f"invalid PID for {node.name}: {pidstr}") results.set_node_fail(node) continue nodes += [node] node.setPID(pid) else: self.ui.error(f'cannot start {node.name}; check output of "diag"') results.set_node_fail(node) if output: self.ui.error(output) # Check whether processes did indeed start up. hanging = [] running = [] for node, success in self._waitforzeeks(nodes, "RUNNING", 3, True): if success: running += [node] else: hanging += [node] # It can happen that Zeek hangs in DNS lookups at startup # which can take a while. At this point we already know # that the process has been started (_waitforzeeks ensures that). # If by now there is not a TERMINATED status, we assume that it # is doing fine and will move on to RUNNING once DNS is done. for node, success in self._waitforzeeks(hanging, "TERMINATED", 0, False): if success: self.ui.error( f'{node.name} terminated immediately after starting; check output with "diag"' ) node.clearPID() results.set_node_fail(node) else: self.ui.info(f"({node.name} still initializing)") running += [node] for node in running: self._log_action(node, "started") results.set_node_success(node) return results def _isrunning(self, nodes, setcrashed=True): results = [] cmds = [] for node in nodes: pid = node.getPID() if not pid: results += [(node, False)] continue cmds += [(node, "check-pid", [str(pid)])] for node, success, output in self.executor.run_helper(cmds): # If we cannot run the helper script, then we ignore this node # because the process might actually be running but we can't tell. if not success: self.ui.error(f"failed to run check-pid on node {node.name}") continue running = output.strip() == "running" results += [(node, running)] if not running: if setcrashed: # Grmpf. It crashed. node.clearPID() node.setCrashed() return results def _waitforzeeks(self, nodes, status, timeout, ensurerunning): # If ensurerunning is true, process must still be running. if ensurerunning: running = self._isrunning(nodes) else: running = [(node, True) for node in nodes] results = [] # Determine set of nodes still to check. todo = {} for node, isrunning in running: if isrunning: todo[node.name] = node else: results += [(node, False)] while True: # Determine whether process is still running. We need to do this # before we get the state to avoid a race condition. nodelist = sorted(todo.values(), key=node_mod.sortnode) running = self._isrunning(nodelist, setcrashed=False) # Check nodes' .status file cmds = [] for node in nodelist: cmds += [(node, "first-line", [f"{node.cwd()}/.status"])] for node, success, output in self.executor.run_helper(cmds): if not success or not output: continue fields = output.split() if len(fields) == 2: if status in fields[0]: # Status reached. Cool. del todo[node.name] results += [(node, True)] else: # Something's wrong. We give up on that node. del todo[node.name] results += [(node, False)] for node, isrunning in running: if node.name in todo and not isrunning: # Alright, a dead node's status will not change anymore. del todo[node.name] results += [(node, False)] if not todo: # All done. break # Wait a bit before we start over. time.sleep(1) # Timeout reached? timeout -= 1 if timeout <= 0: break logging.debug("Waiting for %d node(s)...", len(todo)) for node in todo.values(): # These did time-out. results += [(node, False)] if todo: logging.debug("Timeout while waiting for %d node(s)", len(todo)) return results def _log_action(self, node, action): if not self.config.statslogenable: return t = time.time() with open(self.config.statslog, "a") as out: out.write(f"{t} {node} action {action}\n") # Do a "post-terminate crash" for the given nodes. def _make_crash_reports(self, nodes): for n in nodes: self.pluginregistry.zeekProcessDied(n) msg_header_backtrace = "If you want to help us debug this problem, then please forward\nthis mail to reports@zeek.org\n" msg_header_no_backtrace = "This crash report does not include a backtrace. In order for crash reports\nto be useful when Zeek crashes, a backtrace is needed.\n" postterminate = os.path.join(self.config.scriptsdir, "post-terminate") cmds = [ (node, postterminate, [node.type, node.cwd(), "crash"]) for node in nodes ] for node, success, output in self.executor.run_cmds(cmds): if success: crashreport = output # Note: here it is assumed that the crash-diag script outputs # this string only when there's a backtrace. has_backtrace = "Core file: " in crashreport if has_backtrace: msg = msg_header_backtrace + crashreport else: msg = msg_header_no_backtrace + crashreport msuccess, moutput = self._sendmail( f"Crash report from {node.name}", msg ) if not msuccess: self.ui.error( f"error occurred while trying to send mail: {moutput}" ) else: self.ui.error( f"error running post-terminate for {node.name}:\n{output}" ) node.clearCrashed() def _sendmail(self, subject, body): if not self.config.sendmail: return True, "" cmd = "{} '{}'".format( os.path.join(self.config.scriptsdir, "send-mail"), subject ) return execute.run_localcmd(cmd, inputtext=body) # Stop Zeek processes on nodes. def stop(self, nodes): results = cmdresult.CmdResult() loggers, manager, proxies, workers = node_mod.separate_types(nodes) for n in nodes: n.setExpectRunning(False) # Stop nodes. Do it in the order workers, proxies, manager, loggers # (the reverse of "start"). if workers: self._stop_nodes(workers, results) if not results.ok: for n in proxies + manager + loggers: results.set_node_fail(n) return results if proxies: self._stop_nodes(proxies, results) if not results.ok: for n in manager + loggers: results.set_node_fail(n) return results if manager: self._stop_nodes(manager, results) if not results.ok: for n in loggers: results.set_node_fail(n) return results if loggers: self._stop_nodes(loggers, results) return results def _stop_nodes(self, nodes, results): self.ui.info(f"stopping {node_mod.nodes_describe(nodes)} ...") running = [] # Check which nodes are still running. for node, isrunning in self._isrunning(nodes): if isrunning: running += [node] else: results.set_node_success(node) # Generate crash report for any crashed nodes. crashed = [node for node in nodes if node.hasCrashed()] if crashed: self.ui.info( "creating crash report for previously crashed nodes: {}".format( ", ".join([n.name for n in crashed]) ) ) self._make_crash_reports(crashed) # Helper function to stop nodes with given signal. def stop(nodes, signal): cmds = [] for node in nodes: cmds += [(node, "stop", [str(node.getPID()), str(signal)])] return self.executor.run_helper(cmds) # Stop nodes. for node, success, output in stop(running, 15): if not success: # Give up on this node. Most likely either we cannot connect # to the host, or we don't have permission to kill the process. self.ui.error(f"unable to stop {node.name}: {output}") results.set_node_fail(node) running.remove(node) if running: time.sleep(1) # Check whether they terminated. terminated = [] kill = [] for node, success in self._waitforzeeks( running, "TERMINATED", self.config.stoptimeout, False ): if not success: # Check whether it crashed during shutdown ... result = self._isrunning([node]) for node, isrunning in result: if isrunning: self.ui.info(f"{node.name} did not terminate ... killing ...") kill += [node] else: # crashed flag is set by _isrunning(). self.ui.info(f"{node.name} crashed during shutdown") if kill: # Kill those which did not terminate gracefully. stop(kill, 9) # Give them a bit to disappear. time.sleep(5) # Check which are still running. We check all nodes to be on the safe # side and give them a bit more time to finally disappear. timeout = 10 todo = {} for node in running: todo[node.name] = node while True: nodelist = sorted(todo.values(), key=node_mod.sortnode) running = self._isrunning(nodelist, setcrashed=False) for node, isrunning in running: if node.name in todo and not isrunning: # Alright, it's gone. del todo[node.name] terminated += [node] results.set_node_success(node) if not todo: # All done. break # Wait a bit before we start over. if timeout <= 0: break time.sleep(1) timeout -= 1 for node in todo.values(): results.set_node_fail(node) # Do post-terminate cleanup for those which terminated gracefully. cleanup = [node for node in terminated if not node.hasCrashed()] cmds = [] postterminate = os.path.join(self.config.scriptsdir, "post-terminate") for node in cleanup: crashflag = "killed" if node in kill else "" cmds += [(node, postterminate, [node.type, node.cwd(), crashflag])] for node, success, output in self.executor.run_cmds(cmds): if success: self._log_action(node, "stopped") else: self.ui.error( f"error running post-terminate for {node.name}:\n{output}" ) self._log_action(node, "stopped (failed)") node.clearPID() node.clearCrashed() return results # Output status summary for nodes. def status(self, nodes): results = cmdresult.CmdResult() showall = self.config.statuscmdshowall if showall: self.ui.info("Getting process status ...") nodestatus = self._isrunning(nodes) running = [] cmds = [] for node, isrunning in nodestatus: if isrunning: running += [node] cmds += [ ( node, "first-line", [f"{node.cwd()}/.status", f"{node.cwd()}/.startup"], ) ] statuses = {} startups = {} for n, success, output in self.executor.run_helper(cmds): out = output.splitlines() try: val = out[0].split()[0].lower() if (success and out[0]) else "???" except IndexError: val = "???" statuses[n.name] = val try: val = fmttime(out[1]) if (success and out[1]) else "???" except (IndexError, ValueError): val = "???" startups[n.name] = val if showall: self.ui.info("Getting peer status ...") peers = {} nodes = [n for n in running if statuses[n.name] == "running"] for node, success, args in self._query_peerstatus(nodes): if success and args: peers[node.name] = [] for f in args[0].split(): if not f.startswith("peer="): continue # Get everything after the '=' character. val = f[5:] if val: peers[node.name] += [val] for node, isrunning in nodestatus: node_info = { "name": node.name, "type": node.type, "host": node.host, "status": "stopped", "pid": None, "started": None, } if showall: node_info["peers"] = None if isrunning: node_info["status"] = statuses[node.name] elif node.hasCrashed(): node_info["status"] = "crashed" if isrunning: node_info["pid"] = node.getPID() if showall: if node.name in peers: node_info["peers"] = len(peers[node.name]) else: node_info["peers"] = "???" node_info["started"] = startups[node.name] results.set_node_data(node, True, node_info) return results # Check the configuration for nodes without installing first. def check(self, nodes): return self._check_config(nodes, False, False) # Print the loaded_scripts.log for either the installed scripts # (if "check" is false), or the original scripts (if "check" is true). def scripts(self, nodes, check): return self._check_config(nodes, not check, True) def _check_config(self, nodes, installed, list_scripts): results = cmdresult.CmdResult() nodetmpdirs = [ (node, os.path.join(self.config.tmpdir, f"check-config-{node.name}")) for node in nodes ] nodes = [] for node, cwd in nodetmpdirs: if os.path.isdir(cwd): try: shutil.rmtree(cwd) except OSError as err: self.ui.error(f"cannot remove directory {cwd}: {err}") results.ok = False return results try: os.makedirs(cwd) except OSError as err: self.ui.error(f"cannot create temporary directory: {err}") results.ok = False return results nodes += [(node, cwd)] cmds = [] for node, cwd in nodes: env = _make_env_params(node) installed_policies = "1" if installed else "0" print_scripts = "1" if list_scripts else "0" if not install.make_layout(cwd, self.ui, True): results.ok = False return results if not install.make_local_networks(cwd, self.ui): results.ok = False return results if not install.make_zeekctl_config_policy( cwd, self.ui, self.pluginregistry ): results.ok = False return results cmd = os.path.join( self.config.scriptsdir, "check-config" ) + " {} {} {} {}".format( installed_policies, print_scripts, cwd, " ".join(_make_zeek_params(node, False)), ) cmd += " zeekctl/check" cmds += [((node, cwd), cmd, env, None)] for (node, cwd), success, output in execute.run_localcmds(cmds): results.set_node_output(node, success, output) try: shutil.rmtree(cwd) except OSError: # Don't bother reporting an error now. pass return results def _query_peerstatus(self, nodes): running = self._isrunning(nodes) eventlist = [] for node, isrunning in running: if isrunning: eventlist += [ ( node, "Control::peer_status_request", [], "Control::peer_status_response", ) ] return events.send_events_parallel(eventlist, config.Config.controltopic) def execute_cmd(self, nodes, cmd): results = cmdresult.CmdResult() for node, success, out in self.executor.run_shell_cmds( [(n, cmd) for n in nodes] ): results.set_node_output(node, success, out) return results # Clean up the working directory for nodes (flushes state). # If cleantmp is true, also wipes ${tmpdir}; this is done # even when the node is still running. def cleanup(self, nodes, cleantmp=False): # Given a set of node names "orig" and command results "res", add # all node names to "orig" that have a failed result in "res". def addfailed(orig, res): for node, status, output in res: # if status is Fail, then add the node name if not status: orig.add(node.name) return orig results = cmdresult.CmdResult() result = self._isrunning(nodes) running = [node for (node, on) in result if on] notrunning = [node for (node, on) in result if not on] for node in running: self.ui.info(f" {node} is still running, not cleaning work directory") results1 = self.executor.rmdirs([(n, n.cwd()) for n in notrunning]) results2 = self.executor.mkdirs([(n, n.cwd()) for n in notrunning]) failed = set() failed = addfailed(failed, results1) failed = addfailed(failed, results2) for node in notrunning: node.clearCrashed() if cleantmp: self.ui.info(f"cleaning {self.config.tmpdir} ...") results3 = self.executor.rmdirs( [(n, self.config.tmpdir) for n in running + notrunning] ) results4 = self.executor.mkdirs( [(n, self.config.tmpdir) for n in running + notrunning] ) failed = addfailed(failed, results3) failed = addfailed(failed, results4) for node in nodes: if node.name in failed: results.set_node_fail(node) else: results.set_node_success(node) return results # Report diagnostics for nodes (e.g., stderr output). def diag(self, nodes): results = cmdresult.CmdResult() crashdiag = os.path.join(self.config.scriptsdir, "crash-diag") cmds = [(node, crashdiag, [node.cwd()]) for node in nodes] for node, success, output in self.executor.run_cmds(cmds): if not success: errmsgs = f"error running crash-diag for {node.name}\n" errmsgs += output results.set_node_output(node, False, errmsgs) continue results.set_node_output(node, True, output) return results def capstats(self, nodes, interval): results = cmdresult.CmdResult() if not self.config.capstatspath: results.set_node_data( nodes[0], False, { "output": 'Error: cannot run capstats because zeekctl option "capstatspath" is not defined' }, ) return results for node, netif, success, vals in self.get_capstats_output(nodes, interval): if not success: vals = {"output": vals} results.set_node_data(node, success, vals) if not results.nodes: results.ok = False return results # Gather capstats from interfaces. # # Returns a list of tuples of the form (node, netif, success, vals) # where 'netif' is the network interface name used by capstats on # the 'node', and 'success' is a boolean indicating whether or not # we were able to get the data; in case there's no error, 'vals' maps # tags to their values (otherwise, 'vals' is an error message). # # Tags are those as returned by capstats on the command-line. # # If there is more than one node, then the results will also contain # one "pseudo-node" of the name "$total" with the sum of all individual # values. def get_capstats_output(self, nodes, interval): results = [] # Construct a list of (node, interface) tuples, one tuple for each # unique (host, interface) pair. nodenetifs = [] hosts = {} for node in nodes: if not node.interface: continue netif = self._capstats_interface(node) if not netif: continue if hosts.setdefault((node.addr, netif), node) == node: nodenetifs.append((node, netif)) capstats = self.config.capstatspath cmds = [ (node, capstats, ["-I", str(interval), "-n", "1", "-i", interface]) for (node, interface) in nodenetifs ] outputs = self.executor.run_cmds(cmds) totals = {} for node, success, output in outputs: netif = self._capstats_interface(node) if output: # Grab the first output line, because we might log this to # stats.log later. outputline = output.splitlines()[0] if not success: if output: results += [ ( node, netif, False, f"{node.name}: capstats failed ({outputline})", ) ] else: results += [ (node, netif, False, f"{node.name}: cannot execute capstats") ] continue if not output: results += [(node, netif, False, f"{node.name}: no capstats output")] continue fields = outputline.split()[1:] if not fields: results += [ ( node, netif, False, f"{node.name}: unexpected capstats output: {outputline}", ) ] continue vals = {} try: for field in fields: key, val = field.split("=") val = float(val) vals[key] = val if key in totals: totals[key] += val else: totals[key] = val except ValueError: results += [ ( node, netif, False, f"{node.name}: unexpected capstats output: {outputline}", ) ] continue results += [(node, netif, True, vals)] # Add pseudo-node for totals when there is more than one result if len(results) > 1: results += [(node_mod.Node(self.config, "$total"), None, True, totals)] return results # Convert a Zeek network interface name to one that capstats can use. def _capstats_interface(self, node): netif = node.interface if netif.startswith("dnacl") and netif.count("@") == 1: # PF_RING+DNA with pfdnacluster_master is being used # (e.g. interface name "dnacluster:21" gets changed to # "dnacluster:21@1" by the zeekctl pf_ring plugin) netif = netif.split("@", 1)[0] elif "::" in netif: # Interface name has packet source prefix (e.g. "af_packet::eth0"), # so don't try to run capstats on this interface unless it is # af_packet since we know that works. if netif.startswith("af_packet"): netif = netif.split("::")[1] else: netif = None return netif # Gets disk space on all volumes relevant to zeekctl installation. # Returns a list of the form: [ (host, diskinfo), ...] # where diskinfo is a list of the form DiskInfo named tuple objects (fs, # total, used, avail, percent) or ["FAIL", ] if an error # is encountered. def df(self, nodes): results = cmdresult.CmdResult() DiskInfo = namedtuple( "DiskInfo", ("fs", "total", "used", "available", "percent") ) dirs = ( "logdir", "bindir", "helperdir", "cfgdir", "spooldir", "policydir", "libdir", "libdir64", "tmpdir", "staticdir", "scriptsdir", ) df = {} for node in nodes: df[node.name] = {} cmds = [] for node in nodes: for key in dirs: if key == "logdir" and not ( node_mod.is_logger(node) or node_mod.is_manager(node) or node_mod.is_standalone(node) ): # Don't need to check this on nodes that don't write logs. continue path = self.config.config[key] if key == "libdir" or key == "libdir64": if not os.path.exists(path): continue cmds += [(node, "df", [path])] for node, success, output in self.executor.run_helper(cmds): if success: fields = output.split() if len(fields) != 4: df[node.name]["FAIL"] = "wrong number of fields from df helper" continue fs = fields[0] # Ignore NFS mounted volumes. if not fs.startswith("/") and ":" in fs: continue try: total = float(fields[1]) used = float(fields[2]) avail = float(fields[3]) except ValueError as err: df[node.name]["FAIL"] = f"bad output from df helper: {err}" continue perc = used * 100.0 / (used + avail) df[node.name][fs] = DiskInfo(fs, total, used, avail, perc) else: df[node.name]["FAIL"] = output if output else "no output" for node in nodes: success = "FAIL" not in df[node.name] results.set_node_data(node, success, df[node.name]) return results # Returns a list of tuples of the form (node, error, vals) where 'error' is # an error message string, or None if there was no error. 'vals' is a # dict which maps tags to their values. Tags are "pid", "vsize", # "rss", "cpu", and "cmd". def get_top_output(self, nodes): results = [] running = self._isrunning(nodes) # Get all the PIDs first. pids = {} for node, isrunning in running: if isrunning: pids[node.name] = node.getPID() else: results += [(node, "not running", {})] continue if not pids: return results cmds = [] hosts = {} # Now run top once per host. for node in nodes: # Do the loop again to keep the order. if node.name not in pids: continue if node.host in hosts: continue hosts[node.host] = 1 cmds += [(node, "top", [])] if not cmds: return results res = {} for node, success, output in self.executor.run_helper(cmds): res[node.host] = success, output # Gather results for all the nodes that are running for node in nodes: if node.name not in pids: continue success, output = res[node.host] if not success: # The error msg gets written to stats.log, so we only want # the first line. errmsg = output.splitlines()[0] if output else "" results += [(node, f"top failed: {errmsg}", {})] continue if not output: results += [(node, "no output from top", {})] continue # Get the zeek process info, which is a list of fields from # the "top" helper. procinfo = [] try: for line in output.splitlines(): if int(line.split()[0]) == pids[node.name]: procinfo = line.split() break except (IndexError, ValueError) as err: results += [(node, f"bad output from top: {err}", {})] continue if not procinfo: # It's possible that the process is no longer there. results += [(node, "not running", {})] continue vals = {} try: pid = int(procinfo[0]) vals["pid"] = pid vals["vsize"] = int( float(procinfo[1]) ) # May be something like 2.17684e+9 vals["rss"] = int(float(procinfo[2])) vals["cpu"] = procinfo[3] vals["cmd"] = " ".join(procinfo[4:]) except (IndexError, ValueError) as err: results += [(node, f"unexpected top output: {err}", {})] continue results += [(node, None, vals)] return results # Produce a top-like output for node's processes. def top(self, nodes): results = cmdresult.CmdResult() for node, error, vals in self.get_top_output(nodes): top_info = { "name": node.name, "type": node.type, "host": node.host, "pid": None, "vsize": None, "rss": None, "cpu": None, "cmd": None, "error": None, } if error: top_info["error"] = error results.set_node_data(node, False, {"procs": top_info}) continue top_info2 = top_info.copy() top_info2.update(vals) results.set_node_data(node, True, {"procs": top_info2}) return results def print_id(self, nodes, id): results = cmdresult.CmdResult() running = self._isrunning(nodes) eventlist = [] for node, isrunning in running: if isrunning: eventlist += [ ( node, "Control::id_value_request", [id], "Control::id_value_response", ) ] if not eventlist: results.set_node_output(nodes[0], False, "no running instances of Zeek") return results for node, success, args in events.send_events_parallel( eventlist, config.Config.controltopic ): if success: out = "\n".join(args) else: out = args results.set_node_output(node, success, out) return results def _query_netstats(self, nodes): running = self._isrunning(nodes) eventlist = [] for node, isrunning in running: if isrunning: eventlist += [ ( node, "Control::net_stats_request", [], "Control::net_stats_response", ) ] return events.send_events_parallel(eventlist, config.Config.controltopic) def peerstatus(self, nodes): results = cmdresult.CmdResult() for node, success, args in self._query_peerstatus(nodes): if success: if args: out = args[0] else: out = "" else: out = args results.set_node_output(node, success, out) if not results.nodes: results.set_node_output(nodes[0], False, "no running instances of Zeek") return results def netstats(self, nodes): results = cmdresult.CmdResult() for node, success, args in self._query_netstats(nodes): if success: if args: out = args[0].strip() else: out = "" else: out = args results.set_node_output(node, success, out) if not results.nodes: results.set_node_output(nodes[0], False, "no running instances of Zeek") return results def process(self, trace, zeek_options, zeek_scripts): results = cmdresult.CmdResult() if not os.path.isfile(trace): self.ui.error(f"trace file not found: {trace}") results.ok = False return results if self.config.standalone: node = self.config.nodes()[0] else: node = self.config.workers()[0] cwd = os.path.join(self.config.tmpdir, "testing") if os.path.isdir(cwd): try: shutil.rmtree(cwd) except OSError as err: self.ui.error(f"cannot remove directory: {err}") results.ok = False return results try: os.makedirs(cwd) except OSError as err: self.ui.error(f"cannot create directory: {err}") results.ok = False return results env = _make_env_params(node) zeek_args = " ".join(zeek_options + _make_zeek_params(node, False)) zeek_args += " zeekctl/process-trace" if zeek_scripts: zeek_args += " " + " ".join(zeek_scripts) cmd = ( os.path.join(self.config.scriptsdir, "run-zeek-on-trace") + f" {0} {cwd} {trace} {zeek_args}" ) self.ui.info(cmd) success, output = execute.run_localcmd(cmd, env=env) if not success: results.ok = False self.ui.info(output) self.ui.info(f"### Zeek output in {cwd}") return results def install(self, local_only): results = cmdresult.CmdResult() try: self.config.record_zeek_version() except config.ConfigurationError as err: self.ui.error(f"{err}") results.ok = False return results manager = self.config.manager() # Delete previously installed policy files to not mix things up. policies = [ self.config.policydirsiteinstall, self.config.policydirsiteinstallauto, ] for dirpath in policies: if os.path.isdir(dirpath): self.ui.info(f"removing old policies in {dirpath} ...") try: shutil.rmtree(dirpath) except OSError as err: self.ui.error(f"failed to remove directory {dirpath}: {err}") results.ok = False return results self.ui.info("creating policy directories ...") for dirpath in policies: try: os.makedirs(dirpath) except OSError as err: self.ui.error(f"failed to create directory: {err}") results.ok = False return results # Install local site policy. if self.config.sitepolicypath: self.ui.info("installing site policies ...") dst = self.config.policydirsiteinstall for dir in self.config.sitepolicypath.split(":"): dirpath = self.config.subst(dir) for pathname in glob.glob(os.path.join(dirpath, "*")): if not execute.install(pathname, dst, self.ui): results.ok = False return results if not install.make_layout(self.config.policydirsiteinstallauto, self.ui): results.ok = False return results self.ui.info("generating local-networks.zeek ...") if not install.make_local_networks( self.config.policydirsiteinstallauto, self.ui ): results.ok = False return results self.ui.info("generating zeekctl-config.zeek ...") if not install.make_zeekctl_config_policy( self.config.policydirsiteinstallauto, self.ui, self.pluginregistry ): results.ok = False return results loggers = self.config.loggers() if loggers: # Just use the first logger that is defined. node_cwd = loggers[0].cwd() else: node_cwd = manager.cwd() current = self.config.subst(os.path.join(self.config.logdir, "current")) try: util.force_symlink(node_cwd, current) except OSError as err: results.ok = False self.ui.error(f"failed to update symlink '{current}': {err}") return results self.ui.info("generating zeekctl-config.sh ...") if not install.make_zeekctl_config_sh(self.ui): results.ok = False return results if local_only: return results # Make sure we install each remote host only once. nodes = self.config.hosts(exclude_local=True) # If there are no remote hosts, then we're done. if not nodes: # Save current configuration state. self.config.update_cfg_hash() return results # Sync to clients. self.ui.info("updating nodes ...") dirs = [] if not self.config.havenfs: # Non-NFS, need to explicitly synchronize. syncs = install.get_syncs() else: # NFS. We only need to take care of the spool/log directories. # We need this only on the manager. dirs.append((manager, self.config.logdir)) syncs = install.get_nfssyncs() syncs = [ (dir, mirror) for (dir, mirror, optional) in syncs if not optional or os.path.exists(self.config.subst(dir)) ] createdirs = [self.config.subst(dir) for (dir, mirror) in syncs if not mirror] for n in nodes: for dir in createdirs: dirs.append((n, dir)) for node, success, output in self.executor.mkdirs(dirs): if not success: self.ui.error(f"cannot create a directory on node {node.name}") if output: self.ui.error(output) results.ok = False return results paths = [self.config.subst(dir) for (dir, mirror) in syncs if mirror] if not execute.sync(nodes, paths, self.ui): results.ok = False return results # Save current configuration state. self.config.update_cfg_hash() return results # Triggers all activity which is to be done regularly via cron. def cron(self, watch): if not self.config.cronenabled: logging.debug("cron is disabled") return # Check if "zeekctl install" has been run. if not self.config.is_zeekctl_installed(): # Don't output anything here, otherwise the cron job may generate # emails before the user has a chance to do "zeekctl install". return cronui = cron.CronUI() tasks = cron.CronTasks( cronui, self.config, self, self.executor, self.pluginregistry ) cronui.buffer_output() if watch: # Check if node state matches expected state, and start/stop if # necessary. startlist = [] stoplist = [] for node, isrunning in self._isrunning(self.config.nodes()): expectrunning = node.getExpectRunning() if not isrunning and expectrunning: startlist.append(node) elif isrunning and not expectrunning: stoplist.append(node) if startlist: self.start(startlist) if stoplist: self.stop(stoplist) # Check for dead hosts. tasks.check_hosts() # Generate statistics. tasks.log_stats(5) # Check available disk space. tasks.check_disk_space() # Expire old log files. tasks.expire_logs() # Expire old crash directories. tasks.expire_crash() # Update the HTTP stats directory. tasks.update_http_stats() # Run external command if we have one. tasks.run_cron_cmd() # Mail potential output. output = cronui.get_buffered_output() if output: success, out = self._sendmail("cron: " + output.splitlines()[0], output) if not success: self.ui.error(f"zeekctl cron failed to send mail: {out}") self.ui.info(f"Output of zeekctl cron:\n{output}") logging.debug("cron done")