zeek/auxil/zeekctl/bin/zeekctl.in
Patrick Kelley 8fd444092b initial
2025-05-07 15:35:15 -04:00

862 lines
29 KiB
Python
Executable File

#! /usr/bin/env python3
#
# The ZeekControl interactive shell.
import logging
import os.path
import sys
import time
for path in (
"@PREFIX@/lib/zeekctl",
"@PY_MOD_INSTALL_DIR@",
"@PY_MOD_INSTALL_DIR@/zeekctl",
):
if os.path.isdir(path):
sys.path.insert(0, path)
from ZeekControl import util, utilcurses, version, zeekcmd
from ZeekControl.exceptions import CommandSyntaxError, ZeekControlError
from ZeekControl.zeekctl import ZeekCtl
# Main command loop.
class ZeekCtlCmdLoop(zeekcmd.ExitValueCmd):
prompt = "[ZeekControl] > "
def __init__(self, zeekctl_class=ZeekCtl, interactive=False, cmd=""):
zeekcmd.ExitValueCmd.__init__(self)
self.zeekctl = zeekctl_class(ui=self)
self.interactive = interactive
# Warn user to do zeekctl install, if needed. Skip this check when
# running cron to avoid receiving annoying emails. Also skip if the
# install or deploy commands are running.
if cmd not in ("cron", "install", "deploy"):
self.zeekctl.warn_zeekctl_install()
def finish(self):
self.zeekctl.finish()
def info(self, text):
print(text)
logging.info(text)
def warn(self, text):
self.info(f"Warning: {text}")
def error(self, text):
print(f"Error: {text}", file=sys.stderr)
logging.info(text)
def err(self, text):
print(text, file=sys.stderr)
logging.info(text)
def default(self, line):
strlist = line.split()
cmd = strlist[0]
cmdargs = " ".join(strlist[1:])
results = self.zeekctl.plugincmd(cmd, cmdargs)
if results.unknowncmd:
self.error(f"unknown command '{cmd}'")
if not self.interactive:
self.do_help(None)
return results.ok
def emptyline(self):
pass
def precmd(self, line):
logging.debug(line)
return line
def postcmd(self, stop, line):
logging.debug("done")
return stop
def do_EOF(self, args):
self._stopping = True
return True
def do_exit(self, args):
"""Terminates the shell."""
self._stopping = True
return True
def do_quit(self, args):
"""Terminates the shell."""
self._stopping = True
return True
def do_nodes(self, args):
"""Prints a list of all configured nodes.
Note that the env_vars attribute includes the set of environment
variables from the 'env_vars' option in both 'node.cfg' and
'zeekctl.cfg' and also those set by any plugins."""
if args:
raise CommandSyntaxError("the nodes command does not take any arguments")
results = self.zeekctl.nodes()
for node, success, data in results.get_node_data():
self.info(data["description"])
return results.ok
def do_config(self, args):
"""Prints all configuration options with their current values."""
if args:
raise CommandSyntaxError("the config command does not take any arguments")
results = self.zeekctl.get_config()
for key, val in results.keyval:
self.info(f"{key} = {val}")
return results.ok
def do_install(self, args):
"""- [--local]
Reinstalls on all nodes, including all configuration files and
local policy scripts.
The ``--local`` option is intended for testing or debugging. It
causes only the local host to be installed (i.e., no changes pushed
out to any other hosts in the Zeek cluster). Normally all nodes
should be reinstalled at the same time, as any inconsistencies between
them will lead to strange effects.
This command must be executed after *all* changes to any part of
the ZeekControl configuration or after upgrading to a new version
of Zeek or ZeekControl, otherwise the modifications will not take effect.
Before executing ``install``, it is recommended to verify the
configuration with check_. Note that when using the deploy command
there is no need to first use the install command, because deploy
automatically runs install before restarting the nodes."""
local = False
for arg in args.split():
if arg == "--local":
local = True
else:
raise CommandSyntaxError(
f"invalid argument for the install command: {arg}"
)
results = self.zeekctl.install(local)
return results.ok
def do_start(self, args):
"""- [<nodes>]
Starts the given nodes, or all nodes if none are specified. Nodes
already running are left untouched.
"""
results = self.zeekctl.start(node_list=args)
return results.ok
def do_stop(self, args):
"""- [<nodes>]
Stops the given nodes, or all nodes if none are specified. Nodes that
are in the "crashed" state are reset to the "stopped" state, and
nodes that are "stopped" are left untouched.
"""
results = self.zeekctl.stop(node_list=args)
return results.ok
def do_restart(self, args):
"""- [--clean] [<nodes>]
Restarts the given nodes, or all nodes if none are specified. The
effect is the same as first executing stop_ followed
by a start_, giving the same nodes in both cases.
If ``--clean`` is given, the installation is reset into a clean state
before restarting. More precisely, a ``restart --clean`` turns into
the command sequence stop_, cleanup_, check_, install_, and
start_.
"""
clean = False
if args.startswith("--clean"):
args = args[7:]
clean = True
results = self.zeekctl.restart(clean=clean, node_list=args)
return results.ok
def do_deploy(self, args):
"""
Checks for errors in Zeek policy scripts, then does an install followed
by a restart on all nodes. This command should be run after any
changes to Zeek policy scripts or the zeekctl configuration, and after
Zeek is upgraded or even just recompiled.
This command is equivalent to running the check_, install_, and
restart_ commands, in that order.
"""
if args:
raise CommandSyntaxError("the deploy command does not take any arguments")
results = self.zeekctl.deploy()
return results.ok
def do_status(self, args):
"""- [<nodes>]
Prints the current status of the given nodes.
For each node, the information shown includes the node's name and type,
the host where the node will run, the status, the PID, and the
date/time when the node was started. The status column will usually
show a status of either "stopped" or "running". A status of
"crashed" means that ZeekControl verified that Zeek is no longer
running, but was expected to be running."""
success = True
results = self.zeekctl.status(node_list=args)
typewidth = 7
hostwidth = 16
data = results.get_node_data()
if data and data[0][2]["type"] == "standalone":
# In standalone mode, we need a wider "type" column.
typewidth = 10
hostwidth = 13
showall = False
if data:
showall = "peers" in data[0][2]
if showall:
colfmt = "{name:<12} {type:<{0}} {host:<{1}} {status:<9} {pid:<6} {peers:<6} {started}"
else:
colfmt = "{name:<12} {type:<{0}} {host:<{1}} {status:<9} {pid:<6} {started}"
hdrlist = ["name", "type", "host", "status", "pid", "peers", "started"]
header = {x: x.title() for x in hdrlist}
self.info(colfmt.format(typewidth, hostwidth, **header))
colfmtstopped = "{name:<12} {type:<{0}} {host:<{1}} {status}"
for data in results.get_node_data():
node_info = data[2]
mycolfmt = colfmt if node_info["pid"] else colfmtstopped
self.info(mycolfmt.format(typewidth, hostwidth, **node_info))
# Return status code of True only if all nodes are running
if node_info["status"] != "running":
success = False
return success
def _do_top_once(self, args):
results = self.zeekctl.top(args)
typewidth = 7
hostwidth = 16
data = results.get_node_data()
if data:
procinfo = data[0][2]["procs"]
if procinfo["type"] == "standalone":
# In standalone mode, we need a wider "type" column.
typewidth = 10
hostwidth = 13
lines = [
"{:<12s} {:<{}s} {:<{}s} {:<7s} {:<6s} {:<4s} {:<5s} {:s}".format(
"Name",
"Type",
typewidth,
"Host",
hostwidth,
"Pid",
"VSize",
"Rss",
"Cpu",
"Cmd",
)
]
for data in results.get_node_data():
procinfo = data[2]["procs"]
msg = ["{:<12s}".format(procinfo["name"])]
msg.append("{:<{}s}".format(procinfo["type"], typewidth))
msg.append("{:<{}s}".format(procinfo["host"], hostwidth))
if procinfo["error"]:
msg.append("<{:s}>".format(procinfo["error"]))
else:
msg.append("{:<7s}".format(str(procinfo["pid"])))
msg.append("{:<6s}".format(util.number_unit_str(procinfo["vsize"])))
msg.append("{:<4s}".format(util.number_unit_str(procinfo["rss"])))
msg.append("{:>3s}% ".format(procinfo["cpu"]))
msg.append("{:s}".format(procinfo["cmd"]))
lines.append(" ".join(msg))
return (results.ok, lines)
def do_top(self, args):
"""- [<nodes>]
For each of the nodes, prints the status of the Zeek process in
a *top*-like format, including CPU usage and memory consumption. If
executed interactively, the display is updated frequently
until key ``q`` is pressed. If invoked non-interactively, the
status is printed only once."""
if not self.interactive:
success, lines = self._do_top_once(args)
for line in lines:
self.info(line)
return success
utilcurses.enterCurses()
utilcurses.clearScreen()
count = 0
while utilcurses.getCh() != "q":
if count % 10 == 0:
success, lines = self._do_top_once(args)
utilcurses.clearScreen()
utilcurses.printLines(lines)
time.sleep(0.1)
count += 1
utilcurses.leaveCurses()
return success
def do_diag(self, args):
"""- [<nodes>]
If a node has terminated unexpectedly, this command prints a (somewhat
cryptic) summary of its final state including excerpts of any
stdout/stderr output, resource usage, and also a stack backtrace if a
core dump is found. The same information is sent out via mail when a
node is found to have crashed (the "crash report"). While the
information is mainly intended for debugging, it can also help to find
misconfigurations (which are usually, but not always, caught by the
check_ command)."""
results = self.zeekctl.diag(node_list=args)
for node, success, output in results.get_node_output():
self.info(f"[{node}]")
self.info(output)
return results.ok
def do_cron(self, args):
"""- [enable|disable|?] | [--no-watch]
This command has two modes of operation. Without arguments (or just
``--no-watch``), it performs a set of maintenance tasks, including
the logging of various statistical information, expiring old log
files, checking for dead hosts, and restarting nodes which terminated
unexpectedly (the latter can be suppressed with the ``--no-watch``
option if no auto-restart is desired). This mode is intended to be
executed regularly via *cron*, as described in the installation
instructions. While not intended for interactive use, no harm will be
caused by executing the command manually: all the maintenance tasks
will then just be performed one more time.
The second mode is for interactive usage and determines if the regular
tasks are indeed performed when ``zeekctl cron`` is executed. In other
words, even with ``zeekctl cron`` in your crontab, you can still
temporarily disable it by running ``cron disable``, and
then later reenable with ``cron enable``. This can be helpful while
working, e.g., on the ZeekControl configuration and ``cron`` would
interfere with that. ``cron ?`` can be used to query the current state.
"""
watch = True
if args == "--no-watch":
watch = False
elif args:
if args == "enable":
self.zeekctl.setcronenabled(True)
elif args == "disable":
self.zeekctl.setcronenabled(False)
elif args == "?":
results = self.zeekctl.cronenabled()
cron_state = "enabled" if results else "disabled"
self.info("cron " + cron_state)
else:
self.error("invalid cron argument")
return False
return True
self.zeekctl.cron(watch)
return True
def do_check(self, args):
"""- [<nodes>]
Verifies a modified configuration in terms of syntactical correctness
(most importantly correct syntax in policy scripts).
Note that this command checks the site-specific policy files as found
in SitePolicyPath_ rather than the ones installed by the install_
command. Therefore, new errors in a policy script can be detected
before affecting currently running nodes, even when they need to be
restarted.
This command should be executed for each configuration change *before*
using install_ to put the change into place. However, when using the
deploy command there is no need to first run check, because deploy
automatically runs check before installing the policy scripts."""
results = self.zeekctl.check(node_list=args)
for node, success, output in results.get_node_output():
if success:
self.info(f"{node} scripts are ok.")
else:
self.info(f"{node} scripts failed.")
self.err(output)
return results.ok
def do_cleanup(self, args):
"""- [--all] [<nodes>]
Clears the nodes' spool directories, but only for nodes that are not
running. This implies that their persistent state is flushed. Nodes
that were crashed are reset into the "stopped" state.
If ``--all`` is specified, this command also removes the content of
the node's TmpDir_, in particular deleting any data
potentially saved there for reference from previous crashes.
Generally, if you want to reset the installation back into a clean
state, you can first stop_ all nodes, then execute
``cleanup --all``, then install_, and finally start_ all nodes
again."""
cleantmp = False
if args.startswith("--all"):
args = args[5:]
cleantmp = True
self.info("cleaning up nodes ...")
results = self.zeekctl.cleanup(cleantmp=cleantmp, node_list=args)
return results.ok
def do_capstats(self, args):
"""- [<nodes>] [<interval>]
Determines the current load on the network interfaces monitored by
each of the given worker nodes. The load is measured over the
specified interval (in seconds), or by default over 10 seconds. This
command uses the :doc:`capstats<../../components/capstats/README>`
tool, which is installed along with ``zeekctl``."""
interval = 10
args = args.split()
if args:
try:
interval = max(1, int(args[-1]))
args = args[0:-1]
except ValueError:
pass
args = " ".join(args)
def outputcapstats(tag, data):
def output_one(tag, vals):
return "{:<21s} {:<10s} {:s}".format(
tag, str(vals.get("kpps", "")), str(vals.get("mbps", ""))
)
self.info(
"{:<21s} {:<10s} {:<10s} ({:d}s average)\n{:s}".format(
tag, "kpps", "mbps", interval, "-" * 40
)
)
totals = None
for node, success, vals in data:
if not success:
self.err(vals["output"])
continue
if str(node) != "$total":
hostnetif = f"{node.host}/{node.interface}"
self.info(output_one(hostnetif, vals))
else:
totals = vals
if totals:
self.info("")
self.info(output_one("Total", totals))
results = self.zeekctl.capstats(interval=interval, node_list=args)
nodedata = results.get_node_data()
if nodedata:
outputcapstats("Interface", nodedata)
else:
self.error(
"No network interfaces suitable for use with capstats were found."
)
return results.ok
def do_df(self, args):
"""- [<nodes>]
Reports the amount of disk space available on the nodes. Shows only
paths relevant to the zeekctl installation."""
results = self.zeekctl.df(node_list=args)
self.info(
"{:>27s} {:>15s} {:<5s} {:<5s} {:<5s}".format(
"", "", "total", "avail", "capacity"
)
)
for node, success, dfs in results.get_node_data():
for key, diskinfo in sorted(dfs.items()):
if key == "FAIL":
self.error(f"df helper failed on {node}: {diskinfo}")
continue
nodehost = f"{node.name}/{node.host}"
self.info(
f"{nodehost:>28s} {diskinfo.fs:>15s} {util.number_unit_str(diskinfo.total):<5s} {util.number_unit_str(diskinfo.available):<5s} {diskinfo.percent:<5.1f}%"
)
return results.ok
def do_print(self, args):
"""- <id> [<nodes>]
Reports the *current* live value of the given Zeek script ID on all of
the specified nodes (which obviously must be running). This can for
example be useful to (1) check that policy scripts are working as
expected, or (2) confirm that configuration changes have in fact been
applied. Note that IDs defined inside a Zeek namespace must be
prefixed with ``<namespace>::`` (e.g.,
``print Log::enable_remote_logging``)."""
args = args.split()
try:
id = args[0]
args = " ".join(args[1:])
except IndexError:
raise CommandSyntaxError("no id given to print")
results = self.zeekctl.print_id(id=id, node_list=args)
for node, success, msg in results.get_node_output():
if success:
out = msg.split("\n", 1)
self.info(f"{node:>12s} {out[0]} = {out[1]}")
else:
self.err(f"{node:>12s} <error: {msg}>")
return results.ok
def do_peerstatus(self, args):
"""- [<nodes>]
Primarily for debugging, ``peerstatus`` reports statistics about the
network connections cluster nodes are using to communicate with other
nodes."""
results = self.zeekctl.peerstatus(node_list=args)
for node, success, msg in results.get_node_output():
if success:
self.info(f"{node:>11s}\n{msg}")
else:
self.err(f"{node:>11s} <error: {msg}>")
return results.ok
def do_netstats(self, args):
"""- [<nodes>]
Queries each of the nodes for their current counts of captured and
dropped packets."""
results = self.zeekctl.netstats(node_list=args)
for node, success, msg in results.get_node_output():
if success:
self.info(f"{node:>11s}: {msg}")
else:
self.err(f"{node:>11s}: <error: {msg}>")
return results.ok
def do_exec(self, args):
"""- <command line>
Executes the given Unix shell command line on all hosts configured to
run at least one Zeek instance. This is handy to quickly perform an
action across all systems."""
results = self.zeekctl.execute(cmd=args)
for node, success, output in results.get_node_output():
out = "\n> ".join(output.splitlines())
error = " " if success else "error"
self.info(f"[{node.name}/{node.host}] {error}\n> {out}")
return results.ok
def do_scripts(self, args):
"""- [-c] [<nodes>]
Primarily for debugging Zeek configurations, the ``scripts``
command lists all the Zeek scripts loaded by each of the nodes in the
order they will be parsed by the node at startup. The pathnames
of each script are indented such that it is possible to determine
from where a script was loaded based on the amount of indentation.
If ``-c`` is given, the command operates as check_ does: it reads
the policy files from their *original* location, not the copies
installed by install_. The latter option is useful to check a
not yet installed configuration."""
check = False
args = args.split()
try:
while args[0].startswith("-"):
opt = args[0]
if opt == "-c":
# Check non-installed policies.
check = True
else:
raise CommandSyntaxError(
f"invalid argument for the scripts command: {opt}"
)
args = args[1:]
except IndexError:
pass
args = " ".join(args)
results = self.zeekctl.scripts(check=check, node_list=args)
for node, success, output in results.get_node_output():
if success:
self.info(f"{node} scripts are ok.")
for line in output.splitlines():
self.info(f" {line}")
else:
self.info(f"{node} scripts failed.")
self.err(output)
return results.ok
def do_process(self, args):
"""- <trace> [options] [-- <scripts>]
Runs Zeek offline on a given trace file using the same configuration as
when running live. It does, however, use the potentially
not-yet-installed policy files in SitePolicyPath_ and disables log
rotation. Additional Zeek command line flags and scripts can
be given (each argument after a ``--`` argument is interpreted as
a script).
Upon completion, the command prints a path where the log files can be
found. Subsequent runs of this command may delete these logs.
In cluster mode, Zeek is run with *both* manager and worker scripts
loaded into a single instance. While that doesn't fully reproduce the
live setup, it is often sufficient for debugging analysis scripts.
"""
options = []
scripts = []
trace = ""
in_scripts = False
for arg in args.split():
if not trace:
trace = arg
continue
if arg == "--":
if in_scripts:
raise CommandSyntaxError(
'cannot parse the arguments of the process command (too many "--")'
)
in_scripts = True
continue
if not in_scripts:
options += [arg]
else:
scripts += [arg]
if not trace:
raise CommandSyntaxError(
"the process command requires the pathname of a trace file"
)
results = self.zeekctl.process(trace, options, scripts)
return results.ok
def completedefault(self, text, line, begidx, endidx):
# Commands that take a "<nodes>" argument.
nodes_cmds = [
"capstats",
"check",
"cleanup",
"df",
"diag",
"netstats",
"print",
"restart",
"start",
"status",
"stop",
"top",
"update",
"peerstatus",
"scripts",
]
args = line.split()
if not args or args[0] not in nodes_cmds:
return []
nodes = self.zeekctl.node_groups() + self.zeekctl.node_names()
return [n for n in nodes if n.startswith(text)]
def do_help(self, args):
"""Prints a brief summary of all commands understood by the shell."""
plugin_help = ""
for cmd, args, descr in self.zeekctl.plugins.allCustomCommands():
if not plugin_help:
plugin_help += "\nCommands provided by plugins:\n\n"
if args:
cmd = f"{cmd} {args}"
plugin_help += f" {cmd:<32s} - {descr}\n"
self.info(
f"""
ZeekControl Version {version.VERSION}
capstats [<nodes>] [<secs>] - Report interface statistics with capstats
check [<nodes>] - Check configuration before installing it
cleanup [--all] [<nodes>] - Delete working dirs (flush state) on nodes
config - Print zeekctl configuration
cron [--no-watch] - Perform jobs intended to run from cron
cron enable|disable|? - Enable/disable "cron" jobs
deploy - Check, install, and restart
df [<nodes>] - Print nodes' current disk usage
diag [<nodes>] - Output diagnostics for nodes
exec <shell cmd> - Execute shell command on all hosts
exit - Exit shell
install - Update zeekctl installation/configuration
netstats [<nodes>] - Print nodes' current packet counters
nodes - Print node configuration
peerstatus [<nodes>] - Print status of nodes' remote connections
print <id> [<nodes>] - Print values of script variable at nodes
process <trace> [<op>] [-- <sc>] - Run Zeek with options and scripts on trace
quit - Exit shell
restart [--clean] [<nodes>] - Stop and then restart processing
scripts [-c] [<nodes>] - List the Zeek scripts the nodes will load
start [<nodes>] - Start processing
status [<nodes>] - Summarize node status
stop [<nodes>] - Stop processing
top [<nodes>] - Show Zeek processes ala top
{plugin_help}"""
)
def main():
# Undocumented option to print the documentation.
if len(sys.argv) == 3 and sys.argv[1] == "--print-doc":
from ZeekControl import printdoc
printdoc.print_zeekctl_docs(sys.argv[2], ZeekCtlCmdLoop)
return 0
if len(sys.argv) == 2 and sys.argv[1] == "--version":
print(f"ZeekControl version {version.VERSION}")
return 0
interactive = True
if len(sys.argv) > 1:
interactive = False
cmd = ""
if len(sys.argv) == 2:
cmd = sys.argv[1]
try:
loop = ZeekCtlCmdLoop(ZeekCtl, interactive, cmd)
except ZeekControlError as e:
print(f"Error: {e}", file=sys.stderr)
return 1
if len(sys.argv) > 1:
cmdline = " ".join(sys.argv[1:])
loop.precmd(cmdline)
try:
cmdsuccess = loop.onecmd(cmdline)
loop.postcmd(False, cmdline)
except ZeekControlError as e:
cmdsuccess = False
print(f"Error: {e}", file=sys.stderr)
except KeyboardInterrupt:
cmdsuccess = False
finally:
loop.finish()
else:
try:
cmdsuccess = loop.cmdloop(
f'\nWelcome to ZeekControl {version.VERSION}\n\nType "help" for help.\n'
)
except ZeekControlError as e:
cmdsuccess = False
print(f"Error: {e}", file=sys.stderr)
except KeyboardInterrupt:
cmdsuccess = False
finally:
loop.finish()
return not cmdsuccess
if __name__ == "__main__":
sys.exit(main())