Patrick Kelley 8fd444092b initial
2025-05-07 15:35:15 -04:00

359 lines
14 KiB
Python

"""This module provides Broker-based communication with a Zeek cluster controller."""
import ssl
import time
import websocket
from .brokertypes import (
DataMessage,
HandshakeAckMessage,
HandshakeMessage,
ZeekEvent,
)
from .config import CONFIG
from .consts import CONTROLLER_TOPIC
from .events import Registry
from .logs import LOG
from .ssl import get_websocket_sslopt
from .utils import make_uuid
class Error(Exception):
"""Catch-all for exceptions arising from use of Controller objects."""
class ConfigError(Error):
"""A problem occurred while configuring the WebSocket object."""
class UsageError(Error):
"""Invalid sequence of operations on a Controller object."""
class Controller:
"""A class managing a connection to the Zeek cluster controller."""
def __init__(
self,
controller_host=None,
controller_port=None,
controller_topic=CONTROLLER_TOPIC,
):
"""Controller connection constructor.
This may raise ConfigError in case of trouble with the
connection settings.
"""
self.controller_host = controller_host or CONFIG.get("controller", "host")
self.controller_port = controller_port or CONFIG.getint("controller", "port")
self.controller_topic = controller_topic
self.controller_broker_id = None # Defined in Handshake ACK message
try:
if self.controller_port < 1 or self.controller_port > 65535:
raise ValueError(
f"controller port number {self.controller_port} outside valid range",
)
disable_ssl = CONFIG.getboolean("ssl", "disable")
proto = "ws" if disable_ssl else "wss"
remote = f"{self.controller_host}:{self.controller_port}"
self.wsock_url = f"{proto}://{remote}/v1/messages/json"
sslopt = None if disable_ssl else get_websocket_sslopt()
self.wsock = websocket.WebSocket(sslopt=sslopt)
except (ValueError, OSError, ssl.SSLError) as err:
raise ConfigError(
f"cannot configure connection to "
f"{self.controller_host}:{self.controller_port}: {err}",
) from err
def connect(self):
"""Connect to the configured controller.
This takes the controller coordonates from the zeek-client configuration
(or the arguments passed to the constructor, if any) and establishes a
fully peered connection. "Fully peered" here means that the object first
establishes the websocket connection, potentially wrapped in TLS as per
the TLS-specific configuration settings, and then conducts the
Broker-level handshake. The latter establishes the Controller's Broker
ID and our topic subscriptions.
Returns True if peering completes successfully, False otherwise, with
according messages written to the log.
"""
LOG.info(
"connecting to controller %s:%s",
self.controller_host,
self.controller_port,
)
attempts = CONFIG.getint("client", "peering_attempts")
retry_delay = CONFIG.getfloat("client", "peering_retry_delay_secs")
handshake = HandshakeMessage([self.controller_topic])
# We accommodate problems during connect() and the Broker handshake,
# attempting these a total of client.peering_attempts times. That is,
# if we use 10 attempts and connect() takes 3 attempts, 7 attempts
# remain for the handshake. Since the kinds of problems that may arise
# in either stage in the (web)socket operations overlap substantially,
# we use a single function that checks them all:
def wsock_operation(op, stage):
nonlocal attempts
while attempts > 0:
try:
attempts -= 1
return op()
except websocket.WebSocketTimeoutException:
time.sleep(retry_delay)
continue
except websocket.WebSocketException as err:
LOG.error(
"websocket error in %s with controller %s:%s: %s",
stage,
self.controller_host,
self.controller_port,
err,
)
return False
except ConnectionRefusedError:
# We don't consider these fatal since they can happen
# naturally during tests and other automated setups where
# it's beneficial to keep trying. Also, this is a subclass
# of OSError, so needs to come before it:
LOG.debug(
"connection refused for controller %s:%s",
self.controller_host,
self.controller_port,
)
time.sleep(retry_delay)
continue
except ssl.SSLError as err:
# Same here, likewise a subclass of OSError:
LOG.error(
"socket TLS error in %s with controller %s:%s: %s",
stage,
self.controller_host,
self.controller_port,
err,
)
return False
except OSError as err:
# From socket.py docs: "Errors related to socket or address
# semantics raise OSError or one of its subclasses".
LOG.error(
"socket error in %s with controller %s:%s: %s",
stage,
self.controller_host,
self.controller_port,
err,
)
return False
except Exception as err:
LOG.exception(
"unexpected error in %s with controller %s:%s: %s",
stage,
self.controller_host,
self.controller_port,
err,
)
return False
if attempts == 0:
LOG.error(
"websocket connection to %s:%s timed out in %s",
self.controller_host,
self.controller_port,
stage,
)
return False
def connect_op():
self.wsock.connect(self.wsock_url, timeout=retry_delay)
self.wsock.send(handshake.serialize())
return True
def handshake_op():
rawdata = self.wsock.recv()
try:
msg = HandshakeAckMessage.unserialize(rawdata)
except TypeError as err:
LOG.error(
"protocol data error with controller %s:%s: %s, raw data: %s",
self.controller_host,
self.controller_port,
err,
rawdata,
)
return False
self.controller_broker_id = msg.endpoint
LOG.info(
"peered with controller %s:%s",
self.controller_host,
self.controller_port,
)
return True
if not wsock_operation(connect_op, "connect()"):
return False
if not wsock_operation(handshake_op, "handshake"):
return False
return True
def publish(self, event):
"""Publishes the given event to the controller topic.
Raises UsageError when invoked without an earlier connect().
Args:
event (zeekclient.event.Event): the event to publish.
"""
if self.controller_broker_id is None:
raise UsageError("cannot publish without established peering")
msg = DataMessage(self.controller_topic, event.to_brokertype())
self.wsock.send(msg.serialize())
def receive(self, timeout_secs=None, filter_pred=None):
"""Receive an event from the controller's event subscriber.
Raises UsageError when invoked without an earlier connect().
Args:
timeout_secs (int): number of seconds before we time out.
Has sematics of the poll.poll() timeout argument, i.e.
None and negative values mean no timeout. The default
is client.request_timeout_secs.
filter_pred: a predicate function for filtering out unacceptable
events. The function takes a received event as only input,
returning True if the event is acceptable for returning to the
`receive()` caller, and False otherwise. When not provided,
any received event is acceptable. When the predicate returns
false, the wait for a suitable event continues, subject to the
same overall timeout.
Returns:
A tuple of (1) an instance of one of the Event classes defined for
the client, or None if timeout_secs passed before anything arrived,
and (2) a string indicating any occurring errors. The string is
empty when no error occurs.
"""
if self.controller_broker_id is None:
raise UsageError("cannot receive without established peering")
timeout = timeout_secs or CONFIG.getint("client", "request_timeout_secs")
old_timeout = self.wsock.gettimeout()
try:
self.wsock.settimeout(timeout)
remote = f"{self.controller_host}:{self.controller_port}"
while True:
# Reading the event proceeds in three steps:
# (1) read data from the websocket
# (2) ensure it's a data message
# (3) try to extract data message payload as event
try:
msg = DataMessage.unserialize(self.wsock.recv())
except TypeError as err:
return (
None,
f"protocol data error with controller {remote}: {err}",
)
except websocket.WebSocketTimeoutException:
return (
None,
f"websocket connection to {remote} timed out",
)
except Exception as err:
LOG.exception("unexpected error")
return (
None,
f"unexpected error with controller {remote}: {err}",
)
try:
# Events are a specially laid-out vector of vectors:
# https://docs.zeek.org/projects/broker/en/current/web-socket.html#encoding-of-zeek-events
evt = ZeekEvent.from_vector(msg.data)
# Turn Broker-level event data into a zeekclient.event.Event:
res = Registry.make_event(evt.name, *evt.args)
if res is not None and (filter_pred is None or filter_pred(res)):
return res, ""
except TypeError:
return None, (
f"protocol data error with controller {remote}: "
f"invalid event data, {repr(msg.data)}"
)
# This wasn't the event type we wanted, try again.
finally:
self.wsock.settimeout(old_timeout)
def transact(self, request_type, response_type, *request_args, reqid=None):
"""Pairs publishing a request event with receiving its response event.
This is a wrapper around :meth:`.Controller.publish()` with subsequent
:meth:`.Controller.receive()`, with automatic provision of a request ID
in the request event, and validation of a matching request ID in the
response. Mismatching response events are ignored, and lack of a
suitable event in the timeout period leads to an empty result with
according error message, just like :meth:`.Controller.receive()`.
The function works only with request and response event types that take
a "reqid" string as first argument. The function verifies this lightly,
just by looking at the name of the first argument. See
`zeekclient.events` for suitable event types.
Raises UsageError when invoked without an earlier connect().
Args:
request_type (zeekclient.event.Event class): the request event type.
response_type (zeekclient.event.Event class): the response event type.
request_args: any event arguments in addition to the initial "reqid" string.
reqid (str): the request ID to use in the request event, and expect
in the response event. When omitted, the function produces its
own ID.
Returns:
The same as Controller.receive(): tuple of an event instance
and a string indicating any error.
"""
# Verify that the first arguments of the event types are actually a
# request ID -- we just look at the name:
if request_type.ARG_NAMES[0] != "reqid":
return (
None,
f"type error: event type {request_type.__name__} does not have request ID",
)
if response_type.ARG_NAMES[0] != "reqid":
return (
None,
f"type error: event type {response_type.__name__} does not have request ID",
)
if reqid is None:
reqid = make_uuid()
evt = request_type(reqid, *request_args)
self.publish(evt)
def is_response(evt):
try:
return isinstance(evt, response_type) and evt.reqid.to_py() == reqid
except AttributeError:
return False
return self.receive(filter_pred=is_response)