359 lines
14 KiB
Python
359 lines
14 KiB
Python
"""This module provides Broker-based communication with a Zeek cluster controller."""
|
|
|
|
import ssl
|
|
import time
|
|
|
|
import websocket
|
|
|
|
from .brokertypes import (
|
|
DataMessage,
|
|
HandshakeAckMessage,
|
|
HandshakeMessage,
|
|
ZeekEvent,
|
|
)
|
|
from .config import CONFIG
|
|
from .consts import CONTROLLER_TOPIC
|
|
from .events import Registry
|
|
from .logs import LOG
|
|
from .ssl import get_websocket_sslopt
|
|
from .utils import make_uuid
|
|
|
|
|
|
class Error(Exception):
|
|
"""Catch-all for exceptions arising from use of Controller objects."""
|
|
|
|
|
|
class ConfigError(Error):
|
|
"""A problem occurred while configuring the WebSocket object."""
|
|
|
|
|
|
class UsageError(Error):
|
|
"""Invalid sequence of operations on a Controller object."""
|
|
|
|
|
|
class Controller:
|
|
"""A class managing a connection to the Zeek cluster controller."""
|
|
|
|
def __init__(
|
|
self,
|
|
controller_host=None,
|
|
controller_port=None,
|
|
controller_topic=CONTROLLER_TOPIC,
|
|
):
|
|
"""Controller connection constructor.
|
|
|
|
This may raise ConfigError in case of trouble with the
|
|
connection settings.
|
|
"""
|
|
self.controller_host = controller_host or CONFIG.get("controller", "host")
|
|
self.controller_port = controller_port or CONFIG.getint("controller", "port")
|
|
self.controller_topic = controller_topic
|
|
self.controller_broker_id = None # Defined in Handshake ACK message
|
|
|
|
try:
|
|
if self.controller_port < 1 or self.controller_port > 65535:
|
|
raise ValueError(
|
|
f"controller port number {self.controller_port} outside valid range",
|
|
)
|
|
|
|
disable_ssl = CONFIG.getboolean("ssl", "disable")
|
|
|
|
proto = "ws" if disable_ssl else "wss"
|
|
remote = f"{self.controller_host}:{self.controller_port}"
|
|
self.wsock_url = f"{proto}://{remote}/v1/messages/json"
|
|
|
|
sslopt = None if disable_ssl else get_websocket_sslopt()
|
|
self.wsock = websocket.WebSocket(sslopt=sslopt)
|
|
except (ValueError, OSError, ssl.SSLError) as err:
|
|
raise ConfigError(
|
|
f"cannot configure connection to "
|
|
f"{self.controller_host}:{self.controller_port}: {err}",
|
|
) from err
|
|
|
|
def connect(self):
|
|
"""Connect to the configured controller.
|
|
|
|
This takes the controller coordonates from the zeek-client configuration
|
|
(or the arguments passed to the constructor, if any) and establishes a
|
|
fully peered connection. "Fully peered" here means that the object first
|
|
establishes the websocket connection, potentially wrapped in TLS as per
|
|
the TLS-specific configuration settings, and then conducts the
|
|
Broker-level handshake. The latter establishes the Controller's Broker
|
|
ID and our topic subscriptions.
|
|
|
|
Returns True if peering completes successfully, False otherwise, with
|
|
according messages written to the log.
|
|
"""
|
|
LOG.info(
|
|
"connecting to controller %s:%s",
|
|
self.controller_host,
|
|
self.controller_port,
|
|
)
|
|
|
|
attempts = CONFIG.getint("client", "peering_attempts")
|
|
retry_delay = CONFIG.getfloat("client", "peering_retry_delay_secs")
|
|
|
|
handshake = HandshakeMessage([self.controller_topic])
|
|
|
|
# We accommodate problems during connect() and the Broker handshake,
|
|
# attempting these a total of client.peering_attempts times. That is,
|
|
# if we use 10 attempts and connect() takes 3 attempts, 7 attempts
|
|
# remain for the handshake. Since the kinds of problems that may arise
|
|
# in either stage in the (web)socket operations overlap substantially,
|
|
# we use a single function that checks them all:
|
|
def wsock_operation(op, stage):
|
|
nonlocal attempts
|
|
|
|
while attempts > 0:
|
|
try:
|
|
attempts -= 1
|
|
return op()
|
|
except websocket.WebSocketTimeoutException:
|
|
time.sleep(retry_delay)
|
|
continue
|
|
except websocket.WebSocketException as err:
|
|
LOG.error(
|
|
"websocket error in %s with controller %s:%s: %s",
|
|
stage,
|
|
self.controller_host,
|
|
self.controller_port,
|
|
err,
|
|
)
|
|
return False
|
|
except ConnectionRefusedError:
|
|
# We don't consider these fatal since they can happen
|
|
# naturally during tests and other automated setups where
|
|
# it's beneficial to keep trying. Also, this is a subclass
|
|
# of OSError, so needs to come before it:
|
|
LOG.debug(
|
|
"connection refused for controller %s:%s",
|
|
self.controller_host,
|
|
self.controller_port,
|
|
)
|
|
time.sleep(retry_delay)
|
|
continue
|
|
except ssl.SSLError as err:
|
|
# Same here, likewise a subclass of OSError:
|
|
LOG.error(
|
|
"socket TLS error in %s with controller %s:%s: %s",
|
|
stage,
|
|
self.controller_host,
|
|
self.controller_port,
|
|
err,
|
|
)
|
|
return False
|
|
except OSError as err:
|
|
# From socket.py docs: "Errors related to socket or address
|
|
# semantics raise OSError or one of its subclasses".
|
|
LOG.error(
|
|
"socket error in %s with controller %s:%s: %s",
|
|
stage,
|
|
self.controller_host,
|
|
self.controller_port,
|
|
err,
|
|
)
|
|
return False
|
|
except Exception as err:
|
|
LOG.exception(
|
|
"unexpected error in %s with controller %s:%s: %s",
|
|
stage,
|
|
self.controller_host,
|
|
self.controller_port,
|
|
err,
|
|
)
|
|
return False
|
|
|
|
if attempts == 0:
|
|
LOG.error(
|
|
"websocket connection to %s:%s timed out in %s",
|
|
self.controller_host,
|
|
self.controller_port,
|
|
stage,
|
|
)
|
|
return False
|
|
|
|
def connect_op():
|
|
self.wsock.connect(self.wsock_url, timeout=retry_delay)
|
|
self.wsock.send(handshake.serialize())
|
|
return True
|
|
|
|
def handshake_op():
|
|
rawdata = self.wsock.recv()
|
|
try:
|
|
msg = HandshakeAckMessage.unserialize(rawdata)
|
|
except TypeError as err:
|
|
LOG.error(
|
|
"protocol data error with controller %s:%s: %s, raw data: %s",
|
|
self.controller_host,
|
|
self.controller_port,
|
|
err,
|
|
rawdata,
|
|
)
|
|
return False
|
|
|
|
self.controller_broker_id = msg.endpoint
|
|
LOG.info(
|
|
"peered with controller %s:%s",
|
|
self.controller_host,
|
|
self.controller_port,
|
|
)
|
|
return True
|
|
|
|
if not wsock_operation(connect_op, "connect()"):
|
|
return False
|
|
if not wsock_operation(handshake_op, "handshake"):
|
|
return False
|
|
|
|
return True
|
|
|
|
def publish(self, event):
|
|
"""Publishes the given event to the controller topic.
|
|
|
|
Raises UsageError when invoked without an earlier connect().
|
|
|
|
Args:
|
|
event (zeekclient.event.Event): the event to publish.
|
|
"""
|
|
if self.controller_broker_id is None:
|
|
raise UsageError("cannot publish without established peering")
|
|
|
|
msg = DataMessage(self.controller_topic, event.to_brokertype())
|
|
self.wsock.send(msg.serialize())
|
|
|
|
def receive(self, timeout_secs=None, filter_pred=None):
|
|
"""Receive an event from the controller's event subscriber.
|
|
|
|
Raises UsageError when invoked without an earlier connect().
|
|
|
|
Args:
|
|
timeout_secs (int): number of seconds before we time out.
|
|
Has sematics of the poll.poll() timeout argument, i.e.
|
|
None and negative values mean no timeout. The default
|
|
is client.request_timeout_secs.
|
|
|
|
filter_pred: a predicate function for filtering out unacceptable
|
|
events. The function takes a received event as only input,
|
|
returning True if the event is acceptable for returning to the
|
|
`receive()` caller, and False otherwise. When not provided,
|
|
any received event is acceptable. When the predicate returns
|
|
false, the wait for a suitable event continues, subject to the
|
|
same overall timeout.
|
|
|
|
Returns:
|
|
A tuple of (1) an instance of one of the Event classes defined for
|
|
the client, or None if timeout_secs passed before anything arrived,
|
|
and (2) a string indicating any occurring errors. The string is
|
|
empty when no error occurs.
|
|
"""
|
|
if self.controller_broker_id is None:
|
|
raise UsageError("cannot receive without established peering")
|
|
|
|
timeout = timeout_secs or CONFIG.getint("client", "request_timeout_secs")
|
|
old_timeout = self.wsock.gettimeout()
|
|
|
|
try:
|
|
self.wsock.settimeout(timeout)
|
|
|
|
remote = f"{self.controller_host}:{self.controller_port}"
|
|
|
|
while True:
|
|
# Reading the event proceeds in three steps:
|
|
# (1) read data from the websocket
|
|
# (2) ensure it's a data message
|
|
# (3) try to extract data message payload as event
|
|
try:
|
|
msg = DataMessage.unserialize(self.wsock.recv())
|
|
except TypeError as err:
|
|
return (
|
|
None,
|
|
f"protocol data error with controller {remote}: {err}",
|
|
)
|
|
except websocket.WebSocketTimeoutException:
|
|
return (
|
|
None,
|
|
f"websocket connection to {remote} timed out",
|
|
)
|
|
except Exception as err:
|
|
LOG.exception("unexpected error")
|
|
return (
|
|
None,
|
|
f"unexpected error with controller {remote}: {err}",
|
|
)
|
|
try:
|
|
# Events are a specially laid-out vector of vectors:
|
|
# https://docs.zeek.org/projects/broker/en/current/web-socket.html#encoding-of-zeek-events
|
|
evt = ZeekEvent.from_vector(msg.data)
|
|
|
|
# Turn Broker-level event data into a zeekclient.event.Event:
|
|
res = Registry.make_event(evt.name, *evt.args)
|
|
if res is not None and (filter_pred is None or filter_pred(res)):
|
|
return res, ""
|
|
except TypeError:
|
|
return None, (
|
|
f"protocol data error with controller {remote}: "
|
|
f"invalid event data, {repr(msg.data)}"
|
|
)
|
|
|
|
# This wasn't the event type we wanted, try again.
|
|
finally:
|
|
self.wsock.settimeout(old_timeout)
|
|
|
|
def transact(self, request_type, response_type, *request_args, reqid=None):
|
|
"""Pairs publishing a request event with receiving its response event.
|
|
|
|
This is a wrapper around :meth:`.Controller.publish()` with subsequent
|
|
:meth:`.Controller.receive()`, with automatic provision of a request ID
|
|
in the request event, and validation of a matching request ID in the
|
|
response. Mismatching response events are ignored, and lack of a
|
|
suitable event in the timeout period leads to an empty result with
|
|
according error message, just like :meth:`.Controller.receive()`.
|
|
|
|
The function works only with request and response event types that take
|
|
a "reqid" string as first argument. The function verifies this lightly,
|
|
just by looking at the name of the first argument. See
|
|
`zeekclient.events` for suitable event types.
|
|
|
|
Raises UsageError when invoked without an earlier connect().
|
|
|
|
Args:
|
|
request_type (zeekclient.event.Event class): the request event type.
|
|
|
|
response_type (zeekclient.event.Event class): the response event type.
|
|
|
|
request_args: any event arguments in addition to the initial "reqid" string.
|
|
|
|
reqid (str): the request ID to use in the request event, and expect
|
|
in the response event. When omitted, the function produces its
|
|
own ID.
|
|
|
|
Returns:
|
|
The same as Controller.receive(): tuple of an event instance
|
|
and a string indicating any error.
|
|
"""
|
|
# Verify that the first arguments of the event types are actually a
|
|
# request ID -- we just look at the name:
|
|
if request_type.ARG_NAMES[0] != "reqid":
|
|
return (
|
|
None,
|
|
f"type error: event type {request_type.__name__} does not have request ID",
|
|
)
|
|
if response_type.ARG_NAMES[0] != "reqid":
|
|
return (
|
|
None,
|
|
f"type error: event type {response_type.__name__} does not have request ID",
|
|
)
|
|
|
|
if reqid is None:
|
|
reqid = make_uuid()
|
|
|
|
evt = request_type(reqid, *request_args)
|
|
self.publish(evt)
|
|
|
|
def is_response(evt):
|
|
try:
|
|
return isinstance(evt, response_type) and evt.reqid.to_py() == reqid
|
|
except AttributeError:
|
|
return False
|
|
|
|
return self.receive(filter_pred=is_response)
|