#!/usr/bin/python3

# Copyright (c) 2026, Oracle and/or its affiliates.
# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
#
# This code is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License version 2 only, as
# published by the Free Software Foundation.
#
# This code is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
# version 2 for more details (a copy is included in the LICENSE file that
# accompanied this code).
#
# You should have received a copy of the GNU General Public License version
# 2 along with this work; if not, see <https://www.gnu.org/licenses/>.
#
# Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
# or visit www.oracle.com if you need additional information or have any
# questions.

# This is script to run on kmon service start.

"""
kmon_server

Long-running daemon for kmon service that manages kernel monitors driven by dtrace scripts.

Responsibilities:
  1) Read configuration and instantiate configured kernel monitors.
  2) Start monitors that are enabled by default on service startup.
  3) Create and listen on a Unix domain socket for:
       - administrative commands from kmonadm
       - event notifications from dtrace/kmon_notify
  4) Log monitor events and failures (fallback to syslog).

IPC / Protocol (Unix domain socket):
  - Socket path: /var/run/oled/kmon/kmon_socket
  - Incoming message type is determined by the first key:
      * "COMMAND=..." (admin request from kmonadm)
      * "EVENT=..."   (event notification from dtrace/kmon_notify)
  - Admin response format:
      "<code>!!!<message>"
    where code is an integer status and message is human-readable string.

Event files:
  - Runtime directory for event files: /var/run/oled/kmon/events
  - Each monitor creates its event notification file with restrictive permissions.

Monitor execution model:
  - Each monitor runs its dtrace script in a dedicated background thread.
  - 'active' indicates the monitor is enabled/managed, not necessarily that dtrace
    is currently running; after an event triggers, the dtrace script exits and is
    restarted after the configured interval.

Operational notes:
  - The server loop is intended to run until the service is stopped.
"""

import re
import os
import threading
import time
import subprocess
import logging
import logging.handlers
import socket
import sys
import fcntl
from pathlib import Path

# Unix domain socket used for both admin commands (kmonadm) and event notifications (kmon_notify)
SOCKET_PATH = "/var/run/oled/kmon/kmon_socket"

# Runtime directory used to store per-monitor event notification files
EVENTS_BASE_PATH = "/var/run/oled/kmon/events"

# kernel monitor
class KMON:
    """
    Represents one configured kernel monitor.

    A monitor is defined by:
      - name: stable identifier (also used for display and event file naming)
      - enabled: whether it should be activated automatically on service start
      - interval: minimum restart delay; after an event triggers and the dtrace script exits,
                  the monitor sleeps for 'interval' seconds before starting dtrace again
      - dpath: full path to the dtrace script to execute

    State model:
      - kmon_active: monitor is activated
      - dtrace_pid:  PID of the currently running dtrace process (if any)
      - kmon_derror: indicates an error occurred when running dtrace
      - command_terminating: set when a stop request is in progress to coordinate shutdown
    """
    def __init__(self, name):
        # name, works as ID, of the monitor
        self.kmon_config_name = name

        # flag from configuration indicating if we should make this monitor
        # active by default (on service start)
        self.kmon_config_enabled = False

        # interval in seconds from configuration.
        # the dtrace event for this monitor can be triggered at most once within this interval.
        # implementation: after the dtrace event is triggered, the the dtrace script exits with 0.
        # It will be restarted after the this specified time.
        # default value is 120 seconds
        self.kmon_config_interval = 120

        # the full path from configuration of the dtrace script
        self.kmon_config_dpath = None

        # status indicating if this monitor is currently active.
        # "active" doesn't mean the dtrace is currently running.
        # check, "interval" for more
        self.kmon_active = False

        # error code (True or False) indicating if error happened with dtrace script
        self.kmon_derror = False

        self.activate_thread = None

        # flag indicating if deactivate is in progress
        self.command_terminating = False

        # dtrace process ID
        self.dtrace_pid = None

    def set_enabled(self, enabled:bool):
        """mark if this monitor is enabled"""
        self.kmon_config_enabled = enabled

    def set_interval(self, interval:int):
        """Set interval for this monitor"""
        self.kmon_config_interval = interval

    def set_dpath(self, dpath:str):
        """set the full path of the dtrace to run"""
        self.kmon_config_dpath = dpath

    def activate_thread_func(self):
        """
        Background worker for an activated monitor.

        Behavior:
          - Repeatedly launches the configured dtrace script.
          - While dtrace is running, periodically checks:
              * whether a stop command is requesting termination
              * whether the process has exited
          - Exit conditions:
              * stop requested -> kill dtrace and end the thread
              * dtrace exits with non-zero -> record error state and stop the monitor
              * dtrace exits with 0 -> treat as "event captured", then restart after interval
        """
        self.kmon_active = True
        while True:
            # start the dtrace script and wait until it terminates
            # then sleep self.kmon_config_interval seconds (and start again)
            cmd = [f"{self.kmon_config_dpath}"]
            try:
                with subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) as proc:
                    self.dtrace_pid = proc.pid
                    # keeps checking for dtrace running error and terminate requests from kmonadm
                    # until the dtrace terminates
                    while True:
                        returncode = proc.poll()
                        # monitor (dtrace) is running
                        if returncode is None:
                            # kill the dtrace as kmonadm requested
                            if self.command_terminating:
                                proc.kill()
                                proc.wait()
                                break
                        # error happened to dtrace (or it's killed)
                        elif returncode != 0:
                            _,err = proc.communicate()
                            LOGGING.log_failed_cmd(cmd[0], err.decode())
                            self.kmon_derror = True
                            self.kmon_active = False
                            break
                        # returncode == 0, exit after received kernel event
                        # monitor will be started after sleeping self.kmon_config_interval seconds
                        else:
                            self.dtrace_pid = None
                            break
                        time.sleep(0.5)

            except OSError:
                LOGGING.log_failed_cmd(cmd[0])
                self.kmon_derror = True
                break

            if self.command_terminating or self.kmon_derror:
                self.dtrace_pid = None
                break

            end = time.monotonic() + self.kmon_config_interval
            while time.monotonic() < end and not self.command_terminating:
                time.sleep(0.5)

        self.kmon_active = False

    # called for
    # 1. repond to "start" command from kmonadm
    # 2. service start
    # return (result_code, message)
    def activate(self):
        """
        Activate this monitor.

        Side effects:
          - Creates/overwrites the per-monitor event file under EVENTS_BASE_PATH
            with restrictive permissions.
          - Spawns a background thread that runs and supervises the dtrace process.

        Returns:
          (code:int, message:str)
        """

        # create the file for notification
        filename = f"{EVENTS_BASE_PATH}/{self.kmon_config_name.upper()}"
        fd = os.open(filename, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, 0o600)
        os.close(fd)

        # already active
        if self.kmon_active:
            return (1, f"Kernel monitor [{self.kmon_config_name}] is currently active")

        # in progress of termination
        # this shouldn't happen if we don't run multiple kmonadm at same time
        if self.command_terminating:
            return (1,
                    (f"Kernel monitor [{self.kmon_config_name}] is currently deactivating, "
                     "please try again"
                    )
                   )

        # clear previous error status if any
        self.kmon_derror = False

        # start a new thread to run dtrace
        self.activate_thread = threading.Thread(target=self.activate_thread_func,args = ())
        self.activate_thread.start()
        return (0,
                (f"Kernel monitor [{self.kmon_config_name}] attempted, use 'kmonadm list' "
                 "command to check status"
                )
               )

    # reponds to "stop" command from kmonadm
    # return (result_code, message)
    def deactivate(self):
        """
        Deactivate this monitor.

        Behavior:
          - If not active, returns an error code/message.
          - Otherwise signals the background thread to terminate, waits for it to exit,
            and updates state accordingly.

        Returns:
          (code:int, message:str)
        """

        # not active
        if not self.kmon_active:
            return (1, f"Kernel monitor [{self.kmon_config_name}] is not active")

        assert self.activate_thread is not None
        self.command_terminating = True
        self.activate_thread.join()
        self.command_terminating = False
        self.kmon_active = False
        return (0, f"Kernel monitor [{self.kmon_config_name}] is deactivated")

    # called on service start
    def service_start(self):
        """start this monitor when service start"""
        # not enabled, we are done
        if not self.kmon_config_enabled:
            return

        self.activate()

# for configuration
class Section:
    """A section in configuration file"""
    def __init__(self, name: str):
        self.name = name
        self.items = {}

# for configuration
class ReadConfig:
    """Reading configuration file"""

    SECTION_PATTERN = re.compile(r"^\s*\[([^\]]+)\]\s*$")
    KEYVAL_PATTERN = re.compile(r"^\s*([^:#=]+)\s*[:=]\s*(.*?)\s*$")

    def __init__(self, filename: str):
        self.filename = filename
        self.kmon_sections = []
        self.kcomponent_sections = []
        self._load()

    # load kernel monitors
    def _load(self):
        current_section = None

        with open(self.filename, "r", encoding="utf-8") as f:
            for line in f:
                line_stripped = line.strip()

                # Skip empty or comment
                if not line_stripped or line_stripped.startswith("#"):
                    continue

                # New section
                section_match = self.SECTION_PATTERN.match(line_stripped)
                if section_match:
                    name = section_match.group(1).upper()
                    current_section = Section(name)
                    if name == "KMON":
                        self.kmon_sections.append(current_section)
                    continue

                # Key-value
                kv_match = self.KEYVAL_PATTERN.match(line_stripped)
                if kv_match and current_section:
                    key, value = kv_match.groups()
                    key = key.strip().upper()
                    value = value.strip()
                    if key == "INTERVAL":
                        value = int(value)
                    elif key == "ENABLE":
                        value = bool(value.upper() == "TRUE")
                    current_section.items[key] = value
                    continue

# kernel monitors
k_monitors = []
# setup kernel monitors from configuration
def setup_kernel_monitors(config):
    """put all kmon instances to k_monitors"""
    for s in config.kmon_sections:
        kmon = KMON(s.items["NAME"])
        kmon.set_enabled(s.items["ENABLE"])
        kmon.set_interval(s.items["INTERVAL"])
        kmon.set_dpath(s.items["MONITOR"])
        k_monitors.append(kmon)

# write messages to /var/log/messages
class LOGGING:
    """
    logging
    write messages to syslog
    """
    syslogger = None

    @classmethod
    # write log to /var/log/messages
    def log(cls, msg):
        """write the message to /var/log/messages"""
        if cls.syslogger is None:
            logger = logging.getLogger()
            logger.setLevel(logging.INFO)
            syslog_handler = logging.handlers.SysLogHandler(address = '/dev/log')
            formatter = logging.Formatter('%(message)s')
            syslog_handler.setFormatter(formatter)
            logger.addHandler(syslog_handler)
            cls.syslogger = logger
        cls.syslogger.info(msg)

    @classmethod
    # log the failure of running command _cmd_
    # parameters:
    #   @cmd    -- the command line, str
    #   @msg    -- the error message if provided, str or None
    def log_failed_cmd(cls, cmd:str, msg = None):
        """log the failure of the command"""
        if msg:
            cls.log(f"KMON: running {cmd} failed: {msg}")
        else:
            cls.log(f"KMON: running {cmd} failed")

    @classmethod
    def log_error(cls, msg:str):
        """log error messages"""
        cls.log(f"KMON: error: {msg}")

    @classmethod
    def log_event(cls, event:str):
        """log the event"""
        cls.log(f"KMON: {event}")

# start all enabled monitors (start the dtrace scripts)
def start_service():
    """start dtrace for all enabled monitors"""
    for kmon in k_monitors:
        kmon.service_start()

# Primary configuration file path
CONFIG_PATH = "/etc/oled/kmon/kmon.conf"

def initialize():
    """
    -- read config to get all monitors
    -- creat event directory
    -- lock file prevent multiple instances of kmon_server from running
    """
    config = ReadConfig(CONFIG_PATH)
    setup_kernel_monitors(config)
    # create dir /var/run/oled/kmon/events
    Path(EVENTS_BASE_PATH).mkdir(parents=True, exist_ok=True)

# process the commands from kmonadm
class ProcessCommand:
    """
    Handles administrative requests received over the Unix domain socket.

    Expected request (lines):
      COMMAND=<list|start|stop|stop_service>
      PARAM=<monitor_name>          # optional, required for start/stop

    Response:
      "<code>!!!<message>"
    """

    VALID_COMMNDS = ["list", "start", "stop", "stop_service"]

    @classmethod
    # parse and process the commands from kmonadm
    # send back resutls to kmonadm
    # parameters:
    #   @socket -- the socket to send back messages via
    #   @lines  -- the command string list sent from kmon
    # response message format:
    # <code>!!!<message>
    #   code                -- result of the processing
    #   message             -- additional message, str
    def process_command(cls, adm_socket, lines):
        """parse the given command string and process it"""
        command = lines[0].split("=")[1]
        # check if the command is valid
        if not command in cls.VALID_COMMNDS:
            code = 1
            msg = f"Invalid command: {command}"
        else:
            if command == "list":
                code = 0
                msg = cls.process_cmd_list()
            elif command == "stop":
                name = None
                if len(lines) >= 2:
                    name = lines[1].split("=")[1]
                if name is None:
                    code = 1
                    msg = "No kmon name is specified to stop"
                else:
                    code, msg = cls.process_cmd_stop(name)
            elif command == "start":
                name = None
                if len(lines) >= 2:
                    name = lines[1].split("=")[1]
                if name is None:
                    code = 1
                    msg = "No kmon name is specified to start"
                else:
                    code, msg = cls.process_cmd_start(name)
            # 'stop_service' comes from systemctl only to stop kmon service
            elif command == "stop_service":
                code, msg = cls.process_cmd_stop_service()
                msg = f"{code}!!!{msg}"
                adm_socket.send(msg.encode())
                adm_socket.close()
                sys.exit(0)
            else:
                code = 1
                msg = f"command {command} not supported"
        msg = f"{code}!!!{msg}"
        adm_socket.send(msg.encode())

    @classmethod
    def process_cmd_list(cls):
        """process 'list' command"""
        msg =       "MONITOR_NAME  INTERVAL  STATUS     PID\n"
        msg = msg + "============  ========  =========  ========\n"
        for kmon in k_monitors:
            if kmon.kmon_active:
                kmon_msg = (
                    f"{kmon.kmon_config_name:<12}"
                    f"  {kmon.kmon_config_interval:<8d}"
                    f"  {'active':<9}"
                    f"  {kmon.dtrace_pid}"
                )
            elif kmon.kmon_derror:
                kmon_msg = (
                    f"{kmon.kmon_config_name:<12}"
                    f"  {kmon.kmon_config_interval:<8d}"
                    f"  {'error':<9}"
                )
            else:
                kmon_msg = (
                    f"{kmon.kmon_config_name:<12}"
                    f"  {kmon.kmon_config_interval:<8d}"
                    f"  {'inactive':<9}"
                )
            msg = msg + kmon_msg + "\n"
        return msg

    @classmethod
    def process_cmd_start(cls, name):
        """process 'start' command"""
        for kmon in k_monitors:
            if kmon.kmon_config_name == name:
                return kmon.activate()
        return (1, f"kmon {name} not found")

    @classmethod
    def process_cmd_stop(cls, name):
        """process 'stop' command"""
        for kmon in k_monitors:
            if kmon.kmon_config_name == name:
                return kmon.deactivate()
        return (1, f"kmon {name} not found")

    @classmethod
    def process_cmd_stop_service(cls):
        """process 'stop_service' command"""
        for kmon in k_monitors:
            kmon.deactivate()
        # don't need to send message back as 'stop_service' is used by systemctl only
        return (0, "")

class ProcessEvent:
    """
    Handles event notifications received over the Unix domain socket (from dtrace/kmon_notify).

    Expected payload:
      First line begins with "EVENT=..." (the server uses the first token to route messages).

    Current behavior:
      - Logs the event string via LOGGING.
    """

    @classmethod
    def process_event(cls, event:str):
        """process the event from dtrace"""
        LOGGING.log_event(event)

# create the unix domain socket, waiting for
# 1. commands from kmonadm
# 2. events from dtrace
# and process them.
# This funciton never ends until service is stopped.
def wait_process_commands():
    """
    Main server loop.

    Creates the Unix domain socket at SOCKET_PATH and then repeatedly accepts connections.
    Each connection is expected to send a small single request:
      - COMMAND=... (admin)
      - EVENT=...   (event)
    The request is processed synchronously and the connection is closed.

    Note:
      - The accept loop uses a timeout to remain responsive (e.g., to shutdown logic).
    """

    if os.path.exists(SOCKET_PATH):
        os.remove(SOCKET_PATH)

    server_socket = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
    server_socket.bind(SOCKET_PATH)
    server_socket.listen() # use default 5 queued connections
    server_socket.settimeout(1)
    while True:
        try:
            conn_socket, _ = server_socket.accept()
        except socket.timeout:
            continue

        data = conn_socket.recv(1024)
        if not data:
            conn_socket.close()
            continue
        data = data.decode()
        lines = data.split("\n")
        t = lines[0].split("=")[0]
        if t == "COMMAND":
            # command from kmonadm
            ProcessCommand.process_command(conn_socket, lines)
        elif t == "EVENT":
            # event from dtrace/kmon_notify
            ProcessEvent.process_event(lines[0])
        else:
            conn_socket.send(f"Invalid message type {t}".encode())
        conn_socket.close()

def cleanup():
    """remove the unix socket file"""
    if os.path.exists(SOCKET_PATH):
        os.remove(SOCKET_PATH)


# Runtime lock file used to prevent multiple instances of kmon_server from running
LOCK_FILE = "/var/run/oled/kmon/.kmon_lock"

def main():
    initialize()

    # Lock LOCK_FILE to prevent multiple instances from running.
    # The lock is kept until kmon_server nerminates.
    # The Linux kernel automatically releases the lock if kmon_server
    # terminates, whether normally or abnormally.
    fp = open(LOCK_FILE, 'w', encoding="utf-8")
    try:
        fcntl.lockf(fp, fcntl.LOCK_EX | fcntl.LOCK_NB)
    except IOError:
        print("kmon_server already running.")
        sys.exit(1)

    start_service()
    wait_process_commands()
    cleanup()

if __name__ == '__main__':
    main()
