Source code for bppy.gym.bp_env

import gymnasium as gym
from bppy.gym.bp_action_space import BPActionSpace
from bppy.gym.simple_bp_observation_space import SimpleBPObservationSpace
from bppy import SolverBasedEventSelectionStrategy
import warnings
import random


[docs]
class BPEnv(gym.Env):
    """
    Custom environment that adheres to OpenAI's Gym interface for simulating
    the behavior of a BProgram. This class is useful for integrating BPrograms
    into reinforcement learning algorithms.

    Attributes
    ----------
    action_space : :class:`BPActionSpace <bppy.gym.bp_action_space.BPActionSpace>`
        Defines the space of possible actions, i.e., events.
    observation_space : :class:`BPObservationSpace <bppy.gym.bp_observation_space.BPObservationSpace>`
        Defines the space of possible observations.
    reward_function : function, optional
        A custom function to compute the reward.

    """
    def __init__(self, bprogram_generator, action_list, observation_space=None, action_space=None, reward_function=None):
        """
        Initializes the BPEnv environment.

        Parameters
        ----------
        bprogram_generator : function
            A function that generates a new instance of the BProgram.
        action_list : list
            List of all events defined as actions.
        observation_space : :class:`BPObservationSpace <bppy.gym.bp_observation_space.BPObservationSpace>`, optional
            Space of possible observations. Defaults to
            :class:`SimpleBPObservationSpace <bppy.gym.simple_bp_observation_space.SimpleBPObservationSpace>`.
        reward_function : function, optional
            A custom function to compute the reward. If not provided, the default reward is the sum of all b-thread
            rewards.
        """
        self.metadata = {}
        self.bprogram = None
        self.bprogram_generator = bprogram_generator
        self.action_space = BPActionSpace(action_list)
        self.event_list = action_list
        self.reward_function = reward_function
        if self.reward_function is None:
            self.reward_function = lambda rewards: sum(filter(None, rewards))
        self.observation_space = observation_space
        if self.observation_space is None:
            self.observation_space = SimpleBPObservationSpace(self.bprogram_generator, action_list)


[docs]
    def step(self, action):
        """
        Executes the given action and returns the resulting observation, reward, done flag, and additional information.

        Parameters
        ----------
        action : int
            An index representing the event to be executed.

        Returns
        -------
        observation : object
            The state of the environment after executing the action.
        reward : float
            The reward obtained by executing the action.
        done : bool
            Whether the episode has ended.
        truncated : bool
            Not used for this environment.
        info : dict
            Additional information for debugging.
        """
        if self.bprogram is None:
            raise RuntimeError("You must call reset() before calling step()")
        if not self.action_space.contains(action):
            return self._state(), 0, True, None, {"message": "Last event is disabled"}
        event = self.action_space.event_list[action]
        self.bprogram.advance_bthreads(self.bprogram.tickets, event)
        while not self._step_done():
            events_are_actions = [e in self.event_list for e in
                                  self.bprogram.event_selection_strategy.selectable_events(self.bprogram.tickets)]
            if any(events_are_actions):
                selectable_events = self.bprogram.event_selection_strategy.selectable_events(self.bprogram.tickets)
                if selectable_events:
                    self.bprogram.advance_bthreads(self.bprogram.tickets, random.choice(tuple(selectable_events)))
            else:
                self.bprogram.advance_bthreads(self.bprogram.tickets,
                                               self.bprogram.event_selection_strategy.select(self.bprogram.tickets))
        done = self.bprogram.event_selection_strategy.selectable_events(self.bprogram.tickets).__len__() == 0
        return self._state(), self._reward(), done, None, {}



[docs]
    def reset(self, seed=None, options=None):
        """
        Resets the environment to its initial state.

        Parameters
        ----------
        seed : int, optional
            A seed for the random number generator.
        options : dict, optional
            Additional options for resetting the environment.

        Returns
        -------
        observation : object
            The initial state of the environment.
        info : dict
            Not used for this environment.
        """
        super().reset(seed=seed, options=options)
        self.bprogram = self.bprogram_generator()
        # if isinstance(self.bprogram.event_selection_strategy, SolverBasedEventSelectionStrategy):
        #     raise NotImplementedError("SolverBasedEventSelectionStrategy is currently not supported")
        self.action_space.bprogram = self.bprogram
        self.bprogram.setup()
        while not self._step_done():
            events_are_actions = [e in self.event_list for e in
                                  self.bprogram.event_selection_strategy.selectable_events(self.bprogram.tickets)]
            if any(events_are_actions):
                selectable_events = self.bprogram.event_selection_strategy.selectable_events(self.bprogram.tickets)
                if selectable_events:
                    self.bprogram.advance_bthreads(self.bprogram.tickets, random.choice(tuple(selectable_events)))
            else:
                self.bprogram.advance_bthreads(self.bprogram.tickets,
                                               self.bprogram.event_selection_strategy.select(self.bprogram.tickets))
        return self.observation_space.bp_state_to_gym_space(self._bthreads_states()), {}



[docs]
    def render(self, mode="human"):
        """
        Not implemented for this environment.
        """
        raise NotImplementedError()



[docs]
    def close(self):
        """
        Closes the environment, releasing any resources.
        """
        self.bprogram = None



[docs]
    def get_state(self):
        """
        Returns the current state of the environment.
        """
        return self._state()


    def _bthreads_states(self):
        return [dict([k, v] for k, v in statement.items() if k != "bt") for statement in self.bprogram.tickets]

    def _state(self):
        return self.observation_space.bp_state_to_gym_space(self._bthreads_states())

    def _bthreads_rewards(self):
        return [x.get("localReward") for x in self.bprogram.tickets]

    def _reward(self):
        return self.reward_function(self._bthreads_rewards())

    def _step_done(self):
        events_are_actions = [e in self.event_list for e in self.bprogram.event_selection_strategy.selectable_events(self.bprogram.tickets)]
        if all(events_are_actions):
            return True
        else:
            if any(events_are_actions):
                warnings.warn("Some selectable events are defined in the actions list, while others are not. "
                              "The program will continue running non-action events until all selectable events "
                              "are defined in the actions list.")
            return False