Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 116 additions & 0 deletions pufferlib/config/ocean/grixel.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
# REMEMBER DEFAULT.INI

[base]
package = ocean
env_name = puffer_grixel
#policy_name = Policy
policy_name = Grixel
rnn_name = Recurrent
# Not included in this release:
#rnn_name = RecurrentPlastic
#rnn_name = RecurrentTransformer
test = Test
test2 = Test2

[policy]
hidden_size = 512
#3031
#512

[rnn]
input_size = 512
#3031
#512
hidden_size = 512
#n_layers = 1

[vec]
#num_envs = 8
num_envs = 1

[env]
#max_size = 47
#max_size = 25
#max_size = 23

#max_size = 13
#num_envs = 1024
#num_maps = 8192

max_size = 13
#num_envs = 4096
num_envs = 1024
num_maps = 8192

# texture_mode governs texture assignment
# to goal/reward vs. zombies
# 0: fixed textures with fixed assignments
# 1: two fixed textures, which one is reward/
# zombie changes randomly for each episode
# 2: completely random textures each time
# Note: the neutral object always has random
# texture

texture_mode = 2

[train]
# Best params
#total_timesteps = 435_000_000
#adam_beta1 = 0.9801350114303844
#adam_beta2 = 0.9931056135397744
#adam_eps = 6.024885743259763e-8
#clip_coef = 0.283658795325587
#ent_coef = 0.007885530106105381
#gae_lambda = 0.9574676436577135
#gamma = 0.9961782334639131
#learning_rate = 0.0007890771333884192
#max_grad_norm = 2.5271346931510053
#minibatch_size = 8192
#prio_alpha = 0.8735470630752789
#prio_beta0 = 0.6533958384978629
#vf_clip_coef = 1.9338563232919095
#vf_coef = 3.915248046963283
#vtrace_c_clip = 1.018588814067991
#vtrace_rho_clip = 2.4215244529216466

# # New sweep best params
#total_timesteps = 435_000_000
total_timesteps = 1_000_000_000
# adam_beta1 = 0.9493079570168755
# adam_beta2 = 0.9998213228757207
# adam_eps = 2.16720639574209e-8
#

# Should be equal to env->horizon in grixel.h - set_state()
bptt_horizon = 128
#bptt_horizon = 256

# clip_coef = 0.399530686596841
ent_coef = 0.0017271288609381147
#ent_coef = 0.01
#ent_coef = 0.003
# gae_lambda = 0.9491722822649111
# gamma = 0.9877360824574745
# max_grad_norm = 3.016348031602564
# minibatch_size = 8192
# prio_alpha = 0.8219794821639037
# prio_beta0 = 0.9447478232810274
# vf_clip_coef = 0.6051579400844748
# vf_coef = 2.323141961227481
# vtrace_c_clip = 1.2499497264614237
# vtrace_rho_clip = 4.7398234531013985
#
# #learning_rate = 0.0012892859713461897
# # anneal_fr is defined as True in default.ini
learning_rate = 0.001
anneal_lr = False

[sweep]
downsample = 0

[sweep.train.total_timesteps]
distribution = log_normal
min = 3e8
max = 6e8
mean = 3e8
scale = time
1 change: 1 addition & 0 deletions pufferlib/ocean/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,7 @@ def make_multiagent(buf=None, **kwargs):
'trash_pickup': 'TrashPickupEnv',
'tower_climb': 'TowerClimb',
'grid': 'Grid',
'grixel': 'Grixel',
'shared_pool': 'PyCPR',
'impulse_wars': 'ImpulseWars',
'drive': 'Drive',
Expand Down
36 changes: 36 additions & 0 deletions pufferlib/ocean/grixel/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
## Grixel environment

This is a pixel-based version of the "grid" environment, that is, a gridworld with pixel-based inputs (as in Crafter / Craftax). We use the pixel-based inputs, and the large space of visual stimuli they allow, to implement a very simple meta-learning experiment, based on visual memory.

Each world is a maze (with added gaps at random position to make movement easier, since maze-solving is no the primary purpose of the environment). In addition to the agent, there are two types of moving objects (or "mobs"), namely "rewards" and "zombies". When hitting a mob, the agent receives a reward (positive or negative) and is randomly teleported. Currently all mobs move randomly.

There is a also a "neutral" type of object, which can be picked and dropped by the agent (picking is simply by moving onto it, dropping is a dedicated action). Currently this has no effect at all.

The visual input to the agent is a local portion of the pixel map, or size 11 x 11 x block_size x block_size. 11x11 is inherited from the "grid" environment as the visual input diameter over the gridworld, and block_size (default 5) is the number of pixels in the height/width of each block in the grid.

All objects are represented by binary textures of size block_size x block_size. The exact visual appearance of all objects is governed by the "texture_mode" parameter in the "env" section of the configuration:

- texture_mode=0: the reward and the zombie each have a fixed, unchanging appearance across episodes
- texture_mode=1: the reward and the zombie randomly swap their appearance for each episode (there are still only two possible appearances in total)
- texture_mode=2: the reward and the zombie have completely random appearance, that is, each of them is assigned a random binary texture for each episode.

In modes 1 and 2, the agent must learn anew which of the two mobs is the reward or the zombie, from experience. This is the meta-learning aspect of the experiment.

Crucially, the agent can also perceive previous-step reward as part of its input; this is required for meta-learning.

The encoder is a CNN where the input layer has both kernel size and stride equal to block_size: the first convolution thus separately maps each block of the gridworld into a single vector.

The experiment works with the standard LSTM from PufferLib's Recurrent model. We also implemented a transformer and a plastic LSTM, with the plastic LSTM performing best by far in this simple visual memory task. These are not included here as they require modifying the rest of the PufferLib code (though you can see these *highly experimental* implementations [there](https://github.com/ThomasMiconi/PufferLib_dev/blob/grixel/pufferlib/models.py)).

Notably, all episodes have the same lengths, equal to the backpropagation-through-time horizon of the PPO training loop. This avoids difficulties with changing environments and ensures each episode starts with a reset hidden state during training.

This code is provided as is. Everything in this code is experimental and none of it has been thoroughly tested.

To run the training:

`puffer train puffer_grixel --rnn-name Recurrent --env.texture-mode 2`

To start a visual eval:

`puffer eval puffer_grixel --rnn-name Recurrent --load-model-path [checkpoint_file] --env.texture-mode 2`

Empty file.
106 changes: 106 additions & 0 deletions pufferlib/ocean/grixel/binding.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
#include "grixel.h"

#define Env Grixel
#define MY_SHARED
#include "../env_binding.h"

static PyObject* my_shared(PyObject* self, PyObject* args, PyObject* kwargs) {

// This is called once at the start of each
// experiment, generating a fixed set of maps,
// from which we will choose randomly for each
// episode

int num_maps = unpack(kwargs, "num_maps");
int max_size = unpack(kwargs, "max_size");
int size = unpack(kwargs, "size");

// These are needed because we're using init_grid, which relies on
// them being set
int pixelize = unpack(kwargs, "pixelize");
int block_size = unpack(kwargs, "block_size");

State* levels = calloc(num_maps, sizeof(State));

if (max_size <= 5) {
PyErr_SetString(PyExc_ValueError, "max_size must be >5");
return NULL;
}

// Temporary env used to gen maps
Grixel env;
env.max_size = max_size;
env.pixelize = pixelize;
env.block_size= block_size;
env.additional_obs_size = unpack(kwargs, "additional_obs_size");
env.nb_object_types = unpack(kwargs, "nb_object_types");

// Hmmm... at that point, block_size and pixelize are not defined,
// even though they're used in init_grid!
init_grid(&env); // This allocates env, with 1 agent and 1 max-size grid

srand(time(NULL));
int start_seed = rand();
for (int i = 0; i < num_maps; i++) {
int sz = size;
if (size == -1) {
int min = 9;
//if (max_size / 2 > min)
// min = max_size/2;
sz = min + (rand() % (max_size-min));
//sz = 5 + (rand() % (max_size-5));
//sz = max_size / 2;
}

if (sz % 2 == 0) {
sz -= 1;
}

float difficulty = (float)rand()/(float)(RAND_MAX);
create_maze_level(&env, sz, sz, difficulty, start_seed + i);
init_state(&levels[i], max_size, 1); // allocates the grid, with num_agents=1
get_state(&env, &levels[i]); // this copies from env to levels
// if env and levels have different num_agents strange things might happen?
}

return PyLong_FromVoidPtr(levels);
}

static int my_init(Env* env, PyObject* args, PyObject* kwargs) {
env->max_size = unpack(kwargs, "max_size");
env->num_maps = unpack(kwargs, "num_maps");
env->texture_mode = unpack(kwargs, "texture_mode");
env->pixelize = unpack(kwargs, "pixelize");
env->additional_obs_size = unpack(kwargs, "additional_obs_size");
env->nb_object_types = unpack(kwargs, "nb_object_types");
env->block_size= unpack(kwargs, "block_size");

init_grid(env); //requires block_size to be pre-set

env->is_mobile[REWARD] = 1;
env->is_mobile[ZOMBIE] = 1;
env->is_pickable[OBJECT] = 1;

PyObject* handle_obj = PyDict_GetItemString(kwargs, "state");
if (!PyObject_TypeCheck(handle_obj, &PyLong_Type)) {
PyErr_SetString(PyExc_TypeError, "state handle must be an integer");
return 1;
}

State* levels = (State*)PyLong_AsVoidPtr(handle_obj);
if (!levels) {
PyErr_SetString(PyExc_ValueError, "Invalid state handle");
return 1;
}

env->levels = levels;
return 0;
}

static int my_log(PyObject* dict, Log* log) {
assign_to_dict(dict, "perf", log->perf);
assign_to_dict(dict, "score", log->score);
assign_to_dict(dict, "episode_return", log->episode_return);
assign_to_dict(dict, "episode_length", log->episode_length);
return 0;
}
95 changes: 95 additions & 0 deletions pufferlib/ocean/grixel/grixel.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
#include "grixel.h"

int main() {
int max_size = 16; //32;
int width = 16; //32;
int height = 16; //32;
int num_agents = 1;
int horizon = 128;
float speed = 1;
int vision = 5;
bool discretize = true;

int render_cell_size = 16;
int seed = 0;

Grixel* env = allocate_grid(max_size, num_agents, horizon,
vision, speed, discretize);

//env->width = 32;
//env->height = 32; env->agents[0].spawn_x = 16;
//env->agents[0].spawn_y = 16;
//env->agents[0].color = 6;
//reset(env, seed);
//load_locked_room_preset(env);


State* levels = calloc(1, sizeof(State));

//create_maze_level(env, 31, 31, 0.85, seed);
//create_maze_level(env, 15, 15, 0.85, seed);
create_maze_level(env, 15, 15, 0.25, seed);
init_state(levels, max_size, num_agents);
get_state(env, levels);
env->num_maps = 1;
env->levels = levels;
//generate_locked_room(env);
//State state;
//init_state(&state, env->max_size, env->num_agents);
//get_state(env, &state);

/*
width = height = 31;
env->width=31;
env->height=31;
env->agents[0].spawn_x = 1;
env->agents[0].spawn_y = 1;
reset(env, seed);
generate_growing_tree_maze(env->grid, env->width, env->height, max_size, 0.85, 0);
env->grid[(env->height-2)*env->max_size + (env->width - 2)] = GOAL;
*/

int tick = 0;
c_render(env);
while (!WindowShouldClose()) {
// User can take control of the first agent
env->actions[0] = ATN_FORWARD;
Agent* agent = &env->agents[0];

// TODO: Why are up and down flipped?
if (IsKeyDown(KEY_LEFT_SHIFT)) {
if (IsKeyDown(KEY_UP) || IsKeyDown(KEY_W)){
//env->actions[0] = ATN_FORWARD;
agent->direction = 3.0*PI/2.0;
} else if (IsKeyDown(KEY_DOWN) || IsKeyDown(KEY_S)) {
//env->actions[0] = ATN_BACK;
agent->direction = PI/2.0;
} else if (IsKeyDown(KEY_LEFT) || IsKeyDown(KEY_A)) {
//env->actions[0] = ATN_LEFT;
agent->direction = PI;
} else if (IsKeyDown(KEY_RIGHT) || IsKeyDown(KEY_D)) {
//env->actions[0] = ATN_RIGHT;
agent->direction = 0;
} else {
env->actions[0] = ATN_PASS;
}
} else {
for (int i = 0; i < num_agents; i++) {
env->actions[i] = rand() % 5;
}
}

//env->actions[0] = actions[t];
tick = (tick + 1)%12;
bool done = false;
if (tick % 1 == 0) {
c_step(env);
//printf("direction: %f\n", env->agents[0].direction);

}
c_render(env);
}
free_allocated_grid(env);
return 0;
}

Loading