PufferAI · ThomasMiconi · Mar 2, 2026 · Mar 2, 2026
diff --git a/pufferlib/config/ocean/grixel.ini b/pufferlib/config/ocean/grixel.ini
@@ -0,0 +1,116 @@
+# REMEMBER DEFAULT.INI
+
+[base]
+package = ocean
+env_name = puffer_grixel
+#policy_name = Policy
+policy_name = Grixel
+rnn_name = Recurrent
+# Not included in this release:
+#rnn_name = RecurrentPlastic
+#rnn_name = RecurrentTransformer
+test = Test
+test2 = Test2
+
+[policy]
+hidden_size = 512 
+#3031
+#512
+
+[rnn]
+input_size = 512
+#3031
+#512
+hidden_size = 512
+#n_layers = 1
+
+[vec]
+#num_envs = 8
+num_envs = 1
+
+[env]
+#max_size = 47
+#max_size = 25
+#max_size = 23
+
+#max_size = 13
+#num_envs = 1024
+#num_maps = 8192
+
+max_size = 13
+#num_envs = 4096
+num_envs = 1024
+num_maps = 8192
+
+# texture_mode governs texture assignment
+# to goal/reward vs. zombies
+# 0: fixed textures with fixed assignments
+# 1: two fixed textures, which one is reward/
+# zombie changes randomly for each episode
+# 2: completely random textures each time
+# Note: the neutral object always has random
+# texture
+
+texture_mode = 2
+
+[train]
+# Best params
+#total_timesteps = 435_000_000
+#adam_beta1 = 0.9801350114303844
+#adam_beta2 = 0.9931056135397744
+#adam_eps = 6.024885743259763e-8
+#clip_coef = 0.283658795325587
+#ent_coef = 0.007885530106105381
+#gae_lambda = 0.9574676436577135
+#gamma = 0.9961782334639131
+#learning_rate = 0.0007890771333884192
+#max_grad_norm = 2.5271346931510053
+#minibatch_size = 8192
+#prio_alpha = 0.8735470630752789
+#prio_beta0 = 0.6533958384978629
+#vf_clip_coef = 1.9338563232919095
+#vf_coef = 3.915248046963283
+#vtrace_c_clip = 1.018588814067991
+#vtrace_rho_clip = 2.4215244529216466
+
+# # New sweep best params
+#total_timesteps = 435_000_000
+total_timesteps = 1_000_000_000
+# adam_beta1 = 0.9493079570168755
+# adam_beta2 = 0.9998213228757207
+# adam_eps = 2.16720639574209e-8
+# 
+
+# Should be equal to env->horizon in grixel.h - set_state()
+bptt_horizon = 128
+#bptt_horizon = 256
+
+# clip_coef = 0.399530686596841
+ent_coef = 0.0017271288609381147
+#ent_coef = 0.01
+#ent_coef = 0.003
+# gae_lambda = 0.9491722822649111
+# gamma = 0.9877360824574745
+# max_grad_norm = 3.016348031602564
+# minibatch_size = 8192
+# prio_alpha = 0.8219794821639037
+# prio_beta0 = 0.9447478232810274
+# vf_clip_coef = 0.6051579400844748
+# vf_coef = 2.323141961227481
+# vtrace_c_clip = 1.2499497264614237
+# vtrace_rho_clip = 4.7398234531013985
+# 
+# #learning_rate = 0.0012892859713461897
+# # anneal_fr is defined as True in default.ini
+learning_rate = 0.001 
+anneal_lr = False
+
+[sweep]
+downsample = 0
+
+[sweep.train.total_timesteps]
+distribution = log_normal
+min = 3e8
+max = 6e8
+mean = 3e8
+scale = time
diff --git a/pufferlib/ocean/environment.py b/pufferlib/ocean/environment.py
@@ -148,6 +148,7 @@ def make_multiagent(buf=None, **kwargs):
     'trash_pickup': 'TrashPickupEnv',
     'tower_climb': 'TowerClimb',
     'grid': 'Grid',
+    'grixel': 'Grixel',
     'shared_pool': 'PyCPR',
     'impulse_wars': 'ImpulseWars',
     'drive': 'Drive',

diff --git a/pufferlib/ocean/grixel/README.md b/pufferlib/ocean/grixel/README.md
@@ -0,0 +1,36 @@
+## Grixel environment
+
+This is a pixel-based version of the "grid" environment, that is, a gridworld with pixel-based inputs (as in Crafter / Craftax). We use the pixel-based inputs, and the large space of visual stimuli they allow, to implement a very simple meta-learning experiment, based on visual memory.
+
+Each world is a maze (with added gaps at random position to make movement easier, since maze-solving is no the primary purpose of the environment). In addition to the agent, there are two types of moving objects (or "mobs"), namely "rewards" and "zombies". When hitting a mob, the agent receives a reward (positive or negative) and is randomly teleported. Currently all mobs move randomly.
+
+There is a also a "neutral" type of object, which can be picked and dropped by the agent (picking is simply by moving onto it, dropping is a dedicated action). Currently this has no effect at all.
+
+The visual input to the agent is a local portion of the pixel map, or size 11 x 11 x block_size x block_size. 11x11 is inherited from the "grid" environment as the visual input diameter over the gridworld, and block_size (default 5) is the number of pixels in the height/width of each block in the grid.
+
+All objects are represented by binary textures of size block_size x block_size. The exact visual appearance of all objects is governed by the "texture_mode" parameter in the "env" section of the configuration:
+
+- texture_mode=0: the reward and the zombie each have a fixed, unchanging appearance across episodes
+- texture_mode=1: the reward and the zombie randomly swap their appearance for each episode (there are still only two possible appearances in total)
+- texture_mode=2: the reward and the zombie have completely random appearance, that is, each of them is assigned a random binary texture for each episode. 
+
+In modes 1 and 2, the agent must learn anew which of the two mobs is the reward or the zombie, from experience. This is the meta-learning aspect of the experiment.
+
+Crucially, the agent can also perceive previous-step reward as part of its input; this is required for meta-learning.
+
+The encoder is a CNN where the input layer has both kernel size and stride equal to block_size: the first convolution thus separately maps each block of the gridworld into a single vector. 
+
+The experiment works with the standard LSTM from PufferLib's Recurrent model. We also implemented a transformer and a plastic LSTM, with the plastic LSTM performing best by far in this simple visual memory task. These are not included here as they require modifying the rest of the PufferLib code (though you can see these *highly experimental* implementations [there](https://github.com/ThomasMiconi/PufferLib_dev/blob/grixel/pufferlib/models.py)).
+
+Notably, all episodes have the same lengths, equal to the backpropagation-through-time horizon of the PPO training loop. This avoids difficulties with changing environments and ensures each episode starts with a reset hidden state during training.
+
+This code is provided as is. Everything in this code is experimental and none of it has been thoroughly tested. 
+
+To run the training:
+
+`puffer train puffer_grixel --rnn-name Recurrent --env.texture-mode 2`
+
+To start a visual eval:
+
+`puffer eval  puffer_grixel  --rnn-name Recurrent --load-model-path [checkpoint_file]  --env.texture-mode 2`
+
diff --git a/pufferlib/ocean/grixel/__init__.py b/pufferlib/ocean/grixel/__init__.py
diff --git a/pufferlib/ocean/grixel/binding.c b/pufferlib/ocean/grixel/binding.c
@@ -0,0 +1,106 @@
+#include "grixel.h"
+
+#define Env Grixel
+#define MY_SHARED
+#include "../env_binding.h"
+
+static PyObject* my_shared(PyObject* self, PyObject* args, PyObject* kwargs) {
+
+    // This is called once at the start of each
+    // experiment, generating a fixed set of maps,
+    // from which we will choose randomly for each
+    // episode
+
+    int num_maps = unpack(kwargs, "num_maps");
+    int max_size = unpack(kwargs, "max_size");
+    int size = unpack(kwargs, "size");
+
+    // These are needed because we're using init_grid, which relies on 
+    // them being set
+    int pixelize = unpack(kwargs, "pixelize");
+    int block_size = unpack(kwargs, "block_size");
+
+    State* levels = calloc(num_maps, sizeof(State));
+
+    if (max_size <= 5) {
+        PyErr_SetString(PyExc_ValueError, "max_size must be >5");
+        return NULL;
+    }
+
+    // Temporary env used to gen maps
+    Grixel env;
+    env.max_size = max_size;
+    env.pixelize = pixelize;
+    env.block_size= block_size;
+    env.additional_obs_size = unpack(kwargs, "additional_obs_size");
+    env.nb_object_types = unpack(kwargs, "nb_object_types");
+
+    // Hmmm... at that point, block_size and pixelize are not defined,
+    // even though they're used in init_grid!
+    init_grid(&env);  // This allocates env, with 1 agent and 1 max-size grid
+
+    srand(time(NULL));
+    int start_seed = rand();
+    for (int i = 0; i < num_maps; i++) {
+        int sz = size;
+        if (size == -1) {
+            int min = 9;
+            //if (max_size / 2 > min)
+            //    min = max_size/2;
+            sz = min + (rand() % (max_size-min));
+            //sz = 5 + (rand() % (max_size-5));
+            //sz = max_size / 2;
+        }
+
+        if (sz % 2 == 0) {
+            sz -= 1;
+        }
+
+        float difficulty = (float)rand()/(float)(RAND_MAX);
+        create_maze_level(&env, sz, sz, difficulty, start_seed + i);
+        init_state(&levels[i], max_size, 1); // allocates the grid, with num_agents=1
+        get_state(&env, &levels[i]); // this copies from env to levels
+        // if env and levels have different num_agents strange things might happen?
+    }
+
+    return PyLong_FromVoidPtr(levels);
+}
+
+static int my_init(Env* env, PyObject* args, PyObject* kwargs) {
+    env->max_size = unpack(kwargs, "max_size");
+    env->num_maps = unpack(kwargs, "num_maps");
+    env->texture_mode = unpack(kwargs, "texture_mode");
+    env->pixelize = unpack(kwargs, "pixelize");
+    env->additional_obs_size = unpack(kwargs, "additional_obs_size");
+    env->nb_object_types = unpack(kwargs, "nb_object_types");
+    env->block_size= unpack(kwargs, "block_size");
+
+    init_grid(env); //requires block_size to be pre-set
+
+    env->is_mobile[REWARD] = 1;
+    env->is_mobile[ZOMBIE] = 1;
+    env->is_pickable[OBJECT] = 1;
+
+    PyObject* handle_obj = PyDict_GetItemString(kwargs, "state");
+    if (!PyObject_TypeCheck(handle_obj, &PyLong_Type)) {
+        PyErr_SetString(PyExc_TypeError, "state handle must be an integer");
+        return 1;
+    }
+
+    State* levels = (State*)PyLong_AsVoidPtr(handle_obj);
+    if (!levels) {
+        PyErr_SetString(PyExc_ValueError, "Invalid state handle");
+        return 1;
+    }
+
+    env->levels = levels;
+    return 0;
+}
+
+static int my_log(PyObject* dict, Log* log) {
+    assign_to_dict(dict, "perf", log->perf);
+    assign_to_dict(dict, "score", log->score);
+    assign_to_dict(dict, "episode_return", log->episode_return);
+    assign_to_dict(dict, "episode_length", log->episode_length);
+    return 0;
+}
diff --git a/pufferlib/ocean/grixel/grixel.c b/pufferlib/ocean/grixel/grixel.c
@@ -0,0 +1,95 @@
+#include "grixel.h"
+
+int main() {
+    int max_size = 16; //32;
+    int width = 16; //32;
+    int height = 16; //32;
+    int num_agents = 1;
+    int horizon = 128;
+    float speed = 1;
+    int vision = 5;
+    bool discretize = true;
+
+    int render_cell_size = 16;
+    int seed = 0;
+
+    Grixel* env = allocate_grid(max_size, num_agents, horizon,
+        vision, speed, discretize);
+
+    //env->width = 32;
+    //env->height = 32; env->agents[0].spawn_x = 16;
+    //env->agents[0].spawn_y = 16;
+    //env->agents[0].color = 6;
+    //reset(env, seed);
+    //load_locked_room_preset(env);
+
+
+    State* levels = calloc(1, sizeof(State));
+
+    //create_maze_level(env, 31, 31, 0.85, seed);
+    //create_maze_level(env, 15, 15, 0.85, seed);
+    create_maze_level(env, 15, 15, 0.25, seed);
+    init_state(levels, max_size, num_agents);
+    get_state(env, levels);
+    env->num_maps = 1;
+    env->levels = levels;
+    //generate_locked_room(env);
+    //State state;
+    //init_state(&state, env->max_size, env->num_agents);
+    //get_state(env, &state);
+
+    /*
+    width = height = 31;
+    env->width=31;
+    env->height=31;
+    env->agents[0].spawn_x = 1;
+    env->agents[0].spawn_y = 1;
+    reset(env, seed);
+    generate_growing_tree_maze(env->grid, env->width, env->height, max_size, 0.85, 0);
+    env->grid[(env->height-2)*env->max_size + (env->width - 2)] = GOAL;
+    */
+
+    int tick = 0;
+    c_render(env);
+    while (!WindowShouldClose()) {
+        // User can take control of the first agent
+        env->actions[0] = ATN_FORWARD;
+        Agent* agent = &env->agents[0];
+
+        // TODO: Why are up and down flipped?
+        if (IsKeyDown(KEY_LEFT_SHIFT)) {
+            if (IsKeyDown(KEY_UP)    || IsKeyDown(KEY_W)){
+                //env->actions[0] = ATN_FORWARD;
+                agent->direction = 3.0*PI/2.0;
+            } else if (IsKeyDown(KEY_DOWN)  || IsKeyDown(KEY_S)) {
+                //env->actions[0] = ATN_BACK;
+                agent->direction = PI/2.0;
+            } else if (IsKeyDown(KEY_LEFT)  || IsKeyDown(KEY_A)) {
+                //env->actions[0] = ATN_LEFT;
+                agent->direction = PI;
+            } else if (IsKeyDown(KEY_RIGHT) || IsKeyDown(KEY_D)) {
+                //env->actions[0] = ATN_RIGHT;
+                agent->direction = 0;
+            } else {
+                env->actions[0] = ATN_PASS;
+            }
+        } else {
+            for (int i = 0; i < num_agents; i++) {
+                env->actions[i] = rand() % 5;
+            }
+        }
+
+        //env->actions[0] = actions[t];
+        tick = (tick + 1)%12;
+        bool done = false;
+        if (tick % 1 == 0) {
+            c_step(env);
+            //printf("direction: %f\n", env->agents[0].direction);
+
+        }
+        c_render(env);
+    }
+    free_allocated_grid(env);
+    return 0;
+}
+