huggingface · Cadene · Mar 4, 2024 · Mar 2, 2024 · Mar 3, 2024 · Mar 3, 2024
diff --git a/lerobot/common/datasets/factory.py b/lerobot/common/datasets/factory.py
@@ -69,7 +69,7 @@ def make_offline_buffer(cfg, sampler=None):
             sampler=sampler,
             batch_size=batch_size,
             pin_memory=pin_memory,
-            prefetch=prefetch,
+            prefetch=prefetch if isinstance(prefetch, int) else None,
         )
     elif cfg.env.name == "pusht":
         offline_buffer = PushtExperienceReplay(
@@ -79,7 +79,7 @@ def make_offline_buffer(cfg, sampler=None):
             sampler=sampler,
             batch_size=batch_size,
             pin_memory=pin_memory,
-            prefetch=prefetch,
+            prefetch=prefetch if isinstance(prefetch, int) else None,
         )
     else:
         raise ValueError(cfg.env.name)

diff --git a/lerobot/common/datasets/pusht.py b/lerobot/common/datasets/pusht.py
@@ -1,4 +1,5 @@
 import logging
+import math
 import os
 from pathlib import Path
 from typing import Callable
@@ -134,20 +135,32 @@ def __init__(
         else:
             storage = TensorStorage(TensorDict.load_memmap(self.root / dataset_id))
 
-        mean_std = self._compute_or_load_mean_std(storage)
-        mean_std["next", "observation", "image"] = mean_std["observation", "image"]
-        mean_std["next", "observation", "state"] = mean_std["observation", "state"]
+        stats = self._compute_or_load_stats(storage)
         transform = NormalizeTransform(
-            mean_std,
+            stats,
             in_keys=[
-                ("observation", "image"),
+                # TODO(rcadene): imagenet normalization is applied inside diffusion policy
+                # We need to automate this for tdmpc and others
+                # ("observation", "image"),
                 ("observation", "state"),
-                ("next", "observation", "image"),
-                ("next", "observation", "state"),
+                # TODO(rcadene): for tdmpc, we might want next image and state
+                # ("next", "observation", "image"),
+                # ("next", "observation", "state"),
                 ("action"),
             ],
+            mode="min_max",
         )
 
+        # TODO(rcadene): make normalization strategy configurable between mean_std, min_max, manual_min_max, min_max_from_spec
+        transform.stats["observation", "state", "min"] = torch.tensor(
+            [13.456424, 32.938293], dtype=torch.float32
+        )
+        transform.stats["observation", "state", "max"] = torch.tensor(
+            [496.14618, 510.9579], dtype=torch.float32
+        )
+        transform.stats["action", "min"] = torch.tensor([12.0, 25.0], dtype=torch.float32)
+        transform.stats["action", "max"] = torch.tensor([511.0, 511.0], dtype=torch.float32)
+
         if writer is None:
             writer = ImmutableDatasetWriter()
         if collate_fn is None:
@@ -282,61 +295,111 @@ def _download_and_preproc(self):
 
         return TensorStorage(td_data.lock_())
 
-    def _compute_mean_std(self, storage, num_batch=10, batch_size=32):
+    def _compute_stats(self, storage, num_batch=100, batch_size=32):
         rb = TensorDictReplayBuffer(
             storage=storage,
             batch_size=batch_size,
             prefetch=True,
         )
         batch = rb.sample()
-        image_mean = torch.zeros(batch["observation", "image"].shape[1])
-        image_std = torch.zeros(batch["observation", "image"].shape[1])
-        state_mean = torch.zeros(batch["observation", "state"].shape[1])
-        state_std = torch.zeros(batch["observation", "state"].shape[1])
-        action_mean = torch.zeros(batch["action"].shape[1])
-        action_std = torch.zeros(batch["action"].shape[1])
+
+        image_channels = batch["observation", "image"].shape[1]
+        image_mean = torch.zeros(image_channels)
+        image_std = torch.zeros(image_channels)
+        image_max = torch.tensor([-math.inf] * image_channels)
+        image_min = torch.tensor([math.inf] * image_channels)
+
+        state_channels = batch["observation", "state"].shape[1]
+        state_mean = torch.zeros(state_channels)
+        state_std = torch.zeros(state_channels)
+        state_max = torch.tensor([-math.inf] * state_channels)
+        state_min = torch.tensor([math.inf] * state_channels)
+
+        action_channels = batch["action"].shape[1]
+        action_mean = torch.zeros(action_channels)
+        action_std = torch.zeros(action_channels)
+        action_max = torch.tensor([-math.inf] * action_channels)
+        action_min = torch.tensor([math.inf] * action_channels)
 
         for _ in tqdm.tqdm(range(num_batch)):
-            image_mean += einops.reduce(batch["observation", "image"], "b c h w -> c", reduction="mean")
-            state_mean += batch["observation", "state"].mean(dim=0)
-            action_mean += batch["action"].mean(dim=0)
+            image_mean += einops.reduce(batch["observation", "image"], "b c h w -> c", "mean")
+            state_mean += einops.reduce(batch["observation", "state"], "b c -> c", "mean")
+            action_mean += einops.reduce(batch["action"], "b c -> c", "mean")
+
+            b_image_max = einops.reduce(batch["observation", "image"], "b c h w -> c", "max")
+            b_image_min = einops.reduce(batch["observation", "image"], "b c h w -> c", "min")
+            b_state_max = einops.reduce(batch["observation", "state"], "b c -> c", "max")
+            b_state_min = einops.reduce(batch["observation", "state"], "b c -> c", "min")
+            b_action_max = einops.reduce(batch["action"], "b c -> c", "max")
+            b_action_min = einops.reduce(batch["action"], "b c -> c", "min")
+            image_max = torch.maximum(image_max, b_image_max)
+            image_min = torch.maximum(image_min, b_image_min)
+            state_max = torch.maximum(state_max, b_state_max)
+            state_min = torch.maximum(state_min, b_state_min)
+            action_max = torch.maximum(action_max, b_action_max)
+            action_min = torch.maximum(action_min, b_action_min)
+
             batch = rb.sample()
 
         image_mean /= num_batch
         state_mean /= num_batch
         action_mean /= num_batch
 
         for i in tqdm.tqdm(range(num_batch)):
-            image_mean_batch = einops.reduce(batch["observation", "image"], "b c h w -> c", reduction="mean")
-            image_std += (image_mean_batch - image_mean) ** 2
-            state_std += (batch["observation", "state"].mean(dim=0) - state_mean) ** 2
-            action_std += (batch["action"].mean(dim=0) - action_mean) ** 2
+            b_image_mean = einops.reduce(batch["observation", "image"], "b c h w -> c", "mean")
+            b_state_mean = einops.reduce(batch["observation", "state"], "b c -> c", "mean")
+            b_action_mean = einops.reduce(batch["action"], "b c -> c", "mean")
+            image_std += (b_image_mean - image_mean) ** 2
+            state_std += (b_state_mean - state_mean) ** 2
+            action_std += (b_action_mean - action_mean) ** 2
+
+            b_image_max = einops.reduce(batch["observation", "image"], "b c h w -> c", "max")
+            b_image_min = einops.reduce(batch["observation", "image"], "b c h w -> c", "min")
+            b_state_max = einops.reduce(batch["observation", "state"], "b c -> c", "max")
+            b_state_min = einops.reduce(batch["observation", "state"], "b c -> c", "min")
+            b_action_max = einops.reduce(batch["action"], "b c -> c", "max")
+            b_action_min = einops.reduce(batch["action"], "b c -> c", "min")
+            image_max = torch.maximum(image_max, b_image_max)
+            image_min = torch.maximum(image_min, b_image_min)
+            state_max = torch.maximum(state_max, b_state_max)
+            state_min = torch.maximum(state_min, b_state_min)
+            action_max = torch.maximum(action_max, b_action_max)
+            action_min = torch.maximum(action_min, b_action_min)
+
             if i < num_batch - 1:
                 batch = rb.sample()
 
         image_std = torch.sqrt(image_std / num_batch)
         state_std = torch.sqrt(state_std / num_batch)
         action_std = torch.sqrt(action_std / num_batch)
 
-        mean_std = TensorDict(
+        stats = TensorDict(
             {
                 ("observation", "image", "mean"): image_mean[None, :, None, None],
                 ("observation", "image", "std"): image_std[None, :, None, None],
+                ("observation", "image", "max"): image_max[None, :, None, None],
+                ("observation", "image", "min"): image_min[None, :, None, None],
                 ("observation", "state", "mean"): state_mean[None, :],
                 ("observation", "state", "std"): state_std[None, :],
+                ("observation", "state", "max"): state_max[None, :],
+                ("observation", "state", "min"): state_min[None, :],
                 ("action", "mean"): action_mean[None, :],
                 ("action", "std"): action_std[None, :],
+                ("action", "max"): action_max[None, :],
+                ("action", "min"): action_min[None, :],
             },
             batch_size=[],
         )
-        return mean_std
-
-    def _compute_or_load_mean_std(self, storage) -> TensorDict:
-        mean_std_path = self.root / self.dataset_id / "mean_std.pth"
-        if mean_std_path.exists():
-            mean_std = torch.load(mean_std_path)
+        stats["next", "observation", "image"] = stats["observation", "image"]
+        stats["next", "observation", "state"] = stats["observation", "state"]
+        return stats
+
+    def _compute_or_load_stats(self, storage) -> TensorDict:
+        stats_path = self.root / self.dataset_id / "stats.pth"
+        if stats_path.exists():
+            stats = torch.load(stats_path)
         else:
-            logging.info(f"compute_mean_std and save to {mean_std_path}")
-            mean_std = self._compute_mean_std(storage)
-            torch.save(mean_std, mean_std_path)
-        return mean_std
+            logging.info(f"compute_stats and save to {stats_path}")
+            stats = self._compute_stats(storage)
+            torch.save(stats, stats_path)
+        return stats
diff --git a/lerobot/common/envs/factory.py b/lerobot/common/envs/factory.py
@@ -1,14 +1,14 @@
 from torchrl.envs.transforms import StepCounter, TransformedEnv
 
-from lerobot.common.envs.transforms import Prod
-
 
 def make_env(cfg, transform=None):
     kwargs = {
         "frame_skip": cfg.env.action_repeat,
         "from_pixels": cfg.env.from_pixels,
         "pixels_only": cfg.env.pixels_only,
         "image_size": cfg.env.image_size,
+        # TODO(rcadene): do we want a specific eval_env_seed?
+        "seed": cfg.seed,
     }
 
     if cfg.env.name == "simxarm":
@@ -19,6 +19,8 @@ def make_env(cfg, transform=None):
     elif cfg.env.name == "pusht":
         from lerobot.common.envs.pusht import PushtEnv
 
+        # assert kwargs["seed"] > 200, "Seed 0-200 are used for the demonstration dataset, so we don't want to seed the eval env with this range."
+
         clsfunc = PushtEnv
     else:
         raise ValueError(cfg.env.name)
@@ -28,12 +30,8 @@ def make_env(cfg, transform=None):
     # limit rollout to max_steps
     env = TransformedEnv(env, StepCounter(max_steps=cfg.env.episode_length))
 
-    if cfg.env.name == "pusht":
-        # to ensure pusht is in [0,255] like simxarm
-        env.append_transform(Prod(in_keys=[("observation", "image")], prod=255.0))
-
     if transform is not None:
-        # useful to add mean and std normalization
+        # useful to add normalization
         env.append_transform(transform)
 
     return env

diff --git a/lerobot/common/envs/pusht.py b/lerobot/common/envs/pusht.py
@@ -1,4 +1,5 @@
 import importlib
+from collections import deque
 from typing import Optional
 
 import torch
@@ -27,12 +28,16 @@ def __init__(
         image_size=None,
         seed=1337,
         device="cpu",
+        num_prev_obs=1,
+        num_prev_action=0,
     ):
         super().__init__(device=device, batch_size=[])
         self.frame_skip = frame_skip
         self.from_pixels = from_pixels
         self.pixels_only = pixels_only
         self.image_size = image_size
+        self.num_prev_obs = num_prev_obs
+        self.num_prev_action = num_prev_action
 
         if pixels_only:
             assert from_pixels
@@ -56,6 +61,12 @@ def __init__(
         self._make_spec()
         self._current_seed = self.set_seed(seed)
 
+        if self.num_prev_obs > 0:
+            self._prev_obs_image_queue = deque(maxlen=self.num_prev_obs)
+            self._prev_obs_state_queue = deque(maxlen=self.num_prev_obs)
+        if self.num_prev_action > 0:
+            self._prev_action_queue = deque(maxlen=self.num_prev_action)
+
     def render(self, mode="rgb_array", width=384, height=384):
         if width != height:
             raise NotImplementedError()
@@ -67,15 +78,15 @@ def render(self, mode="rgb_array", width=384, height=384):
 
     def _format_raw_obs(self, raw_obs):
         if self.from_pixels:
-            obs = {"image": torch.from_numpy(raw_obs["image"])}
+            image = torch.from_numpy(raw_obs["image"])
+            obs = {"image": image}
 
             if not self.pixels_only:
                 obs["state"] = torch.from_numpy(raw_obs["agent_pos"]).type(torch.float32)
         else:
             # TODO:
             obs = {"state": torch.from_numpy(raw_obs["observation"]).type(torch.float32)}
 
-        obs = TensorDict(obs, batch_size=[])
         return obs
 
     def _reset(self, tensordict: Optional[TensorDict] = None):
@@ -87,9 +98,25 @@ def _reset(self, tensordict: Optional[TensorDict] = None):
             raw_obs = self._env.reset()
             assert self._current_seed == self._env._seed
 
+            obs = self._format_raw_obs(raw_obs)
+
+            if self.num_prev_obs > 0:
+                stacked_obs = {}
+                if "image" in obs:
+                    self._prev_obs_image_queue = deque(
+                        [obs["image"]] * (self.num_prev_obs + 1), maxlen=(self.num_prev_obs + 1)
+                    )
+                    stacked_obs["image"] = torch.stack(list(self._prev_obs_image_queue))
+                if "state" in obs:
+                    self._prev_obs_state_queue = deque(
+                        [obs["state"]] * (self.num_prev_obs + 1), maxlen=(self.num_prev_obs + 1)
+                    )
+                    stacked_obs["state"] = torch.stack(list(self._prev_obs_state_queue))
+                obs = stacked_obs
+
             td = TensorDict(
                 {
-                    "observation": self._format_raw_obs(raw_obs),
+                    "observation": TensorDict(obs, batch_size=[]),
                     "done": torch.tensor([False], dtype=torch.bool),
                 },
                 batch_size=[],
@@ -100,18 +127,37 @@ def _reset(self, tensordict: Optional[TensorDict] = None):
 
     def _step(self, tensordict: TensorDict):
         td = tensordict
-        # remove batch dim
-        action = td["action"].squeeze(0).numpy()
+        action = td["action"].numpy()
         # step expects shape=(4,) so we pad if necessary
         # TODO(rcadene): add info["is_success"] and info["success"] ?
         sum_reward = 0
-        for _ in range(self.frame_skip):
-            raw_obs, reward, done, info = self._env.step(action)
+
+        if action.ndim == 1:
+            action = action.repeat(self.frame_skip, 1)
+        else:
+            if self.frame_skip > 1:
+                raise NotImplementedError()
+
+        num_action_steps = action.shape[0]
+        for i in range(num_action_steps):
+            raw_obs, reward, done, info = self._env.step(action[i])
             sum_reward += reward
 
+            obs = self._format_raw_obs(raw_obs)
+
+            if self.num_prev_obs > 0:
+                stacked_obs = {}
+                if "image" in obs:
+                    self._prev_obs_image_queue.append(obs["image"])
+                    stacked_obs["image"] = torch.stack(list(self._prev_obs_image_queue))
+                if "state" in obs:
+                    self._prev_obs_state_queue.append(obs["state"])
+                    stacked_obs["state"] = torch.stack(list(self._prev_obs_state_queue))
+                obs = stacked_obs
+
         td = TensorDict(
             {
-                "observation": self._format_raw_obs(raw_obs),
+                "observation": TensorDict(obs, batch_size=[]),
                 "reward": torch.tensor([sum_reward], dtype=torch.float32),
                 # succes and done are true when coverage > self.success_threshold in env
                 "done": torch.tensor([done], dtype=torch.bool),
@@ -124,14 +170,22 @@ def _step(self, tensordict: TensorDict):
     def _make_spec(self):
         obs = {}
         if self.from_pixels:
+            image_shape = (3, self.image_size, self.image_size)
+            if self.num_prev_obs > 0:
+                image_shape = (self.num_prev_obs, *image_shape)
+
             obs["image"] = BoundedTensorSpec(
                 low=0,
                 high=1,
-                shape=(3, self.image_size, self.image_size),
+                shape=image_shape,
                 dtype=torch.float32,
                 device=self.device,
             )
             if not self.pixels_only:
+                state_shape = self._env.observation_space["agent_pos"].shape
+                if self.num_prev_obs > 0:
+                    state_shape = (self.num_prev_obs, *state_shape)
+
                 obs["state"] = BoundedTensorSpec(
                     low=0,
                     high=512,
@@ -141,6 +195,10 @@ def _make_spec(self):
                 )
         else:
             # TODO(rcadene): add observation_space achieved_goal and desired_goal?
+            state_shape = self._env.observation_space["observation"].shape
+            if self.num_prev_obs > 0:
+                state_shape = (self.num_prev_obs, *state_shape)
+
             obs["state"] = UnboundedContinuousTensorSpec(
                 # TODO:
                 shape=self._env.observation_space["observation"].shape,