Skip to content

Commit

Permalink
Bug fixes and stability improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
GitLab Mirror Bot committed Feb 28, 2025
1 parent b867572 commit 4a0bdea
Show file tree
Hide file tree
Showing 12 changed files with 94 additions and 159 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ Details of the platform is described in the [Cosmos paper](https://research.nvid
- [Video tokenizers](cosmos1/models/tokenizer) for tokenizing videos into continuous tokens (latent vectors) and discrete tokens (integers) efficiently and effectively.
- Video curation pipeline for building your own video dataset. [Coming soon]
- [Post-training scripts](cosmos1/models/POST_TRAINING.md) via NeMo Framework to post-train the pre-trained world foundation models for various Physical AI setup.
- Pre-training scripts via NeMo Framework for building your own world foundation model. [[Diffusion](https://github.com/NVIDIA/NeMo/tree/main/nemo/collections/diffusion)] [[Autoregressive](https://github.com/NVIDIA/NeMo/tree/main/nemo/collections/multimodal_autoregressive)] [[Tokenizer](https://github.com/NVIDIA/NeMo/tree/main/nemo/collections/diffusion/vae)].
- Pre-training scripts via NeMo Framework for building your own world foundation model. [[Diffusion](https://github.com/NVIDIA/NeMo/tree/main/nemo/collections/diffusion)] [[Autoregressive](https://github.com/NVIDIA/NeMo/tree/main/nemo/collections/multimodal_autoregressive)] [[Tokenizer](cosmos1/models/tokenizer/nemo/README.md)].

## Model Family

Expand Down
2 changes: 1 addition & 1 deletion cosmos1/models/autoregressive/modules/embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -467,7 +467,7 @@ def __init__(

def forward(
self,
training_type: str = None,
training_type: str | None = None,
) -> torch.Tensor:
T, H, W = self.latent_shape
emb = torch.cat(
Expand Down
2 changes: 1 addition & 1 deletion cosmos1/models/autoregressive/nemo/cosmos.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def __init__(
self,
seq_len: int,
kv_channels: int,
training_type: str = None,
training_type: str | None = None,
rotary_base: int = 10000,
use_cpu_initialization: bool = False,
latent_shape=[5, 40, 64],
Expand Down
141 changes: 28 additions & 113 deletions cosmos1/models/autoregressive/nemo/cosmos_video2world.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
from typing import TYPE_CHECKING, Annotated, Callable, Dict, Optional, Union

import torch
import torch.nn.functional as F
from megatron.core import tensor_parallel
from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import InferenceWrapperConfig
Expand All @@ -42,20 +41,24 @@
from megatron.core.transformer.transformer_config import TransformerConfig
from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
from megatron.core.utils import make_viewless_tensor
from torch import Tensor, nn

from cosmos1.models.autoregressive.nemo.cosmos import CosmosConfig, CosmosConfig4B, CosmosModel, RotaryEmbedding3D
from torch import nn

from cosmos1.models.autoregressive.nemo.cosmos import (
CosmosConfig,
CosmosConfig4B,
CosmosConfig12B,
CosmosModel,
RotaryEmbedding3D,
)
from cosmos1.models.autoregressive.nemo.inference.inference_controller import CosmosInferenceWrapper
from cosmos1.utils import log

if TYPE_CHECKING:
from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec

from megatron.core import InferenceParams
from megatron.core.packed_seq_params import PackedSeqParams
from megatron.core.transformer.transformer_block import TransformerBlock
from nemo.collections.llm.gpt.model.base import get_batch_on_this_context_parallel_rank
from nemo.collections.llm.gpt.model.llama import Llama3Config
from nemo.collections.llm.utils import Config
from nemo.lightning import OptimizerModule, io
from nemo.lightning.base import teardown
Expand All @@ -64,30 +67,16 @@
class CosmosTransformerBlock(TransformerBlock):
def forward(
self,
hidden_states: Tensor,
attention_mask: Tensor,
context: Tensor = None,
context_mask: Tensor = None,
rotary_pos_emb: Tensor = None,
rotary_pos_cos: Tensor = None,
rotary_pos_sin: Tensor = None,
attention_bias: Tensor = None,
inference_params: InferenceParams = None,
*args,
packed_seq_params: PackedSeqParams = None,
extra_positional_embeddings=None,
**kwargs,
):
packed_seq_params = {"abs_pos_embed": extra_positional_embeddings}
return super().forward(
hidden_states,
attention_mask,
context,
context_mask,
rotary_pos_emb,
rotary_pos_cos,
rotary_pos_sin,
attention_bias,
inference_params,
packed_seq_params,
*args,
packed_seq_params=packed_seq_params,
**kwargs,
)


Expand Down Expand Up @@ -361,7 +350,7 @@ def cosmos_data_step(dataloader_iter) -> Dict[str, torch.Tensor]:
if "cu_seqlens" in _batch:
raise ValueError("Packed sequence cu_seqlens not supported")

required_device_keys.update(("context", "abs_pos_embed"))
required_device_keys.update(("context", "abs_pos_embed", "action"))
if parallel_state.is_pipeline_first_stage():
required_device_keys.update(("tokens", "position_ids"))
if parallel_state.is_pipeline_last_stage():
Expand Down Expand Up @@ -398,38 +387,27 @@ def cosmos_forward_step(model, batch) -> torch.Tensor:


@dataclass
class CosmosConfigVideo2World5B(Llama3Config):
qk_layernorm: bool = True
rope_dim: str = "3D"
class CosmosVideo2WorldConfig:
vocab_size: int = 64064
output_layer_vocab_size: int = 64000
activation_func = F.silu
rotary_base: int = 500_000
seq_length: int = 12864
num_layers: int = 16
hidden_size: int = 4096
ffn_hidden_size: int = 14336
num_attention_heads: int = 32
num_query_groups: int = 8
layernorm_epsilon: float = 1e-5
use_cpu_initialization: bool = True
make_vocab_size_divisible_by: int = 64
kv_channels: int = 128
crossattn_emb_size: int = 1024
latent_shape = [5, 40, 64]
pad_to_multiple_of = 64
forward_step_fn: Callable = cosmos_forward_step
transformer_layer_spec = get_cosmos_video2world_spec()
data_step_fn: Callable = cosmos_data_step
attention_backend: AttnBackend = AttnBackend.flash
crossattn_emb_size: int = 1024
kv_channels: int = 128
training_type: str | None = "text_to_video"

def configure_model(self, tokenizer) -> "MCoreGPTModel":
self.transformer_layer_spec = get_cosmos_video2world_spec()
model = super().configure_model(tokenizer)
if self.rope_dim == "3D":
model.rotary_pos_emb = RotaryEmbedding3D(
seq_len=self.seq_length,
training_type="text_to_video",
training_type=self.training_type,
pad_to_multiple_of=self.pad_to_multiple_of,
kv_channels=self.kv_channels,
max_position_embeddings=self.seq_length,
Expand Down Expand Up @@ -467,78 +445,13 @@ def configure_model(self, tokenizer) -> "MCoreGPTModel":


@dataclass
class CosmosConfigVideo2World13B(Llama3Config):
qk_layernorm: bool = True
rope_dim: str = "3D"
vocab_size: int = 64064
output_layer_vocab_size: int = 64000
activation_func = F.silu
rotary_base: int = 500_000
seq_length: int = 12864
num_layers: int = 40
hidden_size: int = 5120
ffn_hidden_size: int = 14336
num_attention_heads: int = 32
num_query_groups: int = 8
layernorm_epsilon: float = 1e-5
use_cpu_initialization: bool = True
make_vocab_size_divisible_by: int = 128
kv_channels: int = 128
crossattn_emb_size: int = 1024
original_latent_shape = [3, 40, 64]
apply_yarn: bool = True
yarn_beta_fast: int = 4
yarn_beta_slow: int = 1
yarn_scale: int = 2
original_seq_len = 8192
latent_shape = [5, 40, 64]
pad_to_multiple_of = 64
forward_step_fn: Callable = cosmos_forward_step
transformer_layer_spec = get_cosmos_video2world_spec()
data_step_fn: Callable = cosmos_data_step
attention_backend: AttnBackend = AttnBackend.flash
class CosmosConfigVideo2World5B(CosmosVideo2WorldConfig, CosmosConfig4B):
make_vocab_size_divisible_by: int = 64

def configure_model(self, tokenizer) -> "MCoreGPTModel":
self.transformer_layer_spec = get_cosmos_video2world_spec()
model = super().configure_model(tokenizer)
if self.rope_dim == "3D":
model.rotary_pos_emb = RotaryEmbedding3D(
seq_len=self.seq_length,
training_type="text_to_video",
pad_to_multiple_of=self.pad_to_multiple_of,
kv_channels=self.kv_channels,
max_position_embeddings=self.seq_length,
original_max_position_embeddings=self.original_seq_len if hasattr(self, "original_seq_len") else None,
rotary_base=self.rotary_base,
apply_yarn=True if hasattr(self, "apply_yarn") else False,
scale=self.yarn_scale if hasattr(self, "yarn_scale") else None,
extrapolation_factor=1,
attn_factor=1,
beta_fast=self.yarn_beta_fast if hasattr(self, "yarn_beta_fast") else 32,
beta_slow=self.yarn_beta_slow if hasattr(self, "yarn_beta_slow") else 1,
latent_shape=self.latent_shape,
original_latent_shape=self.original_latent_shape if hasattr(self, "original_latent_shape") else None,
)
model.output_layer = tensor_parallel.ColumnParallelLinear(
self.hidden_size,
self.output_layer_vocab_size,
config=self,
init_method=self.init_method,
bias=False,
skip_bias_add=False,
gather_output=False,
skip_weight_param_allocation=False,
embedding_activation_buffer=None,
grad_output_buffer=None,
)

model.decoder = CosmosTransformerBlock(
config=self,
spec=self.transformer_layer_spec,
pre_process=model.pre_process,
post_process=model.post_process,
)
return model
@dataclass
class CosmosConfigVideo2World13B(CosmosVideo2WorldConfig, CosmosConfig12B):
make_vocab_size_divisible_by: int = 128


class CosmosVideo2WorldModel(CosmosModel):
Expand All @@ -549,7 +462,9 @@ def __init__(
tokenizer: Optional["TokenizerSpec"] = None,
model_transform: Optional[Callable[[nn.Module], nn.Module]] = None,
):
super().__init__(config or CosmosConfig4B(), optim=optim, tokenizer=tokenizer, model_transform=model_transform)
super().__init__(
config or CosmosConfigVideo2World5B(), optim=optim, tokenizer=tokenizer, model_transform=model_transform
)
self.config = config

def get_inference_wrapper(self, params_dtype, inference_batch_times_seqlen_threshold) -> torch.Tensor:
Expand Down
17 changes: 4 additions & 13 deletions cosmos1/models/autoregressive/nemo/inference/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -106,9 +106,6 @@ Complete the following steps to run inference on the 4B model.
cd /workspace/Cosmos
git lfs pull $INPUT_DATA

NVTE_FLASH_ATTN=1 \
NVTE_FUSED_ATTN=0 \
NVTE_UNFUSED_ATTN=0 \
torchrun --nproc-per-node 1 cosmos1/models/autoregressive/nemo/inference/general.py \
--input_image_or_video_path $INPUT_DATA \
--video_save_name "Cosmos-1.0-Autoregressive-4B.mp4" \
Expand Down Expand Up @@ -138,14 +135,10 @@ Complete the following steps to run inference on the 5B model.
cd /workspace/Cosmos
git lfs pull $INPUT_DATA

NVTE_FLASH_ATTN=1 \
NVTE_FUSED_ATTN=0 \
NVTE_UNFUSED_ATTN=0 \
python3 cosmos1/models/autoregressive/nemo/inference/video2world.py \
--input_type video \
--input_image_or_video_path 'cosmos1/models/autoregressive/assets/v1p0/input.mp4' \
--prompt "A video recorded from a moving vehicle's perspective, capturing roads, buildings, landscapes, and changing weather and lighting conditions." \
--disable_diffusion_decoder \
--ar_model_dir nvidia/Cosmos-1.0-Autoregressive-5B-Video2World
```

Expand All @@ -160,6 +153,8 @@ Complete the following steps to generate a new output video using a post-trained
1. Set the following environment variables:

```bash
pip install --no-cache-dir imageio[ffmpeg] pyav iopath better_profanity peft git+https://github.com/NVlabs/Pytorch_Retinaface.git@b843f45

export HF_TOKEN="<your/HF/access/token>"
export HF_HOME="<path/to/store/checkpoints>"

Expand All @@ -178,9 +173,6 @@ Complete the following steps to generate a new output video using a post-trained
git lfs pull $INPUT_DATA

# change --ar_model_dir to a post-trained checkpoint under ./logs/default/checkpoints/
NVTE_FLASH_ATTN=1 \
NVTE_FUSED_ATTN=0 \
NVTE_UNFUSED_ATTN=0 \
torchrun --nproc-per-node 1 cosmos1/models/autoregressive/nemo/inference/general.py \
--input_image_or_video_path $INPUT_DATA \
--video_save_name "Cosmos-1.0-Autoregressive-4B.mp4" \
Expand All @@ -194,6 +186,8 @@ Complete the following steps to generate a new output video using a post-trained
1. Set the following environment variables:

```bash
pip install --no-cache-dir imageio[ffmpeg] pyav iopath better_profanity peft git+https://github.com/NVlabs/Pytorch_Retinaface.git@b843f45

export HF_TOKEN="<your/HF/access/token>"
export HF_HOME="<path/to/store/checkpoints>"

Expand All @@ -213,9 +207,6 @@ Complete the following steps to generate a new output video using a post-trained
git lfs pull $INPUT_DATA

# change --ar_model_dir to a post-trained checkpoint under ./logs/default/checkpoints/
NVTE_FLASH_ATTN=1 \
NVTE_FUSED_ATTN=0 \
NVTE_UNFUSED_ATTN=0 \
python3 cosmos1/models/autoregressive/nemo/inference/video2world.py \
--input_image_or_video_path $INPUT_DATA \
--video_save_name "Cosmos-1.0-Autoregressive-5B-Video2World.mp4" \
Expand Down
4 changes: 4 additions & 0 deletions cosmos1/models/autoregressive/nemo/post_training/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,8 @@ Before proceeding, ensure all videos are in **RGB format**. Complete the followi
1. Set the following environment variables:

```bash
pip install --no-cache-dir imageio[ffmpeg] pyav iopath

export HF_TOKEN="<your/HF/access/token>"
export HF_HOME="<path/to/store/checkpoints>"

Expand Down Expand Up @@ -144,6 +146,8 @@ Before proceeding, ensure all videos are in **RGB format**. Complete the followi
1. Set the following environment variables:

```bash
pip install --no-cache-dir imageio[ffmpeg] pyav iopath

export HF_TOKEN="<your/HF/access/token>"
export HF_HOME="<path/to/store/checkpoints>"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,41 +16,30 @@
import json

import torch
from nemo.collections.llm.gpt.data.mock import MockDataModule
from torch.utils.data import Dataset

from cosmos1.models.autoregressive.modules.embedding import SinCosPosEmbAxisTE
from cosmos1.models.autoregressive.nemo.cosmos import CosmosConfig

TOKENIZER_COMPRESSION_FACTOR = [8, 16, 16]
DATA_RESOLUTION_SUPPORTED = [640, 1024]
NUM_CONTEXT_FRAMES = 33
BOV_TOKEN = 64000
PAD_ID = 64002
from nemo.collections.llm.gpt.data.mock import MockDataModule


class CosmosVideo2WorldDataset(Dataset):
def __init__(self, data_path, model_config, split="train"):
self.data_path = data_path
self.model_config = model_config
self.split = split
self.abs_pos_emb = self._initialize_abs_pos_emb()
self.abs_pos_emb = get_abs_pos_embed(model_config, training_type="text_to_video")
metadata_file = f"{self.data_path}/metadata.json"
with open(metadata_file, "r") as f:
metadata = json.load(f)
self.metadata = metadata

def _initialize_abs_pos_emb(self):
pos_emb = SinCosPosEmbAxisTE(
self.model_config.hidden_size,
latent_shape=self.model_config.latent_shape,
pad_to_multiple_of=self.model_config.pad_to_multiple_of,
device="cpu",
)
training_type = "text_to_video"
abs_pos_emb = pos_emb.forward(training_type=training_type)
abs_pos_emb = abs_pos_emb.transpose(0, 1).contiguous()
return abs_pos_emb

def __len__(self):
return self.metadata[f"{self.split}_samples"]

Expand Down Expand Up @@ -90,6 +79,18 @@ def collate_fn(self, batch):
return self._collate_fn(batch)


def get_abs_pos_embed(model_config: CosmosConfig, training_type: str | None = "text_to_video"):
pos_emb = SinCosPosEmbAxisTE(
model_config.hidden_size,
latent_shape=model_config.latent_shape,
pad_to_multiple_of=model_config.pad_to_multiple_of,
device="cpu",
)
abs_pos_emb = pos_emb.forward(training_type=training_type)
abs_pos_emb = abs_pos_emb.transpose(0, 1).contiguous()
return abs_pos_emb


class CosmosVideo2WorldDataModule(MockDataModule):
def __init__(self, *args, **kwargs):
data_path = kwargs["data_path"]
Expand Down
Loading

0 comments on commit 4a0bdea

Please sign in to comment.