From 4a0bdea0eeed82c5847cee13273eca5c26c285b7 Mon Sep 17 00:00:00 2001
From: GitLab Mirror Bot <gitlab-mirror@nvidia.com>
Date: Fri, 28 Feb 2025 00:44:51 +0000
Subject: [PATCH] Bug fixes and stability improvements

---
 README.md                                     |   2 +-
 .../autoregressive/modules/embedding.py       |   2 +-
 cosmos1/models/autoregressive/nemo/cosmos.py  |   2 +-
 .../autoregressive/nemo/cosmos_video2world.py | 141 ++++--------------
 .../autoregressive/nemo/inference/README.md   |  17 +--
 .../nemo/post_training/README.md              |   4 +
 .../nemo/post_training/video2world_dataset.py |  29 ++--
 .../video2world_prepare_dataset.py            |   6 +-
 .../face_blur_filter/face_blur_filter.py      |   6 +-
 .../face_blur_filter/retinaface_utils.py      |   2 +-
 cosmos1/scripts/ip_header.py                  |  38 +++--
 requirements.txt                              |   4 +-
 12 files changed, 94 insertions(+), 159 deletions(-)

diff --git a/README.md b/README.md
index 8ea2df2..aed7974 100644
--- a/README.md
+++ b/README.md
@@ -18,7 +18,7 @@ Details of the platform is described in the [Cosmos paper](https://research.nvid
 - [Video tokenizers](cosmos1/models/tokenizer) for tokenizing videos into continuous tokens (latent vectors) and discrete tokens (integers) efficiently and effectively.
 - Video curation pipeline for building your own video dataset. [Coming soon]
 - [Post-training scripts](cosmos1/models/POST_TRAINING.md) via NeMo Framework to post-train the pre-trained world foundation models for various Physical AI setup.
-- Pre-training scripts via NeMo Framework for building your own world foundation model. [[Diffusion](https://github.com/NVIDIA/NeMo/tree/main/nemo/collections/diffusion)] [[Autoregressive](https://github.com/NVIDIA/NeMo/tree/main/nemo/collections/multimodal_autoregressive)] [[Tokenizer](https://github.com/NVIDIA/NeMo/tree/main/nemo/collections/diffusion/vae)].
+- Pre-training scripts via NeMo Framework for building your own world foundation model. [[Diffusion](https://github.com/NVIDIA/NeMo/tree/main/nemo/collections/diffusion)] [[Autoregressive](https://github.com/NVIDIA/NeMo/tree/main/nemo/collections/multimodal_autoregressive)] [[Tokenizer](cosmos1/models/tokenizer/nemo/README.md)].
 
 ## Model Family
 
diff --git a/cosmos1/models/autoregressive/modules/embedding.py b/cosmos1/models/autoregressive/modules/embedding.py
index 6db842f..5d32c50 100644
--- a/cosmos1/models/autoregressive/modules/embedding.py
+++ b/cosmos1/models/autoregressive/modules/embedding.py
@@ -467,7 +467,7 @@ def __init__(
 
     def forward(
         self,
-        training_type: str = None,
+        training_type: str | None = None,
     ) -> torch.Tensor:
         T, H, W = self.latent_shape
         emb = torch.cat(
diff --git a/cosmos1/models/autoregressive/nemo/cosmos.py b/cosmos1/models/autoregressive/nemo/cosmos.py
index 81f2312..8fb828d 100644
--- a/cosmos1/models/autoregressive/nemo/cosmos.py
+++ b/cosmos1/models/autoregressive/nemo/cosmos.py
@@ -49,7 +49,7 @@ def __init__(
         self,
         seq_len: int,
         kv_channels: int,
-        training_type: str = None,
+        training_type: str | None = None,
         rotary_base: int = 10000,
         use_cpu_initialization: bool = False,
         latent_shape=[5, 40, 64],
diff --git a/cosmos1/models/autoregressive/nemo/cosmos_video2world.py b/cosmos1/models/autoregressive/nemo/cosmos_video2world.py
index b46d0fa..0079f39 100644
--- a/cosmos1/models/autoregressive/nemo/cosmos_video2world.py
+++ b/cosmos1/models/autoregressive/nemo/cosmos_video2world.py
@@ -19,7 +19,6 @@
 from typing import TYPE_CHECKING, Annotated, Callable, Dict, Optional, Union
 
 import torch
-import torch.nn.functional as F
 from megatron.core import tensor_parallel
 from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
 from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import InferenceWrapperConfig
@@ -42,20 +41,24 @@
 from megatron.core.transformer.transformer_config import TransformerConfig
 from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
 from megatron.core.utils import make_viewless_tensor
-from torch import Tensor, nn
-
-from cosmos1.models.autoregressive.nemo.cosmos import CosmosConfig, CosmosConfig4B, CosmosModel, RotaryEmbedding3D
+from torch import nn
+
+from cosmos1.models.autoregressive.nemo.cosmos import (
+    CosmosConfig,
+    CosmosConfig4B,
+    CosmosConfig12B,
+    CosmosModel,
+    RotaryEmbedding3D,
+)
 from cosmos1.models.autoregressive.nemo.inference.inference_controller import CosmosInferenceWrapper
 from cosmos1.utils import log
 
 if TYPE_CHECKING:
     from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
 
-from megatron.core import InferenceParams
 from megatron.core.packed_seq_params import PackedSeqParams
 from megatron.core.transformer.transformer_block import TransformerBlock
 from nemo.collections.llm.gpt.model.base import get_batch_on_this_context_parallel_rank
-from nemo.collections.llm.gpt.model.llama import Llama3Config
 from nemo.collections.llm.utils import Config
 from nemo.lightning import OptimizerModule, io
 from nemo.lightning.base import teardown
@@ -64,30 +67,16 @@
 class CosmosTransformerBlock(TransformerBlock):
     def forward(
         self,
-        hidden_states: Tensor,
-        attention_mask: Tensor,
-        context: Tensor = None,
-        context_mask: Tensor = None,
-        rotary_pos_emb: Tensor = None,
-        rotary_pos_cos: Tensor = None,
-        rotary_pos_sin: Tensor = None,
-        attention_bias: Tensor = None,
-        inference_params: InferenceParams = None,
+        *args,
         packed_seq_params: PackedSeqParams = None,
         extra_positional_embeddings=None,
+        **kwargs,
     ):
         packed_seq_params = {"abs_pos_embed": extra_positional_embeddings}
         return super().forward(
-            hidden_states,
-            attention_mask,
-            context,
-            context_mask,
-            rotary_pos_emb,
-            rotary_pos_cos,
-            rotary_pos_sin,
-            attention_bias,
-            inference_params,
-            packed_seq_params,
+            *args,
+            packed_seq_params=packed_seq_params,
+            **kwargs,
         )
 
 
@@ -361,7 +350,7 @@ def cosmos_data_step(dataloader_iter) -> Dict[str, torch.Tensor]:
     if "cu_seqlens" in _batch:
         raise ValueError("Packed sequence cu_seqlens not supported")
 
-    required_device_keys.update(("context", "abs_pos_embed"))
+    required_device_keys.update(("context", "abs_pos_embed", "action"))
     if parallel_state.is_pipeline_first_stage():
         required_device_keys.update(("tokens", "position_ids"))
     if parallel_state.is_pipeline_last_stage():
@@ -398,30 +387,19 @@ def cosmos_forward_step(model, batch) -> torch.Tensor:
 
 
 @dataclass
-class CosmosConfigVideo2World5B(Llama3Config):
-    qk_layernorm: bool = True
-    rope_dim: str = "3D"
+class CosmosVideo2WorldConfig:
     vocab_size: int = 64064
     output_layer_vocab_size: int = 64000
-    activation_func = F.silu
-    rotary_base: int = 500_000
     seq_length: int = 12864
-    num_layers: int = 16
-    hidden_size: int = 4096
-    ffn_hidden_size: int = 14336
-    num_attention_heads: int = 32
-    num_query_groups: int = 8
-    layernorm_epsilon: float = 1e-5
-    use_cpu_initialization: bool = True
-    make_vocab_size_divisible_by: int = 64
-    kv_channels: int = 128
-    crossattn_emb_size: int = 1024
     latent_shape = [5, 40, 64]
     pad_to_multiple_of = 64
     forward_step_fn: Callable = cosmos_forward_step
     transformer_layer_spec = get_cosmos_video2world_spec()
     data_step_fn: Callable = cosmos_data_step
     attention_backend: AttnBackend = AttnBackend.flash
+    crossattn_emb_size: int = 1024
+    kv_channels: int = 128
+    training_type: str | None = "text_to_video"
 
     def configure_model(self, tokenizer) -> "MCoreGPTModel":
         self.transformer_layer_spec = get_cosmos_video2world_spec()
@@ -429,7 +407,7 @@ def configure_model(self, tokenizer) -> "MCoreGPTModel":
         if self.rope_dim == "3D":
             model.rotary_pos_emb = RotaryEmbedding3D(
                 seq_len=self.seq_length,
-                training_type="text_to_video",
+                training_type=self.training_type,
                 pad_to_multiple_of=self.pad_to_multiple_of,
                 kv_channels=self.kv_channels,
                 max_position_embeddings=self.seq_length,
@@ -467,78 +445,13 @@ def configure_model(self, tokenizer) -> "MCoreGPTModel":
 
 
 @dataclass
-class CosmosConfigVideo2World13B(Llama3Config):
-    qk_layernorm: bool = True
-    rope_dim: str = "3D"
-    vocab_size: int = 64064
-    output_layer_vocab_size: int = 64000
-    activation_func = F.silu
-    rotary_base: int = 500_000
-    seq_length: int = 12864
-    num_layers: int = 40
-    hidden_size: int = 5120
-    ffn_hidden_size: int = 14336
-    num_attention_heads: int = 32
-    num_query_groups: int = 8
-    layernorm_epsilon: float = 1e-5
-    use_cpu_initialization: bool = True
-    make_vocab_size_divisible_by: int = 128
-    kv_channels: int = 128
-    crossattn_emb_size: int = 1024
-    original_latent_shape = [3, 40, 64]
-    apply_yarn: bool = True
-    yarn_beta_fast: int = 4
-    yarn_beta_slow: int = 1
-    yarn_scale: int = 2
-    original_seq_len = 8192
-    latent_shape = [5, 40, 64]
-    pad_to_multiple_of = 64
-    forward_step_fn: Callable = cosmos_forward_step
-    transformer_layer_spec = get_cosmos_video2world_spec()
-    data_step_fn: Callable = cosmos_data_step
-    attention_backend: AttnBackend = AttnBackend.flash
+class CosmosConfigVideo2World5B(CosmosVideo2WorldConfig, CosmosConfig4B):
+    make_vocab_size_divisible_by: int = 64
 
-    def configure_model(self, tokenizer) -> "MCoreGPTModel":
-        self.transformer_layer_spec = get_cosmos_video2world_spec()
-        model = super().configure_model(tokenizer)
-        if self.rope_dim == "3D":
-            model.rotary_pos_emb = RotaryEmbedding3D(
-                seq_len=self.seq_length,
-                training_type="text_to_video",
-                pad_to_multiple_of=self.pad_to_multiple_of,
-                kv_channels=self.kv_channels,
-                max_position_embeddings=self.seq_length,
-                original_max_position_embeddings=self.original_seq_len if hasattr(self, "original_seq_len") else None,
-                rotary_base=self.rotary_base,
-                apply_yarn=True if hasattr(self, "apply_yarn") else False,
-                scale=self.yarn_scale if hasattr(self, "yarn_scale") else None,
-                extrapolation_factor=1,
-                attn_factor=1,
-                beta_fast=self.yarn_beta_fast if hasattr(self, "yarn_beta_fast") else 32,
-                beta_slow=self.yarn_beta_slow if hasattr(self, "yarn_beta_slow") else 1,
-                latent_shape=self.latent_shape,
-                original_latent_shape=self.original_latent_shape if hasattr(self, "original_latent_shape") else None,
-            )
-        model.output_layer = tensor_parallel.ColumnParallelLinear(
-            self.hidden_size,
-            self.output_layer_vocab_size,
-            config=self,
-            init_method=self.init_method,
-            bias=False,
-            skip_bias_add=False,
-            gather_output=False,
-            skip_weight_param_allocation=False,
-            embedding_activation_buffer=None,
-            grad_output_buffer=None,
-        )
 
-        model.decoder = CosmosTransformerBlock(
-            config=self,
-            spec=self.transformer_layer_spec,
-            pre_process=model.pre_process,
-            post_process=model.post_process,
-        )
-        return model
+@dataclass
+class CosmosConfigVideo2World13B(CosmosVideo2WorldConfig, CosmosConfig12B):
+    make_vocab_size_divisible_by: int = 128
 
 
 class CosmosVideo2WorldModel(CosmosModel):
@@ -549,7 +462,9 @@ def __init__(
         tokenizer: Optional["TokenizerSpec"] = None,
         model_transform: Optional[Callable[[nn.Module], nn.Module]] = None,
     ):
-        super().__init__(config or CosmosConfig4B(), optim=optim, tokenizer=tokenizer, model_transform=model_transform)
+        super().__init__(
+            config or CosmosConfigVideo2World5B(), optim=optim, tokenizer=tokenizer, model_transform=model_transform
+        )
         self.config = config
 
     def get_inference_wrapper(self, params_dtype, inference_batch_times_seqlen_threshold) -> torch.Tensor:
diff --git a/cosmos1/models/autoregressive/nemo/inference/README.md b/cosmos1/models/autoregressive/nemo/inference/README.md
index 513dcfd..a304770 100644
--- a/cosmos1/models/autoregressive/nemo/inference/README.md
+++ b/cosmos1/models/autoregressive/nemo/inference/README.md
@@ -106,9 +106,6 @@ Complete the following steps to run inference on the 4B model.
    cd /workspace/Cosmos
    git lfs pull $INPUT_DATA
 
-   NVTE_FLASH_ATTN=1 \
-   NVTE_FUSED_ATTN=0 \
-   NVTE_UNFUSED_ATTN=0 \
    torchrun --nproc-per-node 1 cosmos1/models/autoregressive/nemo/inference/general.py \
    --input_image_or_video_path $INPUT_DATA \
    --video_save_name "Cosmos-1.0-Autoregressive-4B.mp4"  \
@@ -138,14 +135,10 @@ Complete the following steps to run inference on the 5B model.
    cd /workspace/Cosmos
    git lfs pull $INPUT_DATA
 
-   NVTE_FLASH_ATTN=1 \
-   NVTE_FUSED_ATTN=0 \
-   NVTE_UNFUSED_ATTN=0 \
    python3 cosmos1/models/autoregressive/nemo/inference/video2world.py \
       --input_type video \
       --input_image_or_video_path 'cosmos1/models/autoregressive/assets/v1p0/input.mp4' \
       --prompt "A video recorded from a moving vehicle's perspective, capturing roads, buildings, landscapes, and changing weather and lighting conditions." \
-      --disable_diffusion_decoder \
       --ar_model_dir nvidia/Cosmos-1.0-Autoregressive-5B-Video2World
    ```
 
@@ -160,6 +153,8 @@ Complete the following steps to generate a new output video using a post-trained
 1. Set the following environment variables:
 
    ```bash
+   pip install --no-cache-dir imageio[ffmpeg] pyav iopath better_profanity peft git+https://github.com/NVlabs/Pytorch_Retinaface.git@b843f45
+
    export HF_TOKEN="<your/HF/access/token>"
    export HF_HOME="<path/to/store/checkpoints>"
 
@@ -178,9 +173,6 @@ Complete the following steps to generate a new output video using a post-trained
    git lfs pull $INPUT_DATA
 
    # change --ar_model_dir to a post-trained checkpoint under ./logs/default/checkpoints/
-   NVTE_FLASH_ATTN=1 \
-   NVTE_FUSED_ATTN=0 \
-   NVTE_UNFUSED_ATTN=0 \
    torchrun --nproc-per-node 1 cosmos1/models/autoregressive/nemo/inference/general.py \
    --input_image_or_video_path $INPUT_DATA \
    --video_save_name "Cosmos-1.0-Autoregressive-4B.mp4" \
@@ -194,6 +186,8 @@ Complete the following steps to generate a new output video using a post-trained
 1. Set the following environment variables:
 
    ```bash
+   pip install --no-cache-dir imageio[ffmpeg] pyav iopath better_profanity peft git+https://github.com/NVlabs/Pytorch_Retinaface.git@b843f45
+
    export HF_TOKEN="<your/HF/access/token>"
    export HF_HOME="<path/to/store/checkpoints>"
 
@@ -213,9 +207,6 @@ Complete the following steps to generate a new output video using a post-trained
    git lfs pull $INPUT_DATA
 
    # change --ar_model_dir to a post-trained checkpoint under ./logs/default/checkpoints/
-   NVTE_FLASH_ATTN=1 \
-   NVTE_FUSED_ATTN=0 \
-   NVTE_UNFUSED_ATTN=0 \
    python3 cosmos1/models/autoregressive/nemo/inference/video2world.py \
       --input_image_or_video_path $INPUT_DATA \
       --video_save_name "Cosmos-1.0-Autoregressive-5B-Video2World.mp4" \
diff --git a/cosmos1/models/autoregressive/nemo/post_training/README.md b/cosmos1/models/autoregressive/nemo/post_training/README.md
index 4098470..40ae4f3 100644
--- a/cosmos1/models/autoregressive/nemo/post_training/README.md
+++ b/cosmos1/models/autoregressive/nemo/post_training/README.md
@@ -101,6 +101,8 @@ Before proceeding, ensure all videos are in **RGB format**. Complete the followi
 1. Set the following environment variables:
 
    ```bash
+   pip install --no-cache-dir imageio[ffmpeg] pyav iopath
+
    export HF_TOKEN="<your/HF/access/token>"
    export HF_HOME="<path/to/store/checkpoints>"
 
@@ -144,6 +146,8 @@ Before proceeding, ensure all videos are in **RGB format**. Complete the followi
 1. Set the following environment variables:
 
    ```bash
+   pip install --no-cache-dir imageio[ffmpeg] pyav iopath
+
    export HF_TOKEN="<your/HF/access/token>"
    export HF_HOME="<path/to/store/checkpoints>"
 
diff --git a/cosmos1/models/autoregressive/nemo/post_training/video2world_dataset.py b/cosmos1/models/autoregressive/nemo/post_training/video2world_dataset.py
index 0c4c86f..c374099 100644
--- a/cosmos1/models/autoregressive/nemo/post_training/video2world_dataset.py
+++ b/cosmos1/models/autoregressive/nemo/post_training/video2world_dataset.py
@@ -16,16 +16,17 @@
 import json
 
 import torch
+from nemo.collections.llm.gpt.data.mock import MockDataModule
 from torch.utils.data import Dataset
 
 from cosmos1.models.autoregressive.modules.embedding import SinCosPosEmbAxisTE
+from cosmos1.models.autoregressive.nemo.cosmos import CosmosConfig
 
 TOKENIZER_COMPRESSION_FACTOR = [8, 16, 16]
 DATA_RESOLUTION_SUPPORTED = [640, 1024]
 NUM_CONTEXT_FRAMES = 33
 BOV_TOKEN = 64000
 PAD_ID = 64002
-from nemo.collections.llm.gpt.data.mock import MockDataModule
 
 
 class CosmosVideo2WorldDataset(Dataset):
@@ -33,24 +34,12 @@ def __init__(self, data_path, model_config, split="train"):
         self.data_path = data_path
         self.model_config = model_config
         self.split = split
-        self.abs_pos_emb = self._initialize_abs_pos_emb()
+        self.abs_pos_emb = get_abs_pos_embed(model_config, training_type="text_to_video")
         metadata_file = f"{self.data_path}/metadata.json"
         with open(metadata_file, "r") as f:
             metadata = json.load(f)
         self.metadata = metadata
 
-    def _initialize_abs_pos_emb(self):
-        pos_emb = SinCosPosEmbAxisTE(
-            self.model_config.hidden_size,
-            latent_shape=self.model_config.latent_shape,
-            pad_to_multiple_of=self.model_config.pad_to_multiple_of,
-            device="cpu",
-        )
-        training_type = "text_to_video"
-        abs_pos_emb = pos_emb.forward(training_type=training_type)
-        abs_pos_emb = abs_pos_emb.transpose(0, 1).contiguous()
-        return abs_pos_emb
-
     def __len__(self):
         return self.metadata[f"{self.split}_samples"]
 
@@ -90,6 +79,18 @@ def collate_fn(self, batch):
         return self._collate_fn(batch)
 
 
+def get_abs_pos_embed(model_config: CosmosConfig, training_type: str | None = "text_to_video"):
+    pos_emb = SinCosPosEmbAxisTE(
+        model_config.hidden_size,
+        latent_shape=model_config.latent_shape,
+        pad_to_multiple_of=model_config.pad_to_multiple_of,
+        device="cpu",
+    )
+    abs_pos_emb = pos_emb.forward(training_type=training_type)
+    abs_pos_emb = abs_pos_emb.transpose(0, 1).contiguous()
+    return abs_pos_emb
+
+
 class CosmosVideo2WorldDataModule(MockDataModule):
     def __init__(self, *args, **kwargs):
         data_path = kwargs["data_path"]
diff --git a/cosmos1/models/autoregressive/nemo/post_training/video2world_prepare_dataset.py b/cosmos1/models/autoregressive/nemo/post_training/video2world_prepare_dataset.py
index dc129cd..c7678d4 100644
--- a/cosmos1/models/autoregressive/nemo/post_training/video2world_prepare_dataset.py
+++ b/cosmos1/models/autoregressive/nemo/post_training/video2world_prepare_dataset.py
@@ -17,6 +17,8 @@
 import json
 import os
 import random
+import shutil
+from pathlib import Path
 
 import torch
 from einops import rearrange
@@ -111,7 +113,8 @@ def save_tensors(jsonl_contents, split):
     test_jsonl_file_contents = jsonl_file_contents[train_fraction : (train_fraction + test_fraction)]
     val_jsonl_file_contents = jsonl_file_contents[(train_fraction + test_fraction) :]
 
-    from pathlib import Path
+    if os.path.exists(args.output_dir):
+        shutil.rmtree(args.output_dir, ignore_errors=True)
 
     Path(args.output_dir).mkdir(parents=True, exist_ok=True)
 
@@ -127,6 +130,7 @@ def save_tensors(jsonl_contents, split):
 
     with open(f"{args.output_dir}/metadata.json", "w") as f:
         json.dump(metadata, f)
+    return
 
 
 if __name__ == "__main__":
diff --git a/cosmos1/models/guardrail/face_blur_filter/face_blur_filter.py b/cosmos1/models/guardrail/face_blur_filter/face_blur_filter.py
index a416394..45aad16 100644
--- a/cosmos1/models/guardrail/face_blur_filter/face_blur_filter.py
+++ b/cosmos1/models/guardrail/face_blur_filter/face_blur_filter.py
@@ -18,9 +18,9 @@
 
 import numpy as np
 import torch
-from pytorch_retinaface.data import cfg_re50
-from pytorch_retinaface.layers.functions.prior_box import PriorBox
-from pytorch_retinaface.models.retinaface import RetinaFace
+from retinaface.data import cfg_re50
+from retinaface.layers.functions.prior_box import PriorBox
+from retinaface.models.retinaface import RetinaFace
 from torch.utils.data import DataLoader, TensorDataset
 from tqdm import tqdm
 
diff --git a/cosmos1/models/guardrail/face_blur_filter/retinaface_utils.py b/cosmos1/models/guardrail/face_blur_filter/retinaface_utils.py
index 27e69ce..9f5c01e 100644
--- a/cosmos1/models/guardrail/face_blur_filter/retinaface_utils.py
+++ b/cosmos1/models/guardrail/face_blur_filter/retinaface_utils.py
@@ -15,7 +15,7 @@
 
 import numpy as np
 import torch
-from pytorch_retinaface.utils.nms.py_cpu_nms import py_cpu_nms
+from retinaface.utils.nms.py_cpu_nms import py_cpu_nms
 
 from cosmos1.utils import log
 
diff --git a/cosmos1/scripts/ip_header.py b/cosmos1/scripts/ip_header.py
index 1f702a1..6b60833 100644
--- a/cosmos1/scripts/ip_header.py
+++ b/cosmos1/scripts/ip_header.py
@@ -73,6 +73,31 @@ def get_header(ext: str = "py", old: str | bool = False) -> list[str]:
     return header
 
 
+def get_header_ea(ext: str = "py", old: str | bool = False) -> list[str]:
+    # This is the raw header.
+    # The early-access software is governed by the NVIDIA Evaluation License Agreement – EA Cosmos Code (v. Feb 2025).
+    # The license reference will be the finalized version of the license linked above.
+    header = [
+        "The early-access software is governed by the NVIDIA Evaluation License Agreement – EA Cosmos Code (v. Feb 2025).",
+        "The license reference will be the finalized version of the license linked above.",
+    ]
+    # Reformat according to different file extensions.
+    if ext == ".py" and old:
+        if old == "single":
+            header = ["'''"] + header + ["'''"]
+        elif old == "double":
+            header = ['"""'] + header + ['"""']
+        else:
+            raise NotImplementedError
+    elif ext in (".py", ".yaml"):
+        header = [("# " + line if line else "#") for line in header]
+    elif ext in (".c", ".cpp", ".cu", ".h", ".cuh"):
+        header = ["/*"] + [(" * " + line if line else " *") for line in header] + [" */"]
+    else:
+        raise NotImplementedError
+    return header
+
+
 def apply_file(file: str, results: dict[str, int], fix: bool = False) -> None:
     if file.endswith("__init__.py"):
         return
@@ -81,19 +106,14 @@ def apply_file(file: str, results: dict[str, int], fix: bool = False) -> None:
     content = open(file).read().splitlines()
     # Check if cosmos header (with a blank newline) is properly embedded.
     header = get_header(ext=ext)
+    header_ea = get_header_ea(ext=ext)
     if fix:
         # If header passes format check, then just exit
         if _check_header(content, header):
             return
+        if _check_header(content, header_ea):
+            return
         print(f"fixing: {file}")
-        # Remove old header if exists.
-        if ext == ".py":
-            for header_old in [
-                get_header(ext=ext, old="single"),
-                get_header(ext=ext, old="double"),
-            ]:
-                if content[: len(header_old)] == header_old:
-                    content = content[len(header_old) :]
         # Clean up leading blank lines.
         while len(content) > 0 and not content[0]:
             content.pop(0)
@@ -104,7 +124,7 @@ def apply_file(file: str, results: dict[str, int], fix: bool = False) -> None:
             for line in content:
                 file_obj.write(line + "\n")
     else:
-        if not _check_header(content, header):
+        if not _check_header(content, header) and not _check_header(content, header_ea):
             bad_header = colorize("BAD HEADER", color="red", bold=True)
             print(f"{bad_header}: {file}")
             results[file] = 1
diff --git a/requirements.txt b/requirements.txt
index c4cdcf1..9f6e307 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -27,7 +27,7 @@ mediapy
 nltk
 peft
 pillow
-pytorch_retinaface @ git+https://github.com/NVlabs/Pytorch_Retinaface.git@b843f45
+retinaface-py
 sentencepiece
 termcolor
-transformers==4.45.0
+transformers==4.48.0