From 4a0bdea0eeed82c5847cee13273eca5c26c285b7 Mon Sep 17 00:00:00 2001 From: GitLab Mirror Bot Date: Fri, 28 Feb 2025 00:44:51 +0000 Subject: [PATCH] Bug fixes and stability improvements --- README.md | 2 +- .../autoregressive/modules/embedding.py | 2 +- cosmos1/models/autoregressive/nemo/cosmos.py | 2 +- .../autoregressive/nemo/cosmos_video2world.py | 141 ++++-------------- .../autoregressive/nemo/inference/README.md | 17 +-- .../nemo/post_training/README.md | 4 + .../nemo/post_training/video2world_dataset.py | 29 ++-- .../video2world_prepare_dataset.py | 6 +- .../face_blur_filter/face_blur_filter.py | 6 +- .../face_blur_filter/retinaface_utils.py | 2 +- cosmos1/scripts/ip_header.py | 38 +++-- requirements.txt | 4 +- 12 files changed, 94 insertions(+), 159 deletions(-) diff --git a/README.md b/README.md index 8ea2df2..aed7974 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ Details of the platform is described in the [Cosmos paper](https://research.nvid - [Video tokenizers](cosmos1/models/tokenizer) for tokenizing videos into continuous tokens (latent vectors) and discrete tokens (integers) efficiently and effectively. - Video curation pipeline for building your own video dataset. [Coming soon] - [Post-training scripts](cosmos1/models/POST_TRAINING.md) via NeMo Framework to post-train the pre-trained world foundation models for various Physical AI setup. -- Pre-training scripts via NeMo Framework for building your own world foundation model. [[Diffusion](https://github.com/NVIDIA/NeMo/tree/main/nemo/collections/diffusion)] [[Autoregressive](https://github.com/NVIDIA/NeMo/tree/main/nemo/collections/multimodal_autoregressive)] [[Tokenizer](https://github.com/NVIDIA/NeMo/tree/main/nemo/collections/diffusion/vae)]. +- Pre-training scripts via NeMo Framework for building your own world foundation model. [[Diffusion](https://github.com/NVIDIA/NeMo/tree/main/nemo/collections/diffusion)] [[Autoregressive](https://github.com/NVIDIA/NeMo/tree/main/nemo/collections/multimodal_autoregressive)] [[Tokenizer](cosmos1/models/tokenizer/nemo/README.md)]. ## Model Family diff --git a/cosmos1/models/autoregressive/modules/embedding.py b/cosmos1/models/autoregressive/modules/embedding.py index 6db842f..5d32c50 100644 --- a/cosmos1/models/autoregressive/modules/embedding.py +++ b/cosmos1/models/autoregressive/modules/embedding.py @@ -467,7 +467,7 @@ def __init__( def forward( self, - training_type: str = None, + training_type: str | None = None, ) -> torch.Tensor: T, H, W = self.latent_shape emb = torch.cat( diff --git a/cosmos1/models/autoregressive/nemo/cosmos.py b/cosmos1/models/autoregressive/nemo/cosmos.py index 81f2312..8fb828d 100644 --- a/cosmos1/models/autoregressive/nemo/cosmos.py +++ b/cosmos1/models/autoregressive/nemo/cosmos.py @@ -49,7 +49,7 @@ def __init__( self, seq_len: int, kv_channels: int, - training_type: str = None, + training_type: str | None = None, rotary_base: int = 10000, use_cpu_initialization: bool = False, latent_shape=[5, 40, 64], diff --git a/cosmos1/models/autoregressive/nemo/cosmos_video2world.py b/cosmos1/models/autoregressive/nemo/cosmos_video2world.py index b46d0fa..0079f39 100644 --- a/cosmos1/models/autoregressive/nemo/cosmos_video2world.py +++ b/cosmos1/models/autoregressive/nemo/cosmos_video2world.py @@ -19,7 +19,6 @@ from typing import TYPE_CHECKING, Annotated, Callable, Dict, Optional, Union import torch -import torch.nn.functional as F from megatron.core import tensor_parallel from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import InferenceWrapperConfig @@ -42,20 +41,24 @@ from megatron.core.transformer.transformer_config import TransformerConfig from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules from megatron.core.utils import make_viewless_tensor -from torch import Tensor, nn - -from cosmos1.models.autoregressive.nemo.cosmos import CosmosConfig, CosmosConfig4B, CosmosModel, RotaryEmbedding3D +from torch import nn + +from cosmos1.models.autoregressive.nemo.cosmos import ( + CosmosConfig, + CosmosConfig4B, + CosmosConfig12B, + CosmosModel, + RotaryEmbedding3D, +) from cosmos1.models.autoregressive.nemo.inference.inference_controller import CosmosInferenceWrapper from cosmos1.utils import log if TYPE_CHECKING: from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec -from megatron.core import InferenceParams from megatron.core.packed_seq_params import PackedSeqParams from megatron.core.transformer.transformer_block import TransformerBlock from nemo.collections.llm.gpt.model.base import get_batch_on_this_context_parallel_rank -from nemo.collections.llm.gpt.model.llama import Llama3Config from nemo.collections.llm.utils import Config from nemo.lightning import OptimizerModule, io from nemo.lightning.base import teardown @@ -64,30 +67,16 @@ class CosmosTransformerBlock(TransformerBlock): def forward( self, - hidden_states: Tensor, - attention_mask: Tensor, - context: Tensor = None, - context_mask: Tensor = None, - rotary_pos_emb: Tensor = None, - rotary_pos_cos: Tensor = None, - rotary_pos_sin: Tensor = None, - attention_bias: Tensor = None, - inference_params: InferenceParams = None, + *args, packed_seq_params: PackedSeqParams = None, extra_positional_embeddings=None, + **kwargs, ): packed_seq_params = {"abs_pos_embed": extra_positional_embeddings} return super().forward( - hidden_states, - attention_mask, - context, - context_mask, - rotary_pos_emb, - rotary_pos_cos, - rotary_pos_sin, - attention_bias, - inference_params, - packed_seq_params, + *args, + packed_seq_params=packed_seq_params, + **kwargs, ) @@ -361,7 +350,7 @@ def cosmos_data_step(dataloader_iter) -> Dict[str, torch.Tensor]: if "cu_seqlens" in _batch: raise ValueError("Packed sequence cu_seqlens not supported") - required_device_keys.update(("context", "abs_pos_embed")) + required_device_keys.update(("context", "abs_pos_embed", "action")) if parallel_state.is_pipeline_first_stage(): required_device_keys.update(("tokens", "position_ids")) if parallel_state.is_pipeline_last_stage(): @@ -398,30 +387,19 @@ def cosmos_forward_step(model, batch) -> torch.Tensor: @dataclass -class CosmosConfigVideo2World5B(Llama3Config): - qk_layernorm: bool = True - rope_dim: str = "3D" +class CosmosVideo2WorldConfig: vocab_size: int = 64064 output_layer_vocab_size: int = 64000 - activation_func = F.silu - rotary_base: int = 500_000 seq_length: int = 12864 - num_layers: int = 16 - hidden_size: int = 4096 - ffn_hidden_size: int = 14336 - num_attention_heads: int = 32 - num_query_groups: int = 8 - layernorm_epsilon: float = 1e-5 - use_cpu_initialization: bool = True - make_vocab_size_divisible_by: int = 64 - kv_channels: int = 128 - crossattn_emb_size: int = 1024 latent_shape = [5, 40, 64] pad_to_multiple_of = 64 forward_step_fn: Callable = cosmos_forward_step transformer_layer_spec = get_cosmos_video2world_spec() data_step_fn: Callable = cosmos_data_step attention_backend: AttnBackend = AttnBackend.flash + crossattn_emb_size: int = 1024 + kv_channels: int = 128 + training_type: str | None = "text_to_video" def configure_model(self, tokenizer) -> "MCoreGPTModel": self.transformer_layer_spec = get_cosmos_video2world_spec() @@ -429,7 +407,7 @@ def configure_model(self, tokenizer) -> "MCoreGPTModel": if self.rope_dim == "3D": model.rotary_pos_emb = RotaryEmbedding3D( seq_len=self.seq_length, - training_type="text_to_video", + training_type=self.training_type, pad_to_multiple_of=self.pad_to_multiple_of, kv_channels=self.kv_channels, max_position_embeddings=self.seq_length, @@ -467,78 +445,13 @@ def configure_model(self, tokenizer) -> "MCoreGPTModel": @dataclass -class CosmosConfigVideo2World13B(Llama3Config): - qk_layernorm: bool = True - rope_dim: str = "3D" - vocab_size: int = 64064 - output_layer_vocab_size: int = 64000 - activation_func = F.silu - rotary_base: int = 500_000 - seq_length: int = 12864 - num_layers: int = 40 - hidden_size: int = 5120 - ffn_hidden_size: int = 14336 - num_attention_heads: int = 32 - num_query_groups: int = 8 - layernorm_epsilon: float = 1e-5 - use_cpu_initialization: bool = True - make_vocab_size_divisible_by: int = 128 - kv_channels: int = 128 - crossattn_emb_size: int = 1024 - original_latent_shape = [3, 40, 64] - apply_yarn: bool = True - yarn_beta_fast: int = 4 - yarn_beta_slow: int = 1 - yarn_scale: int = 2 - original_seq_len = 8192 - latent_shape = [5, 40, 64] - pad_to_multiple_of = 64 - forward_step_fn: Callable = cosmos_forward_step - transformer_layer_spec = get_cosmos_video2world_spec() - data_step_fn: Callable = cosmos_data_step - attention_backend: AttnBackend = AttnBackend.flash +class CosmosConfigVideo2World5B(CosmosVideo2WorldConfig, CosmosConfig4B): + make_vocab_size_divisible_by: int = 64 - def configure_model(self, tokenizer) -> "MCoreGPTModel": - self.transformer_layer_spec = get_cosmos_video2world_spec() - model = super().configure_model(tokenizer) - if self.rope_dim == "3D": - model.rotary_pos_emb = RotaryEmbedding3D( - seq_len=self.seq_length, - training_type="text_to_video", - pad_to_multiple_of=self.pad_to_multiple_of, - kv_channels=self.kv_channels, - max_position_embeddings=self.seq_length, - original_max_position_embeddings=self.original_seq_len if hasattr(self, "original_seq_len") else None, - rotary_base=self.rotary_base, - apply_yarn=True if hasattr(self, "apply_yarn") else False, - scale=self.yarn_scale if hasattr(self, "yarn_scale") else None, - extrapolation_factor=1, - attn_factor=1, - beta_fast=self.yarn_beta_fast if hasattr(self, "yarn_beta_fast") else 32, - beta_slow=self.yarn_beta_slow if hasattr(self, "yarn_beta_slow") else 1, - latent_shape=self.latent_shape, - original_latent_shape=self.original_latent_shape if hasattr(self, "original_latent_shape") else None, - ) - model.output_layer = tensor_parallel.ColumnParallelLinear( - self.hidden_size, - self.output_layer_vocab_size, - config=self, - init_method=self.init_method, - bias=False, - skip_bias_add=False, - gather_output=False, - skip_weight_param_allocation=False, - embedding_activation_buffer=None, - grad_output_buffer=None, - ) - model.decoder = CosmosTransformerBlock( - config=self, - spec=self.transformer_layer_spec, - pre_process=model.pre_process, - post_process=model.post_process, - ) - return model +@dataclass +class CosmosConfigVideo2World13B(CosmosVideo2WorldConfig, CosmosConfig12B): + make_vocab_size_divisible_by: int = 128 class CosmosVideo2WorldModel(CosmosModel): @@ -549,7 +462,9 @@ def __init__( tokenizer: Optional["TokenizerSpec"] = None, model_transform: Optional[Callable[[nn.Module], nn.Module]] = None, ): - super().__init__(config or CosmosConfig4B(), optim=optim, tokenizer=tokenizer, model_transform=model_transform) + super().__init__( + config or CosmosConfigVideo2World5B(), optim=optim, tokenizer=tokenizer, model_transform=model_transform + ) self.config = config def get_inference_wrapper(self, params_dtype, inference_batch_times_seqlen_threshold) -> torch.Tensor: diff --git a/cosmos1/models/autoregressive/nemo/inference/README.md b/cosmos1/models/autoregressive/nemo/inference/README.md index 513dcfd..a304770 100644 --- a/cosmos1/models/autoregressive/nemo/inference/README.md +++ b/cosmos1/models/autoregressive/nemo/inference/README.md @@ -106,9 +106,6 @@ Complete the following steps to run inference on the 4B model. cd /workspace/Cosmos git lfs pull $INPUT_DATA - NVTE_FLASH_ATTN=1 \ - NVTE_FUSED_ATTN=0 \ - NVTE_UNFUSED_ATTN=0 \ torchrun --nproc-per-node 1 cosmos1/models/autoregressive/nemo/inference/general.py \ --input_image_or_video_path $INPUT_DATA \ --video_save_name "Cosmos-1.0-Autoregressive-4B.mp4" \ @@ -138,14 +135,10 @@ Complete the following steps to run inference on the 5B model. cd /workspace/Cosmos git lfs pull $INPUT_DATA - NVTE_FLASH_ATTN=1 \ - NVTE_FUSED_ATTN=0 \ - NVTE_UNFUSED_ATTN=0 \ python3 cosmos1/models/autoregressive/nemo/inference/video2world.py \ --input_type video \ --input_image_or_video_path 'cosmos1/models/autoregressive/assets/v1p0/input.mp4' \ --prompt "A video recorded from a moving vehicle's perspective, capturing roads, buildings, landscapes, and changing weather and lighting conditions." \ - --disable_diffusion_decoder \ --ar_model_dir nvidia/Cosmos-1.0-Autoregressive-5B-Video2World ``` @@ -160,6 +153,8 @@ Complete the following steps to generate a new output video using a post-trained 1. Set the following environment variables: ```bash + pip install --no-cache-dir imageio[ffmpeg] pyav iopath better_profanity peft git+https://github.com/NVlabs/Pytorch_Retinaface.git@b843f45 + export HF_TOKEN="" export HF_HOME="" @@ -178,9 +173,6 @@ Complete the following steps to generate a new output video using a post-trained git lfs pull $INPUT_DATA # change --ar_model_dir to a post-trained checkpoint under ./logs/default/checkpoints/ - NVTE_FLASH_ATTN=1 \ - NVTE_FUSED_ATTN=0 \ - NVTE_UNFUSED_ATTN=0 \ torchrun --nproc-per-node 1 cosmos1/models/autoregressive/nemo/inference/general.py \ --input_image_or_video_path $INPUT_DATA \ --video_save_name "Cosmos-1.0-Autoregressive-4B.mp4" \ @@ -194,6 +186,8 @@ Complete the following steps to generate a new output video using a post-trained 1. Set the following environment variables: ```bash + pip install --no-cache-dir imageio[ffmpeg] pyav iopath better_profanity peft git+https://github.com/NVlabs/Pytorch_Retinaface.git@b843f45 + export HF_TOKEN="" export HF_HOME="" @@ -213,9 +207,6 @@ Complete the following steps to generate a new output video using a post-trained git lfs pull $INPUT_DATA # change --ar_model_dir to a post-trained checkpoint under ./logs/default/checkpoints/ - NVTE_FLASH_ATTN=1 \ - NVTE_FUSED_ATTN=0 \ - NVTE_UNFUSED_ATTN=0 \ python3 cosmos1/models/autoregressive/nemo/inference/video2world.py \ --input_image_or_video_path $INPUT_DATA \ --video_save_name "Cosmos-1.0-Autoregressive-5B-Video2World.mp4" \ diff --git a/cosmos1/models/autoregressive/nemo/post_training/README.md b/cosmos1/models/autoregressive/nemo/post_training/README.md index 4098470..40ae4f3 100644 --- a/cosmos1/models/autoregressive/nemo/post_training/README.md +++ b/cosmos1/models/autoregressive/nemo/post_training/README.md @@ -101,6 +101,8 @@ Before proceeding, ensure all videos are in **RGB format**. Complete the followi 1. Set the following environment variables: ```bash + pip install --no-cache-dir imageio[ffmpeg] pyav iopath + export HF_TOKEN="" export HF_HOME="" @@ -144,6 +146,8 @@ Before proceeding, ensure all videos are in **RGB format**. Complete the followi 1. Set the following environment variables: ```bash + pip install --no-cache-dir imageio[ffmpeg] pyav iopath + export HF_TOKEN="" export HF_HOME="" diff --git a/cosmos1/models/autoregressive/nemo/post_training/video2world_dataset.py b/cosmos1/models/autoregressive/nemo/post_training/video2world_dataset.py index 0c4c86f..c374099 100644 --- a/cosmos1/models/autoregressive/nemo/post_training/video2world_dataset.py +++ b/cosmos1/models/autoregressive/nemo/post_training/video2world_dataset.py @@ -16,16 +16,17 @@ import json import torch +from nemo.collections.llm.gpt.data.mock import MockDataModule from torch.utils.data import Dataset from cosmos1.models.autoregressive.modules.embedding import SinCosPosEmbAxisTE +from cosmos1.models.autoregressive.nemo.cosmos import CosmosConfig TOKENIZER_COMPRESSION_FACTOR = [8, 16, 16] DATA_RESOLUTION_SUPPORTED = [640, 1024] NUM_CONTEXT_FRAMES = 33 BOV_TOKEN = 64000 PAD_ID = 64002 -from nemo.collections.llm.gpt.data.mock import MockDataModule class CosmosVideo2WorldDataset(Dataset): @@ -33,24 +34,12 @@ def __init__(self, data_path, model_config, split="train"): self.data_path = data_path self.model_config = model_config self.split = split - self.abs_pos_emb = self._initialize_abs_pos_emb() + self.abs_pos_emb = get_abs_pos_embed(model_config, training_type="text_to_video") metadata_file = f"{self.data_path}/metadata.json" with open(metadata_file, "r") as f: metadata = json.load(f) self.metadata = metadata - def _initialize_abs_pos_emb(self): - pos_emb = SinCosPosEmbAxisTE( - self.model_config.hidden_size, - latent_shape=self.model_config.latent_shape, - pad_to_multiple_of=self.model_config.pad_to_multiple_of, - device="cpu", - ) - training_type = "text_to_video" - abs_pos_emb = pos_emb.forward(training_type=training_type) - abs_pos_emb = abs_pos_emb.transpose(0, 1).contiguous() - return abs_pos_emb - def __len__(self): return self.metadata[f"{self.split}_samples"] @@ -90,6 +79,18 @@ def collate_fn(self, batch): return self._collate_fn(batch) +def get_abs_pos_embed(model_config: CosmosConfig, training_type: str | None = "text_to_video"): + pos_emb = SinCosPosEmbAxisTE( + model_config.hidden_size, + latent_shape=model_config.latent_shape, + pad_to_multiple_of=model_config.pad_to_multiple_of, + device="cpu", + ) + abs_pos_emb = pos_emb.forward(training_type=training_type) + abs_pos_emb = abs_pos_emb.transpose(0, 1).contiguous() + return abs_pos_emb + + class CosmosVideo2WorldDataModule(MockDataModule): def __init__(self, *args, **kwargs): data_path = kwargs["data_path"] diff --git a/cosmos1/models/autoregressive/nemo/post_training/video2world_prepare_dataset.py b/cosmos1/models/autoregressive/nemo/post_training/video2world_prepare_dataset.py index dc129cd..c7678d4 100644 --- a/cosmos1/models/autoregressive/nemo/post_training/video2world_prepare_dataset.py +++ b/cosmos1/models/autoregressive/nemo/post_training/video2world_prepare_dataset.py @@ -17,6 +17,8 @@ import json import os import random +import shutil +from pathlib import Path import torch from einops import rearrange @@ -111,7 +113,8 @@ def save_tensors(jsonl_contents, split): test_jsonl_file_contents = jsonl_file_contents[train_fraction : (train_fraction + test_fraction)] val_jsonl_file_contents = jsonl_file_contents[(train_fraction + test_fraction) :] - from pathlib import Path + if os.path.exists(args.output_dir): + shutil.rmtree(args.output_dir, ignore_errors=True) Path(args.output_dir).mkdir(parents=True, exist_ok=True) @@ -127,6 +130,7 @@ def save_tensors(jsonl_contents, split): with open(f"{args.output_dir}/metadata.json", "w") as f: json.dump(metadata, f) + return if __name__ == "__main__": diff --git a/cosmos1/models/guardrail/face_blur_filter/face_blur_filter.py b/cosmos1/models/guardrail/face_blur_filter/face_blur_filter.py index a416394..45aad16 100644 --- a/cosmos1/models/guardrail/face_blur_filter/face_blur_filter.py +++ b/cosmos1/models/guardrail/face_blur_filter/face_blur_filter.py @@ -18,9 +18,9 @@ import numpy as np import torch -from pytorch_retinaface.data import cfg_re50 -from pytorch_retinaface.layers.functions.prior_box import PriorBox -from pytorch_retinaface.models.retinaface import RetinaFace +from retinaface.data import cfg_re50 +from retinaface.layers.functions.prior_box import PriorBox +from retinaface.models.retinaface import RetinaFace from torch.utils.data import DataLoader, TensorDataset from tqdm import tqdm diff --git a/cosmos1/models/guardrail/face_blur_filter/retinaface_utils.py b/cosmos1/models/guardrail/face_blur_filter/retinaface_utils.py index 27e69ce..9f5c01e 100644 --- a/cosmos1/models/guardrail/face_blur_filter/retinaface_utils.py +++ b/cosmos1/models/guardrail/face_blur_filter/retinaface_utils.py @@ -15,7 +15,7 @@ import numpy as np import torch -from pytorch_retinaface.utils.nms.py_cpu_nms import py_cpu_nms +from retinaface.utils.nms.py_cpu_nms import py_cpu_nms from cosmos1.utils import log diff --git a/cosmos1/scripts/ip_header.py b/cosmos1/scripts/ip_header.py index 1f702a1..6b60833 100644 --- a/cosmos1/scripts/ip_header.py +++ b/cosmos1/scripts/ip_header.py @@ -73,6 +73,31 @@ def get_header(ext: str = "py", old: str | bool = False) -> list[str]: return header +def get_header_ea(ext: str = "py", old: str | bool = False) -> list[str]: + # This is the raw header. + # The early-access software is governed by the NVIDIA Evaluation License Agreement – EA Cosmos Code (v. Feb 2025). + # The license reference will be the finalized version of the license linked above. + header = [ + "The early-access software is governed by the NVIDIA Evaluation License Agreement – EA Cosmos Code (v. Feb 2025).", + "The license reference will be the finalized version of the license linked above.", + ] + # Reformat according to different file extensions. + if ext == ".py" and old: + if old == "single": + header = ["'''"] + header + ["'''"] + elif old == "double": + header = ['"""'] + header + ['"""'] + else: + raise NotImplementedError + elif ext in (".py", ".yaml"): + header = [("# " + line if line else "#") for line in header] + elif ext in (".c", ".cpp", ".cu", ".h", ".cuh"): + header = ["/*"] + [(" * " + line if line else " *") for line in header] + [" */"] + else: + raise NotImplementedError + return header + + def apply_file(file: str, results: dict[str, int], fix: bool = False) -> None: if file.endswith("__init__.py"): return @@ -81,19 +106,14 @@ def apply_file(file: str, results: dict[str, int], fix: bool = False) -> None: content = open(file).read().splitlines() # Check if cosmos header (with a blank newline) is properly embedded. header = get_header(ext=ext) + header_ea = get_header_ea(ext=ext) if fix: # If header passes format check, then just exit if _check_header(content, header): return + if _check_header(content, header_ea): + return print(f"fixing: {file}") - # Remove old header if exists. - if ext == ".py": - for header_old in [ - get_header(ext=ext, old="single"), - get_header(ext=ext, old="double"), - ]: - if content[: len(header_old)] == header_old: - content = content[len(header_old) :] # Clean up leading blank lines. while len(content) > 0 and not content[0]: content.pop(0) @@ -104,7 +124,7 @@ def apply_file(file: str, results: dict[str, int], fix: bool = False) -> None: for line in content: file_obj.write(line + "\n") else: - if not _check_header(content, header): + if not _check_header(content, header) and not _check_header(content, header_ea): bad_header = colorize("BAD HEADER", color="red", bold=True) print(f"{bad_header}: {file}") results[file] = 1 diff --git a/requirements.txt b/requirements.txt index c4cdcf1..9f6e307 100644 --- a/requirements.txt +++ b/requirements.txt @@ -27,7 +27,7 @@ mediapy nltk peft pillow -pytorch_retinaface @ git+https://github.com/NVlabs/Pytorch_Retinaface.git@b843f45 +retinaface-py sentencepiece termcolor -transformers==4.45.0 +transformers==4.48.0