From a787e6028e6f2b3abdb2502ac8f2a6df3c5dc121 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Tue, 25 Feb 2025 16:04:46 +0100 Subject: [PATCH 1/2] gguf: run generate-llm --- packages/gguf/scripts/generate-llm.ts | 35 +++++++-- packages/gguf/src/transformer-llm.ts | 109 +++++++++++++++++++++++++- 2 files changed, 135 insertions(+), 9 deletions(-) diff --git a/packages/gguf/scripts/generate-llm.ts b/packages/gguf/scripts/generate-llm.ts index a833959ee..5ea2c677f 100644 --- a/packages/gguf/scripts/generate-llm.ts +++ b/packages/gguf/scripts/generate-llm.ts @@ -5,7 +5,10 @@ import { writeFileSync } from "node:fs"; -const SOURCE_CPP_URL = "https://raw.githubusercontent.com/ggerganov/llama.cpp/master/src/llama.cpp"; +const SOURCE_CPP_URLS = [ + "https://raw.githubusercontent.com/ggerganov/llama.cpp/master/src/llama-arch.cpp", + "https://raw.githubusercontent.com/ggerganov/llama.cpp/master/src/llama-model.cpp", +]; const DEST_FILE_PATH = "./src/transformer-llm.ts"; const DEST_COMMON_SOURCE = ` /** This file is auto-generated by generate-llm.ts */ @@ -90,10 +93,26 @@ const KV_TYPE = { LLM_KV_EXPERT_SHARED_COUNT: "number", LLM_KV_EXPERT_WEIGHTS_SCALE: "number", LLM_KV_ROPE_SCALING_YARN_LOG_MUL: "number", + LLM_KV_ROPE_DIMENSION_COUNT: "number", + LLM_KV_ROPE_DIMENSION_SECTIONS: "number[]", LLM_KV_ATTENTION_Q_LORA_RANK: "number", LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT: "number", LLM_KV_DECODER_START_TOKEN_ID: "number", LLM_KV_USE_PARALLEL_RESIDUAL: "boolean", + LLM_KV_WKV_HEAD_SIZE: "number", + LLM_KV_TIME_MIX_EXTRA_DIM: "number", + LLM_KV_TIME_DECAY_EXTRA_DIM: "number", + LLM_KV_RESCALE_EVERY_N_LAYERS: "boolean", + LLM_KV_TOKEN_SHIFT_COUNT: "boolean", + LLM_KV_SWIN_NORM: "boolean", + LLM_KV_ATTENTION_GROUPNORM_EPS: "number", + LLM_KV_ATTENTION_GROUPNORM_GROUPS: "number", + LLM_KV_ATTENTION_SCALE: "number", + LLM_KV_EMBEDDING_SCALE: "number", + LLM_KV_RESIDUAL_SCALE: "number", + LLM_KV_SSM_DT_B_C_RMS: "boolean", + LLM_KV_EXPERT_WEIGHTS_NORM: "boolean", + LLM_KV_EXPERT_GATING_FUNC: "boolean", }; interface Arch { @@ -105,8 +124,13 @@ interface Arch { } async function main() { - const res = await fetch(SOURCE_CPP_URL); - const cppSource = await res.text(); + const cppSources = await Promise.all( + SOURCE_CPP_URLS.map(async (url) => { + const res = await fetch(url); + return await res.text(); + }) + ); + const cppSource = cppSources.join("\n"); ///////////////////////////////////// // extract list of all architectures @@ -143,6 +167,7 @@ async function main() { constToKVName[matched.groups.cppConst] = matched.groups.name; } } + console.log("constToKVName", constToKVName); ///////////////////////////////////// // extract list of tensor names based on architecture @@ -172,8 +197,8 @@ async function main() { let insideLoadHParamsFn = false; currCppConst = ""; for (const line of cppSource.split("\n")) { - // check if current line is function llm_load_hparams() - if (line.startsWith("static void llm_load_hparams")) { + // check if current line is function llama_model::load_hparams() + if (line.startsWith("void llama_model::load_hparams")) { insideLoadHParamsFn = true; } if (!insideLoadHParamsFn) { diff --git a/packages/gguf/src/transformer-llm.ts b/packages/gguf/src/transformer-llm.ts index 56e2b1bc8..5246243df 100644 --- a/packages/gguf/src/transformer-llm.ts +++ b/packages/gguf/src/transformer-llm.ts @@ -54,6 +54,7 @@ export enum TransformerLLMPoolingType { export const LLM_ARCHITECTURES = [ "llama", + "deci", "falcon", "grok", "gpt2", @@ -71,34 +72,52 @@ export const LLM_ARCHITECTURES = [ "qwen", "qwen2", "qwen2moe", + "qwen2vl", "phi2", "phi3", + "phimoe", "plamo", "codeshell", "orion", "internlm2", "minicpm", + "minicpm3", "gemma", "gemma2", "starcoder2", "mamba", "xverse", "command-r", + "cohere2", "dbrx", "olmo", + "olmo2", + "olmoe", "openelm", "arctic", + "deepseek", "deepseek2", "chatglm", "bitnet", "t5", "t5encoder", "jais", + "nemotron", + "exaone", + "rwkv6", + "rwkv6qwen2", + "granite", + "granitemoe", + "chameleon", + "wavtokenizer-dec", ] as const; type LLMArchitecture = (typeof LLM_ARCHITECTURES)[number]; export type ArchLlama = TransformerLLMBase<"llama"> & { "llama.attention.layer_norm_rms_epsilon": number; }; +export type ArchDeci = TransformerLLMBase<"deci"> & { + "deci.attention.layer_norm_rms_epsilon": number; +}; export type ArchFalcon = TransformerLLMBase<"falcon"> & { "falcon.attention.layer_norm_epsilon": number; }; @@ -130,19 +149,16 @@ export type ArchRefact = TransformerLLMBase<"refact"> & { export type ArchBert = TransformerLLMBase<"bert"> & { "bert.attention.layer_norm_epsilon": number; "bert.attention.causal": boolean; - "tokenizer.ggml.token_type_count": number; "bert.pooling_type": TransformerLLMPoolingType; }; export type ArchNomicBert = TransformerLLMBase<"nomic-bert"> & { "nomic-bert.attention.layer_norm_epsilon": number; "nomic-bert.attention.causal": boolean; - "tokenizer.ggml.token_type_count": number; "nomic-bert.pooling_type": TransformerLLMPoolingType; }; export type ArchJinaBertV2 = TransformerLLMBase<"jina-bert-v2"> & { "jina-bert-v2.attention.layer_norm_epsilon": number; "jina-bert-v2.attention.causal": boolean; - "tokenizer.ggml.token_type_count": number; "jina-bert-v2.pooling_type": TransformerLLMPoolingType; }; export type ArchBloom = TransformerLLMBase<"bloom"> & { @@ -162,6 +178,9 @@ export type ArchQwen2moe = TransformerLLMBase<"qwen2moe"> & { "qwen2moe.expert_shared_feed_forward_length": number; "qwen2moe.attention.layer_norm_rms_epsilon": number; }; +export type ArchQwen2vl = TransformerLLMBase<"qwen2vl"> & { + "qwen2vl.rope.dimension_sections": number[]; +}; export type ArchPhi2 = TransformerLLMBase<"phi2"> & { "phi2.attention.layer_norm_epsilon": number; }; @@ -169,6 +188,9 @@ export type ArchPhi3 = TransformerLLMBase<"phi3"> & { "phi3.attention.layer_norm_rms_epsilon": number; "phi3.attention.sliding_window": number; }; +export type ArchPhimoe = TransformerLLMBase<"phimoe"> & { + "phimoe.attention.layer_norm_rms_epsilon": number; +}; export type ArchPlamo = TransformerLLMBase<"plamo"> & { "plamo.attention.layer_norm_rms_epsilon": number; }; @@ -183,6 +205,14 @@ export type ArchInternlm2 = TransformerLLMBase<"internlm2"> & { }; export type ArchMinicpm = TransformerLLMBase<"minicpm"> & { "minicpm.attention.layer_norm_rms_epsilon": number; + "minicpm.embedding_scale": number; + "minicpm.residual_scale": number; + "minicpm.logit_scale": number; +}; +export type ArchMinicpm3 = TransformerLLMBase<"minicpm3"> & { + "minicpm3.attention.layer_norm_rms_epsilon": number; + "minicpm3.attention.q_lora_rank": number; + "minicpm3.attention.kv_lora_rank": number; }; export type ArchGemma = TransformerLLMBase<"gemma"> & { "gemma.attention.layer_norm_rms_epsilon": number; @@ -201,6 +231,7 @@ export type ArchMamba = TransformerLLMBase<"mamba"> & { "mamba.ssm.inner_size": number; "mamba.ssm.state_size": number; "mamba.ssm.time_step_rank": number; + "mamba.ssm.dt_b_c_rms": boolean; "mamba.attention.layer_norm_rms_epsilon": number; }; export type ArchXverse = TransformerLLMBase<"xverse"> & { @@ -210,6 +241,11 @@ export type ArchCommandR = TransformerLLMBase<"command-r"> & { "command-r.logit_scale": number; "command-r.attention.layer_norm_epsilon": number; }; +export type ArchCohere2 = TransformerLLMBase<"cohere2"> & { + "cohere2.attention.sliding_window": number; + "cohere2.logit_scale": number; + "cohere2.attention.layer_norm_epsilon": number; +}; export type ArchDbrx = TransformerLLMBase<"dbrx"> & { "dbrx.attention.layer_norm_epsilon": number; "dbrx.attention.clamp_kqv": number; @@ -218,12 +254,25 @@ export type ArchOlmo = TransformerLLMBase<"olmo"> & { "olmo.attention.layer_norm_epsilon": number; "olmo.attention.clamp_kqv": number; }; +export type ArchOlmo2 = TransformerLLMBase<"olmo2"> & { + "olmo2.attention.layer_norm_rms_epsilon": number; +}; +export type ArchOlmoe = TransformerLLMBase<"olmoe"> & { + "olmoe.attention.layer_norm_rms_epsilon": number; +}; export type ArchOpenelm = TransformerLLMBase<"openelm"> & { "openelm.attention.layer_norm_rms_epsilon": number; }; export type ArchArctic = TransformerLLMBase<"arctic"> & { "arctic.attention.layer_norm_rms_epsilon": number; }; +export type ArchDeepseek = TransformerLLMBase<"deepseek"> & { + "deepseek.attention.layer_norm_rms_epsilon": number; + "deepseek.leading_dense_block_count": number; + "deepseek.expert_feed_forward_length": number; + "deepseek.expert_shared_count": number; + "deepseek.expert_weights_scale": number; +}; export type ArchDeepseek2 = TransformerLLMBase<"deepseek2"> & { "deepseek2.attention.layer_norm_rms_epsilon": number; "deepseek2.leading_dense_block_count": number; @@ -232,6 +281,8 @@ export type ArchDeepseek2 = TransformerLLMBase<"deepseek2"> & { "deepseek2.expert_feed_forward_length": number; "deepseek2.expert_shared_count": number; "deepseek2.expert_weights_scale": number; + "deepseek2.expert_weights_norm": boolean; + "deepseek2.expert_gating_func": boolean; "deepseek2.rope.scaling.yarn_log_multiplier": number; }; export type ArchChatglm = TransformerLLMBase<"chatglm"> & { @@ -253,9 +304,44 @@ export type ArchJais = TransformerLLMBase<"jais"> & { "jais.attention.layer_norm_epsilon": number; "jais.attention.max_alibi_bias": number; }; +export type ArchNemotron = TransformerLLMBase<"nemotron"> & { + "nemotron.attention.layer_norm_epsilon": number; +}; +export type ArchExaone = TransformerLLMBase<"exaone"> & { + "exaone.attention.layer_norm_rms_epsilon": number; +}; +export type ArchRwkv6 = TransformerLLMBase<"rwkv6">; +export type ArchRwkv6qwen2 = TransformerLLMBase<"rwkv6qwen2"> & { + "rwkv6qwen2.attention.layer_norm_epsilon": number; + "rwkv6qwen2.attention.layer_norm_rms_epsilon": number; + "rwkv6qwen2.wkv.head_size": number; + "rwkv6qwen2.time_mix_extra_dim": number; + "rwkv6qwen2.time_decay_extra_dim": number; + "rwkv6qwen2.rescale_every_n_layers": boolean; + "rwkv6qwen2.token_shift_count": boolean; +}; +export type ArchGranite = TransformerLLMBase<"granite">; +export type ArchGraniteMoe = TransformerLLMBase<"granitemoe"> & { + "granitemoe.attention.layer_norm_rms_epsilon": number; + "granitemoe.logit_scale": number; + "granitemoe.residual_scale": number; + "granitemoe.embedding_scale": number; + "granitemoe.attention.scale": number; +}; +export type ArchChameleon = TransformerLLMBase<"chameleon"> & { + "chameleon.attention.layer_norm_rms_epsilon": number; + "chameleon.swin_norm": boolean; +}; +export type ArchWavtokenizerDec = TransformerLLMBase<"wavtokenizer-dec"> & { + "wavtokenizer-dec.attention.layer_norm_epsilon": number; + "wavtokenizer-dec.attention.group_norm_epsilon": number; + "wavtokenizer-dec.attention.group_norm_groups": number; + "wavtokenizer-dec.attention.causal": boolean; +}; export type TransformerLLM = | ArchLlama + | ArchDeci | ArchFalcon | ArchGrok | ArchGpt2 @@ -273,26 +359,41 @@ export type TransformerLLM = | ArchQwen | ArchQwen2 | ArchQwen2moe + | ArchQwen2vl | ArchPhi2 | ArchPhi3 + | ArchPhimoe | ArchPlamo | ArchCodeshell | ArchOrion | ArchInternlm2 | ArchMinicpm + | ArchMinicpm3 | ArchGemma | ArchGemma2 | ArchStarcoder2 | ArchMamba | ArchXverse | ArchCommandR + | ArchCohere2 | ArchDbrx | ArchOlmo + | ArchOlmo2 + | ArchOlmoe | ArchOpenelm | ArchArctic + | ArchDeepseek | ArchDeepseek2 | ArchChatglm | ArchBitnet | ArchT5 | ArchT5encoder - | ArchJais; + | ArchJais + | ArchNemotron + | ArchExaone + | ArchRwkv6 + | ArchRwkv6qwen2 + | ArchGranite + | ArchGraniteMoe + | ArchChameleon + | ArchWavtokenizerDec; From 60251c5fe4a7cbfeab6e4b39d66ac781248f377f Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Tue, 25 Feb 2025 16:48:13 +0100 Subject: [PATCH 2/2] Update packages/gguf/scripts/generate-llm.ts Co-authored-by: Julien Chaumond --- packages/gguf/scripts/generate-llm.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/gguf/scripts/generate-llm.ts b/packages/gguf/scripts/generate-llm.ts index 5ea2c677f..985d0c5a6 100644 --- a/packages/gguf/scripts/generate-llm.ts +++ b/packages/gguf/scripts/generate-llm.ts @@ -6,8 +6,8 @@ import { writeFileSync } from "node:fs"; const SOURCE_CPP_URLS = [ - "https://raw.githubusercontent.com/ggerganov/llama.cpp/master/src/llama-arch.cpp", - "https://raw.githubusercontent.com/ggerganov/llama.cpp/master/src/llama-model.cpp", + "https://raw.githubusercontent.com/ggml-org/llama.cpp/master/src/llama-arch.cpp", + "https://raw.githubusercontent.com/ggml-org/llama.cpp/master/src/llama-model.cpp", ]; const DEST_FILE_PATH = "./src/transformer-llm.ts"; const DEST_COMMON_SOURCE = `