Appearance
Transformer 推理图 — 代码走读
src/llama-model.cpp — Forward Pass
llama-model.cpp 是 llama.cpp 最大的文件,包含所有模型架构的 forward pass 实现。
计算图构建入口
cpp
// 每次推理调用
struct ggml_cgraph * llm_build_graph(llama_context & lctx, const llama_batch & batch) {
// 创建新计算图
struct ggml_cgraph * graph = ggml_new_graph(ctx);
// 根据架构分发
switch (model.arch) {
case LLM_ARCH_LLAMA:
result = llm_build_llama(lctx, batch);
break;
case LLM_ARCH_GPT2:
result = llm_build_gpt2(lctx, batch);
break;
// ... 其他架构
}
return graph;
}LLaMA Forward Pass
cpp
struct ggml_tensor * llm_build_llama(llama_context & lctx, const llama_batch & batch) {
// 1. Token embedding
struct ggml_tensor * cur = ggml_get_rows(ctx, model.tok_embd, tokens);
// 2. 逐层处理
for (int il = 0; il < n_layer; il++) {
// 2a. Attention
cur = llm_build_norm(ctx, cur, layers[il].attn_norm);
struct ggml_tensor * Q = ggml_mul_mat(ctx, layers[il].wq, cur);
struct ggml_tensor * K = ggml_mul_mat(ctx, layers[il].wk, cur);
struct ggml_tensor * V = ggml_mul_mat(ctx, layers[il].wv, cur);
// 2b. RoPE
Q = ggml_rope(ctx, Q, positions, n_rot, rope_type);
K = ggml_rope(ctx, K, positions, n_rot, rope_type);
// 2c. Attention + KV Cache
// (将 K, V 存入 cache)
cur = llm_build_attn(ctx, Q, K_cache, V_cache, mask);
// 2d. Output projection + 残差
cur = ggml_mul_mat(ctx, layers[il].wo, cur);
cur = ggml_add(ctx, cur, ffn_inp);
// 2e. FFN (SwiGLU)
ffn_inp = cur;
cur = llm_build_norm(ctx, cur, layers[il].ffn_norm);
cur = llm_build_ffn(ctx, cur, layers[il]);
// 2f. 残差
cur = ggml_add(ctx, cur, ffn_inp);
}
// 3. Final norm
cur = llm_build_norm(ctx, cur, model.output_norm);
// 4. Output projection
cur = ggml_mul_mat(ctx, model.output, cur);
return cur;
}src/llama-graph.cpp — 图构建辅助
提供计算图构建的辅助函数:
cpp
// 构建归一化层 (RMSNorm)
struct ggml_tensor * llm_build_norm(
struct ggml_context * ctx,
struct ggml_tensor * cur,
struct ggml_tensor * weight) {
cur = ggml_rms_norm(ctx, cur, epsilon);
cur = ggml_mul(ctx, cur, weight);
return cur;
}关键函数索引
| 函数 | 文件 | 说明 |
|---|---|---|
llm_build_graph | llama-model.cpp | 构建完整推理图 |
llm_build_llama | llama-model.cpp | LLaMA 架构 forward pass |
llm_build_attn | llama-graph.cpp | 注意力计算 |
llm_build_ffn | llama-graph.cpp | FFN (SwiGLU) |
llm_build_norm | llama-graph.cpp | RMSNorm |
ggml_rope | ggml.c | RoPE 位置编码 |