Llama 3.1 本地部署与 API 服务搭建
一、环境准备
部署服务器: H100 80G 模型: Llama-3.1-8B-Instruct
1. 创建 Conda 虚拟环境
建议使用 Python 3.10 以上版本。
conda create -n llama3 python=3.11
2. 激活环境
conda activate llama3
Llama 3.1-8B 模型的本地部署流程,涵盖 Conda 环境搭建、PyTorch 与 Transformers 库安装、模型加载测试以及基于 FastAPI 构建长期运行的 API 服务。此外,文章还讲解了如何通过 SSH 隧道实现跨服务器调用,并提供了针对端口占用、连接不稳定等常见问题的排查与解决步骤。
部署服务器: H100 80G 模型: Llama-3.1-8B-Instruct
建议使用 Python 3.10 以上版本。
conda create -n llama3 python=3.11
conda activate llama3
查看 CUDA 版本:
nvidia-smi
根据 CUDA 版本选择适合的 PyTorch 版本(建议选择不大于主机支持的最高版本),使用镜像源安装:
pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 -i https://pypi.tuna.tsinghua.edu.cn/simple
python -m pip install --upgrade pip
wget --version
md5sum --version
若缺少 wget 或 md5sum,可执行:
apt-get install wget
apt-get install md5sum
pip install --upgrade transformers
pip install accelerate -i https://pypi.tuna.tsinghua.edu.cn/simple
从 HuggingFace 下载模型:https://huggingface.co/meta-llama/Llama-3.1-8B
import transformers
import torch
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
pipeline = transformers.pipeline(
"text-generation",
model=model_id,
model_kwargs={"torch_dtype": torch.bfloat16},
device_map="auto"
)
messages = [
{"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
{"role": "user", "content": "Who are you?"}
]
outputs = pipeline(messages, max_new_tokens=256)
print(outputs[0]["generated_text"][-1])
pip install fastapi uvicorn pydantic -i https://pypi.tuna.tsinghua.edu.cn/simple
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import List, Optional
import uvicorn
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
MODEL_PATH = "<MODEL_PATH>"
class ChatMessage(BaseModel):
role: str
content: str
class ChatRequest(BaseModel):
messages: List[ChatMessage]
max_tokens: Optional[int] = 200
temperature: Optional[float] = 0.7
top_p: Optional[float] = 0.9
class ChatResponse(BaseModel):
response: str
usage: dict
app = FastAPI(title="Llama 3.1 API", version="1.0")
model = None
tokenizer = None
pipe = None
@app.on_event("startup")
async def startup_event():
global model, tokenizer, pipe
logger.info("正在加载模型...")
try:
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForCausalLM.from_pretrained(
MODEL_PATH,
torch_dtype=torch.float16,
device_map="auto",
low_cpu_mem_usage=True
)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
logger.info(f"模型加载完成!设备:{model.device}")
except Exception as e:
logger.error(f"模型加载失败:{e}")
raise
@app.get("/")
async def root():
return {"service": "Llama 3.1 API", "status": "running", "model": "Llama-3.1-8B-Instruct", "device": str(model.device) if model else "未加载"}
@app.get("/health")
async def health_check():
return {"status": "healthy"}
@app.post("/chat/completions", response_model=ChatResponse)
async def chat_completions(request: ChatRequest):
try:
text = tokenizer.apply_chat_template([msg.dict() for msg in request.messages], tokenize=False, add_generation_prompt=True)
outputs = pipe(text, max_new_tokens=request.max_tokens, temperature=request.temperature, top_p=request.top_p, do_sample=True, pad_token_id=tokenizer.eos_token_id)
response_text = outputs[0]['generated_text']
if "assistant" in response_text:
response_text = response_text.split("assistant")[-1].strip()
else:
response_text = response_text.replace(text, "").strip()
input_tokens = len(tokenizer.encode(text))
output_tokens = len(tokenizer.encode(response_text))
return ChatResponse(
response=response_text,
usage={"prompt_tokens": input_tokens, "completion_tokens": output_tokens, "total_tokens": input_tokens + output_tokens}
)
except Exception as e:
logger.error(f"生成失败:{e}")
raise HTTPException(status_code=500, detail=str(e))
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8000, log_level="info")
前台运行:
python api_server.py
curl http://localhost:8000/
curl http://localhost:8000/health
curl -X POST "http://localhost:8000/chat/completions" \
-H "Content-Type: application/json" \
-d '{ "messages": [ {"role": "system", "content": "你是一个有用的 AI 助手"}, {"role": "user", "content": "中国的首都是哪里?" } ], "max_tokens": 100 }'
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel, ConfigDict
from typing import List, Optional
import uvicorn
import logging
import time
import os
from contextlib import asynccontextmanager
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
MODEL_PATH = "<MODEL_PATH>"
model = None
tokenizer = None
@asynccontextmanager
async def lifespan(app: FastAPI):
global model, tokenizer
logger.info("正在 H100 上加载 Llama-3.1-8B-Instruct 模型...")
try:
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
MODEL_PATH,
torch_dtype=torch.float16,
device_map="auto",
low_cpu_mem_usage=True
)
logger.info(f"✅ 模型加载成功!设备:{model.device}")
yield
except Exception as e:
logger.error(f"❌ 模型加载失败:{e}")
raise RuntimeError("模型初始化失败") from e
finally:
if model is not None:
del model
torch.cuda.empty_cache()
app = FastAPI(
title="Llama 3.1 Instruct API",
description="OpenAI-compatible API for Llama-3.1-8B-Instruct on H100",
version="1.0",
lifespan=lifespan
)
class ChatMessage(BaseModel):
role: str
content: str
class ChatCompletionRequest(BaseModel):
model_config = ConfigDict(extra="ignore")
messages: Optional[List[ChatMessage]] = None
prompt: Optional[str] = None
model: str = "llama-3.1-8b"
max_tokens: int = 200
temperature: float = 0.7
top_p: float = 0.9
stream: bool = False
@app.get("/")
async def root():
return {"service": "Llama 3.1 Instruct API", "status": "running", "device": str(model.device) if model else "uninitialized"}
@app.get("/health")
async def health_check():
return {"status": "healthy", "model_loaded": model is not None}
@app.post("/v1/chat/completions")
async def chat_completions(request: ChatCompletionRequest):
if model is None or tokenizer is None:
raise HTTPException(status_code=503, detail="模型尚未加载完成")
if request.stream:
raise HTTPException(status_code=400, detail="流式输出暂不支持")
try:
if request.messages is None:
raise ValueError("messages 不能为空")
messages_dict = [msg.model_dump() for msg in request.messages]
encoding = tokenizer.apply_chat_template(messages_dict, add_generation_prompt=True, return_tensors="pt")
input_ids = encoding.input_ids.to(model.device)
input_length = input_ids.shape[1]
start_time = time.time()
with torch.no_grad():
outputs = model.generate(input_ids=input_ids, max_new_tokens=request.max_tokens, temperature=request.temperature, top_p=request.top_p, do_sample=True, pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id, use_cache=True)
gen_time = time.time() - start_time
new_tokens = outputs[0][input_length:]
response_text = tokenizer.decode(new_tokens, skip_special_tokens=True).strip()
return {
"id": f"cmpl-{int(time.time())}",
"created": int(time.time()),
"model": request.model,
"choices": [{"index": 0, "message": {"role": "assistant", "content": response_text}, "finish_reason": "stop"}],
"usage": {"prompt_tokens": input_length, "completion_tokens": len(new_tokens), "total_tokens": input_length + len(new_tokens)}
}
except Exception as e:
logger.error(f"❌ 生成错误:{e}", exc_info=True)
raise HTTPException(status_code=500, detail=str(e))
if __name__ == "__main__":
port = int(os.getenv("PORT", 8000))
uvicorn.run(app, host="0.0.0.0", port=port, log_level="info", workers=1)
python h100_server.py
在本地终端执行(将 <REMOTE_IP> 替换为服务器 IP):
ssh -L 8000:localhost:8000 root@<REMOTE_IP> -p <SSH_PORT>
检查端口是否监听:
netstat -tuln | grep 8000
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:8000/v1",
api_key="not-needed"
)
response = client.chat.completions.create(
model="llama-3.1-8b",
messages=[
{"role": "system", "content": "你是一个 helpful AI 助手。"},
{"role": "user", "content": "你好,请介绍一下你自己。"}
],
max_tokens=150
)
print(response.choices[0].message.content)
uvicorn.run 中的 port 参数。netstat -tulpn | grep :8000
kill -9 <PID>
-4 -N -L 参数建立隧道。ssh -4 -N -L 8000:localhost:8000 root@<REMOTE_IP> -p <SSH_PORT>
可编写脚本实现断线重连,设置环境变量后运行:
export SSH_TUNNEL_PASSWORD="your_password"
./ssh_tunnel.sh
脚本逻辑应包含自动检测端口状态、重启 SSH 隧道等功能,确保 ServerAliveInterval 等参数配置合理以维持长连接。

微信公众号「极客日志」,在微信中扫描左侧二维码关注。展示文案:极客日志 zeeklog
使用加密算法(如AES、TripleDES、Rabbit或RC4)加密和解密文本明文。 在线工具,加密/解密文本在线工具,online
生成新的随机RSA私钥和公钥pem证书。 在线工具,RSA密钥对生成器在线工具,online
基于 Mermaid.js 实时预览流程图、时序图等图表,支持源码编辑与即时渲染。 在线工具,Mermaid 预览与可视化编辑在线工具,online
解析常见 curl 参数并生成 fetch、axios、PHP curl 或 Python requests 示例代码。 在线工具,curl 转代码在线工具,online
将字符串编码和解码为其 Base64 格式表示形式即可。 在线工具,Base64 字符串编码/解码在线工具,online
将字符串、文件或图像转换为其 Base64 表示形式。 在线工具,Base64 文件转换器在线工具,online