Llama 3.1 本地部署与 API 服务搭建 | 极客日志

PythonAI算法

Llama 3.1 本地部署与 API 服务搭建

Llama 3.1-8B 模型的本地部署流程，涵盖 Conda 环境搭建、PyTorch 与 Transformers 库安装、模型加载测试以及基于 FastAPI 构建长期运行的 API 服务。此外，文章还讲解了如何通过 SSH 隧道实现跨服务器调用，并提供了针对端口占用、连接不稳定等常见问题的排查与解决步骤。

未来可期发布于 2026/4/6更新于 2026/4/1812 浏览

Llama 3.1 本地部署与 API 服务搭建

一、环境准备

部署服务器: H100 80G 模型: Llama-3.1-8B-Instruct

1. 创建 Conda 虚拟环境

建议使用 Python 3.10 以上版本。

conda create -n llama3 python=3.11

2. 激活环境

conda activate llama3

nvidia-smi

pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 -i https://pypi.tuna.tsinghua.edu.cn/simple

python -m pip install --upgrade pip
wget --version
md5sum --version

apt-get install wget
apt-get install md5sum

pip install --upgrade transformers
pip install accelerate -i https://pypi.tuna.tsinghua.edu.cn/simple

import transformers
import torch

model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto"
)

messages = [
    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
    {"role": "user", "content": "Who are you?"}
]

outputs = pipeline(messages, max_new_tokens=256)
print(outputs[0]["generated_text"][-1])

pip install fastapi uvicorn pydantic -i https://pypi.tuna.tsinghua.edu.cn/simple

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import List, Optional
import uvicorn
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

MODEL_PATH = "<MODEL_PATH>"

class ChatMessage(BaseModel):
    role: str
    content: str

class ChatRequest(BaseModel):
    messages: List[ChatMessage]
    max_tokens: Optional[int] = 200
    temperature: Optional[float] = 0.7
    top_p: Optional[float] = 0.9

class ChatResponse(BaseModel):
    response: str
    usage: dict

app = FastAPI(title="Llama 3.1 API", version="1.0")
model = None
tokenizer = None
pipe = None

@app.on_event("startup")
async def startup_event():
    global model, tokenizer, pipe
    logger.info("正在加载模型...")
    try:
        tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_PATH,
            torch_dtype=torch.float16,
            device_map="auto",
            low_cpu_mem_usage=True
        )
        pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
        logger.info(f"模型加载完成！设备：{model.device}")
    except Exception as e:
        logger.error(f"模型加载失败：{e}")
        raise

@app.get("/")
async def root():
    return {"service": "Llama 3.1 API", "status": "running", "model": "Llama-3.1-8B-Instruct", "device": str(model.device) if model else "未加载"}

@app.get("/health")
async def health_check():
    return {"status": "healthy"}

@app.post("/chat/completions", response_model=ChatResponse)
async def chat_completions(request: ChatRequest):
    try:
        text = tokenizer.apply_chat_template([msg.dict() for msg in request.messages], tokenize=False, add_generation_prompt=True)
        outputs = pipe(text, max_new_tokens=request.max_tokens, temperature=request.temperature, top_p=request.top_p, do_sample=True, pad_token_id=tokenizer.eos_token_id)
        response_text = outputs[0]['generated_text']
        if "assistant" in response_text:
            response_text = response_text.split("assistant")[-1].strip()
        else:
            response_text = response_text.replace(text, "").strip()
        input_tokens = len(tokenizer.encode(text))
        output_tokens = len(tokenizer.encode(response_text))
        return ChatResponse(
            response=response_text,
            usage={"prompt_tokens": input_tokens, "completion_tokens": output_tokens, "total_tokens": input_tokens + output_tokens}
        )
    except Exception as e:
        logger.error(f"生成失败：{e}")
        raise HTTPException(status_code=500, detail=str(e))

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8000, log_level="info")

python api_server.py

curl http://localhost:8000/
curl http://localhost:8000/health
curl -X POST "http://localhost:8000/chat/completions" \
-H "Content-Type: application/json" \
-d '{ "messages": [ {"role": "system", "content": "你是一个有用的 AI 助手"}, {"role": "user", "content": "中国的首都是哪里？" } ], "max_tokens": 100 }'

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel, ConfigDict
from typing import List, Optional
import uvicorn
import logging
import time
import os
from contextlib import asynccontextmanager

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

MODEL_PATH = "<MODEL_PATH>"
model = None
tokenizer = None

@asynccontextmanager
async def lifespan(app: FastAPI):
    global model, tokenizer
    logger.info("正在 H100 上加载 Llama-3.1-8B-Instruct 模型...")
    try:
        tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_PATH,
            torch_dtype=torch.float16,
            device_map="auto",
            low_cpu_mem_usage=True
        )
        logger.info(f"✅ 模型加载成功！设备：{model.device}")
        yield
    except Exception as e:
        logger.error(f"❌ 模型加载失败：{e}")
        raise RuntimeError("模型初始化失败") from e
    finally:
        if model is not None:
            del model
            torch.cuda.empty_cache()

app = FastAPI(
    title="Llama 3.1 Instruct API",
    description="OpenAI-compatible API for Llama-3.1-8B-Instruct on H100",
    version="1.0",
    lifespan=lifespan
)

class ChatMessage(BaseModel):
    role: str
    content: str

class ChatCompletionRequest(BaseModel):
    model_config = ConfigDict(extra="ignore")
    messages: Optional[List[ChatMessage]] = None
    prompt: Optional[str] = None
    model: str = "llama-3.1-8b"
    max_tokens: int = 200
    temperature: float = 0.7
    top_p: float = 0.9
    stream: bool = False

@app.get("/")
async def root():
    return {"service": "Llama 3.1 Instruct API", "status": "running", "device": str(model.device) if model else "uninitialized"}

@app.get("/health")
async def health_check():
    return {"status": "healthy", "model_loaded": model is not None}

@app.post("/v1/chat/completions")
async def chat_completions(request: ChatCompletionRequest):
    if model is None or tokenizer is None:
        raise HTTPException(status_code=503, detail="模型尚未加载完成")
    if request.stream:
        raise HTTPException(status_code=400, detail="流式输出暂不支持")
    try:
        if request.messages is None:
            raise ValueError("messages 不能为空")
        messages_dict = [msg.model_dump() for msg in request.messages]
        encoding = tokenizer.apply_chat_template(messages_dict, add_generation_prompt=True, return_tensors="pt")
        input_ids = encoding.input_ids.to(model.device)
        input_length = input_ids.shape[1]
        start_time = time.time()
        with torch.no_grad():
            outputs = model.generate(input_ids=input_ids, max_new_tokens=request.max_tokens, temperature=request.temperature, top_p=request.top_p, do_sample=True, pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id, use_cache=True)
        gen_time = time.time() - start_time
        new_tokens = outputs[0][input_length:]
        response_text = tokenizer.decode(new_tokens, skip_special_tokens=True).strip()
        return {
            "id": f"cmpl-{int(time.time())}",
            "created": int(time.time()),
            "model": request.model,
            "choices": [{"index": 0, "message": {"role": "assistant", "content": response_text}, "finish_reason": "stop"}],
            "usage": {"prompt_tokens": input_length, "completion_tokens": len(new_tokens), "total_tokens": input_length + len(new_tokens)}
        }
    except Exception as e:
        logger.error(f"❌ 生成错误：{e}", exc_info=True)
        raise HTTPException(status_code=500, detail=str(e))

if __name__ == "__main__":
    port = int(os.getenv("PORT", 8000))
    uvicorn.run(app, host="0.0.0.0", port=port, log_level="info", workers=1)

python h100_server.py

ssh -L 8000:localhost:8000 root@<REMOTE_IP> -p <SSH_PORT>

netstat -tuln | grep 8000

from openai import OpenAI

client = OpenAI(
    base_url="http://localhost:8000/v1",
    api_key="not-needed"
)

response = client.chat.completions.create(
    model="llama-3.1-8b",
    messages=[
        {"role": "system", "content": "你是一个 helpful AI 助手。"},
        {"role": "user", "content": "你好，请介绍一下你自己。"}
    ],
    max_tokens=150
)
print(response.choices[0].message.content)

netstat -tulpn | grep :8000
kill -9 <PID>

ssh -4 -N -L 8000:localhost:8000 root@<REMOTE_IP> -p <SSH_PORT>

export SSH_TUNNEL_PASSWORD="your_password"
./ssh_tunnel.sh

Llama 3.1 本地部署与 API 服务搭建

Llama 3.1 本地部署与 API 服务搭建

一、环境准备

1. 创建 Conda 虚拟环境

2. 激活环境

3. 安装 PyTorch

4. 升级 Pip 及工具

5. 安装依赖库

二、本地部署模型测试

代码示例

三、本地部署长期运行的 API 服务

1. 安装所需包

2. 创建 API 服务文件 (api_server.py)

3. 启动 API 服务

4. 测试 API

四、跨服务器部署与使用的 API 服务

1. 服务器端 (H100)

部署脚本 (h100_server.py)

启动服务

2. 客户端

建立 SSH 隧道

运行调用脚本 (client_llama3.py)

五、常见问题排查

1. 服务器端口被占用或无法连接

2. SSH 连接不稳定

更多推荐文章

相关免费在线工具

Llama 3.1 本地部署与 API 服务搭建

Llama 3.1 本地部署与 API 服务搭建

一、环境准备

1. 创建 Conda 虚拟环境

2. 激活环境

3. 安装 PyTorch

4. 升级 Pip 及工具

5. 安装依赖库

二、本地部署模型测试

代码示例

三、本地部署长期运行的 API 服务

1. 安装所需包

2. 创建 API 服务文件 (api_server.py)

3. 启动 API 服务

4. 测试 API

四、跨服务器部署与使用的 API 服务

1. 服务器端 (H100)

部署脚本 (h100_server.py)

启动服务

2. 客户端

建立 SSH 隧道

运行调用脚本 (client_llama3.py)

五、常见问题排查

1. 服务器端口被占用或无法连接

2. SSH 连接不稳定

微信扫一扫，关注极客日志

更多推荐文章

相关免费在线工具