OpenSpiel 进阶教程：用 C++ 与 Python 实现自定义博弈算法

OpenSpiel 进阶教程：用 C++ 与 Python 实现自定义博弈算法 | 极客日志

def get_policy_network(num_actions):
    def network(inputs):
        h = hk.Linear(64)(inputs)
        h = jax.nn.relu(h)
        logits = hk.Linear(num_actions)(h)
        return distrax.Categorical(logits=logits)
    return hk.Transformed(network)

def get_lola_update_fn(agent_id, policy_network, optimizer, pi_lr=0.001, lola_weight=1.0):
    def loss_fn(params, batch):
        # 计算策略梯度损失
        logits = vmap(lambda s: policy_network.apply(params, s).logits)(batch.info_state)
        adv = batch.returns - batch.values
        return vmap(rlax.policy_gradient_loss)(logits, batch.action, adv).mean()

    def update(train_state, batch):
        # 基础策略梯度更新
        loss, grads = jax.value_and_grad(loss_fn)(train_state.policy_params[agent_id], batch)
        # LOLA 修正项计算
        correction = lola_correction(train_state, batch)
        grads = jax.tree_map(lambda g, c: g - lola_weight * c, grads, correction)
        # 应用梯度更新
        updates, opt_state = optimizer(grads, train_state.policy_opt_states[agent_id])
        policy_params = optax.apply_updates(train_state.policy_params[agent_id], updates)
        return TrainState(...), {'loss': loss}
    return update

# 初始化环境和智能体
env = rl_environment.Environment("kuhn_poker")
agent = OpponentShapingAgent(
    player_id=0,
    opponent_ids=[1],
    info_state_size=env.observation_spec()["info_state"][0],
    num_actions=env.action_spec()["num_actions"],
    policy=get_policy_network(env.action_spec()["num_actions"]),
    correction_type="lola"
)

# 训练循环
for _ in range(1000):
    time_step = env.reset()
    while not time_step.last():
        agent_output = agent.step(time_step)
        time_step = env.step([agent_output.action])

struct CFRInfoStateValues {
    std::vector<Action> legal_actions;
    std::vector<double> cumulative_regrets; // 累积后悔值
    std::vector<double> cumulative_policy; // 累积策略
    std::vector<double> current_policy; // 当前策略
};

// 应用后悔值匹配更新策略
void CFRInfoStateValues::ApplyRegretMatching() {
    double sum_positive_regrets = 0.0;
    for (int aidx = 0; aidx < num_actions(); ++aidx) {
        if (cumulative_regrets[aidx] > 0) {
            sum_positive_regrets += cumulative_regrets[aidx];
        }
    }
    for (int aidx = 0; aidx < num_actions(); ++aidx) {
        current_policy[aidx] = (sum_positive_regrets > 0) ? std::max(cumulative_regrets[aidx], 0.0) / sum_positive_regrets : 1.0 / legal_actions.size();
    }
}

// 递归计算反事实价值和后悔值
std::vector<double> CFRSolverBase::ComputeCounterFactualRegret(
    const State& state, const absl::optional<int>& alternating_player,
    const std::vector<double>& reach_probabilities) {
    if (state.IsTerminal()) return state.Returns();
    int current_player = state.CurrentPlayer();
    std::string info_state = state.InformationStateString(current_player);
    std::vector<Action> legal_actions = state.LegalActions();
    // 获取当前策略
    std::vector<double> policy = GetPolicy(info_state, legal_actions);
    // 计算子节点价值
    std::vector<double> child_values;
    std::vector<double> state_value(game_->NumPlayers(), 0.0);
    for (int aidx = 0; aidx < legal_actions.size(); ++aidx) {
        auto child = state.Child(legal_actions[aidx]);
        auto child_reach = reach_probabilities;
        child_reach[current_player] *= policy[aidx];
        auto child_val = ComputeCounterFactualRegret(*child, alternating_player, child_reach);
        for (int i = 0; i < game_->NumPlayers(); ++i) {
            state_value[i] += policy[aidx] * child_val[i];
        }
        child_values.push_back(child_val[current_player]);
    }
    // 更新后悔值
    if (!alternating_player || *alternating_player == current_player) {
        double cfr_reach = CounterFactualReachProb(reach_probabilities, current_player);
        auto& is_vals = info_states_[info_state];
        for (int aidx = 0; aidx < legal_actions.size(); ++aidx) {
            is_vals.cumulative_regrets[aidx] += cfr_reach * (child_values[aidx] - state_value[current_player]);
            is_vals.cumulative_policy[aidx] += reach_probabilities[current_player] * policy[aidx];
        }
    }
    return state_value;
}

git clone <官方仓库地址>
cd open_spiel && ./install.sh

OpenSpiel 进阶教程：用 C++ 与 Python 实现自定义博弈算法

OpenSpiel 进阶教程：用 C++ 与 Python 实现自定义博弈算法

🎮 自定义博弈算法的核心架构

核心组件解析

🐍 Python 实现：基于 JAX 的 LOLA 算法

1. 定义策略网络

2. 实现 LOLA 更新逻辑

3. 运行训练循环

🚀 C++ 实现：经典 CFR 算法

1. 信息状态价值存储

2. 后悔值匹配更新

3. 反事实后悔值计算

🔍 算法调试与可视化

博弈树可视化

多群体博弈分析

📝 实现步骤总结

📚 进阶资源

更多推荐文章

相关免费在线工具

OpenSpiel 进阶教程：用 C++ 与 Python 实现自定义博弈算法

OpenSpiel 进阶教程：用 C++ 与 Python 实现自定义博弈算法

🎮 自定义博弈算法的核心架构

核心组件解析

🐍 Python 实现：基于 JAX 的 LOLA 算法

1. 定义策略网络

2. 实现 LOLA 更新逻辑

3. 运行训练循环

🚀 C++ 实现：经典 CFR 算法

1. 信息状态价值存储

2. 后悔值匹配更新

3. 反事实后悔值计算

🔍 算法调试与可视化

博弈树可视化

多群体博弈分析

📝 实现步骤总结

📚 进阶资源

微信扫一扫，关注极客日志

更多推荐文章

相关免费在线工具