import numpy as np
import matplotlib.pyplot as plt
def load_dataset():
"""
加载数据集,返回特征矩阵、标签矩阵、测试集
"""
data_mat = []
label_mat = []
fr = [
"3.542485\t1.977398\t0",
"3.018896\t2.556416\t0",
"7.551510\t-1.580030\t1",
"2.114999\t-0.004466\t0",
"8.127113\t1.274372\t1",
"7.108772\t-0.986906\t1",
"2.326297\t0.265213\t0",
"0.207971\t-0.438046\t0",
"6.332009\t0.469543\t1",
"6.172788\t-2.044329\t1",
"3.645780\t3.410627\t0",
"3.125951\t-0.160513\t0",
"2.912122\t-0.206010\t0",
"8.307974\t-0.422311\t1",
"5.286862\t0.660109\t1"
]
for line in fr:
line_arr = line.strip().split('\t')
data_mat.append([1.0, float(line_arr[0]), float(line_arr[1])])
label_mat.append(int(line_arr[2]))
test_set = [
[1.0, 7.635630, 0.215151],
[1.0, 6.383078, -1.012999],
[1.0, 7.192221, -0.130088],
[1.0, 8.348103, 1.071160]
]
return np.asmatrix(data_mat), np.asmatrix(label_mat).transpose(), np.asmatrix(test_set)
def sigmoid(in_x):
"""
Sigmoid 激活函数
:param in_x: 输入(线性回归输出)
:return: 0-1 之间的概率值
"""
return 1.0 / (1 + np.exp(-in_x))
def grad_ascent(data_mat_in, class_labels):
"""
批量梯度上升法求解最优权重
:param data_mat_in: 特征矩阵(m×n)
:param class_labels: 标签矩阵(m×1)
:return: 最优权重矩阵(n×1)
"""
data_matrix = np.asmatrix(data_mat_in)
label_mat = np.asmatrix(class_labels)
m, n = np.shape(data_matrix)
alpha = 0.001
max_cycles = 500
weights = np.ones((n, 1))
for k in range(max_cycles):
h = sigmoid(data_matrix * weights)
error = (label_mat - h)
weights = weights + alpha * data_matrix.transpose() * error
return weights
def stoc_grad_ascent0(data_mat_in, class_labels):
"""
随机梯度上升法(单样本更新)
"""
m, n = np.shape(data_mat_in)
alpha = 0.01
weights = np.ones(n)
for i in range(m):
h = sigmoid(sum(data_mat_in[i] * weights))
error = class_labels[i] - h
weights = weights + alpha * error * data_mat_in[i]
return np.mat(weights).transpose()
def classify_vector(in_x, weights):
"""
根据权重预测类别,并输出概率
:param in_x: 单个样本特征(1×n)
:param weights: 最优权重(n×1)
:return: 预测概率、预测类别(0/1)
"""
prob = sigmoid(in_x * weights)
label = 1.0 if prob > 0.5 else 0.0
return prob[0, 0], label
def plot_best_fit(weights, data_mat, label_mat):
"""
绘制样本点和逻辑回归的决策边界
"""
data_arr = np.array(data_mat)
n = np.shape(data_arr)[0]
xcord1 = []; ycord1 = []
xcord2 = []; ycord2 = []
for i in range(n):
if int(label_mat[i]) == 1:
xcord1.append(data_arr[i, 1])
ycord1.append(data_arr[i, 2])
else:
xcord2.append(data_arr[i, 1])
ycord2.append(data_arr[i, 2])
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xcord1, ycord1, s=30, c='blue', marker='o', label='Class 1')
ax.scatter(xcord2, ycord2, s=30, c='red', marker='x', label='Class 0')
x = np.arange(-1.0, 10.0, 0.1)
y = (-weights[0, 0] - weights[1, 0] * x) / weights[2, 0]
ax.plot(x, y, c='green', label='Decision Boundary')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.legend(loc='upper left')
plt.title('Logistic Regression Decision Boundary')
plt.show()
if __name__ == "__main__":
data_mat, label_mat, test_set = load_dataset()
print("数据集加载完成,训练样本数:", np.shape(data_mat)[0])
print("测试样本数:", np.shape(test_set)[0])
print("\nSigmoid 函数测试:sigmoid(0) =", sigmoid(0))
print("sigmoid(2) =", sigmoid(2))
print("sigmoid(-2) =", sigmoid(-2))
weights = grad_ascent(data_mat, label_mat)
print("\n批量梯度上升得到的最优权重:")
print(weights)
print("\n测试集预测结果:")
for i in range(np.shape(test_set)[0]):
prob, label = classify_vector(test_set[i], weights)
print(f"测试样本{i+1}:预测概率={prob:.4f},预测类别={int(label)}")
plot_best_fit(weights, data_mat, label_mat)