Python 3.8+ 环境配置与数据科学工具指南

Python 3.8+ 环境配置与数据科学工具指南 | 极客日志

pip install jupyter notebook

jupyter notebook

pip install numpy pandas matplotlib seaborn scikit-learn

import numpy as np 
print(np.__version__) # 应输出版本号无报错

# 一级标题 ## 二级标题 *斜体*或**加粗**文本

conda create -n my_env python=3.8 conda activate my_env

import numpy as np 
arr = np.array([1, 2, 3]) # 一维数组 
arr_2d = np.array([[1, 2], [3, 4]]) # 二维数组

zeros_1d = np.zeros(3) # 一维零数组 
zeros_2d = np.zeros((2, 3)) # 2x3 零矩阵

ones_arr = np.ones((2, 2)) # 2x2 全 1 矩阵

seq = np.arange(0, 10, 2) # 0, 2, 4, 6, 8

lin_arr = np.linspace(0, 1, 5) # [0.0, 0.25, 0.5, 0.75, 1.0]

rand_arr = np.random.rand(3, 3) # 3x3 均匀分布随机矩阵 
randn_arr = np.random.randn(2, 2) # 2x2 标准正态分布矩阵

arr = np.array([[1, 2], [3, 4]]) 
print(arr.shape) # (2, 2)

reshaped = arr.reshape(4) # 转换为一维数组 [1, 2, 3, 4]

print(arr[0, 1]) # 2 
print(arr[:, 1]) # 获取第二列 [2, 4]

a = np.array([1, 2]) 
b = np.array([3, 4]) 
concat = np.concatenate([a, b]) # [1, 2, 3, 4]

a = np.array([1, 2, 3]) 
b = np.array([4, 5, 6]) 
print(a + b) # [5, 7, 9] 
print(a * 2) # [2, 4, 6]

a = np.array([[1], [2], [3]]) 
b = np.array([1, 2, 3]) 
print(a + b) # [[2, 3, 4], [3, 4, 5], [4, 5, 6]]

mat_a = np.array([[1, 2], [3, 4]]) 
mat_b = np.array([[5, 6], [7, 8]]) 
dot_product = np.dot(mat_a, mat_b) # 或 mat_a @ mat_b

arr = np.array([1, 2, 3, 4]) 
print(np.sum(arr)) # 10 
print(np.mean(arr)) # 2.5 
print(np.max(arr)) # 4

angles = np.array([0, np.pi/2]) 
print(np.sin(angles)) # [0.0, 1.0]

print(np.exp([1, 2])) # [2.718, 7.389] 
print(np.log([1, np.e])) # [0.0, 1.0]

arr = np.array([1, 2, 3, 4]) 
mask = arr > 2 
print(arr[mask]) # [3, 4]

arr = np.array([1, -1, 0]) 
print(np.where(arr > 0, 1, -1)) # [1, -1, -1]

unsorted = np.array([3, 1, 2]) 
sorted_arr = np.sort(unsorted) # [1, 2, 3] 
unique = np.unique([1, 2, 2, 3]) # [1, 2, 3]

pd.Series([1, 3, 5, np.nan], index=['a', 'b', 'c', 'd'])

pd.Series({'a': 1, 'b': 3, 'c': 5})

pd.Series([1, 2], dtype='float64')

pd.Series(5, index=['a', 'b', 'c'])

pd.DataFrame({'A': [1, 2], 'B': [3, 4]})

pd.DataFrame(np.random.randn(3, 4), columns=['A', 'B', 'C', 'D'])

pd.DataFrame(np.array([(1, 'a'), (2, 'b')], dtype=[('x', 'i4'), ('y', 'U1')]))

pd.DataFrame(existing_df, copy=True)

df[df['A'] > 0]

df.iloc[3:5, 0:2]

df.loc['20200101':'20200103', ['A', 'B']]

df.set_index(['col1', 'col2']).sort_index()

df.isna().sum()

df.dropna(axis=0, how='any', subset=['col1', 'col2'])

df.fillna(value=0)

df.fillna(method='ffill')

df.fillna(method='bfill')

df.fillna(df.mean())

df.interpolate()

df.drop_duplicates()

df.drop_duplicates(subset=['col1'])

df.drop_duplicates(keep='last')

df.duplicated()

df.drop_duplicates(subset=['col1'], keep=False)

df.groupby('A').sum()

df.groupby(['A', 'B']).mean()

df.groupby('A').agg(['sum', 'mean', 'std'])

df.groupby('A').agg({'B': 'sum', 'C': lambda x: max(x)-min(x)})

df.groupby('A').filter(lambda x: x['B'].mean() > 0)

pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C'])

pd.pivot_table(df, values=['D', 'E'], index='A', columns='C')

pd.pivot_table(df, values='D', index='A', aggfunc=[np.sum, len])

pd.pivot_table(df, values='D', index='A', fill_value=0)

pd.pivot_table(df, values='D', index='A', margins=True)

pd.to_datetime(df['date_col'])

df['datetime_col'].dt.year

df['datetime_col'] + pd.Timedelta(days=1)

df.resample('M').mean()

df.resample('D').ffill()

df.resample('Q').agg({'A':'sum', 'B':'mean'})

df.rolling(window=3).mean()

df.expanding().sum()

pd.concat([df1, df2], axis=0)

pd.merge(df1, df2, on='key')

pd.merge(df1, df2, left_on=['key1', 'key2'], right_on=['key1', 'key2'])

pd.merge(df1, df2, how='outer')

df1.join(df2, how='left')

pd.melt(df, id_vars=['A'], value_vars=['B', 'C'])

df.pivot(index='date', columns='variable', values='value')

pd.crosstab(df['A'], df['B'])

pd.get_dummies(df['category_col'])

df['new_col'] = df['A'] * 2

df.apply(lambda x: x['A'] * 2, axis=1)

df.eval('A + B')

df.astype({'A': 'int32'})

pd.read_csv('large.csv', chunksize=10000)

df.memory_usage(deep=True)

df.astype('category')

pd.arrays.SparseArray(df['col'])

del df; gc.collect()

import matplotlib.pyplot as plt 
import numpy as np 
# 生成示例数据 
x = np.linspace(0, 10, 100) # 0 到 10 的 100 个等间隔数 
y = np.sin(x) # 生成正弦曲线 
# 绘制折线图 
plt.plot(x, y, color='red', # 线条颜色 
linestyle='--', # 虚线样式 
linewidth=2, # 线宽 
marker='o', # 数据点标记 
markersize=5, # 标记大小 
label='sin(x)' # 图例标签 
)
# 添加标题和坐标轴标签 
plt.title('Sine Wave Example', fontsize=14) 
plt.xlabel('X-axis', fontsize=12) 
plt.ylabel('Y-axis', fontsize=12) 
# 显示图例并调整网格 
plt.legend(loc='upper right') 
plt.grid(True, linestyle=':', alpha=0.5) 
# 显示图形 
plt.show()

# 生成随机数据 
np.random.seed(42) 
x = np.random.randn(50) 
y = x * 2 + np.random.randn(50) * 0.5 
# 绘制散点图 
plt.scatter(x, y, c='blue', # 点颜色 
s=80, # 点大小 
alpha=0.6, # 透明度 
edgecolors='black', # 边缘颜色 
label='Data Points' # 添加回归线 
)
# 添加回归线 
plt.plot(np.unique(x), np.poly1d(np.polyfit(x, y, 1))(np.unique(x)), color='red', label='Trend Line') 
# 样式调整 
plt.title('Scatter Plot with Trend Line', fontsize=14) 
plt.xlabel('Independent Variable', fontsize=12) 
plt.ylabel('Dependent Variable', fontsize=12) 
plt.legend() 
plt.show()

# 创建 2x2 的子图布局 
fig, axes = plt.subplots(2, 2, figsize=(10, 8)) 
# 子图 1：折线图 
axes[0, 0].plot(x, np.sin(x), color='green') 
axes[0, 0].set_title('Subplot 1: Sine Wave') 
# 子图 2：散点图 
axes[0, 1].scatter(x, y, color='orange') 
axes[0, 1].set_title('Subplot 2: Scatter Plot') 
# 子图 3：柱状图 
axes[1, 0].bar(['A', 'B', 'C'], [3, 7, 4], color='purple') 
axes[1, 0].set_title('Subplot 3: Bar Chart') 
# 子图 4：直方图 
axes[1, 1].hist(np.random.randn(1000), bins=20, color='pink') 
axes[1, 1].set_title('Subplot 4: Histogram') 
# 调整子图间距 
plt.tight_layout() 
plt.show()

categories = ['Category A', 'Category B', 'Category C'] 
values = [12, 24, 8] 
plt.bar(categories, values, color=['#FF5733', '#33FF57', '#3357FF'], edgecolor='black', width=0.6) 
plt.title('Bar Chart Example') 
plt.xticks(rotation=45) # 旋转 x 轴标签 
plt.show()

labels = ['Apples', 'Oranges', 'Bananas'] 
sizes = [30, 45, 25] 
explode = (0.1, 0, 0) # 突出显示第一部分 
plt.pie(sizes, labels=labels, explode=explode, autopct='%1.1f%%', # 显示百分比 
shadow=True, startangle=90) 
plt.title('Pie Chart Example') 
plt.show()

data = [np.random.normal(0, std, 100) for std in range(1, 4)] 
plt.boxplot(data, notch=True, # 缺口显示中位数置信区间 
patch_artist=True, boxprops=dict(facecolor='lightblue')) 
plt.title('Boxplot Example') 
plt.xticks([1, 2, 3], ['Group 1', 'Group 2', 'Group 3']) 
plt.show()

plt.style.use('ggplot') # 使用预置主题（如 'ggplot', 'seaborn'）

plt.savefig('output.png', dpi=300, bbox_inches='tight') # 高分辨率保存

import seaborn as sns 
import matplotlib.pyplot as plt 
# 加载示例数据集 
tips = sns.load_dataset("tips") 
# 基础直方图 
sns.histplot(data=tips, x="total_bill") 
plt.title("Total Bill Distribution") 
plt.show() 
# 添加核密度估计 
sns.histplot(data=tips, x="total_bill", kde=True) 
plt.title("With KDE Estimation") 
plt.show() 
# 分组直方图 
sns.histplot(data=tips, x="total_bill", hue="sex", element="step") 
plt.title("Grouped by Gender") 
plt.show()

# 基础箱线图 
sns.boxplot(data=tips, x="day", y="total_bill") 
plt.title("Daily Bill Distribution") 
plt.show() 
# 添加分组维度 
sns.boxplot(data=tips, x="day", y="total_bill", hue="smoker") 
plt.title("With Smoking Status") 
plt.show() 
# 横向箱线图 
sns.boxplot(data=tips, y="day", x="total_bill", orient="h") 
plt.title("Horizontal Orientation") 
plt.show()

# 基础散点矩阵 
sns.pairplot(data=tips) 
plt.suptitle("Pairwise Relationships") 
plt.show() 
# 添加分类着色 
sns.pairplot(data=tips, hue="time", corner=True) 
plt.suptitle("Colored by Meal Time") 
plt.show() 
# 自定义对角线图形 
sns.pairplot(
data=tips, diag_kind="kde", plot_kws={"alpha": 0.6}, height=2.5
) 
plt.suptitle("Custom Diagonal Plots") 
plt.show()

# 计算相关系数矩阵 
corr = tips.corr()
# 基础热力图 
sns.heatmap(corr, annot=True, fmt=".2f") 
plt.title("Correlation Heatmap") 
plt.show() 
# 自定义热力图 
sns.heatmap(
corr, cmap="coolwarm", center=0, linewidths=.5, annot_kws={"size": 10}
) 
plt.title("Styled Heatmap") 
plt.show()

sns.violinplot(data=tips, x="day", y="total_bill", inner="quartile") 
plt.title("Violin Plot Example") 
plt.show()

g = sns.FacetGrid(tips, col="time", row="smoker") 
g.map(sns.scatterplot, "total_bill", "tip") 
g.add_legend() 
plt.show()

sns.lmplot(
data=tips, x="total_bill", y="tip", hue="smoker", markers=["o", "x"]
) 
plt.title("Regression Plot") 
plt.show()

sns.ecdfplot(data=tips, x="total_bill", hue="time") 
plt.title("ECDF Plot") 
plt.show()

# 设置整体样式 
sns.set_style("whitegrid") 
sns.set_palette("husl") 
# 设置上下文（影响缩放比例） 
sns.set_context("notebook") 
# 可选 paper/talk/poster 
# 自定义颜色调色板 
custom_palette = sns.color_palette(["#9b59b6", "#3498db", "#95a5a6"]) 
sns.set_palette(custom_palette)

plt.figure(figsize=(10, 6)) 
sns.histplot(data=tips, x="total_bill") 
plt.savefig("histogram.png", dpi=300, bbox_inches="tight")

Python 3.8+ 环境配置与数据科学工具指南

Python 环境配置与外部库

环境配置与工具准备

安装 Python 3.8+ 版本

配置 Jupyter Notebook

安装常用数据科学库

Jupyter Notebook 基础操作

环境管理（可选）

调试与帮助

数据处理与科学计算

NumPy 基础

多维数组对象 ndarray 的创建方法

数组的基本操作

数组运算与广播机制

常用数学函数

数组的高级操作

Pandas 核心功能

Series 创建

DataFrame 创建

高级索引技术

数据清洗：处理缺失值与去重

高级去重操作

数据聚合与透视

pivot_table 深度使用

时间序列处理

时间序列重采样

数据合并与连接

数据变形

性能优化技巧

内存管理

数据可视化

折线图绘制（plt.plot()）

散点图绘制（plt.scatter()）

子图绘制（plt.subplots()）

扩展绘图类型

柱状图（plt.bar()）

饼图（plt.pie()）

箱线图（plt.boxplot()）

样式高级调整

Seaborn 高级可视化统计图形详解

直方图（sns.histplot()）

箱线图（sns.boxplot()）

散点矩阵（sns.pairplot()）

热力图（sns.heatmap()）

进阶图形扩展

小提琴图（sns.violinplot()）

分面网格（sns.FacetGrid）

回归图（sns.lmplot()）

分布对比图（sns.ecdfplot()）

样式与主题设置

图形保存

学习资源推荐

微信扫一扫，关注极客日志

更多推荐文章

相关免费在线工具