PythonAI算法
Python 数据统计指南:从基础配置到高级分析
Python 数据统计涵盖环境配置、数据获取、清洗预处理及描述性与高级统计分析。介绍 pandas、numpy、scipy 等核心库的使用,展示从 CSV/Excel/SQL/API 加载数据的方法,包含缺失值处理、异常值检测、分类编码及标准化技术。通过 Iris 数据集示例,演示基本统计量计算、分布检验、相关性分析及置信区间构建,提供完整的数据分析代码框架。

Python 数据统计涵盖环境配置、数据获取、清洗预处理及描述性与高级统计分析。介绍 pandas、numpy、scipy 等核心库的使用,展示从 CSV/Excel/SQL/API 加载数据的方法,包含缺失值处理、异常值检测、分类编码及标准化技术。通过 Iris 数据集示例,演示基本统计量计算、分布检验、相关性分析及置信区间构建,提供完整的数据分析代码框架。


Python 在数据统计领域的强大主要得益于其丰富的库生态系统:
# 核心数据分析库
import pandas as pd
import numpy as np
# 数据可视化库
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
# 统计分析库
import scipy.stats as stats
from scipy import stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
# 机器学习库
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
# 其他实用库
import warnings
warnings.filterwarnings('ignore')
# 推荐使用 conda 或 pip 安装必要包
"""
pip install pandas numpy matplotlib seaborn plotly
pip install scipy statsmodels scikit-learn
pip install jupyter notebook
# 交互式环境
"""
# 设置中文字体显示
plt.rcParams['font.sans-serif'] = ['SimHei']
# 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False
# 用来正常显示负号
# 设置绘图样式
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
import pandas as pd
import numpy as np
import sqlite3
import requests
import json
class DataLoader:
def __init__(self):
self.data_sources = {}
def load_csv(self, file_path, **kwargs):
"""加载 CSV 文件"""
try:
df = pd.read_csv(file_path, **kwargs)
self.data_sources['csv'] = df
print(f"成功加载 CSV 文件,数据形状:{df.shape}")
return df
except Exception as e:
print(f"加载 CSV 文件失败:{e}")
return None
def load_excel(self, file_path, sheet_name=0):
"""加载 Excel 文件"""
try:
df = pd.read_excel(file_path, sheet_name=sheet_name)
self.data_sources['excel'] = df
print(f"成功加载 Excel 文件,数据形状:{df.shape}")
return df
except Exception as e:
print(f"加载 Excel 文件失败:{e}")
return
():
:
conn = sqlite3.connect(db_path)
df = pd.read_sql_query(query, conn)
conn.close()
.data_sources[] = df
()
df
Exception e:
()
():
:
response = requests.get(url, params=params)
response.status_code == :
data = response.json()
df = pd.DataFrame(data)
.data_sources[] = df
()
df
:
()
Exception e:
()
loader = DataLoader()
sklearn.datasets load_iris
iris = load_iris()
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
iris_df[] = iris.target
def explore_data(df, sample_size=5):
""" 全面探索数据集基本信息 """
print("="*50)
print("数据集基本信息探索")
print("="*50)
# 基本形状信息
print(f"数据形状:{df.shape}")
print(f"行数:{df.shape[0]}")
print(f"列数:{df.shape[1]}")
# 数据类型信息
print("\n数据类型信息:")
print(df.dtypes)
# 数据预览
print(f"\n前{sample_size}行数据:")
print(df.head(sample_size))
print(f"\n后{sample_size}行数据:")
print(df.tail(sample_size))
# 统计摘要
print("\n数值列统计摘要:")
print(df.describe())
# 缺失值信息
print("\n缺失值统计:")
missing_info = pd.DataFrame({
'缺失数量': df.isnull().sum(),
'缺失比例': df.isnull().sum()/len(df)*100
})
print(missing_info)
# 唯一值信息
()
categorical_cols = df.select_dtypes(include=[]).columns
col categorical_cols:
()
{
: df.shape,
: df.dtypes,
: missing_info
}
info = explore_data(iris_df)
class DataCleaner:
def __init__(self, df):
self.df = df.copy()
self.cleaning_log = []
def detect_missing_values(self):
"""检测缺失值"""
missing_stats = pd.DataFrame({
'missing_count': self.df.isnull().sum(),
'missing_percentage': (self.df.isnull().sum()/len(self.df))*100,
'data_type': self.df.dtypes
})
# 高缺失率列
high_missing_cols = missing_stats[missing_stats['missing_percentage']>50].index.tolist()
self.cleaning_log.append({'step':'缺失值检测','details':f"发现 {len(high_missing_cols)} 个高缺失率列 (>50%)"})
return missing_stats, high_missing_cols
def handle_missing_values(self, strategy='auto', custom_strategy=None):
"""处理缺失值"""
df_clean = self.df.copy()
missing_stats, high_missing_cols = self.detect_missing_values()
# 删除高缺失率列
if high_missing_cols:
df_clean = df_clean.drop(columns=high_missing_cols)
self.cleaning_log.append({'step':,:})
col df_clean.columns:
df_clean[col].isnull().() > :
strategy == :
df_clean[col].dtype [,]:
fill_value = df_clean[col].median()
df_clean[col].fillna(fill_value, inplace=)
method =
:
fill_value = df_clean[col].mode()[] df_clean[col].mode().empty
df_clean[col].fillna(fill_value, inplace=)
method =
strategy == custom_strategy:
col custom_strategy:
fill_value = custom_strategy[col]
df_clean[col].fillna(fill_value, inplace=)
method =
.cleaning_log.append({
:,
: col,
: method,
: .df[col].isnull().()
})
.df = df_clean
df_clean
():
initial_count = (.df)
.df = .df.drop_duplicates()
removed_count = initial_count - (.df)
.cleaning_log.append({
:,
: removed_count,
:(.df)
})
.df
():
df_clean = .df.copy()
numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
outliers_info = {}
col numeric_cols:
method == :
Q1 = df_clean[col].quantile()
Q3 = df_clean[col].quantile()
IQR = Q3 - Q1
lower_bound = Q1 - * IQR
upper_bound = Q3 + * IQR
outliers = df_clean[(df_clean[col]< lower_bound)|(df_clean[col]> upper_bound)]
outlier_count = (outliers)
df_clean[col]= np.where(df_clean[col]< lower_bound, lower_bound, df_clean[col])
df_clean[col]= np.where(df_clean[col]> upper_bound, upper_bound, df_clean[col])
method == :
z_scores = np.(stats.zscore(df_clean[col]))
outlier_count = (df_clean[z_scores > threshold])
median = df_clean[col].median()
mad = stats.median_abs_deviation(df_clean[col])
df_clean[col]= np.where(z_scores > threshold, median, df_clean[col])
outliers_info[col]= outlier_count
.cleaning_log.append({:,: method,: outliers_info })
.df = df_clean
df_clean
():
()
(*)
log .cleaning_log:
()
key, value log.items():
key !=:
()
()
np.random.seed()
test_data = pd.DataFrame({
: np.random.normal(,,),
: np.random.normal(,,),
: np.random.choice([,,],),
: np.random.exponential(,)
})
test_data.loc[:,]= np.nan
test_data.loc[:,]= np.nan
test_data.loc[,]=
test_data.loc[,]=
cleaner = DataCleaner(test_data)
cleaned_data = cleaner.handle_missing_values()
cleaned_data = cleaner.remove_duplicates()
cleaned_data = cleaner.handle_outliers()
cleaner.get_cleaning_report()
class DataTransformer:
def __init__(self, df):
self.df = df.copy()
self.transformation_log = []
def encode_categorical(self, columns=None, method='onehot'):
"""分类变量编码"""
df_encoded = self.df.copy()
if columns is None:
categorical_cols = df_encoded.select_dtypes(include=['object']).columns
else:
categorical_cols = columns
for col in categorical_cols:
if method == 'onehot':
# One-Hot 编码
dummies = pd.get_dummies(df_encoded[col], prefix=col)
df_encoded = pd.concat([df_encoded, dummies], axis=1)
df_encoded.drop(col, axis=1, inplace=True)
encoding_type = "One-Hot 编码"
elif method == 'label':
# 标签编码
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df_encoded[col]= le.fit_transform(df_encoded[col])
encoding_type = "标签编码"
elif method == 'target':
# 目标编码(需要目标变量)
if 'target' in df_encoded.columns:
target_mean = df_encoded.groupby(col)['target'].mean()
df_encoded[col]= df_encoded[col].(target_mean)
encoding_type =
.transformation_log.append({:,: col,: encoding_type })
.df = df_encoded
df_encoded
():
sklearn.preprocessing StandardScaler, MinMaxScaler, RobustScaler
df_scaled = .df.copy()
columns :
numerical_cols = df_scaled.select_dtypes(include=[np.number]).columns
:
numerical_cols = columns
scaler =
method == :
scaler = StandardScaler()
scaling_type =
method == :
scaler = MinMaxScaler()
scaling_type =
method == :
scaler = RobustScaler()
scaling_type =
scaler:
df_scaled[numerical_cols]= scaler.fit_transform(df_scaled[numerical_cols])
.transformation_log.append({
:,
:(numerical_cols),
: scaling_type
})
.df = df_scaled
df_scaled, scaler
():
df_featured = .df.copy()
numerical_cols = df_featured.select_dtypes(include=[np.number]).columns
sklearn.preprocessing PolynomialFeatures
(numerical_cols)>=:
poly = PolynomialFeatures(degree=, include_bias=, interaction_only=)
poly_features = poly.fit_transform(df_featured[numerical_cols[:]])
poly_feature_names = poly.get_feature_names_out(numerical_cols[:])
poly_df = pd.DataFrame(poly_features, columns=poly_feature_names)
df_featured = pd.concat([df_featured, poly_df], axis=)
.transformation_log.append({
:,
:,
:(poly_feature_names)
})
col numerical_cols:
df_featured[]= stats.zscore(df_featured[col])
df_featured[]= df_featured[col].rank()
.transformation_log.append({
:,
:,
:[ col numerical_cols]+[ col numerical_cols]
})
.df = df_featured
df_featured
transformer = DataTransformer(iris_df)
transformed_data, scaler = transformer.scale_numerical(method=)
transformer.create_features()
class DescriptiveStatistics:
def __init__(self, df):
self.df = df
self.numerical_cols = df.select_dtypes(include=[np.number]).columns
self.categorical_cols = df.select_dtypes(include=['object']).columns
def basic_stats(self):
"""计算基本统计量"""
stats_summary = {}
for col in self.numerical_cols:
data = self.df[col].dropna()
stats_summary[col] = {
'count':len(data),
'mean': np.mean(data),
'median': np.median(data),
'std': np.std(data),
'variance': np.var(data),
'min': np.min(data),
'max': np.max(data),
'range': np.max(data)- np.min(data),
'q1': np.percentile(data,25),
'q3': np.percentile(data,75),
'iqr': np.percentile(data,75)- np.percentile(data,25),
'skewness': stats.skew(data),
'kurtosis': stats.kurtosis(data),
'cv':(np.std(data)/ np.mean(data))*100 if np.mean(data)!=0 else np.inf
}
pd.DataFrame(stats_summary).T
():
cat_stats = {}
col .categorical_cols:
data = .df[col].dropna()
value_counts = data.value_counts()
cat_stats[col] = {
:(data),
:(value_counts),
: value_counts.index[] (value_counts)> ,
: value_counts.iloc[] (value_counts)> ,
:(value_counts.iloc[]/(data))* (value_counts)> ,
: stats.entropy(value_counts)
}
pd.DataFrame(cat_stats).T
():
distribution_results = {}
col .numerical_cols:
data = .df[col].dropna()
shapiro_stat, shapiro_p = stats.shapiro(data) (data)< (np.nan, np.nan)
normaltest_stat, normaltest_p = stats.normaltest(data)
distribution_results[col] = {
: shapiro_stat,
: shapiro_p,
: normaltest_stat,
: normaltest_p,
: shapiro_p > np.isnan(shapiro_p) ,
: normaltest_p >
}
pd.DataFrame(distribution_results).T
():
corr_matrix = .df[.numerical_cols].corr()
pearson_corr = .df[.numerical_cols].corr(method=)
spearman_corr = .df[.numerical_cols].corr(method=)
kendall_corr = .df[.numerical_cols].corr(method=)
{: pearson_corr,: spearman_corr,: kendall_corr}
():
()
(*)
()
basic_stats_df = .basic_stats()
(basic_stats_df.())
(.categorical_cols)>:
()
cat_stats_df = .categorical_stats()
(cat_stats_df.())
()
dist_test_df = .distribution_test()
(dist_test_df.())
()
corr_results = .correlation_analysis()
(corr_results[].())
{
: basic_stats_df,
: cat_stats_df (.categorical_cols)> ,
: dist_test_df,
: corr_results
}
desc_stats = DescriptiveStatistics(iris_df)
report = desc_stats.generate_report()
class AdvancedStatistics:
def __init__(self, df):
self.df = df
self.numerical_cols = df.select_dtypes(include=[np.number]).columns
def outlier_detection(self, method='multiple'):
"""异常值检测"""
outlier_results = {}
for col in self.numerical_cols:
data = self.df[col].dropna()
outliers = {}
# IQR 方法
Q1 = np.percentile(data,25)
Q3 = np.percentile(data,75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
iqr_outliers = data[(data < lower_bound)|(data > upper_bound)]
outliers['iqr'] = {
'count':len(iqr_outliers),
'percentage':(len(iqr_outliers)/len(data))*100,
'values': iqr_outliers.tolist()
}
# Z-score 方法
z_scores = np.abs(stats.zscore(data))
zscore_outliers = data[z_scores > 3]
outliers['zscore'] = {
'count':len(zscore_outliers),
'percentage':(len(zscore_outliers)/len(data))*100,
'values': zscore_outliers.tolist()
}
# 修正 Z-score 方法(对异常值更稳健)
median = np.median(data)
mad = stats.median_abs_deviation(data)
modified_z_scores = *(data - median)/ mad
mod_z_outliers = data[np.(modified_z_scores)>]
outliers[] = {
:(mod_z_outliers),
:((mod_z_outliers)/(data))*,
: mod_z_outliers.tolist()
}
outlier_results[col]= outliers
outlier_results
():
normality_results = {}
col .numerical_cols:
data = .df[col].dropna()
tests = {}
(data)<:
shapiro_stat, shapiro_p = stats.shapiro(data)
tests[] = {
: shapiro_stat,
: shapiro_p,
: shapiro_p >
}
k2_stat, k2_p = stats.normaltest(data)
tests[] = {
: k2_stat,
: k2_p,
: k2_p >
}
anderson_result = stats.anderson(data, dist=)
tests[] = {
: anderson_result.statistic,
: anderson_result.critical_values,
: anderson_result.significance_level,
: anderson_result.statistic < anderson_result.critical_values[]
}
ks_stat, ks_p = stats.kstest(data,, args=(np.mean(data), np.std(data)))
tests[] = {
: ks_stat,
: ks_p,
: ks_p >
}
normality_results[col]= tests
normality_results
():
ci_results = {}
col .numerical_cols:
data = .df[col].dropna()
n = (data)
mean = np.mean(data)
std_err = stats.sem(data)
ci = stats.t.interval(confidence, n-, loc=mean, scale=std_err)
bootstrap_ci = ._bootstrap_ci(data, confidence=confidence)
ci_results[col] = {
: n,
: mean,
: std_err,
: ci,
: bootstrap_ci,
: ci[]- ci[]
}
ci_results
():
bootstrap_means = []
_ (n_bootstrap):
bootstrap_sample = np.random.choice(data, size=(data), replace=)
bootstrap_means.append(np.mean(bootstrap_sample))
alpha = (- confidence)/
lower = np.percentile(bootstrap_means, alpha *)
upper = np.percentile(bootstrap_means,(- alpha)*)
(lower, upper)
():
()
(*)
()
outlier_results = .outlier_detection()
col, methods outlier_results.items():
()
method, result methods.items():
()
()
normality_results = .normality_tests()
col, tests normality_results.items():
()
test_name, result tests.items():
is_normal = result.get(,)
status = is_normal
()
()
ci_results = .confidence_intervals()
col, result ci_results.items():
()
()
()
()
{
: outlier_results,
: normality_results,
: ci_results
}
advanced_stats = AdvancedStatistics(iris_df)
advanced_report = advanced_stats.generate_advanced_report()


微信公众号「极客日志」,在微信中扫描左侧二维码关注。展示文案:极客日志 zeeklog
使用加密算法(如AES、TripleDES、Rabbit或RC4)加密和解密文本明文。 在线工具,加密/解密文本在线工具,online
生成新的随机RSA私钥和公钥pem证书。 在线工具,RSA密钥对生成器在线工具,online
基于 Mermaid.js 实时预览流程图、时序图等图表,支持源码编辑与即时渲染。 在线工具,Mermaid 预览与可视化编辑在线工具,online
解析常见 curl 参数并生成 fetch、axios、PHP curl 或 Python requests 示例代码。 在线工具,curl 转代码在线工具,online
将字符串编码和解码为其 Base64 格式表示形式即可。 在线工具,Base64 字符串编码/解码在线工具,online
将字符串、文件或图像转换为其 Base64 表示形式。 在线工具,Base64 文件转换器在线工具,online