Python 数据统计指南：从基础配置到高级分析

Python 数据统计指南：从基础配置到高级分析 | 极客日志

在这里插入图片描述

1. 数据统计基础与环境配置

1.1 Python 数据科学生态系统

Python 在数据统计领域的强大主要得益于其丰富的库生态系统：

# 核心数据分析库
import pandas as pd
import numpy as np

# 数据可视化库
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# 统计分析库
import scipy.stats as stats
from scipy import stats
import statsmodels.api as sm
from statsmodels.formula.api import ols

# 机器学习库
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

# 其他实用库
import warnings
warnings.filterwarnings('ignore')

1.2 环境配置与安装

# 推荐使用 conda 或 pip 安装必要包
"""
pip install pandas numpy matplotlib seaborn plotly
pip install scipy statsmodels scikit-learn
pip install jupyter notebook
# 交互式环境
"""

# 设置中文字体显示
plt.rcParams['font.sans-serif'] = ['SimHei']
# 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False
# 用来正常显示负号
# 设置绘图样式
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

2. 数据获取与加载

2.1 从不同数据源加载数据

import pandas as pd
import numpy as np
import sqlite3
import requests
import json

class DataLoader:
    def __init__(self):
        self.data_sources = {}

    def load_csv(self, file_path, **kwargs):
        """加载 CSV 文件"""
        try:
            df = pd.read_csv(file_path, **kwargs)
            self.data_sources['csv'] = df
            print(f"成功加载 CSV 文件，数据形状：{df.shape}")
            return df
        except Exception as e:
            print(f"加载 CSV 文件失败：{e}")
            return None

    def load_excel(self, file_path, sheet_name=0):
        """加载 Excel 文件"""
        try:
            df = pd.read_excel(file_path, sheet_name=sheet_name)
            self.data_sources['excel'] = df
            print(f"成功加载 Excel 文件，数据形状：{df.shape}")
            return df
        except Exception as e:
            print(f"加载 Excel 文件失败：{e}")
            return 

     ():
        
        :
            conn = sqlite3.connect(db_path)
            df = pd.read_sql_query(query, conn)
            conn.close()
            .data_sources[] = df
            ()
             df
         Exception  e:
            ()
             

     ():
        
        :
            response = requests.get(url, params=params)
             response.status_code == :
                data = response.json()
                df = pd.DataFrame(data)
                .data_sources[] = df
                ()
                 df
            :
                ()
                 
         Exception  e:
            ()
             


loader = DataLoader()

 sklearn.datasets  load_iris
iris = load_iris()
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
iris_df[] = iris.target

2.2 数据基本信息查看

def explore_data(df, sample_size=5):
    """ 全面探索数据集基本信息 """
    print("="*50)
    print("数据集基本信息探索")
    print("="*50)
    # 基本形状信息
    print(f"数据形状：{df.shape}")
    print(f"行数：{df.shape[0]}")
    print(f"列数：{df.shape[1]}")
    # 数据类型信息
    print("\n数据类型信息:")
    print(df.dtypes)
    # 数据预览
    print(f"\n前{sample_size}行数据:")
    print(df.head(sample_size))
    print(f"\n后{sample_size}行数据:")
    print(df.tail(sample_size))
    # 统计摘要
    print("\n数值列统计摘要:")
    print(df.describe())
    # 缺失值信息
    print("\n缺失值统计:")
    missing_info = pd.DataFrame({
        '缺失数量': df.isnull().sum(),
        '缺失比例': df.isnull().sum()/len(df)*100
    })
    print(missing_info)
    # 唯一值信息
    ()
    categorical_cols = df.select_dtypes(include=[]).columns
     col  categorical_cols:
        ()
     {
        : df.shape,
        : df.dtypes,
        : missing_info
    }


info = explore_data(iris_df)

3. 数据清洗与预处理

3.1 缺失值处理

class DataCleaner:
    def __init__(self, df):
        self.df = df.copy()
        self.cleaning_log = []

    def detect_missing_values(self):
        """检测缺失值"""
        missing_stats = pd.DataFrame({
            'missing_count': self.df.isnull().sum(),
            'missing_percentage': (self.df.isnull().sum()/len(self.df))*100,
            'data_type': self.df.dtypes
        })
        # 高缺失率列
        high_missing_cols = missing_stats[missing_stats['missing_percentage']>50].index.tolist()
        self.cleaning_log.append({'step':'缺失值检测','details':f"发现 {len(high_missing_cols)} 个高缺失率列 (>50%)"})
        return missing_stats, high_missing_cols

    def handle_missing_values(self, strategy='auto', custom_strategy=None):
        """处理缺失值"""
        df_clean = self.df.copy()
        missing_stats, high_missing_cols = self.detect_missing_values()
        
        # 删除高缺失率列
        if high_missing_cols:
            df_clean = df_clean.drop(columns=high_missing_cols)
            self.cleaning_log.append({'step':,:})
        
        
         col  df_clean.columns:
             df_clean[col].isnull().() > :
                 strategy == :
                    
                     df_clean[col].dtype  [,]:
                        
                        fill_value = df_clean[col].median()
                        df_clean[col].fillna(fill_value, inplace=)
                        method = 
                    :
                        
                        fill_value = df_clean[col].mode()[]   df_clean[col].mode().empty  
                        df_clean[col].fillna(fill_value, inplace=)
                        method = 
                 strategy ==   custom_strategy:
                    
                     col  custom_strategy:
                        fill_value = custom_strategy[col]
                        df_clean[col].fillna(fill_value, inplace=)
                        method = 
                .cleaning_log.append({
                    :,
                    : col,
                    : method,
                    : .df[col].isnull().()
                })
        .df = df_clean
         df_clean

     ():
        
        initial_count = (.df)
        .df = .df.drop_duplicates()
        removed_count = initial_count - (.df)
        .cleaning_log.append({
            :,
            : removed_count,
            :(.df)
        })
         .df

     ():
        
        df_clean = .df.copy()
        numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
        outliers_info = {}
        
         col  numeric_cols:
             method == :
                
                Q1 = df_clean[col].quantile()
                Q3 = df_clean[col].quantile()
                IQR = Q3 - Q1
                lower_bound = Q1 -  * IQR
                upper_bound = Q3 +  * IQR
                outliers = df_clean[(df_clean[col]< lower_bound)|(df_clean[col]> upper_bound)]
                outlier_count = (outliers)
                
                df_clean[col]= np.where(df_clean[col]< lower_bound, lower_bound, df_clean[col])
                df_clean[col]= np.where(df_clean[col]> upper_bound, upper_bound, df_clean[col])
             method == :
                
                z_scores = np.(stats.zscore(df_clean[col]))
                outlier_count = (df_clean[z_scores > threshold])
                
                median = df_clean[col].median()
                mad = stats.median_abs_deviation(df_clean[col])
                df_clean[col]= np.where(z_scores > threshold, median, df_clean[col])
                outliers_info[col]= outlier_count
        
        .cleaning_log.append({:,: method,: outliers_info })
        .df = df_clean
         df_clean

     ():
        
        ()
        (*)
         log  .cleaning_log:
            ()
             key, value  log.items():
                 key !=:
                    ()
            ()



np.random.seed()
test_data = pd.DataFrame({
    : np.random.normal(,,),
    : np.random.normal(,,),
    : np.random.choice([,,],),
    : np.random.exponential(,)
})

test_data.loc[:,]= np.nan
test_data.loc[:,]= np.nan
test_data.loc[,]= 
test_data.loc[,]= 

cleaner = DataCleaner(test_data)
cleaned_data = cleaner.handle_missing_values()
cleaned_data = cleaner.remove_duplicates()
cleaned_data = cleaner.handle_outliers()
cleaner.get_cleaning_report()

3.2 数据转换与编码

class DataTransformer:
    def __init__(self, df):
        self.df = df.copy()
        self.transformation_log = []

    def encode_categorical(self, columns=None, method='onehot'):
        """分类变量编码"""
        df_encoded = self.df.copy()
        if columns is None:
            categorical_cols = df_encoded.select_dtypes(include=['object']).columns
        else:
            categorical_cols = columns
            
        for col in categorical_cols:
            if method == 'onehot':
                # One-Hot 编码
                dummies = pd.get_dummies(df_encoded[col], prefix=col)
                df_encoded = pd.concat([df_encoded, dummies], axis=1)
                df_encoded.drop(col, axis=1, inplace=True)
                encoding_type = "One-Hot 编码"
            elif method == 'label':
                # 标签编码
                from sklearn.preprocessing import LabelEncoder
                le = LabelEncoder()
                df_encoded[col]= le.fit_transform(df_encoded[col])
                encoding_type = "标签编码"
            elif method == 'target':
                # 目标编码（需要目标变量）
                if 'target' in df_encoded.columns:
                    target_mean = df_encoded.groupby(col)['target'].mean()
                    df_encoded[col]= df_encoded[col].(target_mean)
                    encoding_type = 
            .transformation_log.append({:,: col,: encoding_type })
        .df = df_encoded
         df_encoded

     ():
        
         sklearn.preprocessing  StandardScaler, MinMaxScaler, RobustScaler
        df_scaled = .df.copy()
         columns  :
            numerical_cols = df_scaled.select_dtypes(include=[np.number]).columns
        :
            numerical_cols = columns
            
        scaler = 
         method == :
            scaler = StandardScaler()
            scaling_type = 
         method == :
            scaler = MinMaxScaler()
            scaling_type = 
         method == :
            scaler = RobustScaler()
            scaling_type = 
            
         scaler:
            df_scaled[numerical_cols]= scaler.fit_transform(df_scaled[numerical_cols])
        
        .transformation_log.append({
            :,
            :(numerical_cols),
            : scaling_type
        })
        .df = df_scaled
         df_scaled, scaler

     ():
        
        df_featured = .df.copy()
        numerical_cols = df_featured.select_dtypes(include=[np.number]).columns
        
         sklearn.preprocessing  PolynomialFeatures
         (numerical_cols)>=:
            poly = PolynomialFeatures(degree=, include_bias=, interaction_only=)
            poly_features = poly.fit_transform(df_featured[numerical_cols[:]])
            
            poly_feature_names = poly.get_feature_names_out(numerical_cols[:])
            poly_df = pd.DataFrame(poly_features, columns=poly_feature_names)
            df_featured = pd.concat([df_featured, poly_df], axis=)
            .transformation_log.append({
                :,
                :,
                :(poly_feature_names)
            })
        
         col  numerical_cols:
            df_featured[]= stats.zscore(df_featured[col])
            df_featured[]= df_featured[col].rank()
        .transformation_log.append({
            :,
            :,
            :[ col  numerical_cols]+[ col  numerical_cols]
        })
        .df = df_featured
         df_featured


transformer = DataTransformer(iris_df)
transformed_data, scaler = transformer.scale_numerical(method=)
transformer.create_features()

4. 描述性统计分析

4.1 基本统计量计算

class DescriptiveStatistics:
    def __init__(self, df):
        self.df = df
        self.numerical_cols = df.select_dtypes(include=[np.number]).columns
        self.categorical_cols = df.select_dtypes(include=['object']).columns

    def basic_stats(self):
        """计算基本统计量"""
        stats_summary = {}
        for col in self.numerical_cols:
            data = self.df[col].dropna()
            stats_summary[col] = {
                'count':len(data),
                'mean': np.mean(data),
                'median': np.median(data),
                'std': np.std(data),
                'variance': np.var(data),
                'min': np.min(data),
                'max': np.max(data),
                'range': np.max(data)- np.min(data),
                'q1': np.percentile(data,25),
                'q3': np.percentile(data,75),
                'iqr': np.percentile(data,75)- np.percentile(data,25),
                'skewness': stats.skew(data),
                'kurtosis': stats.kurtosis(data),
                'cv':(np.std(data)/ np.mean(data))*100 if np.mean(data)!=0 else np.inf
            }
         pd.DataFrame(stats_summary).T

     ():
        
        cat_stats = {}
         col  .categorical_cols:
            data = .df[col].dropna()
            value_counts = data.value_counts()
            cat_stats[col] = {
                :(data),
                :(value_counts),
                : value_counts.index[]  (value_counts)>  ,
                : value_counts.iloc[]  (value_counts)>  ,
                :(value_counts.iloc[]/(data))*  (value_counts)>  ,
                : stats.entropy(value_counts) 
            }
         pd.DataFrame(cat_stats).T

     ():
        
        distribution_results = {}
         col  .numerical_cols:
            data = .df[col].dropna()
            
            shapiro_stat, shapiro_p = stats.shapiro(data)  (data)<  (np.nan, np.nan)
            normaltest_stat, normaltest_p = stats.normaltest(data)
            distribution_results[col] = {
                : shapiro_stat,
                : shapiro_p,
                : normaltest_stat,
                : normaltest_p,
                : shapiro_p >   np.isnan(shapiro_p)  ,
                : normaltest_p >
            }
         pd.DataFrame(distribution_results).T

     ():
        
        corr_matrix = .df[.numerical_cols].corr()
        
        pearson_corr = .df[.numerical_cols].corr(method=)
        spearman_corr = .df[.numerical_cols].corr(method=)
        kendall_corr = .df[.numerical_cols].corr(method=)
         {: pearson_corr,: spearman_corr,: kendall_corr}

     ():
        
        ()
        (*)
        
        ()
        basic_stats_df = .basic_stats()
        (basic_stats_df.())
        
         (.categorical_cols)>:
            ()
            cat_stats_df = .categorical_stats()
            (cat_stats_df.())
        
        ()
        dist_test_df = .distribution_test()
        (dist_test_df.())
        
        ()
        corr_results = .correlation_analysis()
        (corr_results[].())
         {
            : basic_stats_df,
            : cat_stats_df  (.categorical_cols)>  ,
            : dist_test_df,
            : corr_results
        }


desc_stats = DescriptiveStatistics(iris_df)
report = desc_stats.generate_report()

4.2 高级统计分析

class AdvancedStatistics:
    def __init__(self, df):
        self.df = df
        self.numerical_cols = df.select_dtypes(include=[np.number]).columns

    def outlier_detection(self, method='multiple'):
        """异常值检测"""
        outlier_results = {}
        for col in self.numerical_cols:
            data = self.df[col].dropna()
            outliers = {}
            # IQR 方法
            Q1 = np.percentile(data,25)
            Q3 = np.percentile(data,75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            iqr_outliers = data[(data < lower_bound)|(data > upper_bound)]
            outliers['iqr'] = {
                'count':len(iqr_outliers),
                'percentage':(len(iqr_outliers)/len(data))*100,
                'values': iqr_outliers.tolist()
            }
            # Z-score 方法
            z_scores = np.abs(stats.zscore(data))
            zscore_outliers = data[z_scores > 3]
            outliers['zscore'] = {
                'count':len(zscore_outliers),
                'percentage':(len(zscore_outliers)/len(data))*100,
                'values': zscore_outliers.tolist()
            }
            # 修正 Z-score 方法（对异常值更稳健）
            median = np.median(data)
            mad = stats.median_abs_deviation(data)
            modified_z_scores = *(data - median)/ mad
            mod_z_outliers = data[np.(modified_z_scores)>]
            outliers[] = {
                :(mod_z_outliers),
                :((mod_z_outliers)/(data))*,
                : mod_z_outliers.tolist()
            }
            outlier_results[col]= outliers
         outlier_results

     ():
        
        normality_results = {}
         col  .numerical_cols:
            data = .df[col].dropna()
            tests = {}
            
             (data)<:
                shapiro_stat, shapiro_p = stats.shapiro(data)
                tests[] = {
                    : shapiro_stat,
                    : shapiro_p,
                    : shapiro_p >
                }
            
            k2_stat, k2_p = stats.normaltest(data)
            tests[] = {
                : k2_stat,
                : k2_p,
                : k2_p >
            }
            
            anderson_result = stats.anderson(data, dist=)
            tests[] = {
                : anderson_result.statistic,
                : anderson_result.critical_values,
                : anderson_result.significance_level,
                : anderson_result.statistic < anderson_result.critical_values[] 
            }
            
            ks_stat, ks_p = stats.kstest(data,, args=(np.mean(data), np.std(data)))
            tests[] = {
                : ks_stat,
                : ks_p,
                : ks_p >
            }
            normality_results[col]= tests
         normality_results

     ():
        
        ci_results = {}
         col  .numerical_cols:
            data = .df[col].dropna()
            n = (data)
            mean = np.mean(data)
            std_err = stats.sem(data)
            
            ci = stats.t.interval(confidence, n-, loc=mean, scale=std_err)
            
            bootstrap_ci = ._bootstrap_ci(data, confidence=confidence)
            ci_results[col] = {
                : n,
                : mean,
                : std_err,
                : ci,
                : bootstrap_ci,
                : ci[]- ci[]
            }
         ci_results

     ():
        
        bootstrap_means = []
         _  (n_bootstrap):
            bootstrap_sample = np.random.choice(data, size=(data), replace=)
            bootstrap_means.append(np.mean(bootstrap_sample))
        alpha = (- confidence)/
        lower = np.percentile(bootstrap_means, alpha *)
        upper = np.percentile(bootstrap_means,(- alpha)*)
        (lower, upper)

     ():
        
        ()
        (*)
        
        ()
        outlier_results = .outlier_detection()
         col, methods  outlier_results.items():
            ()
             method, result  methods.items():
                ()
        
        ()
        normality_results = .normality_tests()
         col, tests  normality_results.items():
            ()
             test_name, result  tests.items():
                is_normal = result.get(,)
                status = is_normal 
                ()
        
        ()
        ci_results = .confidence_intervals()
         col, result  ci_results.items():
            ()
            ()
            ()
            ()
         {
            : outlier_results,
            : normality_results,
            : ci_results
        }


advanced_stats = AdvancedStatistics(iris_df)
advanced_report = advanced_stats.generate_advanced_report()

在这里插入图片描述

Python 数据统计指南：从基础配置到高级分析

1. 数据统计基础与环境配置

1.1 Python 数据科学生态系统

1.2 环境配置与安装

2. 数据获取与加载

2.1 从不同数据源加载数据

2.2 数据基本信息查看

3. 数据清洗与预处理

3.1 缺失值处理

3.2 数据转换与编码

4. 描述性统计分析

4.1 基本统计量计算

4.2 高级统计分析

更多推荐文章

相关免费在线工具

Python 数据统计指南：从基础配置到高级分析

1. 数据统计基础与环境配置

1.1 Python 数据科学生态系统

1.2 环境配置与安装

2. 数据获取与加载

2.1 从不同数据源加载数据

2.2 数据基本信息查看

3. 数据清洗与预处理

3.1 缺失值处理

3.2 数据转换与编码

4. 描述性统计分析

4.1 基本统计量计算

4.2 高级统计分析

微信扫一扫，关注极客日志

更多推荐文章

相关免费在线工具