Python 爬虫实战：抓取微信公众号文章内容

Python 爬虫实战：抓取微信公众号文章内容 | 极客日志

requests

BeautifulSoup

工具 / 库	版本	作用
Python	3.8+	核心开发语言
requests	2.31.0	发送 HTTP 请求
BeautifulSoup4	4.12.2	解析 HTML/XML
lxml	4.9.3	高性能 HTML 解析器
fake-useragent	1.4.0	随机生成 User-Agent

pip install requests beautifulsoup4 lxml fake-useragent

import requests
import json
import time
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from urllib.parse import urlparse

class WeChatArticleSpider:
    def __init__(self):
        # 初始化请求头，模拟微信客户端
        self.ua = UserAgent()
        self.headers = {
            "User-Agent": self.ua.random,
            "Referer": "https://mp.weixin.qq.com/",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
            "Cache-Control": "max-age=0",
            "Connection": "keep-alive",
            # 需替换为实际有效的 Cookie（从浏览器 F12 抓包获取）
            "Cookie": "你的微信公众号 Cookie",
        }
        # 超时时间
        self.timeout = 10
        # 避免高频请求，设置延迟
        self.delay = 2

    def get_article_html(self, article_url):
        """
        获取文章页面 HTML 源码
        :param article_url: 公众号文章链接
        :return: 页面 HTML 文本/None
        """
        try:
            # 延迟请求，降低反爬风险
            time.sleep(self.delay)
            response = requests.get(
                url=article_url,
                headers=self.headers,
                timeout=self.timeout,
                allow_redirects=True
            )
            response.raise_for_status()  # 抛出 HTTP 异常
            response.encoding = response.apparent_encoding  # 自动识别编码
            return response.text
        except requests.exceptions.RequestException as e:
            print(f"获取页面失败：{e}")
            return None

    def parse_static_content(self, html):
        """
        解析静态内容：标题、作者、发布时间、正文
        :param html: 页面 HTML 文本
        :return: 静态数据字典
        """
        soup = BeautifulSoup(html, "lxml")
        static_data = {}
        # 解析标题
        title_tag = soup.find("h1", class_="rich_media_title")
        static_data["title"] = title_tag.get_text(strip=True) if title_tag else "未知标题"
        # 解析作者
        author_tag = soup.find("a", class_="rich_media_meta_link")
        static_data["author"] = author_tag.get_text(strip=True) if author_tag else "未知作者"
        # 解析发布时间
        time_tag = soup.find("em", class_="rich_media_meta rich_media_meta_text")
        static_data["publish_time"] = time_tag.get_text(strip=True) if time_tag else "未知时间"
        # 解析正文
        content_tag = soup.find("div", class_="rich_media_content")
        if content_tag:
            # 清理多余标签，保留纯文本
            for useless_tag in content_tag.find_all(["script", "style", "iframe"]):
                useless_tag.decompose()
            static_data["content"] = content_tag.get_text(strip=True, separator="\n")
        else:
            static_data["content"] = "正文解析失败"
        return static_data

    def parse_dynamic_data(self, html):
        """
        解析动态数据：阅读量、在看数（从页面 JS 变量中提取）
        :param html: 页面 HTML 文本
        :return: 动态数据字典
        """
        dynamic_data = {"read_count": 0, "like_count": 0}
        try:
            # 定位包含阅读量的 JS 变量
            js_start = html.find("var msgBizInfo = ")
            if js_start != -1:
                js_end = html.find("};", js_start) + 1
                js_str = html[js_start:js_end].replace("var msgBizInfo = ", "")
                biz_info = json.loads(js_str)
                dynamic_data["read_count"] = biz_info.get("read_num", 0)
                dynamic_data["like_count"] = biz_info.get("like_num", 0)
        except Exception as e:
            print(f"解析动态数据失败：{e}")
        return dynamic_data

    def run(self, article_url):
        """
        爬虫主流程
        :param article_url: 公众号文章链接
        :return: 完整文章数据
        """
        # 校验 URL 合法性
        parsed_url = urlparse(article_url)
        if not parsed_url.scheme or not parsed_url.netloc:
            print("URL 格式错误")
            return None
        # 1. 获取页面 HTML
        html = self.get_article_html(article_url)
        if not html:
            return None
        # 2. 解析静态数据
        static_data = self.parse_static_content(html)
        # 3. 解析动态数据
        dynamic_data = self.parse_dynamic_data(html)
        # 4. 合并数据
        article_data = {**static_data, **dynamic_data}
        return article_data

if __name__ == "__main__":
    # 实例化爬虫
    spider = WeChatArticleSpider()
    # 待爬取的公众号文章链接（替换为实际链接）
    target_url = "https://mp.weixin.qq.com/s/xxxxxx"
    # 执行爬虫
    result = spider.run(target_url)
    # 输出结果
    if result:
        print("===== 微信公众号文章抓取结果 =====")
        print(f"标题：{result['title']}")
        print(f"作者：{result['author']}")
        print(f"发布时间：{result['publish_time']}")
        print(f"阅读量：{result['read_count']}")
        print(f"在看数：{result['like_count']}")
        print(f"正文（前 200 字）：{result['content'][:200]}...")
    else:
        print("爬虫执行失败")

===== 微信公众号文章抓取结果 =====
标题：Python 爬虫实战：从入门到精通
作者：Python 技术栈
发布时间：2026-01-14
阅读量：12580
在看数：896
正文（前 200 字）：本文将详细讲解 Python 爬虫的核心技术，包括请求头伪装、反反爬策略、动态页面解析等内容。通过实战案例，帮助初学者快速掌握爬虫开发技巧，同时针对常见的反爬机制给出解决方案，让你的爬虫更稳定、更高效...

代码模块	核心原理	关键作用
`__init__` 方法	初始化请求头（包含随机 UA、Cookie）、设置请求延迟	模拟真实用户请求，规避基础反爬
`get_article_html` 方法	发送 GET 请求，处理超时 / 重定向，自动识别编码	获取文章页面完整 HTML 源码
`parse_static_content` 方法	使用 BeautifulSoup 定位 class 属性，解析 HTML 标签	提取标题、作者、正文等静态内容
`parse_dynamic_data` 方法	从页面 JS 变量中提取 JSON 数据	抓取阅读量、在看数等动态加载字段
`run` 方法	串联 URL 校验、HTML 获取、数据解析全流程	实现爬虫的完整执行逻辑

import json
with open("wechat_article.json", "w", encoding="utf-8") as f:
    json.dump(article_data, f, ensure_ascii=False, indent=4)
print("数据已保存到 wechat_article.json")

def batch_crawl(self, url_list):
    """批量抓取多个公众号文章"""
    all_results = []
    for url in url_list:
        result = self.run(url)
        if result:
            all_results.append(result)
    return all_results

# 调用示例
url_list = [
    "https://mp.weixin.qq.com/s/xxxxxx1",
    "https://mp.weixin.qq.com/s/xxxxxx2"
]
batch_result = spider.batch_crawl(url_list)
print(f"批量抓取完成，共获取{len(batch_result)}篇文章")

Python 爬虫实战：抓取微信公众号文章内容

前言

摘要

一、技术原理与环境准备

1.1 核心技术原理

1.2 环境配置

环境安装命令

二、实战开发：微信公众号文章爬虫

2.1 核心思路拆解

2.2 完整代码实现

2.3 代码输出结果示例

2.4 核心代码原理说明

三、反反爬优化策略

3.2 高频请求限制突破

3.3 动态内容抓取补充方案

四、数据持久化与扩展

4.1 数据保存到本地文件

4.2 批量抓取公众号文章

五、注意事项与合规声明

总结

更多推荐文章

相关免费在线工具

Python 爬虫实战：抓取微信公众号文章内容

前言

摘要

一、技术原理与环境准备

1.1 核心技术原理

1.2 环境配置

环境安装命令

二、实战开发：微信公众号文章爬虫

2.1 核心思路拆解

2.2 完整代码实现

2.3 代码输出结果示例

2.4 核心代码原理说明

三、反反爬优化策略

3.1 Cookie 有效期处理

3.2 高频请求限制突破

3.3 动态内容抓取补充方案

四、数据持久化与扩展

4.1 数据保存到本地文件

4.2 批量抓取公众号文章

五、注意事项与合规声明

总结

微信扫一扫，关注极客日志

更多推荐文章

相关免费在线工具