Python 爬虫实战：反爬绕过与数据解析常见问题总结

第一个坑：User-Agent 被识别为爬虫

刚开始写爬虫时，直接用 requests.get(url) 发送请求，结果返回 403 Forbidden。

问题原因：requests 默认的 User-Agent 是 python-requests/版本号，网站一眼就能识别这是爬虫。

解决方案：自定义请求头，伪装成浏览器。

import requests

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
    'Accept-Encoding': 'gzip, deflate',
    'Connection': 'keep-alive',
}
response = requests.get(url, headers=headers, timeout=10)
print(response.status_code)

经验：Headers 越完整越像真人在访问，最少也要加上 User-Agent。

第二个坑：IP 被封禁

搞定了 User-Agent 开始爬数据，结果爬了 100 多条后，又返回 403 了。这次更惨，换浏览器也不行——IP 被封了！

问题原因：同一个 IP 短时间内请求太频繁，触发了网站的反爬机制。

解决方案：使用代理 IP 池 + 请求间隔。

import requests
import time
import random

# 代理池（示例，实际需要购买或免费获取）
proxies = [
    {'http': 'http://123.45.67.89:8080', 'https': 'https://123.45.67.89:8080'},
    {'http': 'http://98.76.54.32:8080', 'https': 'https://98.76.54.32:8080'},
]

def fetch_with_proxy(url):
    # 每次请求随机选择代理
    proxy = random.choice(proxies)
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
    try:
        response = requests.get(url, headers=headers, proxies=proxy, timeout=10)
        # 随机延时 1-3 秒，模拟真人操作
        time.sleep(random.uniform(1, 3))
        return response
    except Exception as e:
        print(f"请求失败：{e}")
        return None

# 使用示例
for page in range(1, 11):
    url = f"https://example.com/list?page={page}"
    response = fetch_with_proxy(url)
    if response:
        print(f"第 {page} 页爬取成功")

经验：免费代理不稳定，生产环境建议用付费代理服务。请求间隔是关键，别太贪心。

第三个坑：CSS 选择器解析不到数据

终于绕过了反爬，拿到 HTML 了。结果用 BeautifulSoup 解析时，什么都拿不到！

from bs4 import BeautifulSoup

html = '<div><span>商品名称</span></div>'
soup = BeautifulSoup(html, 'html.parser')

# 错误写法：class 是 Python 关键字，不能直接用
# name = soup.find(class='name').text  # 报错！

# 正确写法：用 class_ 参数
name = soup.find(class_='name').text
print(name)
# 输出：商品名称

问题原因：class 是 Python 关键字，BeautifulSoup 用 class_ 参数代替。

解决方案：

from bs4 import BeautifulSoup

# 完整示例：解析商品列表
html = '''
<div>
    <div>
        <span>iPhone 15</span>
        <span>5999</span>
    </div>
    <div>
        <span>华为 Mate 60</span>
        <span>6999</span>
    </div>
</div>
'''
soup = BeautifulSoup(html, 'html.parser')

# 方法 1：用 class_ 参数
items = soup.find_all(class_='product-item')
for item in items:
    name = item.find(class_='name').text
    price = item.find(class_='price').text
    print(f"商品：{name}, 价格：{price}")

# 方法 2：用 CSS 选择器（更简洁）
names = soup.select('.product-item .name')
prices = soup.select('.product-item .price')
for name, price in zip(names, prices):
    print(f"商品：{name.text}, 价格：{price.text}")

经验：BeautifulSoup 的 class_ 参数容易忘，CSS 选择器 .class 更直观。

第四个坑：JSON 数据中文乱码

这次爬到了一个 API 接口，返回的是 JSON 数据。结果打印出来全是 \u4e2d\u6587 这种 Unicode 转义符，看得我头皮发麻。

import requests

response = requests.get('https://api.example.com/data')
data = response.json()

# 直接解析 JSON
# 问题：中文显示为 Unicode 转义
print(data)
# {'name': '\u4e2d\u6587', 'city': '\u5317\u4eac'}

# 解决方案：设置正确的编码
response.encoding = 'utf-8'  # 关键步骤！
data = response.json()
print(data)
# {'name': '中文', 'city': '北京'}

问题原因：HTTP 响应头没有明确指定编码时，requests 默认用 ISO-8859-1 解析，导致中文乱码。

完整解决方案：

import requests
import json

def fetch_json(url):
    response = requests.get(url)
    # 方法 1：自动检测编码
    response.encoding = response.apparent_encoding
    # 方法 2：手动指定 UTF-8
    # response.encoding = 'utf-8'
    # 方法 3：处理 gbk 编码
    # response.encoding = 'gbk'
    
    try:
        data = response.json()
        return data
    except json.JSONDecodeError:
        # 如果解析失败，尝试用文本方式
        print("JSON 解析失败，原始文本：", response.text[:200])
        return None

# 使用示例
url = 'https://api.example.com/users'
data = fetch_json(url)
if data:
    for user in data:
        print(f"用户名：{user.get('name','N/A')}")

经验：拿到 JSON 数据后，先 print(response.text[:200]) 看看原始内容，确认编码再解析。

第五个坑：异步加载的数据抓不到

最坑爹的情况来了：网页用 JavaScript 动态加载数据，requests 拿到的是空壳 HTML！

import requests
from bs4 import BeautifulSoup

url = 'https://example.com/dynamic-page'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

# 问题：页面是空的！
items = soup.find_all(class_='product-item')
print(f"找到 {len(items)} 个商品")
# 输出：找到 0 个商品
# 因为数据是通过 JavaScript 动态加载的
# 原始 HTML 里根本没有这些内容

问题原因：现代网页大量使用 AJAX、Vue、React 等技术，数据在浏览器中通过 JavaScript 动态渲染，requests 只能拿到最初的 HTML 骨架。

解决方案：使用 Selenium 或 Playwright 模拟浏览器行为。

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

# 配置 Chrome（无头模式）
options = Options()
options.add_argument('--headless')
options.add_argument('--disable-gpu')
options.add_argument('--no-sandbox')
driver = webdriver.Chrome(options=options)

try:
    # 打开页面
    driver.get('https://example.com/dynamic-page')
    # 等待 JavaScript 加载完成（最多等 10 秒）
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, 'product-item'))
    )
    # 滚动页面，触发懒加载
    for _ in range(3):
        driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
        time.sleep(1)
    
    # 解析数据
    items = driver.find_elements(By.CLASS_NAME, 'product-item')
    print(f"找到 {len(items)} 个商品")
    for item in items:
        name = item.find_element(By.CLASS_NAME, 'name').text
        price = item.find_element(By.CLASS_NAME, 'price').text
        print(f"商品：{name}, 价格：{price}")
:
    driver.quit()

或者用 Playwright（更现代的选择）：

import asyncio
from playwright.async_api import async_playwright

async def main():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto('https://example.com/dynamic-page')
        # 等待数据加载
        await page.wait_for_selector('.product-item')
        # 提取数据
        items = await page.query_selector_all('.product-item')
        for item in items:
            name = await item.query_selector_eval('.name', 'el => el.textContent')
            price = await item.query_selector_eval('.price', 'el => el.textContent')
            print(f"商品：{name}, 价格：{price}")
        await browser.close()

asyncio.run(main())

经验：遇到动态加载的页面，先右键查看网页源代码，如果搜不到数据，就是 JavaScript 渲染的，乘早用 Selenium/Playwright。

Python 爬虫实战：反爬绕过与数据解析常见问题总结