Python 实现携程景区评论数据爬取与分析

1. 前言

本文记录如何使用 Python 爬取携程景区的评论数据，并进行简单的分析与存储。

2. 爬虫实现

2.1 定位接口

打开携程网，找到某个景区点击跳转到详情页面。

文章配图

按 F12 打开开发者工具，然后点击下评论下一页来监听是否有网络请求更新。如果没有那就看页面地址栏是否发生变化，前者是动态更新（post 请求，通过调后端接口完成数据更新），后者是静态更新（get 请求，通过 html 页面更新数据）。

通过搜索评论内容定位到评论数据是通过调 getCommentCollapseList 接口返回的。

文章配图

那么我们现在已经知道数据在哪个接口中，接下来就需要在本地模拟调用这个请求即可，这里我是用 Python 的 requests 库实现。你问我怎么知道调这个请求需要携带这些参数？

那我告诉你一个快速又便捷的方法就是右键复制这个请求的 curl(bash) 去拿到 https://curlconverter.com/ 网站粘贴，就会自动输出完整的调用请求代码。

import requests
cookies = {
    'GUID': '09031069217559688465',
    'MKT_CKID': '1751274744072.9fx30.ncpi',
    '_RSG': 'Ce4EW5dni37P3spnPcTGtA',
    # ... (其他 Cookie 字段)
}
headers = {
    'accept': '*/*',
    'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
    'cache-control': 'no-cache',
    'content-type': 'application/json',
    'cookieorigin': 'https://you.ctrip.com',
    'origin': 'https://you.ctrip.com',
    'pragma': 'no-cache',
    'priority': 'u=1, i',
    'referer': 'https://you.ctrip.com/',
    'sec-ch-ua': '"Chromium";v="142", "Microsoft Edge";v="142", "Not_A Brand";v="99"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
    'sec-fetch-dest': 'empty',
    'sec-fetch-mode': 'cors',
    'sec-fetch-site': 'same-site',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36 Edg/142.0.0.0',
    'x-ctx-currency': 'CNY',
    'x-ctx-locale': 'zh-CN',
    'x-ctx-ubt-pageid': '290510',
    'x-ctx-ubt-pvid': '13',
    'x-ctx-ubt-sid': '57',
    'x-ctx-ubt-vid': '1752483344037.225tvy',
}
params = {
    '_fxpcqlniredt': '09031069217559688465',
    'x-traceID': '09031069217559688465-1762138434490-2103112',
}
json_data = {
    'arg': {
        'channelType': 2,
        'collapseType': 0,
        'commentTagId': 0,
        'pageIndex': 1,
        'pageSize': 10,
        'poiId': 80633,
        'sourceType': 1,
        'sortType': 3,
        'starType': 0,
    },
    'head': {
        'cid': '09031069217559688465',
        'ctok': '',
        'cver': '1.0',
        'lang': '01',
        'sid': '8888',
        'syscode': '09',
        'auth': '',
        'xsid': '',
        'extension': [],
    },
}
response = requests.post(
    'https://m.ctrip.com/restapi/soa2/13444/json/getCommentCollapseList',
    params=params,
    cookies=cookies,
    headers=headers,
    json=json_data,
)
print(response.json())

打印数据看看评论内容是否在返回结果中。

文章配图

可以看到返回结果中有我们想要的数据，那接下来就是写数据分析和存储代码了。

分析以及存储代码：

文章配图

import requests
import json
import pandas as pd
from datetime import datetime

def crawlComment():
    cookies = { ... }  # 此处省略具体 Cookie 值，实际运行时需替换为有效会话
    headers = { ... }
    params = { ... }
    json_data = { ... }
    response = requests.post(
        'https://m.ctrip.com/restapi/soa2/13444/json/getCommentCollapseList',
        params=params,
        cookies=cookies,
        headers=headers,
        json=json_data,
    )
    return response.json()

def extract_comments_from_json(json_data):
    comments = []
    if 'result' in json_data and 'items' in json_data['result']:
        items = json_data['result']['items']
        for item in items:
            comment_info = {
                'comment_id': item.get('commentId'),
                'user_nick': item.get('userInfo', {}).get('userNick', ''),
                'user_member': item.get('userInfo', {}).get('userMember', ''),
                'score': item.get('score', 0),
                'content': item.get('content', ''),
                'publish_time': format_timestamp(item.get('publishTime', '')),
                : get_tourist_type_display(item.get(, )),
                : item.get(, ),
                : item.get(, ),
                : item.get(, ),
                : item.get(, ),
                : (item.get(, [])),
                : item.get(, []),
                : get_detailed_scores(item.get(, []))
            }
            comments.append(comment_info)
     comments

 ():
      timestamp_str:
         
    :
         timestamp_str.startswith():
            timestamp_str = timestamp_str.replace(, ).replace(, )
            millis = (timestamp_str.split()[])
            dt = datetime.fromtimestamp(millis / )
             dt.strftime()
    :
        
     timestamp_str

 ():
    tourist_type_map = {
        : ,
        : ,
        : ,
        : ,
        : ,
        : 
    }
     tourist_type_map.get(tourist_type, )

 ():
    scores_detail = {}
     score_item  scores_list:
        name = score_item.get(, )
        score = score_item.get(, )
         name:
            scores_detail[name] = score
     scores_detail

 ():
      comments:
         {}
    total_comments = (comments)
    avg_score = (comment[]  comment  comments) / total_comments
    score_distribution = {}
     comment  comments:
        score = comment[]
        score_range = 
        score_distribution[score_range] = score_distribution.get(score_range, ) + 
    tourist_type_dist = {}
     comment  comments:
        tourist_type = comment[]
        tourist_type_dist[tourist_type] = tourist_type_dist.get(tourist_type, ) + 
    location_dist = {}
     comment  comments:
        location = comment[]  
        location_dist[location] = location_dist.get(location, ) + 
    avg_content_length = ((comment[])  comment  comments) / total_comments
    analysis = {
        : total_comments,
        : (avg_score, ),
        : score_distribution,
        : tourist_type_dist,
        : location_dist,
        : (avg_content_length, ),
        : (comment[]  comment  comments),
        : (comment[]  comment  comments)
    }
     analysis

 ():
    df = pd.DataFrame(comments)
    csv_filename = 
    df.to_csv(csv_filename, index=, encoding=)
    txt_filename = 
     (txt_filename, , encoding=)  f:
        f.write()
        f.write( *  + )
        f.write()
        f.write()
        f.write()
        f.write()
        f.write()
        f.write()
        f.write()
         score_range, count  analysis[].items():
            f.write()
        f.write()
         tourist_type, count  analysis[].items():
            f.write()
        f.write()
         location, count  analysis[].items():
            f.write()
        f.write( +  *  + )
        f.write()
         i, comment  (comments, ):
            f.write()
            f.write()
            f.write()
            f.write()
            f.write()
            f.write()
            f.write()
            f.write()
             comment[]:
                f.write()
                scores_str = .join([  k, v  comment[].items()])
                f.write(scores_str + )
             comment[]:
                f.write()
            f.write()
            f.write( *  + )
     csv_filename, txt_filename

 ():
    ()
    comments = extract_comments_from_json(json_data)
    ()
      comments:
        ()
        
    analysis = analyze_comments_data(comments)
    ()
    ()
    ()
    ()
    ()
     score_range, count  analysis[].items():
        ()
    csv_file, txt_file = save_comments_to_files(comments, analysis)
    ()
    ()
    ()
     comments, analysis

 __name__ == :
    json_data = crawlComment()
    comments, analysis = process_ctrip_comments(json_data)

关于分页

如果只获取了 10 个评论，是因为默认一页只有 10 条数据。想要爬取其他页的数据可以修改请求入参，也就是 json_data。json_data 中有个参数叫 pageSize 参数，可选范围是 1~300，也就是说一个景区你能爬 3000 条评论数据。

文章配图

3. 总结

本教程演示了从接口定位、请求模拟到数据解析与存储的完整流程。注意实际使用时请确保 Cookie 有效，并遵守相关法律法规及目标网站的 robots 协议。

Python 实现携程景区评论数据爬取与分析

1. 前言

2. 爬虫实现

2.1 定位接口

3. 总结

更多推荐文章

相关免费在线工具

Python 实现携程景区评论数据爬取与分析

1. 前言

2. 爬虫实现

2.1 定位接口

3. 总结

微信扫一扫，关注极客日志

更多推荐文章

相关免费在线工具