import requests
import json
import time
import random
import pandas as pd
from fake_useragent import UserAgent
from jsonpath import jsonpath
from urllib.parse import urlencode
class XiaoHongShuCrawler:
def __init__(self, cookie):
"""
初始化爬虫
:param cookie: 小红书登录后的 Cookie 字符串
"""
self.ua = UserAgent()
self.headers = {
'User-Agent': self.ua.random,
'Accept': 'application/json, text/plain, */*',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Referer': 'https://www.xiaohongshu.com/',
'Cookie': cookie,
'Origin': 'https://www.xiaohongshu.com',
'X-Requested-With': 'XMLHttpRequest',
'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin'
}
self.all_note_data = []
self.base_api_url = 'https://edith.xiaohongshu.com/api/sns/web/v1/feed'
def get_note_detail(self, note_id):
"""
获取单篇笔记的详细数据
:param note_id: 笔记 ID
:return: 笔记数据字典/None
"""
try:
params = {
'note_id': note_id,
'source': 'web',
'timestamp': int(time.time() * 1000)
}
time.sleep(random.uniform(3, 8))
response = requests.get(
url=f'{self.base_api_url}/detail?{urlencode(params)}',
headers=self.headers,
timeout=20
)
response.raise_for_status()
json_data = response.json()
if json_data.get('success') is not True:
print(f"笔记{note_id}接口返回失败:{json_data.get('msg', '未知错误')}")
return None
note_data = {}
note_data['note_id'] = note_id
note_data['note_title'] = jsonpath(json_data, '$..title')[0] if jsonpath(json_data, '$..title') else '无标题'
note_data['note_content'] = jsonpath(json_data, '$..content')[0] if jsonpath(json_data, '$..content') else '无正文'
note_data['author_name'] = jsonpath(json_data, '$..nickname')[0] if jsonpath(json_data, '$..nickname') else '未知作者'
note_data['like_count'] = int(jsonpath(json_data, '$..like_count')[0]) if jsonpath(json_data, '$..like_count') else 0
note_data['collect_count'] = int(jsonpath(json_data, '$..collect_count')[0]) if jsonpath(json_data, '$..collect_count') else 0
note_data['comment_count'] = int(jsonpath(json_data, '$..comment_count')[0]) if jsonpath(json_data, '$..comment_count') else 0
tags = jsonpath(json_data, '$..tags[*].name') or []
note_data['tag_list'] = ' | '.join(tags) if tags else '无标签'
publish_ts = jsonpath(json_data, '$..create_time')[0] if jsonpath(json_data, '$..create_time') else 0
note_data['publish_time'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(publish_ts)) if publish_ts else '未知时间'
note_data['note_url'] = f'https://www.xiaohongshu.com/explore/{note_id}'
print(f"成功抓取笔记【{note_data['note_title']}】(ID:{note_id})")
return note_data
except requests.exceptions.RequestException as e:
print(f"笔记{note_id}请求失败:{e}")
return None
except Exception as e:
print(f"笔记{note_id}解析失败:{e}")
return None
def crawl_note_list(self, keyword='穿搭', page_num=3, page_size=20):
"""
根据关键词抓取穿搭笔记列表
:param keyword: 搜索关键词(默认穿搭)
:param page_num: 抓取页数
:param page_size: 每页笔记数
"""
try:
for page in range(1, page_num + 1):
params = {
'keyword': keyword,
'page': page,
'page_size': page_size,
'sort': 'popular',
'timestamp': int(time.time() * 1000)
}
time.sleep(random.uniform(5, 10))
response = requests.get(
url=f'{self.base_api_url}/search?{urlencode(params)}',
headers=self.headers,
timeout=20
)
response.raise_for_status()
json_data = response.json()
if json_data.get('success') is not True:
print(f"第{page}页搜索失败:{json_data.get('msg', '未知错误')}")
continue
note_ids = jsonpath(json_data, '$..note_id') or []
if not note_ids:
print(f"第{page}页未获取到笔记 ID,结束抓取")
break
print(f"第{page}页获取到{len(note_ids)}篇笔记 ID")
for note_id in note_ids:
note_data = self.get_note_detail(note_id)
if note_data:
self.all_note_data.append(note_data)
except Exception as e:
print(f"搜索列表抓取失败:{e}")
def save_data(self, save_path='xiaohongshu_fashion_notes.csv'):
"""
保存笔记数据到 CSV 文件
:param save_path: 保存路径
"""
if not self.all_note_data:
print("无有效笔记数据可保存")
return
df = pd.DataFrame(self.all_note_data)
df = df.drop_duplicates(subset=['note_id'], keep='last')
df['note_content'] = df['note_content'].str.replace('\n', ' ')
df.to_csv(save_path, index=False, encoding='utf-8-sig')
print(f"小红书穿搭笔记数据已保存至:{save_path}")
print(f"共抓取{len(df)}篇有效笔记")
return df
if __name__ == '__main__':
XHS_COOKIE = '你的小红书登录 Cookie 字符串'
crawler = XiaoHongShuCrawler(cookie=XHS_COOKIE)
crawler.crawl_note_list(keyword='穿搭', page_num=3, page_size=20)
result_df = crawler.save_data()
print("\n=== 小红书穿搭笔记数据抓取结果(前 5 条)===")
print(result_df.head().to_string(index=False))