from DrissionPage import ChromiumPage
from datetime import datetime
import csv
import time
import re
def extract_video_info(page):
"""提取视频标题和以#开头的标签"""
try:
title_ele = page.ele('tag:h1', timeout=5)
title = title_ele.text.strip() if title_ele else '未知标题'
tag_pattern = re.compile(r'#\S+')
tags = tag_pattern.findall(title)
pure_title = tag_pattern.sub('', title).strip() or title
return pure_title, tags
except Exception as e:
print(f"提取视频信息失败:{e}")
return '未知标题', []
def extract_author_info(page):
"""
精准适配版:提取作者信息(仅保留方式 1 的精准匹配)
只匹配 class+data 属性结构,移除所有兜底方案
"""
author_info = {
'作者名称': '未知',
'粉丝数': '0',
'获赞数': '0'
}
try:
author_ele = page.ele('xpath://div[@class="q5XQ42ql" and @data-click-from="title"]', timeout=3)
if author_ele:
author_name = author_ele.text.strip()
if author_name:
author_info['作者名称'] = author_name
stat_eles = page.eles('xpath://span[contains(text(), "粉丝") or contains(text(), "获赞") or contains(text(), "赞")]')
for ele in stat_eles:
stat_text = ele.text.strip()
if '粉丝' in stat_text:
if stat_text.replace('粉丝', '').strip().isdigit() or '万' in stat_text or '亿' in stat_text:
author_info['粉丝数'] = stat_text.replace('粉丝', '').strip()
else:
next_ele = ele.next()
if next_ele and next_ele.text.strip():
author_info['粉丝数'] = next_ele.text.strip()
elif '获赞' in stat_text or '赞' in stat_text:
if stat_text.replace('获赞', '').replace('赞', '').strip().isdigit() or '万' in stat_text or '亿' in stat_text:
author_info['获赞数'] = stat_text.replace('获赞', '').replace('赞', '').strip()
else:
next_ele = ele.next()
if next_ele and next_ele.text.strip():
author_info['获赞数'] = next_ele.text.strip()
except Exception as e:
print(f"提取作者信息失败:{e}")
return author_info
def main():
video_url = input("请输入抖音视频链接(回车使用默认链接):").strip()
if not video_url:
video_url = 'https://v.douyin.com/y5R-HvKi_vE'
with open('douyin_comments.csv', mode='w', encoding='utf-8-sig', newline='') as f:
csv_writer = csv.DictWriter(f, fieldnames=[
'视频标题', '视频标签', '作者名称', '作者粉丝数', '作者获赞数',
'昵称', '地区', '日期', '评论'
])
csv_writer.writeheader()
dp = ChromiumPage()
success_page_count = 0
video_title = '未知标题'
video_tags = []
author_info = {'作者名称': '未知', '粉丝数': '0', '获赞数': '0'}
try:
dp.listen.start('comment/list/')
dp.get(video_url)
time.sleep(8)
video_title, video_tags = extract_video_info(dp)
author_info = extract_author_info(dp)
print(f"\n=== 开始爬取 ===")
print(f"视频标题:{video_title}")
print(f"视频标签:{','.join(video_tags) if video_tags else '无'}")
print(f"作者名称:{author_info['作者名称']}")
print(f"作者粉丝数:{author_info['粉丝数']}")
print(f"作者获赞数:{author_info['获赞数']}")
print(f"================\n")
page_num = 1
has_next_page = True
while has_next_page:
print(f'正在采集第 {page_num} 页的数据内容')
resp = dp.listen.wait(timeout=15)
if not resp:
print(f"第 {page_num} 页等待数据包超时,尝试滚动加载...")
dp.scroll.to_bottom()
time.sleep(2)
resp = dp.listen.wait(timeout=5)
if not resp:
print(f"第 {page_num} 页仍无数据包,终止爬取")
break
try:
json_data = resp.response.body
comments = json_data.get('comments', [])
if not comments:
print(f"第 {page_num} 页无评论数据,终止爬取")
break
for index in comments:
try:
create_time = index.get('create_time', 0)
if create_time == 0:
date = '未知时间'
else:
date = str(datetime.fromtimestamp(create_time))
region = index.get('ip_label', '')
if not region:
ip_client_info = index.get('client_info', {})
region = ip_client_info.get('province', '未知')
except KeyError as e:
print(f"处理单个评论数据出现异常,异常信息:{e},跳过该评论")
continue
dit = {
'视频标题': video_title,
'视频标签': ','.join(video_tags),
'作者名称': author_info['作者名称'],
'作者粉丝数': author_info['粉丝数'],
'作者获赞数': author_info['获赞数'],
'昵称': index.get('user', {}).get('nickname', '未知'),
'地区': region,
'日期': date,
'评论': index.get('text', ''),
}
try:
csv_writer.writerow(dit)
print(dit)
except Exception as e:
print(f"写入 CSV 文件出现异常,异常信息:{e},跳过该数据")
success_page_count += 1
next_page = dp.ele('css:.Rcc71LyU', timeout=3)
if not next_page:
print("未找到下一页元素,终止爬取")
break
try:
dp.scroll.to_see(next_page)
time.sleep(1)
next_page.click()
page_num += 1
time.sleep(3)
except Exception as e:
print(f"滚动/点击下一页按钮失败,异常信息:{e},终止爬取")
break
except Exception as e:
print(f"第 {page_num} 页数据处理出现异常,异常信息:{e},终止爬取")
break
print(f"\n=== 爬取结束 ===")
print(f"共采集了 {success_page_count} 页评论数据")
print(f"数据已保存到:douyin_comments.csv")
except Exception as e:
print(f"爬取过程中出现致命异常:{e}")
finally:
dp.quit()
print("浏览器已关闭")
if __name__ == '__main__':
main()