Python 爬虫核心技术原理与实战解析

Python 爬虫核心技术原理与实战解析 | 极客日志

GET /path/to/resource HTTP/1.1
Host: www.example.com
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8
...

import requests

url = 'http://www.example.com'
try:
    response = requests.get(url, timeout=5)
    print(response.text)
except Exception as e:
    print(f"Request failed: {e}")

HTTP/1.1 200 OK
Content-Type: text/html; charset=utf-8
Content-Length: 1234
...
<html>
...
</html>

import requests

url = 'http://www.example.com'
try:
    response = requests.get(url)
    print(response.status_code)
    print(response.headers)
    print(response.text)
except Exception as e:
    print(f"Error: {e}")

import re

html = '<a href="http://example.com">Example</a>'
links = re.findall(r'<a href="([^"]+)">([^<]+)</a>', html)
for link in links:
    print(link[0], link[1])

from lxml import etree

html = '<a href="http://www.example.com">Example</a>'
tree = etree.HTML(html)
links = tree.xpath('//a')
for link in links:
    print(link.get('href'), link.text)

from bs4 import BeautifulSoup

html = '<a href="http://www.example.com">Example</a>'
soup = BeautifulSoup(html, 'html.parser')
links = soup.find_all('a')
for link in links:
    print(link.get('href'), link.text)

links = soup.select('a')
for link in links:
    href = link['href']
    text = link.get_text()
    print(href, text)

import csv

data = [['url', 'text'], [href, text]]
with open('output.csv', 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerows(data)

import scrapy

class MySpider(scrapy.Spider):
    name = 'example.com'
    start_urls = ['http://www.example.com']

    def parse(self, response):
        # 处理响应
        # 提取数据
        # 发送更多请求
        pass

import requests
from bs4 import BeautifulSoup

url = 'http://www.example.com'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
# 处理页面，提取数据

from selenium import webdriver

driver = webdriver.Chrome('path/to/chromedriver')
driver.get('http://www.example.com')
# 处理页面，提取数据
driver.quit()

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
response = requests.get(url, headers=headers)

import requests

login_url = 'https://example.com/login'
data = {
    'username': 'your_username',
    'password': 'your_password'
}
response = requests.post(login_url, data=data)
session = response.cookies

data_url = 'https://example.com/data'
response = requests.get(data_url, cookies=session)
data = response.text

import requests
from bs4 import BeautifulSoup
import time

# 发送 HTTP 请求
url = 'https://www.jianshu.com'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
try:
    response = requests.get(url, headers=headers, timeout=10)
    html = response.text
except Exception as e:
    print(f"Failed to fetch page: {e}")
    exit()

# 解析 HTML 内容
soup = BeautifulSoup(html, 'html.parser')

# 提取数据
articles = soup.select('.note-list li')
data = []

for article in articles:
    try:
        title_elem = article.select('a.title')[0]
        author_elem = article.select('.name')[0]
        
        title = title_elem.string.strip() if title_elem.string else ''
        author = author_elem.string.strip() if author_elem.string else ''
        href = 'https://www.jianshu.com' + title_elem['href']
        
        data.append([title, author, href])
    except IndexError:
        continue
    time.sleep(1)  # 控制请求频率

# 数据存储
import csv
with open('jianshu_articles.csv', 'w', newline='', encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(['Title', 'Author', 'Link'])
    writer.writerows(data)
print("Data saved successfully.")

Python 爬虫核心技术原理与实战解析

导言

1. HTTP 请求与响应

1.1 HTTP 请求

1.2 HTTP 响应

2. 网页解析技术

2.1 正则表达式

2.2 XPath

2.3 BeautifulSoup

2.4 提取数据

2.5 数据存储与再处理

3. 爬虫框架

3.1 Scrapy

3.2 BeautifulSoup + requests

3.3 Selenium

4. 其他关键技术

4.1 User-Agent 伪装

4.2 反爬虫策略与解决方法

4.3 网页登录与 Session 管理

4.4 Robots 协议与合规性

5. 实例：爬取简书网站文章信息

结语

更多推荐文章

相关免费在线工具

Python 爬虫核心技术原理与实战解析

导言

1. HTTP 请求与响应

1.1 HTTP 请求

1.2 HTTP 响应

2. 网页解析技术

2.1 正则表达式

2.2 XPath

2.3 BeautifulSoup

2.4 提取数据

2.5 数据存储与再处理

3. 爬虫框架

3.1 Scrapy

3.2 BeautifulSoup + requests

3.3 Selenium

4. 其他关键技术

4.1 User-Agent 伪装

4.2 反爬虫策略与解决方法

4.3 网页登录与 Session 管理

4.4 Robots 协议与合规性

5. 实例：爬取简书网站文章信息

结语

微信扫一扫，关注极客日志

更多推荐文章

相关免费在线工具