云计算百科
云计算领域专业知识百科平台

Python爬虫入门实战:从零开始抓取网页数据

前言:为什么学习爬虫?

在当今大数据时代,网络数据采集(爬虫)已成为数据分析和人工智能领域的重要技能。无论是市场调研、竞品分析,还是学术研究,爬虫技术都能帮助我们高效获取所需信息。本文将带领Python初学者一步步掌握爬虫核心技术。

一、爬虫基础准备

1.1 环境搭建

# 创建虚拟环境(推荐)
python m venv spider_env
source spider_env/bin/activate # Linux/Mac
spider_env\\Scripts\\activate # Windows

# 安装必要库
pip install requests beautifulsoup4 lxml pandas
# 可选:安装更强大的解析库
pip install parsel selenium playwright

1.2 爬虫核心库介绍

  • requests:发送HTTP请求
  • BeautifulSoup:解析HTML/XML
  • lxml:高效解析库
  • pandas:数据处理和存储

二、第一个爬虫:获取网页内容

2.1 简单的GET请求

import requests
from bs4 import BeautifulSoup

def get_webpage(url):
"""获取网页内容"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}

try:
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status() # 检查请求是否成功
response.encoding = response.apparent_encoding
return response.text
except requests.RequestException as e:
print(f"请求失败: {e}")
return None

# 测试爬取豆瓣电影TOP250
url = "https://movie.douban.com/top250"
html = get_webpage(url)
print(f"获取到 {len(html)} 个字符")

2.2 解析HTML内容

def parse_movie_info(html):
"""解析电影信息"""
if not html:
return []

soup = BeautifulSoup(html, 'lxml')
movies = []

# 查找所有电影项目
items = soup.find_all('div', class_='item')

for item in items[:5]: # 只取前5部电影
# 提取电影标题
title_elem = item.find('span', class_='title')
title = title_elem.text if title_elem else "无标题"

# 提取评分
rating_elem = item.find('span', class_='rating_num')
rating = rating_elem.text if rating_elem else "无评分"

# 提取简介
quote_elem = item.find('span', class_='inq')
quote = quote_elem.text if quote_elem else "无简介"

movies.append({
'title': title,
'rating': rating,
'quote': quote
})

return movies

# 执行解析
movies = parse_movie_info(html)
for i, movie in enumerate(movies, 1):
print(f"{i}. {movie['title']} – 评分: {movie['rating']}")
print(f" 简介: {movie['quote']}\\n")

三、实战项目:爬取天气预报数据

3.1 分析目标网站

我们以中国天气网为例:http://www.weather.com.cn/weather/101010100.shtml

3.2 完整爬虫代码

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random

class WeatherSpider:
def __init__(self):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Referer': 'http://www.weather.com.cn/'
}

def get_weather(self, city_code='101010100'):
"""获取指定城市的天气数据"""
url = f'http://www.weather.com.cn/weather/{city_code}.shtml'

try:
response = requests.get(url, headers=self.headers, timeout=15)
response.encoding = 'utf-8'

if response.status_code == 200:
return self.parse_weather(response.text)
else:
print(f"请求失败,状态码:{response.status_code}")
return None

except Exception as e:
print(f"发生错误:{e}")
return None

def parse_weather(self, html):
"""解析天气数据"""
soup = BeautifulSoup(html, 'lxml')
weather_data = []

# 查找7天天气预报
days = soup.select('ul.t.clearfix > li')

for day in days[:7]: # 取7天数据
# 提取日期
date_elem = day.select_one('h1')
date = date_elem.text if date_elem else "未知日期"

# 提取天气状况
wea_elem = day.select_one('p.wea')
weather = wea_elem.text if wea_elem else "未知天气"

# 提取温度
temp_elem = day.select_one('p.tem')
if temp_elem:
high_temp = temp_elem.select_one('span').text if temp_elem.select_one('span') else "未知"
low_temp = temp_elem.select_one('i').text if temp_elem.select_one('i') else "未知"
else:
high_temp = low_temp = "未知"

# 提取风向风力
win_elem = day.select_one('p.win')
if win_elem:
wind = win_elem.select_one('i').text if win_elem.select_one('i') else "未知"
else:
wind = "未知"

weather_data.append({
'日期': date,
'天气': weather,
'最高温': high_temp,
'最低温': low_temp,
'风向风力': wind
})

return weather_data

def save_to_csv(self, data, filename='weather.csv'):
"""保存数据到CSV文件"""
if data:
df = pd.DataFrame(data)
df.to_csv(filename, index=False, encoding='utf-8-sig')
print(f"数据已保存到 {filename}")
return True
return False

# 使用示例
if __name__ == "__main__":
spider = WeatherSpider()

# 城市代码:北京101010100,上海101020100,广州101280101
weather_data = spider.get_weather('101010100')

if weather_data:
print("北京7天天气预报:")
print("="*60)
for day in weather_data:
print(f"{day['日期']:10} {day['天气']:10} "
f"温度:{day['最低温']}~{day['最高温']} "
f"风力:{day['风向风力']}")

# 保存数据
spider.save_to_csv(weather_data)

四、高级技巧与注意事项

4.1 处理动态加载内容

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def dynamic_crawler(url):
"""使用Selenium处理动态页面"""
options = webdriver.ChromeOptions()
options.add_argument('–headless') # 无头模式
options.add_argument('–disable-gpu')

driver = webdriver.Chrome(options=options)

try:
driver.get(url)
# 等待元素加载
wait = WebDriverWait(driver, 10)
element = wait.until(
EC.presence_of_element_located((By.CLASS_NAME, "content"))
)

# 获取页面源码
html = driver.page_source
return html
finally:
driver.quit()

4.2 反爬虫策略应对

class AntiAntiSpider:
def __init__(self):
self.session = requests.Session()
self.delay_range = (1, 3) # 随机延迟1-3秒

def random_delay(self):
"""随机延迟,避免请求过快"""
time.sleep(random.uniform(*self.delay_range))

def rotate_user_agent(self):
"""轮换User-Agent"""
user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36'
]
return random.choice(user_agents)

def make_request(self, url, max_retries=3):
"""带有重试机制的请求"""
for attempt in range(max_retries):
try:
headers = {'User-Agent': self.rotate_user_agent()}
response = self.session.get(url, headers=headers, timeout=15)

if response.status_code == 200:
return response
elif response.status_code == 429: # 请求过多
print("触发频率限制,等待后重试…")
time.sleep(10)
else:
print(f"请求失败,状态码:{response.status_code}")

except Exception as e:
print(f"请求异常:{e}")

self.random_delay()

return None

4.3 数据存储方案

import json
import sqlite3
from datetime import datetime

class DataStorage:
"""数据存储管理类"""

@staticmethod
def save_json(data, filename):
"""保存为JSON格式"""
with open(filename, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
print(f"JSON数据已保存到 {filename}")

@staticmethod
def save_sqlite(data, db_name='spider_data.db'):
"""保存到SQLite数据库"""
conn = sqlite3.connect(db_name)
cursor = conn.cursor()

# 创建表
cursor.execute('''
CREATE TABLE IF NOT EXISTS weather (
id INTEGER PRIMARY KEY AUTOINCREMENT,
date TEXT,
weather TEXT,
high_temp TEXT,
low_temp TEXT,
wind TEXT,
crawl_time TIMESTAMP
)
'''
)

# 插入数据
for item in data:
cursor.execute('''
INSERT INTO weather (date, weather, high_temp, low_temp, wind, crawl_time)
VALUES (?, ?, ?, ?, ?, ?)
'''
, (
item['日期'],
item['天气'],
item['最高温'],
item['最低温'],
item['风向风力'],
datetime.now()
))

conn.commit()
conn.close()
print(f"数据已保存到数据库 {db_name}")

五、伦理与法律注意事项

5.1 爬虫道德规范

  • 遵守robots.txt:检查目标网站的robots.txt文件
  • 限制请求频率:避免对服务器造成压力
  • 尊重版权:不抓取版权保护内容
  • 保护隐私:不收集个人敏感信息
  • 5.2 检查robots.txt

    import urllib.robotparser

    def check_robots_permission(url, user_agent='*'):
    """检查robots.txt权限"""
    rp = urllib.robotparser.RobotFileParser()
    base_url = '/'.join(url.split('/')[:3])
    rp.set_url(base_url + "/robots.txt")
    rp.read()

    return rp.can_fetch(user_agent, url)

    # 使用示例
    url = "https://www.example.com/data"
    if check_robots_permission(url):
    print("允许爬取")
    else:
    print("禁止爬取,请尊重网站规则")

    六、完整项目:书籍信息爬虫

    import requests
    from bs4 import BeautifulSoup
    import csv
    import time

    class BookSpider:
    def __init__(self):
    self.base_url = "http://books.toscrape.com/"
    self.books_data = []

    def scrape_all_books(self):
    """爬取所有书籍信息"""
    page = 1
    while True:
    url = f"{self.base_url}catalogue/page-{page}.html"
    print(f"正在爬取第 {page} 页…")

    html = self.get_page(url)
    if not html:
    break

    books = self.parse_books(html)
    if not books:
    break

    self.books_data.extend(books)
    page += 1
    time.sleep(1) # 礼貌延迟

    return self.books_data

    def get_page(self, url):
    """获取页面内容"""
    try:
    response = requests.get(url, timeout=10)
    if response.status_code == 200:
    return response.content
    return None
    except:
    return None

    def parse_books(self, html):
    """解析书籍信息"""
    soup = BeautifulSoup(html, 'html.parser')
    books = soup.find_all('article', class_='product_pod')

    book_list = []
    for book in books:
    title = book.h3.a['title']
    price = book.select_one('p.price_color').text
    rating = book.p['class'][1] # 评分等级

    book_list.append({
    'title': title,
    'price': price,
    'rating': rating
    })

    return book_list

    def export_data(self, format='csv'):
    """导出数据"""
    if format == 'csv':
    with open('books.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.DictWriter(f, fieldnames=['title', 'price', 'rating'])
    writer.writeheader()
    writer.writerows(self.books_data)
    print(f"已导出 {len(self.books_data)} 本书籍信息到 books.csv")

    # 运行爬虫
    if __name__ == "__main__":
    spider = BookSpider()
    books = spider.scrape_all_books()

    print(f"\\n共爬取到 {len(books)} 本书籍")
    for i, book in enumerate(books[:5], 1):
    print(f"{i}. 《{book['title']}》 – 价格: {book['price']} – 评分: {book['rating']}")

    spider.export_data()

    七、常见问题与解决方案

    Q1: 遇到403 Forbidden错误怎么办?

    • 添加合适的请求头(User-Agent、Referer等)
    • 使用代理IP
    • 降低请求频率

    Q2: 数据编码混乱怎么办?

    • 使用response.encoding = response.apparent_encoding
    • 尝试不同编码:utf-8、gbk、gb2312
    • 使用chardet库自动检测编码

    Q3: 如何提高爬虫效率?

    • 使用concurrent.futures实现多线程
    • 使用aiohttp实现异步爬虫
    • 合理设置延迟,避免被封

    八、学习资源推荐

    8.1 进一步学习

  • 官方文档:

    • requests:https://docs.python-requests.org/
    • BeautifulSoup:https://www.crummy.com/software/BeautifulSoup/bs4/doc/
  • 进阶库:

    • Scrapy:专业爬虫框架
    • Playwright:现代化浏览器自动化
    • PyQuery:jQuery风格的解析库
  • 8.2 练习网站

  • 练习友好网站:
    • http://books.toscrape.com/
    • http://quotes.toscrape.com/
    • https://httpbin.org/
  • 总结

    本文从零开始介绍了Python爬虫的完整流程,涵盖了基础请求、数据解析、反爬应对、数据存储等关键环节。爬虫技术的学习需要理论与实践结合,建议初学者:

  • 从小项目开始,逐步增加复杂度
  • 遵守爬虫伦理,尊重网站规则
  • 持续学习,关注技术发展
  • 多做练习,积累实战经验
  • 记住,爬虫不仅仅是技术活,更是对耐心和细致程度的考验。遇到问题时,多查阅文档、多调试代码,你一定会成为爬虫高手!

    声明:本文仅供学习交流使用,请遵守相关法律法规和网站规定,不得用于非法用途。


    希望这篇教程能帮助你顺利入门Python爬虫!如果有任何问题,欢迎在评论区留言讨论。

    赞(0)
    未经允许不得转载:网硕互联帮助中心 » Python爬虫入门实战:从零开始抓取网页数据
    分享到: 更多 (0)

    评论 抢沙发

    评论前必须登录!