适用人群:零基础到进阶开发者
目标:系统掌握 Python 处理 HTML 的解析、清洗、提取、生成与自动化等实战能力。
# 目录
# 简介与环境准备
Python 处理 HTML 通常包括:抓取(获取网页)、解析(提取结构化内容)、清洗(去噪与规范化)、生成(输出 HTML 或模板渲染)等。
# 安装
| pip install requests beautifulsoup4 lxml |
如需动态页面解析:
| pip install playwright |
| python -m playwright install |
# 版本选择建议
- Python 3.9+ 更佳
- 解析库建议搭配:
BeautifulSoup + lxml
# 验证安装
| python -c "import bs4, lxml, requests; print('ok')" |
# 核心概念
- HTML 文档树(DOM):标签嵌套形成树形结构
- 选择器:根据标签、属性、层级定位元素
- 编码:网页内容可能不是 UTF-8
- 清洗:移除脚本、样式、广告等噪声
# 常用方法速查(入门必会)
soup.find() / soup.find_all() :按标签查找soup.select() :CSS 选择器查找tag.get_text() :获取文本tag.get() :获取属性tag.decompose() :删除节点
# 第一个 HTML 解析程序 —— 提取标题与段落
| from bs4 import BeautifulSoup |
| |
| html = """ |
| <html> |
| <head><title>示例</title></head> |
| <body> |
| <h1>欢迎</h1> |
| <p>这是第一段。</p> |
| <p>这是第二段。</p> |
| </body> |
| </html> |
| """ |
| |
| soup = BeautifulSoup(html, "lxml") |
| print("标题:", soup.title.get_text()) |
| for p in soup.find_all("p"): |
| print("段落:", p.get_text(strip=True)) |
# HTML 解析基础
# BeautifulSoup 核心对象
| soup = BeautifulSoup(html, "lxml") |
| print(type(soup)) |
| print(soup.body) |
| print(soup.body.children) |
常见节点类型与用法:
BeautifulSoup :是整棵 HTML 文档树的根;选择器入口。Tag :具体标签节点,支持属性访问(如 tag["href"] )与结构操作(如 unwrap() )。NavigableString :文本节点,可通过 str() 或 get_text() 访问。- 迭代:
tag.children 返回迭代器,若要多次使用可先 list(tag.children) ; tag.descendants 会遍历所有深层子孙。 - 解析器:
lxml 性能佳且容错高;备用可用 "html.parser" ,遇到解析异常可尝试切换。
# 常见选择器
| soup.find("h1") |
| soup.find_all("p") |
| soup.find("div", class_="card") |
| soup.find_all("a", attrs={"data-id": "100"}) |
# 标签与属性操作
| tag = soup.find("a") |
| print(tag["href"]) |
| print(tag.get("href")) |
| print(tag.get_text(strip=True)) |
# 文本清洗与规范化
| text = soup.get_text(" ", strip=True) |
| text = " ".join(text.split()) |
# 选择器进阶
# CSS 选择器
| soup.select("div.card") |
| soup.select("ul > li") |
| soup.select("a[href^='https']") |
# XPath(lxml)
| from lxml import etree |
| |
| tree = etree.HTML(html) |
| print(tree.xpath("//h1/text()")) |
| print(tree.xpath("//a[@class='link']/@href")) |
# 正则与选择器结合
| import re |
| soup.find_all("a", href=re.compile(r"/item/\d+")) |
# 网页抓取与编码
# requests 基础用法
| import requests |
| |
| resp = requests.get("https://example.com", timeout=10) |
| resp.raise_for_status() |
| html = resp.text |
# 编码检测与修复
| resp = requests.get(url) |
| resp.encoding = resp.apparent_encoding |
| html = resp.text |
# 反爬与礼貌策略
- 设置
User-Agent - 控制请求频率(
time.sleep ) - 读取站点
robots.txt
| headers = {"User-Agent": "Mozilla/5.0"} |
| requests.get(url, headers=headers, timeout=10) |
# 表格与列表数据提取
# HTML 表格转结构化数据
| from bs4 import BeautifulSoup |
| |
| soup = BeautifulSoup(html, "lxml") |
| table = soup.find("table") |
| rows = [] |
| for tr in table.find_all("tr"): |
| cells = [td.get_text(strip=True) for td in tr.find_all(["th", "td"])] |
| if cells: |
| rows.append(cells) |
| print(rows) |
# 列表与卡片式结构
| items = [] |
| for card in soup.select("div.card"): |
| title = card.select_one(".title").get_text(strip=True) |
| price = card.select_one(".price").get_text(strip=True) |
| items.append({"title": title, "price": price}) |
# HTML 清洗与内容提取
# 去噪与正文提取
| for tag in soup(["script", "style", "noscript"]): |
| tag.decompose() |
| |
| text = soup.get_text(" ", strip=True) |
# 保留 / 移除标签
| |
| allowed = {"p", "h1", "h2"} |
| for tag in soup.find_all(True): |
| if tag.name not in allowed: |
| tag.unwrap() |
# HTML 生成与模板
# 使用字符串模板
| from string import Template |
| |
| html = Template("<h1>$title</h1><p>$content</p>").substitute( |
| title="标题", |
| content="内容" |
| ) |
# Jinja2 模板渲染
| from jinja2 import Template |
| |
| tpl = Template(""" |
| <h1>Python 处理 HTML 全面学习指南(中文)</h1> |
| <ul> |
| |
| </ul> |
| """) |
| |
| print(tpl.render(title="清单", items=["a", "b", "c"])) |
# 处理动态页面
# Selenium 基础
| from selenium import webdriver |
| from selenium.webdriver.common.by import By |
| |
| driver = webdriver.Chrome() |
| driver.get("https://example.com") |
| html = driver.page_source |
| print(driver.find_element(By.CSS_SELECTOR, "h1").text) |
# Playwright(推荐)
| from playwright.sync_api import sync_playwright |
| |
| with sync_playwright() as p: |
| browser = p.chromium.launch(headless=True) |
| page = browser.new_page() |
| page.goto("https://example.com") |
| html = page.content() |
| print(page.locator("h1").inner_text()) |
| browser.close() |
# 并发抓取与任务编排
# 线程池与连接复用
| import requests |
| from concurrent.futures import ThreadPoolExecutor |
| |
| session = requests.Session() |
| |
| def fetch(url): |
| r = session.get(url, timeout=10) |
| r.raise_for_status() |
| return r.text |
| |
| urls = ["https://example.com/1", "https://example.com/2"] |
| with ThreadPoolExecutor(max_workers=5) as ex: |
| pages = list(ex.map(fetch, urls)) |
# 异步爬取(aiohttp)
| import aiohttp |
| import asyncio |
| |
| async def fetch(session, url): |
| async with session.get(url) as resp: |
| resp.raise_for_status() |
| return await resp.text() |
| |
| async def main(urls): |
| async with aiohttp.ClientSession() as session: |
| tasks = [fetch(session, u) for u in urls] |
| return await asyncio.gather(*tasks) |
# 存储与导出
# CSV / JSON 导出
| import csv, json |
| |
| with open("data.csv", "w", newline="", encoding="utf-8") as f: |
| writer = csv.writer(f) |
| writer.writerows(rows) |
| |
| with open("data.json", "w", encoding="utf-8") as f: |
| json.dump(items, f, ensure_ascii=False, indent=2) |
# 数据库写入(SQLite)
| import sqlite3 |
| |
| conn = sqlite3.connect("data.db") |
| cur = conn.cursor() |
| cur.execute("CREATE TABLE IF NOT EXISTS items (title TEXT, price TEXT)") |
| cur.executemany("INSERT INTO items VALUES (?, ?)", [(i["title"], i["price"]) for i in items]) |
| conn.commit() |
| conn.close() |
# 浏览器预览与自动打开(webbrowser)
# 基本用法
| import webbrowser |
| |
| |
| webbrowser.open("https://example.com") |
说明: webbrowser.open() 只负责调起浏览器,不保证页面加载成功;返回 True/False 表示是否成功发起打开请求。
# 新标签 / 新窗口
| import webbrowser |
| |
| url = "https://example.com" |
| webbrowser.open(url, new=2) |
| webbrowser.open(url, new=1) |
# 指定 / 注册浏览器
| import webbrowser |
| |
| |
| try: |
| browser = webbrowser.get("chrome") |
| except webbrowser.Error: |
| browser = webbrowser.get() |
| |
| browser.open("https://example.com", new=2) |
在 Windows 上,若需要明确使用 Edge 可手动注册(路径需根据实际安装位置调整):
| import webbrowser |
| from webbrowser import BackgroundBrowser |
| |
| edge_path = r"C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedge.exe" |
| webbrowser.register("edge", None, BackgroundBrowser(edge_path)) |
| webbrowser.get("edge").open("https://example.com", new=2) |
提示:某些 Python 版本已内置 edge 控制器;若报错则使用上面的 register 方式或改用 windows-default 。
# 预览本地 HTML 文件
| from pathlib import Path |
| import webbrowser |
| |
| html_path = Path("output.html").resolve() |
| webbrowser.open(html_path.as_uri(), new=2) |
也可临时生成并预览:
| from tempfile import NamedTemporaryFile |
| from pathlib import Path |
| import webbrowser |
| |
| with NamedTemporaryFile("w", suffix=".html", delete=False, encoding="utf-8") as tmp: |
| tmp.write("<h1>预览</h1><p>Hello, webbrowser!</p>") |
| temp_uri = Path(tmp.name).resolve().as_uri() |
| |
| webbrowser.open(temp_uri, new=2) |
# 命令行用法
| python -m webbrowser -t "https://example.com" |
| python -m webbrowser -n "https://example.com" |
# 注意事项
- 成功与否:
open() 返回 True/False ,仅表示是否成功调起浏览器。 - 环境限制:无图形界面环境(服务器 / 容器)可能无法打开浏览器。
- 路径安全:本地文件推荐使用
Path.as_uri() ,避免空格与编码问题。 - 浏览器选择:
get("chrome"/"firefox"/"windows-default") 因系统与安装变化而异;不可用时请回退默认或手动 register 。 - 行为副作用:自动打开浏览器可能触发登录与隐私相关动作,谨慎在脚本中批量调用。
# 调试与排错技巧
- 打印
soup.prettify() 看结构 - 选择器不生效时检查层级与动态加载
- 解析结果为空,先检查编码与登录态
- 使用浏览器 DevTools 复制 CSS/XPath
# 小项目示例(完整代码)
# 小项目 1:新闻列表抽取器
| import requests |
| from bs4 import BeautifulSoup |
| |
| url = "https://example.com/news" |
| html = requests.get(url, timeout=10).text |
| soup = BeautifulSoup(html, "lxml") |
| |
| items = [] |
| for li in soup.select("ul.news li"): |
| title = li.select_one("a").get_text(strip=True) |
| link = li.select_one("a")["href"] |
| items.append({"title": title, "link": link}) |
| |
| print(items[:5]) |
# 小项目 2:电商商品信息采集
| import requests |
| from bs4 import BeautifulSoup |
| |
| url = "https://example.com/products" |
| html = requests.get(url, timeout=10).text |
| soup = BeautifulSoup(html, "lxml") |
| |
| products = [] |
| for card in soup.select(".product-card"): |
| name = card.select_one(".name").get_text(strip=True) |
| price = card.select_one(".price").get_text(strip=True) |
| products.append({"name": name, "price": price}) |
| |
| print(products) |
# 小项目 3:HTML 转 Markdown
| from bs4 import BeautifulSoup |
| |
| html = "<h1>标题</h1><p>段落</p>" |
| soup = BeautifulSoup(html, "lxml") |
| |
| md = [] |
| for tag in soup.find_all(["h1", "p"]): |
| if tag.name == "h1": |
| md.append("# " + tag.get_text(strip=True)) |
| elif tag.name == "p": |
| md.append(tag.get_text(strip=True)) |
| |
| print("\n\n".join(md)) |
# 最佳实践与安全合规
- 遵守站点
robots.txt 与服务条款 - 控制并发与请求频率,避免对目标站点造成压力
- 对外部输入的 HTML 做清洗,防止 XSS 注入
- 不存储敏感信息或进行未授权采集
# 延伸阅读
- BeautifulSoup 文档
- lxml 文档
- requests 文档
- Playwright 文档
- W3C HTML 规范