1、首先要下载BeautifulSoup:
pip3 install BeautifulSoup4
2、
from bs4 import BeautifulSoup
s = '''<html><head><title>The Dormouse's story</title></head><body><p class="title"><b>The Dormouse's story</b></p><p class="story">Once upon a time there were three little sisters; and their names were<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;and they lived at the bottom of a well.</p><p class="story">...</p><script>alert(123)</script>'''bs=BeautifulSoup(s,"html.parser")#打印出字符串s,不完整的标签会自动补全
print(bs)
#取到所用标签中内容
print(bs.text)
#每个标签当成一个元素,从外到内遍历
print(bs.find_all())
#找到所用的a标签
print(bs.find_all("a"))
#找到所有的body标签,虽然body不完整,但会自动补全的
print(bs.find_all("body"))
#找到每个a标签的href值
for tag in bs.find_all("a"): print(tag.get("href"))
#找到每个a标签的name属性值
for tag in bs.find_all(): print(tag.name)
if tag.name in ["script","link"]:
tag.decompose() # 去除标签script和link
# 打印出 去除标签后的字符串
print(str(bs))
# 打印出去除字符串后的文本内容
print(bs.text)