Python爬虫学习小结_2
<html>
<head>
<script>
location.replace(location.href.replace("https://","http://"));
</script>
</head>
<body>
<noscript id="headsss">
<meta name="meta" http-equiv="refresh" content="0;url=http://www.baidu.com/">
</noscript>
<div id="s-top-left" class="s-top-left s-isindex-wrap">
<a href="http://news.baidu.com" target="_blank" class="mnav c-font-normal c-color-t" id="testss">新闻</a>
<a href="https://www.hao123.com" target="_blank" class="mnav c-font-normal c-color-t">hao123</a>
<a href="http://map.baidu.com" target="_blank" class="mnav c-font-normal c-color-t">地图</a>
<a href="https://live.baidu.com/" target="_blank" class="mnav c-font-normal c-color-t">直播</a>
<a href="https://haokan.baidu.com/?sfrom=baidu-top" target="_blank" class="mnav c-font-normal c-color-t">视频</a>
<a href="http://tieba.baidu.com" target="_blank" class="mnav c-font-normal c-color-t">贴吧</a>
<a href="http://xueshu.baidu.com" name="test" target="_blank" class="lesss">学术</a>
</div>
</body>
</html>
# -*- coding = utf-8 -*-
import re
from bs4 import BeautifulSoup
file = open("./baidu.html","rb")
html = file.read()
bs = BeautifulSoup(html,"html.parser")
# print(bs.a)
# print(bs.head)
# print(type(bs.head))
# print(bs.a.string)
# print(type(bs.a.string))
# print(bs.a.attrs)
# print(type(bs))
#文档的遍历
#print(bs.head.contents[1])
#文档的搜索
#1.1 find_all()
#list = bs.find_all("a")
#1.2 search()
list = bs.find_all(re.compile("a"))
#1.3 函数查找
# def name_is_exists(tag):
# return tag.has_attr("name")
#
# list = bs.find_all(name_is_exists)
#1.4 kwargs 参数
#list = bs.find_all(id="headsss")
#list = bs.find_all(class_=True)
#list = bs.find_all(href="http://news.baidu.com")
#1.5 text 参数
#list = bs.find_all(text=["新闻","地图","直播"])
#1.6 limit 参数
#list = bs.find_all("a",limit=3)
#2 css选择器
#list = bs.select("meta") #通过标签查找
#list = bs.select(".mnav") #通过类查找
#list = bs.select("#testss") #通过id查找
#list = bs.select("a[class='lesss']") #通过属性查找
#list = bs.select("div > a") #通过字标签查找
list = bs.select(".mnav ~ .lesss") #通过兄弟类查询
#print(list)
for item in list:
print(item)