Python爬虫学习小结_2

<html>
<head>
	<script>
		location.replace(location.href.replace("https://","http://"));
	</script>
</head>
<body>
	<noscript id="headsss">
		<meta name="meta" http-equiv="refresh" content="0;url=http://www.baidu.com/">
	</noscript>
    <div id="s-top-left" class="s-top-left s-isindex-wrap">
		<a href="http://news.baidu.com" target="_blank" class="mnav c-font-normal c-color-t" id="testss">新闻</a>
		<a href="https://www.hao123.com" target="_blank" class="mnav c-font-normal c-color-t">hao123</a>
		<a href="http://map.baidu.com" target="_blank" class="mnav c-font-normal c-color-t">地图</a>
		<a href="https://live.baidu.com/" target="_blank" class="mnav c-font-normal c-color-t">直播</a>
		<a href="https://haokan.baidu.com/?sfrom=baidu-top" target="_blank" class="mnav c-font-normal c-color-t">视频</a>
		<a href="http://tieba.baidu.com" target="_blank" class="mnav c-font-normal c-color-t">贴吧</a>
		<a href="http://xueshu.baidu.com" name="test" target="_blank" class="lesss">学术</a>
	</div>
</body>
</html>
# -*- coding = utf-8 -*-

import re
from bs4 import BeautifulSoup

file = open("./baidu.html","rb")
html = file.read()
bs = BeautifulSoup(html,"html.parser")

# print(bs.a)
# print(bs.head)
# print(type(bs.head))

# print(bs.a.string)
# print(type(bs.a.string))
# print(bs.a.attrs)
# print(type(bs))

#文档的遍历

#print(bs.head.contents[1])


#文档的搜索

#1.1 find_all()
#list = bs.find_all("a")

#1.2 search()
list = bs.find_all(re.compile("a"))

#1.3 函数查找
# def name_is_exists(tag):
#     return tag.has_attr("name")
#
# list = bs.find_all(name_is_exists)

#1.4  kwargs 参数
#list = bs.find_all(id="headsss")
#list = bs.find_all(class_=True)
#list = bs.find_all(href="http://news.baidu.com")

#1.5 text 参数
#list = bs.find_all(text=["新闻","地图","直播"])


#1.6 limit 参数
#list = bs.find_all("a",limit=3)


#2 css选择器
#list = bs.select("meta")  #通过标签查找
#list = bs.select(".mnav")  #通过类查找
#list = bs.select("#testss")  #通过id查找
#list = bs.select("a[class='lesss']")  #通过属性查找
#list = bs.select("div > a")  #通过字标签查找
list = bs.select(".mnav ~ .lesss")  #通过兄弟类查询


#print(list)
for item in list:
    print(item)