
适用场景:会员帐号下的文章、视频,老网站的数据下载
可以爬取网页的数据:文章标题,内容,联系方式;使用Selenium模拟浏览器行为:点击,提交,浏览。
目标网站: https://pic.netbian.com/4kmeinv/
from lxml import etree
import requests
import os
if __name__=="__main__":
headers = {
'user-agent': 'Mozilla/5.0(Windows NT 6.1;Win64;x64) AppleWebKit/537.36(KHTML, likeGecko) Chrome/85.0.4183.121Safari/537.36'
}
url="http://pic.netbian.com/4kmeinv/"
response=requests.get(url=url,headers=headers)
response.encoding='gb2312'
#response.encoding=response.apparent_encoding
text=response.text
tree=etree.HTML(text)
li_list=tree.xpath("//div[@class='slist']/ul/li")
if not os.path.exists('./piclibs'):
os.mkdir('./piclibs')
for li in li_list:
li_src=li.xpath('./a/img/@src')[0]
li_src="http://pic.netbian.com/"+li_src
li_name=li.xpath('./a/img/@alt')[0]+".jpg"
#li_name.encode('iso-8859-1').decode('gbk')
#print(li_src)
#print(li_name)
img_data=requests.get(url=li_src,headers=headers).content
img_path='piclibs/'+li_name
with open(img_path,'wb') as fp:
fp.write(img_data)
print(li_name,'下载成功')

