15、Python爬虫

目录[-]

2018年4月更新

https://github.com/kennethreitz/requests-html GitHub源码

在2018年4月，出现了一个关于爬虫的新技术，经过使用，发现比原来的urllib、requests、正则、bs4都要方便，补充代码

特点：

1.支持javascripts

2.CSS selector又名Jquery风格

3.模拟用户代理，如同真实的浏览器（那么headers就不用了）

4.自动跟踪重定向

5.连接池和cookie持久性

6.支持Xpath

pip install requests-html

基本语法：

1.一切的开始，引用库，得到session

from requests_html import HTMLSession session=HTMLSession() r=session.get(url)

2.得到所有页面上的链接

r.html.links

3.绝对链接（自动补全了link地址）

r.html.absolute_links

4.使用CSS_selector查找一个元素

r.html.find（'#about'，first = True）

5.使用Xpath查找一个元素，但是仅限于一个元素

r.html.xpath（' / html / body / div [1] / a '）

6.在页面上搜索文字

r.html.search('Python是一种{}语言')[0]

about.text——抓取元素的文本内容 about.attrs——抓取元素的属性 about.html——渲染元素怒的HTML about.find('a')——选择元素中的元素 about.absolute_links ——选择元素中的链接

css_celector的用法，想找到a href这一个标签的内容，就用

h2.news_entry>a

# -*- coding:utf-8 -*- from requests_html import HTMLSession import sys,io session=HTMLSession() r=session.get('https://news.cnblogs.com/n/recommend') sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030') #改变标准输出的默认编码 title=r.html.find('h2.news_entry>a') content=r.html.find('div.entry_summary>a') for i,j in zip(title,content): print(i.absolute_links) print(i.text) print(j.text+'\n')

总结：缺点在于，没有代码补全，用起来难度一般。

# -*- coding:utf-8 -*- from requests_html import HTMLSession import urllib.request import time,os.path log_path ='F:\AV' # os.getcwd()获取python当前运行脚本的目录，比如python脚本在E盘，那么log_path路径就是E:\logs\ if (os.path.exists(log_path)): pass else: os.mkdir(log_path) # 接上一步，判断根目录下是否存在logs文件夹，如果不存在就创建一个，存在就pass session=HTMLSession() urls = ['http://www.win4000.com/zt/mao_{}.html'.format(str(i)) for i in range(1, 5)] # url='http://www.win4000.com/zt/mao_1.html' for url in urls: r=session.get(url) time.sleep(1) hrefs2 = r.html.find('div.tab_box>div>ul.clearfix>li>a') for href2 in hrefs2[:-5]: # print(href2.find('img')) name=href2.text print(name) for i in href2.find('img'): print(i.attrs['data-original']) #这就是图片下载地址，str类型 urllib.request.urlretrieve(i.attrs['data-original'], 'F:\AV\%s.jpg' %name) print('爬虫结束，请去 %s'%(log_path)+' 查看图片')

总结：爬取图片，这一个比上一个强多了尤其是for i in href2.find('img'):这一句，在找到元素后可以接着找该元素下的元素内容

用途：经常src图片地址是在下一层标签藏着，有了这个就可以轻松找到对应标签。

from requests_html import HTMLSession import time from retrying import retry import urllib.request import os,os.path,shutil url='http://www.ivsky.com/tupian/zhiwuhuahui/' session = HTMLSession() r = session.get(url) picture=r.html.find('ul.ali>li>div>a') list_name = [] list_link = [] for i in picture: # print(i) url_set=i.absolute_links #第二步，大标签页 {'http://www.ivsky.com/tupian/xianrenzhang_v47505/'} # print(i.attrs['title']) #多肉植物仙人掌图片(11张) title=i.attrs['title'] list_name.append(title) for url_str in url_set: # print(url_str) list_link.append(url_str) print(list_name) print(list_link) list_path=[] for k in range(len(list_name)): path = r'F:\AV\pachong\%s' % (list_name[k]) list_path.append(path) # print(path) try: os.mkdir(path) except Exception as e: pass name_extra=0 #下载图片后，给图片加一个唯一编号，重名的话，后下载的会覆盖先下载的 for m in range(len(list_link)): session2 = HTMLSession() r2 = session.get(list_link[m]) picture_url=r2.html.find('ul.pli>li>div>a>img') # print(picture_url) #现在得到了所有内部图片了 # print(m) for n in picture_url: # print(n.attrs['alt']) # print(n.attrs['src']) name_extra+=1 name_final=n.attrs['alt']+str(name_extra) url_final=n.attrs['src'] img='F:\AV\pachong\%s.jpg'%(name_final) try: urllib.request.urlretrieve(url_final, img) time.sleep(0.5) shutil.move('F:\AV\pachong\%s.jpg' % (name_final), list_path[m]) except: continue print('运行结束，图片已全部下载完毕。')

爬虫retry函数，有用的，真是奇妙，还以为并没有用

pip install retrying

from retry import retry @retry(tries=5, delay=2) def do_something(): xxx do_something()

这里表示直接用装饰器就行了，会将小说下载重新开始进行

from retrying import retry @retry() def start():

1.爬取百度新闻的部分内容（有些新闻实在是没办法爬的到，几乎每一小段都有不同的编写方式，实在是不好爬取）

#coding=utf-8 from bs4 import BeautifulSoup import lxml,re,requests response=requests.get('https://news.baidu.com/').text #上面这个能得到网址返回信息 soup=BeautifulSoup(response,'lxml') #print(soup) tag=soup.find_all(mon=re.compile("(.*?)"),target="_blank")#这个是修改之后的写法，使用的是bs4的多个指定名字的参数，同时过滤多个tag的多个属性 #print(tag) with open(r'F:\untitled\.idea\text.txt','w', encoding='utf-8') as f: for m in tag: title=m.get_text() #两个都是str字符串类型 link=m.get("href") #print(title) #print(link) #title、link的数据存储到本地指定txt中 f.write("{} {}\n".format(title,link))

运行结果如下：

——————————————————————————————————————

2.3D福利彩票开奖，每一期的日期、期号、第一个第二个第三个号码总共5个数字。

import urllib,re,xlwt,xlrd from bs4 import BeautifulSoup import requests urls = ['http://kaijiang.zhcw.com/zhcw/html/3d/list_{}.html'.format(str(i)) for i in range(1, 242)] #print(get_3d_html()) 第一步，先得到想要爬取的所有网址 for url in urls: response=requests.get(url).text #print(response) soup=BeautifulSoup(response,'lxml') #print(soup) zhengze=re.compile(r'<tr>.*?<td align="center">(.*?)</td>' r'.*?<td align="center">(.*?)</td>.*?' r'<td align="center" style="padding-left:20px;">' r'(.*?).*?(.*?).*?(.*?)</td>',re.S) it=re.findall(zhengze,response) #print(it) #第二步，得到所有想要的数据 turple_list=[list(item) for item in it]#每一个列表长度是20行 with open(r'F:\untitled\.idea\ext222.txt', 'a') as month_file: for tag in turple_list: #做一个双层循环，把外层列表嵌套的内层列表元素显示出来 for i in tag: month_file.write(str(i)) month_file.write(' ') month_file.write('\n')

————————————————————————————————————————

3.爬取全部苹果emoji的图片，然后使用.txt爬下所有的文件名

#coding=utf-8 import urllib from bs4 import BeautifulSoup import lxml def get_content(url): html = urllib.urlopen(url) content = html.read() html.close() return content def get_images(content): oSoup = BeautifulSoup(content,"lxml") all_images = oSoup.find_all('img', class_="imga") x = 1 for img in all_images: print img['src'] image_name = "%s.jpg" % x urllib.urlretrieve(img['src'], image_name) x += 1 url = "http://www.unicode.org/emoji/charts-5.0/emoji-list.html" content = get_content(url) get_images(content)

#——————————————————————上段保存的是emoji图片

#coding=utf-8 import urllib from bs4 import BeautifulSoup import sys import os import lxml url = "http://www.unicode.org/emoji/charts-5.0/emoji-list.html" html = urllib.urlopen(url) content = html.read() oSoup = BeautifulSoup(content,'lxml') all_names = oSoup.find_all('td',class_="code") f=open("111.txt",'w')[] for name in all_names: a = name.select('a')[0]['href'] #print a f.write(a) f.write('\n') f.close() #——————————————————————下段保存的是emoji标题 #这两个我是分开来写的，但是好用

————————————————————————————————————————

4.爬取有道翻译，翻译后的值是什么（Json相关）

from urllib import request import requests,json from urllib import parse url='http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule' Form_Data={} Form_Data['i']='jack' Form_Data['from']='AUTO' Form_Data['to']='AUTO' Form_Data['smartresult']='dict' Form_Data['client']='fanyideskweb' Form_Data['salt']='1524473261709' Form_Data['sign']='2b43308921f84f305f43e38fd1657c1b' Form_Data['doctype']='json' Form_Data['vertion']='2.1' Form_Data['from']='fanyi.web' Form_Data['action']='FY_BY_CLICKBUTTION' Form_Data['typoResult']='false' data=parse.urlencode(Form_Data).encode('utf-8')#使用urlencode方法转换成标准格式 response=request.urlopen(url,data)#传递Request对象和转换完格式的数据，data参数是像在线翻译、在线答题等提交的内容，剩下两个参数不常用 html=response.read().decode('utf-8')#读取信息并解码 translate_results=json.loads(html)#使用Json translate_results = translate_results['translateResult'][0][0]['tgt']#找到翻译结果 print('翻译结果是：%s' % translate_results)

1.这里使用Form_Data数据，是xhr，XMLHttpRequest，后台与服务器交换数据，可以在不刷新网页的情况下加载更多数据。

2.多个翻译结果也可以在同一个json中调用，返回结果也是多个

5.爬取图片，图片的名字和下载地址可以一一对应，取用的是“帅啊”网的内容

#coding='utf-8' from bs4 import BeautifulSoup import requests,lxml,random,re,time from urllib.request import urlretrieve head={} USER_AGENTS = [ "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", ] head['User-Agent']=random.choice(USER_AGENTS) #随机生成一个头文件 url_total=['http://www.shuaia.net/index_{}.html'.format(str(i)) for i in range(2, 10)]#先定个小目标，爬10页 url_total.append('http://www.shuaia.net/index.html') for url in url_total: response=requests.get(url,timeout=1000,headers=head) response.encoding='utf-8' soup=BeautifulSoup(response.text,'lxml') #爬到的内容有中文乱码，所以强制转换了编码方式为utf-8 zhengze=re.compile('src="(.*?)" class="attachment-weiran" alt="(.*?)">') jieguo=re.findall(zhengze,response.text) for i in jieguo: photo_url=i[0] #照片下载地址取第一个 photo_name=i[1] print(photo_url,photo_name) photo_name=photo_name[:-31] #把模特的名字搞定，字符串截取一部分，把后面相等长度的多余内容切掉 urlretrieve(photo_url,'F:/photo/%s.jpg' %photo_name) time.sleep(1) print('爬取结束')

6.使用selenium，爬取百度文库的内容

from selenium import webdriver from selenium.webdriver.common.keys import Keys from bs4 import BeautifulSoup import lxml options=webdriver.ChromeOptions() options.add_argument('user-agent="Mozilla/5.0 (Linux; Android 4.0.4; Galaxy Nexus Build/IMM76B) ' 'AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.133 Mobile Safari/535.19"') #模拟手机登录，因为手机和PC打开后客户端显示是不一样的 driver = webdriver.Chrome('F:/chromedriver.exe') #使用本地F盘的chrome驱动程序打开chrome浏览器 driver.get("https://wenku.baidu.com/view/b9a940d65901020206409c5b.html?from=search") #这里就打开了文库，我的确实和预想的内容不一样 try: VIP_btn=driver.find_element_by_xpath('//*[@id="html-reader-go-more"]/div[1]/p[2]/span') driver.execute_script('arguments[0].scrollIntoView();',VIP_btn) #向下翻页，翻到加入VIP按钮这里 page=driver.find_element_by_xpath('//*[@id="html-reader-go-more"]/div[2]/span') #找到字段（还剩X页未读，继续阅读这里） page.click() #点击这个按钮，这样就能打开所有的百度文库内容了 sleep(1) #加1s延迟，为了安全 html=driver.page_source #注意，这里的page_source是一个范式，就这么些写的 soup=BeautifulSoup(html,'lxml') neirong=soup.find_all(class_="ie-fix") f=open('F:/111.txt','w') for i in neirong: xiangxi=i.get_text() xiangxi=xiangxi.replace('\xa0','') print(xiangxi) f.write(xiangxi) f.close() print('第一种方式，进行翻页，下载成功了！') #把爬取到的内容存储在111.txt中 except: #检索一下，上面的网址有没有继续阅读按钮，没有就证明这一页就全部显示了，所以也不需要使用selenium翻页 html2 = driver.page_source soup2 = BeautifulSoup(html2, 'lxml') #print(soup2) neirong2=soup2.find_all(class_="reader-page-wrap") #print(neirong2) f = open('F:/222.txt', 'w') for i in neirong2: xiangxi2=i.get_text() xiangxi2=xiangxi2.replace('\xa0','') print(xiangxi2) f.write(xiangxi2) f.close() print('第二种方式，不用翻页，下载成功了！') #把爬取到的内容存储在222.txt中

7.爬取一个ip代理网站，第一篇的100个代理ip地址，不过没有判断哪些可用哪些不可用

http://www.xicidaili.com/nn/1 #coding='utf-8' from bs4 import BeautifulSoup import requests,lxml,random,re,time from urllib.request import urlretrieve from lxml import etree from matplotlib.cbook import flatten #为下面做字符串合并引用的库 head={} USER_AGENTS = [ "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36" ] head['User-Agent']=random.choice(USER_AGENTS) #随机生成一个头文件 url='http://www.xicidaili.com/nn/1' response=requests.get(url,timeout=1000,headers=head) request=BeautifulSoup(response.text,'lxml') #搞定了，有返回内容 ip_list = BeautifulSoup(str(request.find_all(id = 'ip_list')), 'lxml') #这句话很灵性，找到ip_list标签，把返回结果都改成str dom=etree.HTML(str(ip_list)) ip=dom.xpath('//td[2]/text()') #ip地址的列表 port=dom.xpath('//td[3]/text()') #端口地址的列表 ip_port=list(flatten(zip(ip,port))) #优雅的把两个列表合并起来，这个已经是ip池了 #print(ip_port) for m in range(0,100): ip_port222=ip[m]+':'+port[m] print(ip_port222) f = open('F:/ip_port.txt', 'w') f.write(ip_port222+'\n') f.close()

9.豆瓣网站，下载电影评论信息

#coding='utf-8'

from bs4 import BeautifulSoup

import requests,lxml,random,re,time

from urllib.request import urlretrieve

head={}

USER_AGENTS = [

"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",

"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",

]

head['User-Agent']=random.choice(USER_AGENTS) #随机生成一个头文件

url='http://movie.douban.com/top250?format=text'

response = requests.get(url, timeout=1000, headers=head)

response.coding='utf-8'

request = BeautifulSoup(response.content, 'lxml') # print即可

#print(request)

for tag in request.find_all('div',class_='info'): #为什么要这么写，因为下面想要查询的内容全部都在这个标签里面

#print(tag)

name=tag.find('span',class_='title').get_text() #find方法，默认会取出第一个字段的内容，如果想要取的不是第一个字段，下面会写

#print(name)

score=tag.find('span',class_='rating_num').get_text() #评分没什么好说的

#print(score)

people=tag.find('div',class_='star') #这里情况多一些，有四个span标签，但是我们要取最下面的标签

#print(people) #字符串

people_span=people.findAll('span')

peoplecount=people_span[3].get_text() #这里就得到了最终想要的结果

#print(peoplecount)

href=tag.find('a').get('href')

#print(href)

print('电影名：'+name+" "+'电影评分: '+score+'评价人数：'+peoplecount+'链接地址：'+href)

最终结果如下：

如果想要找到多个页签，有对应的规律，这里是第一页，页签是0；第二页，页签是25；第三页，页签是50以此类推，所以就按照下面的写，然后把上面的嵌套循环即可。

urls = ['https://movie.douban.com/top250?start={}&filter='.format(str(i*25)) for i in range(0,10)]

#比趣看小说网 # -*- coding: utf-8 -*- from requests_html import HTMLSession import time from retrying import retry #重试模块，当爬取过程中出现超时后，会再次等待并继续爬取 @retry() def xiazai(): #这个是小说笔趣阁的爬取步骤 session=HTMLSession() r=session.get('https://www.biqugex.com/book_25317/') #第一步，得到书名 shuming=r.html.find('div.info>h2')[0].text # print(shuming) # #第二步，得到所有章节名和章节链接 list_name=[] list_link=[] zhangjie=r.html.find('div.listmain>dl>dd>a') for i in zhangjie[6:]: # print(i.text) # print(i.absolute_links) set_to_str=','.join(i.absolute_links) #将set类型转str list_name.append(i.text) list_link.append(set_to_str) # 第三步，得到章节url内的信息，然后打出来全部文字 for j in range(len(list_link)): url_zhangjie=list_link[j] # print(url_zhangjie) session2 = HTMLSession() r_content=session2.get(url_zhangjie) # print(r_content.text) # 找到正文内容 print(list_name[j]) content=r_content.html.find('div.showtxt') # 第四步，得到章节内容，替换文字，下载到F盘 with open('F:\\' + shuming + '.txt', 'a', encoding='utf-8') as f: for m in content: content_neirong=m.text.replace(u'\xa0', u' ')[:-52] content_neirong=content_neirong.replace(r'『百度搜索↺49↰小↷说⇆网↴，更多好看小说阅读。』','') #替换掉没用的字符串 f.write('{}\n'.format(content_neirong)) time.sleep(0.5) print('全书下载完毕。') if __name__=='__main__': xiazai(

15、Python爬虫

我要留言 / 展开表单