在路上

 找回密码
 立即注册
在路上 站点首页 学习 查看内容

爬取某图片网站多页图片的python爬虫

2016-12-20 13:14| 发布者: zhangjf| 查看: 450| 评论: 0

摘要: 爬取某图片网站多页图片的python爬虫 # coding=utf-8import requestsimport refrom lxml import etreeimport timeimport sysreload(sys)sys.setdefaultencoding(utf-8)#定义一个爬虫class spider ...
爬取某图片网站多页图片的python爬虫
  1. # coding=utf-8
  2. import requests
  3. import re
  4. from lxml import etree
  5. import time
  6. import sys
  7. reload(sys)
  8. sys.setdefaultencoding("utf-8")
  9. #定义一个爬虫
  10. class spider(object):
  11. def __init__(self):
  12. print u'开始爬取内容。。。'
  13. #getsource用来获取网页源代码
  14. def getsource(self,url):
  15. html = requests.get(url)
  16. return html.text
  17. #changepage用来生产不同页数的链接
  18. def changepage(self,url,total_page):
  19. now_page = int(re.search('index_(d+)',url,re.S).group(1)) #可修改
  20. page_group = []
  21. for i in range(now_page,total_page+1):
  22. link = re.sub('index_d+','index_%s'%i,url,re.S) #可修改
  23. page_group.append(link)
  24. return page_group
  25. #getpic用来爬取一个网页图片
  26. def getpic(self,source):
  27. selector = etree.HTML(source)
  28. pic_url = selector.xpath('//ul[@class="ali"]/li/div/a/img/@src') #可修改
  29. return pic_url
  30. #savepic用来保存结果到pic文件夹中
  31. def savepic(self,pic_url):
  32. picname=re.findall('(d+)',link,re.S) #可修改
  33. picnamestr = ''.join(picname)
  34. i=0
  35. for each in pic_url:
  36. print 'now downloading:' + each
  37. pic = requests.get(each)
  38. fp = open('pic\'+picnamestr +'-'+str(i)+ '.jpg', 'wb')
  39. fp.write(pic.content)
  40. fp.close()
  41. i += 1
  42. #ppic集合类的方法
  43. def ppic(self, link):
  44. print u'正在处理页面:' + link
  45. html = picspider.getsource(link)
  46. pic_url = picspider.getpic(html)
  47. picspider.savepic(pic_url)
  48. time1=time.time()
  49. if __name__ == '__main__':
  50. url = 'http://www.ivsky.com/tupian/ziranfengguang/index_1.html' #可修改
  51. picspider = spider()
  52. all_links = picspider.changepage(url,3) #可修改
  53. for link in all_links:
  54. picspider.ppic(link)
  55. time2=time.time()
  56. print u'耗时:'+str(time2-time1)
复制代码
  1. # coding=utf-8
  2. import requests
  3. import re
  4. from lxml import etree
  5. import time
  6. import sys
  7. reload(sys)
  8. sys.setdefaultencoding("utf-8")
  9. #定义一个爬虫
  10. class spider(object):
  11. def __init__(self):
  12. print u'开始爬取内容。。。'
  13. #getsource用来获取网页源代码
  14. def getsource(self,url):
  15. html = requests.get(url)
  16. return html.text
  17. #changepage用来生产不同页数的链接
  18. def changepage(self,url,total_page):
  19. now_page = int(re.search('index_(d+)',url,re.S).group(1)) #可修改
  20. page_group = []
  21. for i in range(now_page,total_page+1):
  22. link = re.sub('index_d+','index_%s'%i,url,re.S) #可修改
  23. page_group.append(link)
  24. return page_group
  25. #getpic用来爬取一个网页图片
  26. def getpic(self,source):
  27. selector = etree.HTML(source)
  28. pic_url = selector.xpath('//ul[@class="ali"]/li/div/a/img/@src') #可修改
  29. return pic_url
  30. #savepic用来保存结果到pic文件夹中
  31. def savepic(self,pic_url):
  32. picname=re.findall('(d+)',link,re.S) #可修改
  33. picnamestr = ''.join(picname)
  34. i=0
  35. for each in pic_url:
  36. print 'now downloading:' + each
  37. pic = requests.get(each)
  38. fp = open('pic\'+picnamestr +'-'+str(i)+ '.jpg', 'wb')
  39. fp.write(pic.content)
  40. fp.close()
  41. i += 1
  42. #ppic集合类的方法
  43. def ppic(self, link):
  44. print u'正在处理页面:' + link
  45. html = picspider.getsource(link)
  46. pic_url = picspider.getpic(html)
  47. picspider.savepic(pic_url)
  48. time1=time.time()
  49. if __name__ == '__main__':
  50. url = 'http://www.ivsky.com/tupian/ziranfengguang/index_1.html' #可修改
  51. picspider = spider()
  52. all_links = picspider.changepage(url,3) #可修改
  53. for link in all_links:
  54. picspider.ppic(link)
  55. time2=time.time()
  56. print u'耗时:'+str(time2-time1)
复制代码

最新评论

小黑屋|在路上 ( 蜀ICP备15035742号-1 

;

GMT+8, 2025-7-8 03:06

Copyright 2015-2025 djqfx

返回顶部