本文共 2007 字,大约阅读时间需要 6 分钟。
# -*- coding:utf-8 -*-# pip install requests 框架import requests# pip install beautifulsoup4 框架# pip install lxml 解析器from bs4 import BeautifulSoupimport osclass doutuSpider(object): headers = { "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36" } def get_url(self, url): data = requests.get(url, headers=self.headers) soup = BeautifulSoup(data.content, 'lxml') totals = soup.findAll("a", {"class": "list-group-item"}) for one in totals: sub_url = one.get('href') global path path = 'E:\\img' + '\\' + sub_url.split('/')[-1] os.mkdir(path) try: self.get_img_url(sub_url) except: pass pass pass def get_img_url(self, url): data = requests.get(url, headers = self.headers) soup = BeautifulSoup(data.content, 'lxml') totals = soup.findAll('div', {'class': 'artile_des'}) for one in totals: img = one.find('img') try: sub_url = img.get('src') except Exception as e: raise e finally: urls = sub_url try: self.get_img(urls) except: print urls pass pass pass def get_img(self, url): filename = url.split('/')[-1] global path img_path = path + '\\' + filename img = requests.get(url, headers = self.headers) try: with open(img_path, 'wb') as f: f.write(img.content) except: pass pass def create(self): for count in range(1,10): url = 'https://www.doutula.com/article/list/?page={}'.format(count) print 'download {} page'.format(count) self.get_url(url) pass passif __name__ == '__main__': doutu = doutuSpider() doutu.create()
转载地址:http://lokwo.baihongyu.com/