# -*-coding:utf-8-*- from lxml import etree import requests import sys, os, re class Meizitu(object): def __init__(self): # 设置序号为图片名 self.i = 1 # 构建url self.url = 'http://www.meizitu.com/a/more_{}.html' # 构建header self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36" } self.pattern = re.compile('http://www.meizitu.com/a/\d+.html', re.S) # 使用正则匹配url self.img_url = [] def get_page(self, url): response = requests.get(url, headers=self.headers) # print(response.encoding) # 返回内容编码编码格式 return response.content def parse_data(self, detail_data): # 取得图片页url str_data1 = self.pattern.findall(detail_data.decode('utf-8', 'ignore')) return str_data1 def pic_info(self, image_list): # 详细页面 img_url = [] for url in image_list: img_page = self.get_page(url) html = etree.HTML(img_page) if html is not None: img_detial_list = html.xpath('//img/@src') if img_detial_list: for i in img_detial_list: if i not in self.img_url: self.img_url.append(i) # 去重存到self.img_url img_url.append(i) def download(self): # print(image_list) if not os.path.exists('images'): os.makedirs('images') for url in self.img_url: print(url) data = self.get_page(url) with open('images' + os.sep + str(self.i) + '.jpg', 'wb+') as f: f.write(data) self.i = self.i + 1 def run(self): page = input('输入页码') self.url = 'http://www.meizitu.com/a/more_{}.html'.format(page)
print(self.url) detail_data = self.get_page(self.url) # 抽取图片url image_list = self.parse_data(detail_data) self.pic_info(image_list) self.download() if __name__ == '__main__': meizi = Meizitu() meizi.run()