1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92
|
''' @File : test.py @Time : 2021/03/06 00:19:09 @Author : Zhu Zhouyue @Version : 1.0 @Contact : zhuzhouyue2005@outlook.com @Desc : None '''
import requests import re from bs4 import BeautifulSoup import time
def GetHtmlText(url): try: user_agent = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'} rspon = requests.get(url, headers=user_agent) rspon.encoding = rspon.apparent_encoding rspon.raise_for_status() except: print('网页获取失败:', rspon.status_code) return None return rspon
def GetMaxPageCount(): max_page_count = 0 url = 'https://bing.ioliu.cn/' soup = BeautifulSoup(GetHtmlText(url).text, "html.parser") tag_page = soup.find('div', {'class': 'page'}) page_txt = None for tag_child in tag_page.children: if(tag_child.name == 'span'): page_txt = tag_child.string match = re.search(r'(?<=1 / )\d*', page_txt) max_page_count = int(match.group(0)) time.sleep(0.5) return max_page_count return rspon def SavePictureInUrl(pic_url,pic_name,pic_path): source = GetHtmlText(pic_url) if source == None: return file_name = '{}.jpg'.format(pic_name) file = open(pic_path+file_name, "wb") file.write(source.content) file.close()
def GetOnePageJpg(page_count, pic_path): url = 'https://bing.ioliu.cn/?p={}'.format(page_count) suop = BeautifulSoup(GetHtmlText(url).text, 'html.parser') tag_container = suop.find_all('div', {'class':'container'}) tag_item = tag_container[1] url_photo = 'https://bing.ioliu.cn' for tag_pic in tag_item.children: tag_title = tag_pic.find('h3') text_title = tag_title.string a = re.findall(r'[^\*"/:?\\|<>]', text_title, re.S) text_title = ''.join(a) tag_calendar = tag_pic.find('p', {'class':'calendar'}) tag_em = tag_calendar.find('em') text_calendar = tag_em.string text_pic_name = text_calendar + '__' + text_title tag_download = tag_pic.find('a', {'class':'ctrl download'}) url_pic = url_photo + tag_download['href'] SavePictureInUrl(url_pic, text_pic_name, pic_path) print('.', end='', flush=True) time.sleep(5) def GetAllPageJpg(pic_path): max_page_count = GetMaxPageCount() for page_index in range(1, max_page_count): GetOnePageJpg(page_index, pic_path) print('\r', '正在获取,已完成:{:.2f} %'.format(page_index/max_page_count*100), end = '', flush=True)
def main(): pic_path = '/Volumes/ZZY/bing/' GetAllPageJpg(pic_path) main()
|