Python实现Bing壁纸爬虫

第一次学习爬虫的记录

python爬虫是python比较强大的一个功能，可以支持从网站页面爬取数据以及文件

首先导入需要的库

import requests
import re
from bs4 import BeautifulSoup
import time

根据传入的url请求网站，并返回得到的数据

def GetHtmlText(url):
    try:
        user_agent = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'}
        rspon = requests.get(url, headers=user_agent)
        rspon.encoding = rspon.apparent_encoding
        rspon.raise_for_status()
    except:
        print('网页获取失败:', rspon.status_code)
        return None
    return rspon

获取主页信息和获取最大页数

def GetMaxPageCount():
    max_page_count = 0
    url = 'https://bing.ioliu.cn/'
    soup = BeautifulSoup(GetHtmlText(url).text, "html.parser")
    tag_page = soup.find('div', {'class': 'page'})
    page_txt = None
    for tag_child in tag_page.children:
        if(tag_child.name == 'span'):
            page_txt = tag_child.string
            match = re.search(r'(?<=1 / )\d*', page_txt)
            max_page_count = int(match.group(0))
        time.sleep(0.5)    
    return max_page_count
    return rspon

根据传入的url链接获取图片的二进制数据，并且根据传入的路径和文件名将文件写入到对应的路径中。

def SavePictureInUrl(pic_url,pic_name,pic_path):
    source = GetHtmlText(pic_url)
    if source == None:
        return
    file_name = '{}.jpg'.format(pic_name)
    file = open(pic_path+file_name, "wb")    #以二进制写的方式打开文件。
    file.write(source.content)
    file.close()

从返回的网页数据中获取每张图片的相关信息以及图片下载的url，然后调用相关函数下载图片

def GetOnePageJpg(page_count, pic_path):
    url = 'https://bing.ioliu.cn/?p={}'.format(page_count)
    suop = BeautifulSoup(GetHtmlText(url).text, 'html.parser')
    tag_container = suop.find_all('div', {'class':'container'})
    tag_item = tag_container[1]
    url_photo = 'https://bing.ioliu.cn'
    for tag_pic in tag_item.children:
        # 获取图片的标题和日期信息并且拼接成图片名
        tag_title = tag_pic.find('h3')
        text_title = tag_title.string
        a = re.findall(r'[^\*"/:?\\|<>]', text_title, re.S)      #剔除某些不能作为文件名的特殊字符
        text_title = ''.join(a)
        tag_calendar = tag_pic.find('p', {'class':'calendar'})
        tag_em = tag_calendar.find('em')
        text_calendar = tag_em.string
        text_pic_name = text_calendar + '__' + text_title
        # 获取图片的下载url
        tag_download = tag_pic.find('a', {'class':'ctrl download'})
        url_pic = url_photo + tag_download['href']
        #信息保存到图片中
        SavePictureInUrl(url_pic, text_pic_name, pic_path)
        print('.', end='', flush=True)        #输出进度信息
        time.sleep(5) #延时5s

爬取图片

def GetAllPageJpg(pic_path):
    # 爬取所有的图片，并保存在输入的路径参数下
    max_page_count = GetMaxPageCount()
    for page_index in range(1, max_page_count):
        GetOnePageJpg(page_index, pic_path)
        print('\r', '正在获取，已完成：{:.2f} %'.format(page_index/max_page_count*100), end = '', flush=True)      #输出进度信息

调用main函数

def main():
    # 程序执行
    pic_path = '/Volumes/ZZY/bing/'  # 文件保存路径
    GetAllPageJpg(pic_path)
main()       #执行main函数

最后附上源码

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
'''
@File    :   test.py
@Time    :   2021/03/06 00:19:09
@Author  :   Zhu Zhouyue 
@Version :   1.0
@Contact :   zhuzhouyue2005@outlook.com
@Desc    :   None
'''
#无反爬虫机制，慎用，ip容易被ban
# here put the import lib
import requests
import re
from bs4 import BeautifulSoup
import time

def GetHtmlText(url):
    # 根据传入的url请求网站，并返回得到的数据
    try:
        user_agent = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'}
        rspon = requests.get(url, headers=user_agent)
        rspon.encoding = rspon.apparent_encoding
        rspon.raise_for_status()
    except:
        print('网页获取失败:', rspon.status_code)
        return None
    return rspon

def GetMaxPageCount():
    # 获取主页信息，并且获取网站的最大页数
    max_page_count = 0
    url = 'https://bing.ioliu.cn/'
    soup = BeautifulSoup(GetHtmlText(url).text, "html.parser")
    tag_page = soup.find('div', {'class': 'page'})
    page_txt = None
    for tag_child in tag_page.children:
        if(tag_child.name == 'span'):
            page_txt = tag_child.string
            match = re.search(r'(?<=1 / )\d*', page_txt)
            max_page_count = int(match.group(0))
        time.sleep(0.5)    
    return max_page_count
    return rspon
    
def SavePictureInUrl(pic_url,pic_name,pic_path):
    # 根据传入的url链接获取图片的二进制数据，并且根据传入的路径和文件名将文件写入到对应的路径中。
    source = GetHtmlText(pic_url)
    if source == None:
        return
    file_name = '{}.jpg'.format(pic_name)
    file = open(pic_path+file_name, "wb")    #以二进制写的方式打开文件。
    file.write(source.content)
    file.close()

def GetOnePageJpg(page_count, pic_path):
    # 从返回的网页数据中获取每张图片的相关信息以及图片下载的url，然后调用相关函数下载图片
    url = 'https://bing.ioliu.cn/?p={}'.format(page_count)
    suop = BeautifulSoup(GetHtmlText(url).text, 'html.parser')
    tag_container = suop.find_all('div', {'class':'container'})
    tag_item = tag_container[1]
    url_photo = 'https://bing.ioliu.cn'
    for tag_pic in tag_item.children:
        # 获取图片的标题和日期信息并且拼接成图片名
        tag_title = tag_pic.find('h3')
        text_title = tag_title.string
        a = re.findall(r'[^\*"/:?\\|<>]', text_title, re.S)      #剔除某些不能作为文件名的特殊字符
        text_title = ''.join(a)
        tag_calendar = tag_pic.find('p', {'class':'calendar'})
        tag_em = tag_calendar.find('em')
        text_calendar = tag_em.string
        text_pic_name = text_calendar + '__' + text_title
        # 获取图片的下载url
        tag_download = tag_pic.find('a', {'class':'ctrl download'})
        url_pic = url_photo + tag_download['href']
        #信息保存到图片中
        SavePictureInUrl(url_pic, text_pic_name, pic_path)
        print('.', end='', flush=True)        #输出进度信息
        time.sleep(5)
def GetAllPageJpg(pic_path):
    # 爬取所有的图片，并保存在输入的路径参数下
    max_page_count = GetMaxPageCount()
    for page_index in range(1, max_page_count):
        GetOnePageJpg(page_index, pic_path)
        print('\r', '正在获取，已完成：{:.2f} %'.format(page_index/max_page_count*100), end = '', flush=True)      #输出进度信息

def main():
    # 程序执行
    pic_path = '/Volumes/ZZY/bing/'  # 文件保存路径
    GetAllPageJpg(pic_path)
main()       #执行main函数

本程序仅用于交流学习使用，严禁用此程序实施违法行为

Coding

#Python #Spider

Python实现Bing壁纸爬虫

http://blog.fantasticjoe.com/d36a95a3.html

作者

JoeZhu

发布于

2021年3月7日

更新于

2025年3月12日

许可协议

Python构建UserAgent池上一篇