• Home
  • About
    • 朱智博在Github上的Blog photo

      朱智博在Github上的Blog

      朱智博,朱智博的博客,zhuio,zhuio.github.io,

    • Learn More
    • Email
    • Github
    • Weibo
  • Posts
    • All Posts
    • All Tags
  • Projects

煎蛋妹子图爬虫

21 Jan 2017

Reading time ~1 minute

用python爬取煎蛋上妹子图

用到beautifulsoup4 requests time urllib.request.urlopen

import urllib.request
import bs4 as bs
import requests
import re
import time
import random

def parser(page):
    urls = []
    for i in range(2000,page):
        url = 'http://jandan.net/ooxx/page-{}'.format(str(i))
        urls.append(url)
    for a in urls:
        sauce = urllib.request.urlopen(a)
        soup = bs.BeautifulSoup(sauce,'lxml')
        imgs = soup.find_all('img')
        for img in imgs:
            src = re.compile('src="//(.*)".*/>')
            findsrc = re.findall(src,str(img))
            try:
                for i in findsrc:
                    time.sleep(0.05)
                    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
                    req = urllib.request.Request(url='https://'+i, headers=headers)
                    data = urllib.request.urlopen(req).read()
                    path = 'images/'+str(random.randint(1,10000))+'.jpg'
                    with open(path,'wb') as f:
                        f.write(data)
            except urllib.error.URLError as e:
                print('except:', '有张图片失效了')
            finally:
                print('图片正在生成')

if __name__ == '__main__':
    parser(2323)


Share Tweet +1