
Sleep (5) is required to set the crawl speed of time.sleep(5). The complete code is attached first, and detailed explanation is given below

import csv
from fake_useragent import UserAgent
import json

from lxml import etree

import requests

# proxy server
proxyHost = ""
proxyPort = "9020"

# Proxy tunnel authentication information
proxyUser = "HM89Z6WLA4F6N05D"
proxyPass = "C8CF37D06DBED9DB"

proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
    "host": proxyHost,
    "port": proxyPort,
    "user": proxyUser,
    "pass": proxyPass,

proxies = {
    "http": proxyMeta,
    "https": proxyMeta,

headers = {"User-Agent": '{}'.format(UserAgent().random),
           "Accept": "text/html,application/xhtml+xml,application/xml; Q = 0.9 * / *; Q = 0.8"."Accept-Language": "en-us"."Connection": "keep-alive"."Accept-Charset": "GB2312,utf-8; Q = 0.7 *; Q = 0.7"}

#-------- Get the game name and URL ------------#
directory_url = ''

Web = requests.get(directory_url, headers=headers, proxies=proxies).text

dom = etree.HTML(Web)

Game_urls_list = []
Game_names_list = []
for i in range(3.13):
    Game_names = dom.xpath('//*[@id="allCate"]/section/div[{}]/ul/li/a/strong/text()'.format(i))
    Game_urls = dom.xpath('//*[@id="allCate"]/section/div[{}]/ul/li/a/@href'.format(i))

# -- -- -- -- -- -- -- -- -- -- -- -- -- - game name and the url in a new list, and split game url (behind) may be used -- -- -- -- -- -- -- -- -- -- #
    for Gn in Game_names:
    for Gu in Game_urls:
        G_url = Gu.split('_') [1]

#---------- Save the name and URL into the dictionary ------------#
All_game = dict(zip(Game_names_list, Game_urls_list))

#---------- take the key loop of the dictionary in turn ----------#
for G_name in All_game.keys():
    print("=========== is creeping up ========", G_name)
    count = 1  # Because the number of pages to climb varies according to the game partition, use count to make a flexible way to climb

    for page in range(1.350) :# Observe that a game should not exceed 350 pages at most
        # time.sleep(5)
        base_api = '{}&type={}'.format(page, All_game['{}'.format(G_name)])
            response = requests.get(base_api, headers=headers, proxies=proxies, timeout=30, verify=False).text

        except IOError:

        RoomList = json.loads(response).get('data').get('list')

        if len(RoomList) > 1:
        # this page has data,count+1
            count += 1
            path = '/home/liuyang/Spider/Scrapy_Project/BS_Spider/Douyu/Info_Douyu2020-04-05-14:00.csv'

            for room in RoomList:
                GameName = G_name
                RoomId = room.get('rid')
                RoomName = room.get('roomName')
                BlogName = room.get('nickname')
                HotSpots = room.get('hn')

                with open(path, "a+", encoding='utf-8-sig') as f:
                    writer = csv.writer(f, dialect="excel")
                    csv_write = csv.writer(f)
                    csv_data = [G_name, RoomId, RoomName, BlogName, HotSpots]
                    print(G_name, RoomId, RoomName, BlogName, HotSpots)
            count -= 10
	    No data in this page, count minus 10,
	    If page=1, count=2, if page=2, count=3......
        print(count, page)

        if count < page:
        # Since some games are 10 + pages and some are 350 + pages, there is no need to request an API for a 10 + page game 350 times
Copy the code

If need IP agency recommended ABU cloud, can be calculated by the hour 1 h 1 yuan, after all, born in a poor family, proxy IP can look at a look at this article


We first climb down the name and URL of each game and store them in the dictionary, and then take out the dictionary in turn and enter each game partition to crawl all live broadcast rooms. First, we get all games from douyu’s classified address:

directory_url = ''
Copy the code

Get all live data of a game: first from douyu’s classification address:, midpoint open a game, LOL

Don’t have to enter the…

Solemnly declare: this project and all related articles, only for technical exchange experience, forbid the application of relevant technology to improper ways, because the risk of abuse of technology has nothing to do with myself.