A fish live data full station crawl

preface

Sleep (5) is required to set the crawl speed of time.sleep(5). The complete code is attached first, and detailed explanation is given below

import csv
from fake_useragent import UserAgent
import json

from lxml import etree

import requests

# proxy server
proxyHost = "http-dyn.abuyun.com"
proxyPort = "9020"

# Proxy tunnel authentication information
proxyUser = "HM89Z6WLA4F6N05D"
proxyPass = "C8CF37D06DBED9DB"

proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
    "host": proxyHost,
    "port": proxyPort,
    "user": proxyUser,
    "pass": proxyPass,
}

proxies = {
    "http": proxyMeta,
    "https": proxyMeta,
}


headers = {"User-Agent": '{}'.format(UserAgent().random),
           "Accept": "text/html,application/xhtml+xml,application/xml; Q = 0.9 * / *; Q = 0.8"."Accept-Language": "en-us"."Connection": "keep-alive"."Accept-Charset": "GB2312,utf-8; Q = 0.7 *; Q = 0.7"}


#-------- Get the game name and URL ------------#
directory_url = 'https://www.douyu.com/directory'

Web = requests.get(directory_url, headers=headers, proxies=proxies).text

dom = etree.HTML(Web)

Game_urls_list = []
Game_names_list = []
for i in range(3.13):
    Game_names = dom.xpath('//*[@id="allCate"]/section/div[{}]/ul/li/a/strong/text()'.format(i))
    Game_urls = dom.xpath('//*[@id="allCate"]/section/div[{}]/ul/li/a/@href'.format(i))

# -- -- -- -- -- -- -- -- -- -- -- -- -- - game name and the url in a new list, and split game url (behind) may be used -- -- -- -- -- -- -- -- -- -- #
    for Gn in Game_names:
        Game_names_list.append(Gn)
    for Gu in Game_urls:
        G_url = Gu.split('_') [1]
        Game_urls_list.append(G_url)

#---------- Save the name and URL into the dictionary ------------#
All_game = dict(zip(Game_names_list, Game_urls_list))


#---------- take the key loop of the dictionary in turn ----------#
for G_name in All_game.keys():
    print("=========== is creeping up ========", G_name)
    count = 1  # Because the number of pages to climb varies according to the game partition, use count to make a flexible way to climb

    for page in range(1.350) :# Observe that a game should not exceed 350 pages at most
        # time.sleep(5)
        base_api = 'https://m.douyu.com/api/room/list?page={}&type={}'.format(page, All_game['{}'.format(G_name)])
        try:
            response = requests.get(base_api, headers=headers, proxies=proxies, timeout=30, verify=False).text

        except IOError:
            pass

        RoomList = json.loads(response).get('data').get('list')

        if len(RoomList) > 1:
        # this page has data,count+1
            count += 1
            path = '/home/liuyang/Spider/Scrapy_Project/BS_Spider/Douyu/Info_Douyu2020-04-05-14:00.csv'

            for room in RoomList:
                GameName = G_name
                RoomId = room.get('rid')
                RoomName = room.get('roomName')
                BlogName = room.get('nickname')
                HotSpots = room.get('hn')

                with open(path, "a+", encoding='utf-8-sig') as f:
                    writer = csv.writer(f, dialect="excel")
                    csv_write = csv.writer(f)
                    csv_data = [G_name, RoomId, RoomName, BlogName, HotSpots]
                    csv_write.writerow(csv_data)
                    f.close()
                    print(G_name, RoomId, RoomName, BlogName, HotSpots)
        else:
            count -= 10
	    No data in this page, count minus 10,
	    If page=1, count=2, if page=2, count=3......
	    Conut 
        print(count, page)

        if count < page:
        # Since some games are 10 + pages and some are 350 + pages, there is no need to request an API for a 10 + page game 350 times
            break
		
Copy the code

If need IP agency recommended ABU cloud, can be calculated by the hour 1 h 1 yuan, after all, born in a poor family, proxy IP can look at a look at this article zhuanlan.zhihu.com/p/36207770

A:

We first climb down the name and URL of each game and store them in the dictionary, and then take out the dictionary in turn and enter each game partition to crawl all live broadcast rooms. First, we get all games from douyu’s classified address: www.douyu.com/directory

directory_url = 'https://www.douyu.com/directory'
Copy the code

Get all live data of a game: first from douyu’s classification address: www.douyu.com/directory, midpoint open a game, LOL

Don’t have to enter the
M.douyu.com/api/room/li…

wangwanghub.com

Solemnly declare: this project and all related articles, only for technical exchange experience, forbid the application of relevant technology to improper ways, because the risk of abuse of technology has nothing to do with myself.

preface

A:

Related Posts

Use Context in React

Server anti-penetration – Information collection

Distributed Testing Tutorials for Automated Functional Testing Platform TestComplete