preface
Sleep (5) is required to set the crawl speed of time.sleep(5). The complete code is attached first, and detailed explanation is given below
import csv
from fake_useragent import UserAgent
import json
from lxml import etree
import requests
# proxy server
proxyHost = "http-dyn.abuyun.com"
proxyPort = "9020"
# Proxy tunnel authentication information
proxyUser = "HM89Z6WLA4F6N05D"
proxyPass = "C8CF37D06DBED9DB"
proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
"host": proxyHost,
"port": proxyPort,
"user": proxyUser,
"pass": proxyPass,
}
proxies = {
"http": proxyMeta,
"https": proxyMeta,
}
headers = {"User-Agent": '{}'.format(UserAgent().random),
"Accept": "text/html,application/xhtml+xml,application/xml; Q = 0.9 * / *; Q = 0.8"."Accept-Language": "en-us"."Connection": "keep-alive"."Accept-Charset": "GB2312,utf-8; Q = 0.7 *; Q = 0.7"}
#-------- Get the game name and URL ------------#
directory_url = 'https://www.douyu.com/directory'
Web = requests.get(directory_url, headers=headers, proxies=proxies).text
dom = etree.HTML(Web)
Game_urls_list = []
Game_names_list = []
for i in range(3.13):
Game_names = dom.xpath('//*[@id="allCate"]/section/div[{}]/ul/li/a/strong/text()'.format(i))
Game_urls = dom.xpath('//*[@id="allCate"]/section/div[{}]/ul/li/a/@href'.format(i))
# -- -- -- -- -- -- -- -- -- -- -- -- -- - game name and the url in a new list, and split game url (behind) may be used -- -- -- -- -- -- -- -- -- -- #
for Gn in Game_names:
Game_names_list.append(Gn)
for Gu in Game_urls:
G_url = Gu.split('_') [1]
Game_urls_list.append(G_url)
#---------- Save the name and URL into the dictionary ------------#
All_game = dict(zip(Game_names_list, Game_urls_list))
#---------- take the key loop of the dictionary in turn ----------#
for G_name in All_game.keys():
print("=========== is creeping up ========", G_name)
count = 1 # Because the number of pages to climb varies according to the game partition, use count to make a flexible way to climb
for page in range(1.350) :# Observe that a game should not exceed 350 pages at most
# time.sleep(5)
base_api = 'https://m.douyu.com/api/room/list?page={}&type={}'.format(page, All_game['{}'.format(G_name)])
try:
response = requests.get(base_api, headers=headers, proxies=proxies, timeout=30, verify=False).text
except IOError:
pass
RoomList = json.loads(response).get('data').get('list')
if len(RoomList) > 1:
# this page has data,count+1
count += 1
path = '/home/liuyang/Spider/Scrapy_Project/BS_Spider/Douyu/Info_Douyu2020-04-05-14:00.csv'
for room in RoomList:
GameName = G_name
RoomId = room.get('rid')
RoomName = room.get('roomName')
BlogName = room.get('nickname')
HotSpots = room.get('hn')
with open(path, "a+", encoding='utf-8-sig') as f:
writer = csv.writer(f, dialect="excel")
csv_write = csv.writer(f)
csv_data = [G_name, RoomId, RoomName, BlogName, HotSpots]
csv_write.writerow(csv_data)
f.close()
print(G_name, RoomId, RoomName, BlogName, HotSpots)
else:
count -= 10
No data in this page, count minus 10,
If page=1, count=2, if page=2, count=3......
Conut
print(count, page)
if count < page:
# Since some games are 10 + pages and some are 350 + pages, there is no need to request an API for a 10 + page game 350 times
break
Copy the code
If need IP agency recommended ABU cloud, can be calculated by the hour 1 h 1 yuan, after all, born in a poor family, proxy IP can look at a look at this article zhuanlan.zhihu.com/p/36207770
A:
We first climb down the name and URL of each game and store them in the dictionary, and then take out the dictionary in turn and enter each game partition to crawl all live broadcast rooms. First, we get all games from douyu’s classified address: www.douyu.com/directory
directory_url = 'https://www.douyu.com/directory'
Copy the code
Get all live data of a game: first from douyu’s classification address: www.douyu.com/directory, midpoint open a game, LOL
Don’t have to enter the
M.douyu.com/api/room/li…
wangwanghub.com
Solemnly declare: this project and all related articles, only for technical exchange experience, forbid the application of relevant technology to improper ways, because the risk of abuse of technology has nothing to do with myself.