Python climbed to the waterfall stream Baidu pictures
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urlencode
import json
import os
name = input("Please enter the picture you want to crawl:")
number = int(input("Please enter the number of sheets to climb:")) date={#date is some basic information of baidu picture link, which can be viewed through F12. We refresh the picture, and we can see the new webpage code, which can be extracted.
"tn": "resultjson_com"."ipn":"rj"."ct": 201326592."fp": "result"."queryWord": "name"."cl": 2."lm": -1."ie": "utf-8"."oe": "utf-8"."word":"name"."pn": 0.'rn': 30,}def get_url(date) :
url="https://image.baidu.com/search/acjson?"+urlencode(date) #urlencode converts date data to urls
# print(url)# verify link generation
return url
def get_html(url) :
html=requests.get(url)
#print(html.status_code)# print(html.status_code
return html
def get_urllist(html) :
data=html.json()["data"] # JSON can parse JSON format web source code, is a dictionary, can extract data, data contains Baidu pictures link information
Print (data[1]
return data
def get_picture(data) :
picture_urllist=[]
for i in range(len(data)):
try:
picture_urllist.append(data[i]["middleURL"]) #data is also a dictionary, many keys can be seen in the link, is a picture link, extract
except:
continue
Print (picture_urllist) print(picture_urllist
return picture_urllist
def picture_write(picture_urllist,n) :
for i in range(len(picture_urllist)):
# try:
path="/home/jin/life/picture/"+name+"/"+name+str(n)+".jpg"
picture=requests.get(picture_urllist[i])
with open(path,"wb") as file:
file.write(picture.content)
n+=1
print("Successfully climb to {} picture".format(n))
if n>=number:
print("Successful climb")
exit()
return n
# except:
# continue
def make_file(name) :
path="/home/jin/life/picture/"+name
os.makedirs(path) #os.makedires(path) creates a folder with a path
#print(path)# check whether the file is created
def main() :
date["queryWord"]=name
date["word"]=name
n=0
make_file(name)
for i in range(0.10000.30):
date["pn"]=i
url1=get_url(date)
html=get_html(url1)
data=get_urllist(html)
pictureurl=get_picture(data)
n=picture_write(pictureurl,n)
main()
Copy the code