Brief Introduction of speakers:

Data acquisition and web crawler technology introduction

2. Technical basis of web crawler

1) Urllib foundation

>>> import urllib.request #open, read and crawl into memory, Decode (ignore minor errors in decoding) and assign to data >>> data=urllib.request.ulropen("http://www.baidu.com").read().decode(" utF-8 ", "Ignore ") # check whether the data in the page exists, by checking the length of the data >>> len(data) extract the page title # import the regular expression,.*? >> import re # regular expression >>> pat="<title>(.*?) </title>" #re.compile() is used to compile regular expressions. >>> RST =re.compile(pat, re.s).findall(data) >>> print(RST) #Copy the code
>>> data=urllib.request.ulropen("http://www.jd.com").read().decode("utf-8", "ignore")
>>> rst=re.compile(pat,re.S).findall(data)
>>> print(rst)Copy the code
> > > urllib. Request. Urlretrieve (" http://www.jd.com ", filename = "D: / my teaching/Python/ali cloud series broadcast live/second code/test. The HTML")Copy the code

2) Browser camouflage

# # browser url = "https://www.qiushibaike.com/" disguise build opener opener = urllib. Request. Build_opener # () the user-agent set to the value of the browser UA = (" the user-agent ", "Mozilla / 5.0 (Windows NT 10.0; WOW64) AppleWebKit / 537.36 (KHTML, Like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.x MetaSr 1.0") # urllib.request.install_opener(opener) data=urllib.request.urlopen(url).read().decode("utf-8","ignore")Copy the code

3) User agent pool

# user agent pool uapools=["Mozilla/5.0 (Windows NT 10.0; Win64; X64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393", "Mozilla / 5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.x MetaSr 1.0", "Mozilla/4.0 (Compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",Copy the code

def UA(): Thisua =random. Choice (uapools) ua=(" user-agent ", thisUA) thisua= (" user-agent ",thisua) Request. Install_opener (opener) print(" thisua: "+ STR (thisua))Copy the code
For I in range(0,10): UA() data=urllib.request.urlopen(url).read().decode("utf-8","ignore")Copy the code
For I in range(0,10): if(I %3==0): UA() data=urllib.request.urlopen(url).read().decode(" UTF-8 ","ignore")Copy the code

4) The first exercise – Embarrassing things encyclopedia reptilian combat

Import urllib.request import re import random # Uapools =["Mozilla/5.0 (Windows NT 10.0; Win64; X64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393", "Mozilla / 5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.x MetaSr 1.0", "Mozilla/4.0 (Compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",] def UA(): opener=urllib.request.build_opener() thisua=random.choice(uapools) ua=("User-Agent",thisua) opener.addheaders=[ua] Urllib.request. Install_opener (opener) print(" thisua: "+ STR (thisUA)) #for UA () # construct different page thisurl corresponding url = "http://www.qiushibaike.com/8hr/page/" + STR (I + 1) + "/" Data =urllib.request.urlopen(thisurl).read().decode(" utF-8 ","ignore" class="content">.*?<span>(.*?)</span>.*?</div>' rst=re.compile(pat,re.S).findall(data) for j in range(0,len(rst)): print(rst[j]) print("-------")Copy the code
Import time # then call the time.sleep() method laterCopy the code

3. Packet capture analysis
1) Introduction to Fiddler

2) The second exercise – Tencent video comment crawler combat

Import urllib.request import re # https://video.coral.qq.com/filmreviewr/c/upcomment/ ? Commentid = [comment id] & reqnum = [each extraction by the number of comments] # video id Vid ="j6cgzhtkuonf6te" # comment id cid="6233603654052033588" num="20 url="https://video.coral.qq.com/filmreviewr/c/upcomment/"+vid+"?commentid="+cid+"&reqnum="+num Headers = {the user-agent: "Mozilla / 5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.x MetaSr 1.0", "Content-Type":"application/javascript", } opener=urllib.request.build_opener() headall=[] for key,value in headers.items(): Item =(key,value) headall.append(item) opener. Addheaders = Headall urllib.request. Install_opener (opener) # data=urllib.request.urlopen(url).read().decode("utf-8") titlepat='"title":"(.*?)"' commentpat='"content":"(.*?)"' titleall=re.compile(titlepat,re.S).findall(data) commentall=re.compile(commentpat,re.S).findall(data) for i in range(0,len(titleall)): try: Print (" comment title is: "+ eval (" u" '+ titleall [I] +' "')) print (" comment is:" + eval (" u "' + commentall [I] + '"')) print (" -- -- -- -- -- -") except the Exception  as err: print(err)Copy the code
Import urllib.request import re # https://video.coral.qq.com/filmreviewr/c/upcomment/ ? Commentid = [comment id] & reqnum = [each extraction by the number of comments] vid = "j6cgzhtkuonf6te" Cid ="6233603654052033588" num="3" headers={" user-agent ":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.x MetaSr 1.0", "Content-Type":"application/javascript", } opener=urllib.request.build_opener() headall=[] for key,value in headers.items(): Item =(key,value) header. Append (item) opener. Addheaders = HeadAll urllib.request. For j in range(0,100): Print (" "+ STR (j)+") Thisurl # construct the current critical url = "https://video.coral.qq.com/filmreviewr/c/upcomment/" + vid + + cid? "commentid =" + "& reqnum =" + num data=urllib.request.urlopen(thisurl).read().decode("utf-8") titlepat='"title":"(.*?)","abstract":"' commentpat='"content":"(.*?)"' titleall=re.compile(titlepat,re.S).findall(data) Commentall =re.compile(commentpat, re.s).findall(data) lastpat='"last":"(.*?)"' Cid= re.compile(lastpat, re.s).findall(data)[0] for I in range(0,len(titleall)): try: Print (" comment title is: "+ eval (" u" '+ titleall [I] +' "')) print (" comment is:" + eval (" u "' + commentall [I] + '"')) print (" -- -- -- -- -- -") except the Exception  as err: print(err)Copy the code

4. Challenge cases
1) The third exercise – Chinese judgment documents web crawler actual combat

Unpacked tools
Js interface beautiful tool

Import urllib.request import re import http.cookiejar import execjs import uuid guID = uuID.uuid4 () print("guid:"+str(guid)) fh=open("./base64.js","r") js1=fh.read() fh.close() fh=open("./md5.js","r") js2=fh.read() Close () fh=open("./getkey.js","r") js3=fh.read() fh.close( cjar=http.cookiejar.CookieJar() opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cjar)) #Referer is often used for reverse crawling, Refer to source url opener.addheaders=[("Referer","http://wenshu.court.gov.cn/list/list/?sorttype=1&conditions=searchWord+1+AJLX++%E6%A1%88% E4%BB%B6%E7%B1%BB%E5%9E%8B:%E5%88%91%E4%BA%8B%E6%A1%88%E4%BB%B6&conditions=searchWord+2018+++%E8%A3%81%E5%88%A4%E5%B9%B4 %E4%BB%BD:2018&conditions=searchWord+%E4%B8%8A%E6%B5%B7%E5%B8%82+++%E6%B3%95%E9%99%A2%E5%9C%B0%E5%9F%9F:%E4%B8%8A%E6%B5% Request. Install_opener (opener) # Import random uapools=["Mozilla/5.0 (Windows NT 10.0;  Win64; X64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393", "Mozilla / 5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.x MetaSr 1.0", "Mozilla/4.0 (Compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0) ", ] # # urllib.request.urlopen("http://wenshu.court.gov.cn/list/list/?sorttype=1&conditions=searchWord+1+AJLX++%E6%A1%88%E4%BB%B 6%E7%B1%BB%E5%9E%8B:%E5%88%91%E4%BA%8B%E6%A1%88%E4%BB%B6&conditions=searchWord+2018+++%E8%A3%81%E5%88%A4%E5%B9%B4%E4%BB% BD:2018&conditions=searchWord+%E4%B8%8A%E6%B5%B7%E5%B8%82+++%E6%B3%95%E9%99%A2%E5%9C%B0%E5%9F%9F:%E4%B8%8A%E6%B5%B7%E5%B 8% of 82 "). The read (). The decode (" utf-8 ", "ignore") # vjkl5 fields extracted from the Cookie pat = "vjkl5 = (. *?) \ s" vjkl5=re.compile(pat,re.S).findall(str(cjar)) if(len(vjkl5)>0): vjkl5=vjkl5[0] else: Vjkl5 =0 print("vjkl5:"+ STR (vjkl5)) # Js_all = js_all. Replace (" ce7c8849dffea151c0179187f85efc9751115a7b ", STR (vjkl5) # using the python implementation js code, Compile_js=execjs.compile (js_all) # fetch vl5x=compile_js.call("getKey") Print (" vl5x: "+ STR (vl5x) # url =" http://wenshu.court.gov.cn/List/ListContent "for loop, switch 1 to 10 pages for I in range (0, 10) : try: # made number field values from the GetCode codeurl = "http://wenshu.court.gov.cn/ValiCode/GetCode" # mentioned above, GetCode, as long as a guid field, Codedata =urllib.parse.urlencode({"guid":guid, }).encode('utf-8') codereq = urllib.request.Request(codeurl,codedata) codereq.add_header('User-Agent',random.choice(uapools)) Codedata =urllib.request.urlopen(codereq).read().decode(" utF-8 ","ignore") #print(codeData) # construct the request parameter postdata = urllib. Parse. Urlencode ({" Param ":" case types: criminal cases, the referee year: 2018, the court region: Shanghai ", "Index" : STR (I + 1), "Page" : "20", "Order" : "the court hierarchy," "Direction":"asc", "number":str(codedata), "guid":guid, "vl5x":vl5x, }).encode('utf-8') # req = urllib.request. Req.add_header (' user-agent ',random.choice(uapools)) # Get the value of the instrument ID in ListContent Data = urllib. Request. Urlopen (the req.) read (). The decode (" utf-8 ", "ignore") pat = 'document ID. *? ". *? "(. *?)." allid=re.compile(pat).findall(data) print(allid) except Exception as err: print(err)Copy the code

