directory

Practice: Crawl pictures from the web to local

Practice: climb the QQ number in the network


Practice: Crawl pictures from the web to local

Photo from No.1 Store

You can start by crawling the web code of Yihaodian into an HTML

import urllib.request
import os
import re

def imageCrawler(url,topath) :
    headers = {
        'User-Agent': 'the Mozilla / 5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60'
    }
    req = urllib.request.Request(url, headers=headers)
    response = urllib.request.urlopen(req)
    html = response.read()
    with open(r'a.html'.'wb') as f:
        f.write(html)


url = 'https://search.yhd.com/c0-0/k%25E6%2597%25B6%25E5%25B0%259A%25E8%25A3%2599%25E8%25A3%2585/'
toPath = r'C:\Users\asus\Desktop\img'
imageCrawler(url,toPath)
Copy the code

 a.html

Find the img SRC source for each image in the web page

Then use encoding to get (.*?). Pictures in the address, through the urllib. Request. Urlretrieve images are downloaded to the local store

Final code:

import os
import re

def imageCrawler(url,topath) :
    headers = {
        'User-Agent': 'the Mozilla / 5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60'
    }
    req = urllib.request.Request(url, headers=headers)
    response = urllib.request.urlopen(req)
    html = response.read().decode('utf-8')
    # with open(r'a.html','wb') as f:
    # f.write(html)

    # pat = r'
      
(.*?)
'
pat = r'
\n '
re_image = re.compile(pat,re.S) imagelist = re_image.findall(html) print(imagelist) print(len(imagelist)) # print(imagelist[0]) num = 1 for imageurl in imagelist: path = os.path.join(topath,str(num)+'.jpg') num += 1 # Download images to local storage urllib.request.urlretrieve('http://'+imageurl,filename=path) url = 'https://search.yhd.com/c0-0/k%25E6%2597%25B6%25E5%25B0%259A%25E8%25A3%2599%25E8%25A3%2585/' toPath = r'C:\Users\asus\Desktop\img' imageCrawler(url,toPath) Copy the code

Running results:

 

 

Practice: climb the QQ number in the network

Get the QQ number from this page of Douban

Code:

import urllib.request
import os
import re
import ssl
from collections import deque

def writeFile1(htmlBytes,topath) :
    with open(topath,'wb') as f:
        f.write(htmlBytes)
def writeFileStr(htmlBytes,topath) :
    with open(topath,'wb') as f:
        f.write(htmlBytes)

def gethtmlbytes(url) :
    headers = {
        'User-Agent': 'the Mozilla / 5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60'
    }
    req = urllib.request.Request(url, headers=headers)
    context = ssl._create_unverified_context()
    response = urllib.request.urlopen(req,context=context)
    return response.read()

def qqCrawler(url,topath) :
    htmlbytes = gethtmlbytes(url)
    # writeFile1(htmlbytes,r'c.html')
    # writeFileStr(htmlbytes,r'c.txt')
    htmlStr = str(htmlbytes)
    # Climb QQ number
    pat = R '[1-9] \ d {4, 9}'
    re_qq = re.compile(pat)
    qqlist = re_qq.findall(htmlStr)
    #QQ id list to delete weight
    qqlist = list(set(qqlist))
    # Write the QQ number to TXT
    f = open(topath,'a')
    for qqstr in qqlist:
        f.write(qqstr+'\n')
    f.close()
    # print(qqlist)
    # print(len(qqlist))
    # Crawl some urls inside
    pat = '((| | FTP HTTPS (HTTP: / /) (((a - zA - Z0-9 \. _ -] + \. [a zA - Z] {2, 6}) | ([0-9] {1, 3} \. [0-9] {1, 3} \. [0-9] {1, 3})) (: [0-9] {1, 4}) * (/ [a - zA - Z0 - 9 \ & % _ \. / - ~ -] *)? '
    re_url = re.compile(pat)
    urllist = re_url.findall(htmlStr)
    # QQ id list to delete weight
    urllist = list(set(urllist))
    # print(urllist)
    # print(len(urllist))
    # print(urllist[10])
    return urllist

url = 'https://www.douban.com/group/topic/110094603/'
topath = r'b.txt'
# qqCrawler(url,topath)
Set up the central controller
def center(url,topath) :
    queue = deque()
    queue.append(url)

    while len(queue) ! =0:
        targetUrl = queue.popleft()
        urllist = qqCrawler(targetUrl,topath)
        for item in urllist:
            tempurl = item[0]
            queue.append(tempurl)

center(url,topath)
Copy the code

Run result: can climb for a long time, stop directly

 

 

 

 

Learn together and make progress together. If there are any mistakes, please comment