This article has participated in the activity of “New person creation Ceremony”, and started the road of digging gold creation together

1. Use the Requests library and regular expressions to grab the content of cat’s Eye movie TOP100

(1)Cat’s Eye movie websiteHere are 10 pages.

Maoyan.com/board/4?off… Maoyan.com/board/4?off….Maoyan.com/board/4?off…

import re
import json
import time
import requests
from requests.exceptions import RequestException
#from fake_useragent import UserAgent
Copy the code

(2) define get_one_page(URL) method to obtain the source code of the specified web page.

def get_one_page(url) :
    """ Send the request, get the response! :param url: :return: """
    try:
        headers = {
            'User-Agent':'the Mozilla / 5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident / 5.0 '
        }
        response = requests.get(url,timeout=30, headers=headers)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None
Copy the code

(3) Define the parse_one_page(HTML) method, parse the source code, and get each piece of movie information.

def parse_one_page(html) :
    "" Use regular expressions to extract the movie information in the response, and form structured data! :param html: :return: """
    pattern = re.compile(
        '<dd>'
        ". *? board-index.*? > (. *?) '#index
        ". *? data-src="(.*?) "'#image
        ". *? name.*? a.*? > (. *?) '#title
        ". *? star.*? > (. *?) 

'
Starring # ". *? releasetime.*? > (. *?)

'
# Release time ". *? integer.*? > (. *?) '# Integer part of score ". *? fraction.*? > (. *?) '# Score the decimal part ". *? ', re.S) items = re.findall(pattern, str(html)) for item in items: yield { 'index': item[0].'image': item[1].'title': item[2].strip(), 'actor': item[3].strip()[3:] if len(item[3) >3 else ' '.'time' : item[4].strip()[5:] if len(item[4) >5 else ' '.'score': item[5].strip() + item[6].strip() } Copy the code

(4) Define write_to_file(content) method to write movie information into Excel file.

def write_to_file(content) :
    "" Stores the data, serializes the dictionary through the JSON library's dumps() method, and writes it to a text file! :param content: :return: """
    with open('result.txt'.'a', encoding='utf-8') as f:
        f.write(json.dumps(content, ensure_ascii=False) + ',\n')
Copy the code

(5) Define main(offset) method to sum all methods.

def main(offset) :
    By constructing URL offset parameter (offset value), achieve TOP100 ten page data crawl! :param offset: :return: """
    url = "http://maoyan.com/board/4?offset=" + str(offset)
    html = get_one_page(url)
    for item in parse_one_page(html):
        print(item)
        write_to_file(item)
Copy the code

(6) Use the for loop to traverse all urls.

if __name__ == '__main__':
    for i in range(9):
        main(offset=i * 10)
        time.sleep(5)
Copy the code
import re
import json
import time
import requests
from requests.exceptions import RequestException
#from fake_useragent import UserAgent


def get_one_page(url) :
    """ Send the request, get the response! :param url: :return: """
    try:
        headers = {
            'User-Agent':'the Mozilla / 5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident / 5.0 '
        }
        response = requests.get(url,timeout=30, headers=headers)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None

def parse_one_page(html) :
    "" Use regular expressions to extract the movie information in the response, and form structured data! :param html: :return: """
    pattern = re.compile('
      
.*? board-index.*? > (. *?) '
'.*? data-src="(.*?) ". *? name.*? a.*? > (. *?) ' '.*? star.*? > (. *?) ' '

.*? releasetime.*? > (. *?) '
'

.*? integer.*? > (. *?) '
'.*? fraction.*? > (. *?) ' '.*? ',re.S) items = re.findall(pattern, str(html)) for item in items: yield { 'index': item[0].'image': item[1].'title': item[2].strip(), 'actor': item[3].strip()[3:] if len(item[3) >3 else ' '.'time' : item[4].strip()[5:] if len(item[4) >5 else ' '.'score': item[5].strip() + item[6].strip() } def write_to_file(content) : "" Stores the data, serializes the dictionary through the JSON library's dumps() method, and writes it to a text file! :param content: :return: """ with open('result.txt'.'a', encoding='utf-8') as f: f.write(json.dumps(content, ensure_ascii=False) + ',\n') def main(offset) : By constructing URL offset parameter (offset value), achieve TOP100 ten page data crawl! :param offset: :return: """ url = "http://maoyan.com/board/4?offset=" + str(offset) html = get_one_page(url) for item in parse_one_page(html): print(item) write_to_file(item) if __name__ == '__main__': for i in range(1): main(offset=i * 10) time.sleep(5) Copy the code
import re
import time
import requests
from requests.exceptions import RequestException
import xlwings as xw
#from fake_useragent import UserAgent

def getHTML(url) :
    try:
        headers = {
            'User-Agent': 'the Mozilla / 5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
        }
        response = requests.get(url,timeout=30, headers=headers)
        response.encoding = response.apparent_encoding
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None

def findMaoyan(html) :
    global mlist
    pattern = re.compile(
        '<dd>'
        ". *? board-index.*? > (. *?) '#index
        ". *? data-src="(.*?) "'#image
        ". *? name.*? a.*? > (. *?) '#title
        ". *? star.*? > (. *?) 

'
Starring # ". *? releasetime.*? > (. *?)

'
# Release time ". *? integer.*? > (. *?) '# Integer part of score ". *? fraction.*? > (. *?) '# Score the decimal part ". *? ', re.S) items = re.findall(pattern,str(html)) for item in items: mlist.append([item[0].#index item[1].#image item[2].strip(),#title item[3].strip()[3:] if len(item[3) >3 else ' '.Starring # item[4].strip()[5:] if len(item[4) >5 else ' '.# Release time item[5].strip() + item[6].strip()])# score #print(mlist) return mlist def main() : global mlist mlist = [['index'.'image'.'title'.'star'.'Show time'.'score']] for i in range(10): url = "http://maoyan.com/board/4?offset=" + str(i*10) html = getHTML(url) findMaoyan(html) time.sleep(1) Write to an Excel file wb = xw.Book() sht = wb.sheets('Sheet1') sht.range('a1').value = mlist Add data to the table if __name__ == '__main__': main() Copy the code