This article has participated in the activity of “New person creation Ceremony”, and started the road of digging gold creation together

1. Use the Requests library and regular expressions to grab the content of cat’s Eye movie TOP100

(1)Cat’s Eye movie websiteHere are 10 pages.

Maoyan.com/board/4?off… Maoyan.com/board/4?off….Maoyan.com/board/4?off…

import re
import json
import time
import requests
from requests.exceptions import RequestException
#from fake_useragent import UserAgent
Copy the code

(2) define get_one_page(URL) method to obtain the source code of the specified web page.

def get_one_page(url) :
    """ Send the request, get the response! :param url: :return: """
    try:
        headers = {
            'User-Agent':'the Mozilla / 5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident / 5.0 '
        }
        response = requests.get(url,timeout=30, headers=headers)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None
Copy the code

(3) Define the parse_one_page(HTML) method, parse the source code, and get each piece of movie information.

def parse_one_page(html) :
    "" Use regular expressions to extract the movie information in the response, and form structured data! :param html: :return: """
    pattern = re.compile(
        '<dd>'
        ". *? board-index.*? > (. *?) '#index
        ". *? data-src="(.*?) "'#image
        ". *? name.*? a.*? > (. *?) '#title
        ". *? star.*? > (. *?) 
'Starring #
        ". *? releasetime.*? > (. *?) 
'# Release time
        ". *? integer.*? > (. *?) '# Integer part of score
        ". *? fraction.*? > (. *?) '# Score the decimal part
        ". *? ', re.S)
    items = re.findall(pattern, str(html))
    for item in items:
        yield {
            'index': item[0].'image': item[1].'title': item[2].strip(),
            'actor': item[3].strip()[3:] if len(item[3) >3 else ' '.'time' : item[4].strip()[5:] if len(item[4) >5 else ' '.'score': item[5].strip() + item[6].strip()
        }
Copy the code

(4) Define write_to_file(content) method to write movie information into Excel file.

def write_to_file(content) :
    "" Stores the data, serializes the dictionary through the JSON library's dumps() method, and writes it to a text file! :param content: :return: """
    with open('result.txt'.'a', encoding='utf-8') as f:
        f.write(json.dumps(content, ensure_ascii=False) + ',\n')
Copy the code

(5) Define main(offset) method to sum all methods.

def main(offset) :
    By constructing URL offset parameter (offset value), achieve TOP100 ten page data crawl! :param offset: :return: """
    url = "http://maoyan.com/board/4?offset=" + str(offset)
    html = get_one_page(url)
    for item in parse_one_page(html):
        print(item)
        write_to_file(item)
Copy the code

(6) Use the for loop to traverse all urls.

if __name__ == '__main__':
    for i in range(9):
        main(offset=i * 10)
        time.sleep(5)
Copy the code

import re
import json
import time
import requests
from requests.exceptions import RequestException
#from fake_useragent import UserAgent


def get_one_page(url) :
    """ Send the request, get the response! :param url: :return: """
    try:
        headers = {
            'User-Agent':'the Mozilla / 5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident / 5.0 '
        }
        response = requests.get(url,timeout=30, headers=headers)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None

def parse_one_page(html) :
    "" Use regular expressions to extract the movie information in the response, and form structured data! :param html: :return: """
    pattern = re.compile('
      
       .*? board-index.*? > (. *?) '
      
                         '.*? data-src="(.*?) ". *? name.*? a.*? > (. *?) '
                         '.*? star.*? > (. *?) '
                         '
.*? releasetime.*? > (. *?) '
                         '
.*? integer.*? > (. *?) '
                         '.*? fraction.*? > (. *?) '
                         '.*? ',re.S)
    items = re.findall(pattern, str(html))
    for item in items:
        yield {
            'index': item[0].'image': item[1].'title': item[2].strip(),
            'actor': item[3].strip()[3:] if len(item[3) >3 else ' '.'time' : item[4].strip()[5:] if len(item[4) >5 else ' '.'score': item[5].strip() + item[6].strip()
        }

def write_to_file(content) :
    "" Stores the data, serializes the dictionary through the JSON library's dumps() method, and writes it to a text file! :param content: :return: """
    with open('result.txt'.'a', encoding='utf-8') as f:
        f.write(json.dumps(content, ensure_ascii=False) + ',\n')

def main(offset) :
    By constructing URL offset parameter (offset value), achieve TOP100 ten page data crawl! :param offset: :return: """
    url = "http://maoyan.com/board/4?offset=" + str(offset)
    html = get_one_page(url)
    for item in parse_one_page(html):
        print(item)
        write_to_file(item)

if __name__ == '__main__':
    for i in range(1):
        main(offset=i * 10)
        time.sleep(5)

Copy the code

import re
import time
import requests
from requests.exceptions import RequestException
import xlwings as xw
#from fake_useragent import UserAgent

def getHTML(url) :
    try:
        headers = {
            'User-Agent': 'the Mozilla / 5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
        }
        response = requests.get(url,timeout=30, headers=headers)
        response.encoding = response.apparent_encoding
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None

def findMaoyan(html) :
    global mlist
    pattern = re.compile(
        '<dd>'
        ". *? board-index.*? > (. *?) '#index
        ". *? data-src="(.*?) "'#image
        ". *? name.*? a.*? > (. *?) '#title
        ". *? star.*? > (. *?) 
'Starring #
        ". *? releasetime.*? > (. *?) 
'# Release time
        ". *? integer.*? > (. *?) '# Integer part of score
        ". *? fraction.*? > (. *?) '# Score the decimal part
        ". *? ', re.S)
    items = re.findall(pattern,str(html))
    for item in items:
        mlist.append([item[0].#index
                      item[1].#image
                      item[2].strip(),#title
                      item[3].strip()[3:] if len(item[3) >3 else ' '.Starring #
                      item[4].strip()[5:] if len(item[4) >5 else ' '.# Release time
                      item[5].strip() + item[6].strip()])# score
    #print(mlist)
    return mlist

def main() :
    global mlist
    mlist = [['index'.'image'.'title'.'star'.'Show time'.'score']]
    for i in range(10):
        url = "http://maoyan.com/board/4?offset=" + str(i*10)
        html = getHTML(url)
        findMaoyan(html)
        time.sleep(1)

    Write to an Excel file
    wb = xw.Book()
    sht = wb.sheets('Sheet1')
    sht.range('a1').value = mlist  Add data to the table

if __name__ == '__main__':
    main()


Copy the code

mo4tech.com (Moment For Technology) is a global community with thousands techies from across the global hang out!Passionate technologists, be it gadget freaks, tech enthusiasts, coders, technopreneurs, or CIOs, you would find them all here.

Python crawls TOP100 data from cat’s eye movies

1. Use the Requests library and regular expressions to grab the content of cat’s Eye movie TOP100

(1)Cat’s Eye movie websiteHere are 10 pages.

(2) define get_one_page(URL) method to obtain the source code of the specified web page.

(3) Define the parse_one_page(HTML) method, parse the source code, and get each piece of movie information.

(4) Define write_to_file(content) method to write movie information into Excel file.

(5) Define main(offset) method to sum all methods.

(6) Use the for loop to traverse all urls.

Python crawls TOP100 data from cat’s eye movies

1. Use the Requests library and regular expressions to grab the content of cat’s Eye movie TOP100

(1)Cat’s Eye movie websiteHere are 10 pages.

(2) define get_one_page(URL) method to obtain the source code of the specified web page.

(3) Define the parse_one_page(HTML) method, parse the source code, and get each piece of movie information.

(4) Define write_to_file(content) method to write movie information into Excel file.

(5) Define main(offset) method to sum all methods.

(6) Use the for loop to traverse all urls.

Related Posts

Millions of TPS high throughput, second level low delay, Ali search offline platform how to achieve?

Python ETL tool pyetl

Draw waffle charts in Python compared to Excel