Implementation idea:

Grab the URL of the actual accessed dynamic page – use regular expression to get the required content – parse the content – store the content

Text explanation of the above part of the process:

Grab the url of the actual visited dynamic page:

  

Regular expressions:

For more details on how to implement a simple crawler in Python, you can search for regular expressions in python

Json:

Search for json python

Store to database:

1, mysql 2, mysql Python

Source code and notes

Note: The version using Python is 2.7

#! /usr/bin/python # Specifies the encoding # -*- coding: utF-8 -*-

Import urllib import urllib2 import re import MySQLdb import json

Def getHtml(self,url=None): def getHtml(self,url=None): WOW64; Rv :40.0) Gecko/20100101 Firefox/40.0″ header={” user-agent “:user_agent} request=urllib2. request (URL,headers=header) response=urllib2.urlopen(request) html=response.read() return html

    def getContent(self,html,reg):

content=re.findall(html, reg, re.S)

return content

Mysql def connectDB(self): Host =”192.168.85.21″ dbName=”test1″ user=”root” password=”123456″ This code must be consistent with the database code db = MySQLdb. Connect (host, user, password, dbName, charset = ‘utf8’) return db cursorDB = the cursor () return cursorDB

Create table, SQL language. Def creatTable(self,createTableName) def creatTable(self,createTableName) createTableSql=”CREATE TABLE IF NOT EXISTS “+ createTableName+”(time VARCHAR(40),title VARCHAR(100),text VARCHAR(40),clicks VARCHAR(10))” DB_create=self.connectDB() cursor_create=DB_create.cursor() cursor_create.execute(createTableSql) DB_create.close() print ‘creat table ‘+createTableName+’ successfully’ return createTableName

# data insert table def inserttable (self, inserttable insertTime, insertTitle, insertText, insertClicks) : insertContentSql=”INSERT INTO “+insertTable+”(time,title,text,clicks)VALUES(%s,%s,%s,%s)” # insertContentSql=”INSERT INTO “+insertTable+”(time,title,text,clicks)VALUES(“+insertTime+” , “+insertTitle+” , “+insertText+” , “+insertClicks+”)”


DB_insert=self.connectDB()

cursor_insert=DB_insert.cursor()        

cursor_insert.execute(insertContentSql,(insertTime,insertTitle,insertText,insertClicks))

DB_insert.commit()

DB_insert.close()

print ‘inert contents to  ‘+insertTable+’ successfully’  

Url = “baoliao.hb.qq.com/api/report/…”

Reg_jason =r’.*? jQuery.*? \((.*)\)’ reg_time=r’.*?” create_time”:”(.*?) “‘ reg_title=r’.*?” title”:”(.*?) “. *? ‘ reg_text=r’.*?” content”:”(.*?) “. *? ‘ reg_clicks=r’.*?” counter_clicks”:”(.*?) “‘

Crawl.gethtml (url) html_jason=re.findall(reg_jason, HTML, re.s)

html_need=json.loads(html_jason[0])

print len(html_need)

print len(html_need[‘data’][‘list’])


table=crawl.creatTable(‘yh1’)

for i in range(len(html_need[‘data’][‘list’])):

creatTime=html_need[‘data’][‘list’][i][‘create_time’]

title=html_need[‘data’][‘list’][i][‘title’]

content=html_need[‘data’][‘list’][i][‘content’]

clicks=html_need[‘data’][‘list’][i][‘counter_clicks’]

crawl.inserttable(table,creatTime,title,content,clicks)