This is the third article in the python3 primer series, and you need to know the first two to learn this one, otherwise it’s a bit of a struggle
- Python crawler 001– Web crawlers, image crawlers, article crawlers, Python crawlers crawl news sites
- Python3 Mysql database 002 uses PyCharm to quickly connect to and manipulate mysql database
Let’s start by saving the news data we crawled in the first section to mysql data
First, we need to connect to the database
Configure the database connection parameters by defining a MySQLCommand class and a connectMysql method to connect to the database
# -*- coding: utf-8 -*-
# author wechat: 2501902696
import pymysql
Class used to manipulate the database
class MySQLCommand(object):
Class initialization
def __init__(self):
self.host = 'localhost'
self.port = 3306 # port
self.user = 'root' # username
self.password = "" # your password
self.db = "home" # libraries
self.table = "home_list" # table
# link database
def connectMysql(self):
try:
self.conn = pymysql.connect(host=self.host, port=self.port, user=self.user,
passwd=self.password, db=self.db, charset='utf8')
self.cursor = self.conn.cursor()
except:
print('connect mysql error.')
Copy the code
Second, after connecting to the database, we need to insert data
We have two questions before we insert the data
- 1. How to deduplicate duplicate data
- 2. Where should the primary key ID of the new data start
Check whether the data exists before inserting it
def insertData(self, my_dict):
table = "home_list" Table to operate on
SQL statement url=' %s' before and after %s
sqlExit = "SELECT url FROM home_list WHERE url = ' %s '" % (my_dict['url'])
res = self.cursor.execute(sqlExit)
if res: If the number of queried data items is greater than 0, the data already exists
print("Data already exists", res)
return 0
Insert if data does not exist
try:
cols = ', '.join(my_dict.keys())# Use, split
values = '",".join(my_dict.values())
sql = "INSERT INTO home_list (%s) VALUES (%s)" % (cols, '"' + values + '"')
SQL > alter table SQL > alter table SQL
# INSERT INTO home_list (img_path, url, id, title) VALUES ("https://img.huxiucdn.com.jpg"," https://www.huxiu.com90.html"," 12"," ")
try:
result = self.cursor.execute(sql)
insert_id = self.conn.insert_id() # id returned after successful insertion
self.conn.commit()
Check whether the command is executed successfully
if result:
print("Insert successful", insert_id)
return insert_id + 1
except pymysql.Error as e:
Rollback if an error occurs
self.conn.rollback()
The primary key is unique and cannot be inserted
if "key 'PRIMARY'" in e.args[1]:
print("Data already exists, no data inserted")
else:
print("Failed to insert data due to %d: %s" % (e.args[0], e.args[1]))
except pymysql.Error as e:
print("Database error, cause %d: %s" % (e.args[0], e.args[1]))
Copy the code
Through the above code we see how to heavy
- Before each insert, we need to check whether the data already exists. If so, we will not insert. The fields of our home_list table include ID, title, URL, and IMG_PATH. The titlehe and IMG_PATH fields may be empty by analyzing the data we captured, so here we use the URL field to retrieve the data. Read the code above once you know the de-duplication principle, you should be able to understand it easily
Third, query the id value of the last data in the database to determine the start value of our new data ID
Get the id of the last item in the home_list table using the getLastId function below
Select * from id where id = 1
def getLastId(self):
sql = "SELECT max(id) FROM " + self.table
try:
self.cursor.execute(sql)
row = self.cursor.fetchone() Get the first data from the query
if row[0]:
return row[0] Return the id of the last item of data
else:
return 0 Return 0 if the table is empty
except:
print(sql + ' execute failed.')
Copy the code
The complete code for the MySQLCommand database action class is posted below
# -*- coding: utf-8 -*-
# author wechat: 2501902696
import pymysql
Class used to manipulate the database
class MySQLCommand(object):
Class initialization
def __init__(self):
self.host = 'localhost'
self.port = 3306 # port
self.user = 'root' # username
self.password = "" # your password
self.db = "home" # libraries
self.table = "home_list" # table
# link database
def connectMysql(self):
try:
self.conn = pymysql.connect(host=self.host, port=self.port, user=self.user,
passwd=self.password, db=self.db, charset='utf8')
self.cursor = self.conn.cursor()
except:
print('connect mysql error.')
Check whether the data exists before inserting it
def insertData(self, my_dict):
table = "home_list" Table to operate on
SQL statement url=' %s' before and after %s
sqlExit = "SELECT url FROM home_list WHERE url = ' %s '" % (my_dict['url'])
res = self.cursor.execute(sqlExit)
if res: If the number of queried data items is greater than 0, the data already exists
print("Data already exists", res)
return 0
Insert if data does not exist
try:
cols = ', '.join(my_dict.keys())# Use, split
values = '",".join(my_dict.values())
sql = "INSERT INTO home_list (%s) VALUES (%s)" % (cols, '"' + values + '"')
SQL > alter table SQL > alter table SQL
# INSERT INTO home_list (img_path, url, id, title) VALUES ("https://img.huxiucdn.com.jpg"," https://www.huxiu.com90.html"," 12"," ")
try:
result = self.cursor.execute(sql)
insert_id = self.conn.insert_id() # id returned after successful insertion
self.conn.commit()
Check whether the command is executed successfully
if result:
print("Insert successful", insert_id)
return insert_id + 1
except pymysql.Error as e:
Rollback if an error occurs
self.conn.rollback()
The primary key is unique and cannot be inserted
if "key 'PRIMARY'" in e.args[1]:
print("Data already exists, no data inserted")
else:
print("Failed to insert data due to %d: %s" % (e.args[0], e.args[1]))
except pymysql.Error as e:
print("Database error, cause %d: %s" % (e.args[0], e.args[1]))
Select * from id where id = 1
def getLastId(self):
sql = "SELECT max(id) FROM " + self.table
try:
self.cursor.execute(sql)
row = self.cursor.fetchone() Get the first data from the query
if row[0]:
return row[0] Return the id of the last item of data
else:
return 0 Return 0 if the table is empty
except:
print(sql + ' execute failed.')
def closeMysql(self):
self.cursor.close()
self.conn.close() Create an instance of the database action class
Copy the code
Post code to insert crawler data into the database
# -*- coding: utf-8 -*-
# author wechat: 2501902696
from bs4 import BeautifulSoup
from urllib import request
import chardet
from db.MySQLCommand import MySQLCommand
url = "https://www.huxiu.com"
response = request.urlopen(url)
html = response.read()
charset = chardet.detect(html)
html = html.decode(str(charset["encoding"])) # set the encoding of the captured HTML
# use html.parser
soup = BeautifulSoup(html, 'html.parser')
Get every a node from class=hot-article-img
allList = soup.select('.hot-article-img')
Connect to database
mysqlCommand = MySQLCommand()
mysqlCommand.connectMysql()
Select * from database where id = 1; select * from database where id = 1
dataCount = int(mysqlCommand.getLastId()) + 1
for news in allList: # Walk through the list to get valid information
aaa = news.select('a')
Select only results with length greater than 0
if len(aaa) > 0:
# post link
try: Null if an exception is thrown
href = url + aaa[0]['href']
except Exception:
href = ' '
# post image URL
try:
imgUrl = aaa[0].select('img') [0] ['src']
except Exception:
imgUrl = ""
# News headlines
try:
title = aaa[0]['title']
except Exception:
title = ""
Combine each of the retrieved data into a dictionary for database insertion
news_dict = {
"id": str(dataCount),
"title": title,
"url": href,
"img_path": imgUrl
}
try:
Insert data if it already exists
res = mysqlCommand.insertData(news_dict)
if res:
dataCount=res
except Exception as e:
print("Insert data failed", str(e))Insert failed
mysqlCommand.closeMysql() Make sure the data is turned off
dataCount=0
Copy the code
If you’re not familiar with the code above, check out my first tutorial on Python crawlers – web crawlers, image crawlers, article crawlers, and Python crawlers for news sites
Python3 crawler + PYTHon3 database section is done here, take a look at the operation effect
Written on the fourth day of Python zero Basics