Disclaimer: This article is only for study and research, prohibited for illegal use, otherwise the consequences, such as infringement, please inform to delete, thank you!
@[TOC](Scrapy
1. Project scenario
Target Website:www.liepin.com/zhaopin/?ke…
2. Preparation
Scrapy startProject liepin_spider
Scrapy genspider liepin ‘www.liepin.com/zhaopin/’
2.3 Configuring Settings, proxy, and database connections
3. Page analysis
3.1 the diagram below
4. Write code
4.1 Crawler code
# -*- coding: utf-8 -*-
import scrapy
from liepin_spider.items import LiepinSpiderItem
from sql import MyMysql
class LiepinSpider(scrapy.Spider) :
name = 'liepin'
custom_settings = {
'ITEM_PIPELINES': {
'liepin_spider.pipelines.LiepinSpiderPipeline': 200,},'DOWNLOADER_MIDDLEWARES': {
# 'liepin_spider.middlewares.LiepinSpiderDownloaderMiddleware': 100,
'scrapy.downloadermiddlewares.retry.RetryMiddleware': None.# 'liepin_spider.middlewares.MyRetryMiddleware': 110,
},
'CONCURRENT_REQUESTS': 1.# Number of concurrent requests
'DOWNLOAD_DELAY': 1.Request delay
'DOWNLOAD_TIMEOUT' : 5.# request timeout
}
def __init__(self, s_type=None, c_type=None, *args, **kwargs) : # pass to select a crawl mode
super(LiepinSpider, self).__init__(*args, **kwargs)
self.c_type = c_type Type 1: daily 2: monthly
self.s_type = s_type # retrieve keyword type
def start_requests(self) :
mysql = MyMysql()
if self.c_type == '1':
keys = mysql.read_many('select zwmc from lp_job_names where type = 0')# get database keyword, crawl part
else:
keys = mysql.read_many('select zwmc from lp_job_names ')SQL > select * from database
if self.s_type=='0':
quyu_info = mysql.read_many('select dqs,city from lp_job_areas') # select all region data
else:
quyu_info = mysql.read_many('select dqs,city from lp_job_areas where type = {}'.format(self.s_type))# Select some region data
for key in keys:
for quyu in quyu_info:
print("The search terms are:" + key[0] + "The current search area is:"+quyu[1])
params = (
('dqs', quyu[0]), # region parameters
('key', key[0]),# Search keywords
('curPage'.'0'),
('pubTime'.'1'),# Within a day
('jobKind'.'2')# Job type
)
url = 'https://www.liepin.com/zhaopin/'
yield scrapy.FormRequest(url=url,method='GET',formdata=params,callback=self.parse,meta={"key_info":key[0]."diqu_info":quyu[1]},dont_filter=True)
SQL > alter database status
print("Position:"+key[0] +"Enquiry completed")
mysql.update("UPDATE lp_job_names set type = '1' WHERE zwmc = '%s' " % key[0])
# Reset job status
mysql.update("UPDATE lp_job_names set type = '0' WHERE type = '1'")
def parse(self, response) :
urls = response.xpath('//div[@class="job-info"]/h3/a/@href').getall() # All recruitment links
key = response.meta["key_info"]
diqu = response.meta["diqu_info"]
if len(urls) == 0:
print(diqu+"No position:"+key)
pass If no data is found, skip
else:
for url in urls: # Loop through job listings
if 'https://www.liepin.com' not in url:
url = 'https://www.liepin.com'+ url
yield scrapy.Request(url=url,callback=self.get_data,meta={"key_info":key,"diqu_info":diqu},dont_filter=True )
next_url = 'https://www.liepin.com' + response.xpath('//div[@class="pagerbar"]/a/@href').getall()[-2] # next page address
if 'javascript:' in next_url: # Determine if there is a next page
pass
else:
print(diqu+Position:+key+"Next page address",next_url)
yield scrapy.Request(url=next_url,meta={"key_info":key,"diqu_info":diqu},callback=self.parse,dont_filter=True )
def get_data(self,response) :
items = LiepinSpiderItem()
items['key_word'] = response.meta["key_info"]# Search keywords
items['diqu']= response.meta["diqu_info"]# Search area
items['c_type']= self.c_type
items['zhiwei']= response.xpath('//h1/text()').get()# position
if items['zhiwei'] is None:
pass
else:
items['company'] = response.xpath('//div[@class="title-info"]/h3/a/text()').get()# Company name
items['salary'] =' '.join(response.xpath('//p[@class="job-item-title"]/text()').getall()).strip()# salary
try:
items['fb_time']= response.xpath('//p[@class="basic-infor"]/time/@title').get() + response.xpath('//p[@class="basic-infor"]/time/text()').get().strip() # Release time
except:
items['fb_time'] = ' '
items['requirement'] =The '#'.join(response.xpath('//div[@class="job-title-left"]/div[@class="job-qualifications"]/span/text()').getall()) # require
items['welfare'] =The '#'.join(response.xpath('//div[@class="comp-tag-box"]/ul/li/span/text()').getall()) # welfare
items['job_description'] =' '.join(response.xpath('//div[@class="content content-word"]/text()').getall()).strip() # Job Description
items['log_url']= response.xpath('//div[@class="company-logo"]/a/@href').get() # Company logo address
items['industry']= response.xpath('//ul[@class="new-compintro"]/li[1]/a/text()').get() # industry
company_info = response.xpath('//ul[@class="new-compintro"]/li//text()').getall()
items['company_size'] = items['company_addr'] = ' '
for num in range(3.len(company_info)):
if 'Size of company' in company_info[num]:
items['company_size']= company_info[num].replace('Company Size:'.' ') # Company size
else:
items['company_addr']= company_info[num].replace(Company Address:.' ') # Company address
# yield items
print(items)
Copy the code
4.2 Database connection code
# -*- coding: utf-8 -*-
import pymysql
class MyMysql:
def __init__(self) :
self.host = 'xxxxx' # ip
self.port = 3306 # port
self.user = 'xxxx' # username
self.password = 'xxxx' # your password
self.dbname = 'xxxx' # database name
self.charset = 'utf8mb4' # character type
# link database
self.connect()
def connect(self) :
Link database and fetch cursor
self.db = pymysql.connect(host=self.host, port=self.port, user=self.user, password=self.password,
db=self.dbname, charset=self.charset)
self.cursor = self.db.cursor()
def run(self, sql) :
ret = None
try:
ret = self.cursor.execute(sql)
self.db.commit()
except Exception as e:
self.db.rollback()
# finally:
# self.close()
return ret
def rollback(self) :
self.db.rollback()
self.close()
def close(self) :
self.cursor.close()
self.db.close()
def insert(self, sql) :
try:
self.cursor.execute(sql)
self.db.commit()
except pymysql.err.IntegrityError:
pass
def commit(self) :
self.db.commit()
self.close()
def update(self, sql) :
return self.run(sql)
def delete(self, sql) :
return self.run(sql)
def read_one(self, sql) :
ret = None
try:
self.cursor.execute(sql)
Get the data
ret = self.cursor.fetchone()
except Exception as e:
# print(' query failed ')
pass
# finally:
# self.close()
return ret
def read_many(self, sql) :
ret = None
try:
self.cursor.execute(sql)
Get the data
ret = self.cursor.fetchall()
except Warning as e:
print('Query failed')
finally:
pass
return ret
Copy the code
3.3 the Items code
# -*- coding: utf-8 -*-
import scrapy
class LiepinSpiderItem(scrapy.Item) :
key_word = scrapy.Field()# Search keywords
zhiwei = scrapy.Field()# Job wanted
company = scrapy.Field()# Company name
salary = scrapy.Field()# salary
diqu = scrapy.Field()# region
fb_time = scrapy.Field()# Release time
requirement = scrapy.Field()# require
welfare = scrapy.Field()# welfare
job_description = scrapy.Field()# Job Description
log_url = scrapy.Field()# Company logo address
industry = scrapy.Field()# industry
company_size = scrapy.Field()# Company size
company_addr = scrapy.Field()# Company address
c_type = scrapy.Field()# crawl time type
Copy the code
4.4 Pipelines code
# -*- coding: utf-8 -*-
import emoji
import pymysql
from pymongo import MongoClient
from twisted.enterprise import adbapi
import copy
adbparams_info = dict(
host='xxxx'.# ip
db='xxxx'.# database name
user='xxxx'.# username
password='xxxx'.# your password
charset='utf8',
cursorclass=pymysql.cursors.DictCursor # set cursor type
)
class LiepinSpiderPipeline(object) :
Asynchronous write
def __init__(self,dbpool) :
self.dbpool = dbpool
self.conn = MongoClient('xxxx'.27017) # mongo link, you don't have to
@classmethod
def from_settings(cls,settings) :
First remove the content required by setting to connect to the database and construct a location
adbparams = adbparams_info
dbpool = adbapi.ConnectionPool('pymysql', **adbparams)
Return the instantiation parameter
return cls(dbpool)
def process_item(self,item,spider) :
Insert Item data into the database asynchronously with Twisted
item1 = copy.deepcopy(item)
# in mysql
self.dbpool.runInteraction(self.do_insert, item1) Specify the operation method and operation data
# Deposit mongo, obtained position
self.mongo_insert(item['zhiwei'])
def mongo_insert(self,job) :
db = self.conn.crawlab_test Connect to mongo database
my_set = db.jobs # store collection
data_test1 = {
'job': job
}
my_set.insert_one(data_test1)
def handle_error(self,failure,item) :
Asynchronous insertion is abnormal
print(failure,"Database exception")
def do_insert(self, cursor, item) :
insert_sql = """insert into lp_job_data(key_word,zhiwei,company,salary,diqu,fb_time,requirement,welfare, job_description,logo_url,industry,company_size,company_addr,type,c_type) VALUES('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','5','%s') """% (
item['key_word'],item['zhiwei'],item['company'],item['salary'],item['diqu'],
item['fb_time'],item['requirement'],item['welfare'],item['job_description'].replace("'".'"'),item['log_url'],
item['industry'],item['company_size'],item['company_addr'],item['c_type'])
try:
cursor.execute(insert_sql)
except:
insert_sql = emoji.demojize(insert_sql)# replace emoj
cursor.execute(insert_sql)
def close_spider(self, spider) :
self.conn.close()
Copy the code
4.5 Middleware proxy configuration code
# -*- coding: utf-8 -*-
import json,time
import random
import requests
from liepin_spider import settings
class LiepinSpiderDownloaderMiddleware(object) :
def process_request(self, request, spider) :
Set random headers
ua_random = random.choice(settings.USER_AGENT_LIST)
request.headers['User-Agent'] = UA_random -- Set the proxy and add it yourselfdef process_exception(self, request, exception, spider) :Ua_random = random. Choice (settings.user_agent_list) request. Headers [uA_random = random.'User-Agent'] = ua_random
return request
Copy the code
4.6 Running Code
import os
def start_all() :
os.system('scrapy crawl liepin -a s_type="0" -a c_type="2"') # select all region data
# def start_type1():
# os.system('scrapy crawl liepin -a ') # scrapy crawl liepin -a
#
# def start_type2():
# os.system('scrapy crawl liepin -a s_type="2")
#
# def start_type3():
# os.system('scrapy crawl liepin -a s_type=" 2 ") # scrapy crawl liepin -a s_type="3"
def new_type() :
os.system('scrapy crawl liepin -a s_type="4" -a c_type="1"') # 5 cities, some jobs
if __name__ == '__main__':
# start_type1()
# start_type2()
# start_type3()
new_type()
Copy the code
5. Run code
5.1 Let’s take a look at the final code effect
5.2 The data are basically achieved. Finally, I put the post keywords, regional data and the structure of the crawl result table here