Disclaimer: This article is only for study and research, prohibited for illegal use, otherwise the consequences, such as infringement, please inform to delete, thank you!



@[TOC](Scrapy


1. Project scenario


Target Website:www.liepin.com/zhaopin/?ke…

2. Preparation


Scrapy startProject liepin_spider
Scrapy genspider liepin ‘www.liepin.com/zhaopin/’
2.3 Configuring Settings, proxy, and database connections

3. Page analysis

3.1 the diagram below


4. Write code


4.1 Crawler code

# -*- coding: utf-8 -*-
import scrapy
from liepin_spider.items import LiepinSpiderItem
from sql import MyMysql


class LiepinSpider(scrapy.Spider) :
    name = 'liepin'
    custom_settings = {
        'ITEM_PIPELINES': {
            'liepin_spider.pipelines.LiepinSpiderPipeline': 200,},'DOWNLOADER_MIDDLEWARES': {
            # 'liepin_spider.middlewares.LiepinSpiderDownloaderMiddleware': 100,
            'scrapy.downloadermiddlewares.retry.RetryMiddleware': None.# 'liepin_spider.middlewares.MyRetryMiddleware': 110,
        },
        'CONCURRENT_REQUESTS': 1.# Number of concurrent requests
        'DOWNLOAD_DELAY': 1.Request delay
        'DOWNLOAD_TIMEOUT' : 5.# request timeout
    }

    def __init__(self, s_type=None, c_type=None, *args, **kwargs) : # pass to select a crawl mode
        super(LiepinSpider, self).__init__(*args, **kwargs)
        self.c_type = c_type Type 1: daily 2: monthly
        self.s_type = s_type # retrieve keyword type


    def start_requests(self) :
        mysql = MyMysql()
        if self.c_type == '1':
            keys = mysql.read_many('select zwmc from lp_job_names where type = 0')# get database keyword, crawl part
        else:
            keys = mysql.read_many('select zwmc from lp_job_names ')SQL > select * from database
        
        if self.s_type=='0':
            quyu_info = mysql.read_many('select dqs,city from lp_job_areas')  # select all region data
        else:
            quyu_info = mysql.read_many('select dqs,city from lp_job_areas where type = {}'.format(self.s_type))# Select some region data

        for key in keys:
            for quyu in quyu_info:
                print("The search terms are:" + key[0] + "The current search area is:"+quyu[1])
                params = (
                    ('dqs', quyu[0]),  # region parameters
                    ('key', key[0]),# Search keywords
                    ('curPage'.'0'),
                    ('pubTime'.'1'),# Within a day
                    ('jobKind'.'2')# Job type
                )
                url = 'https://www.liepin.com/zhaopin/'
                yield scrapy.FormRequest(url=url,method='GET',formdata=params,callback=self.parse,meta={"key_info":key[0]."diqu_info":quyu[1]},dont_filter=True)
            SQL > alter database status
            print("Position:"+key[0] +"Enquiry completed")
            mysql.update("UPDATE lp_job_names set type = '1' WHERE zwmc = '%s' " % key[0])
        # Reset job status
        mysql.update("UPDATE lp_job_names set type = '0' WHERE type = '1'")


    def parse(self, response) :
        urls = response.xpath('//div[@class="job-info"]/h3/a/@href').getall() # All recruitment links
        key = response.meta["key_info"]
        diqu = response.meta["diqu_info"]
        if len(urls) == 0:
            print(diqu+"No position:"+key)
            pass If no data is found, skip
        else:
            for url in urls: # Loop through job listings
                if 'https://www.liepin.com' not in url:
                    url = 'https://www.liepin.com'+ url
                yield scrapy.Request(url=url,callback=self.get_data,meta={"key_info":key,"diqu_info":diqu},dont_filter=True )

            next_url = 'https://www.liepin.com' + response.xpath('//div[@class="pagerbar"]/a/@href').getall()[-2] # next page address

            if 'javascript:' in next_url: # Determine if there is a next page
                pass
            else:
                print(diqu+Position:+key+"Next page address",next_url)
                yield scrapy.Request(url=next_url,meta={"key_info":key,"diqu_info":diqu},callback=self.parse,dont_filter=True )

    def get_data(self,response) :
        items = LiepinSpiderItem()
        items['key_word'] = response.meta["key_info"]# Search keywords
        items['diqu']= response.meta["diqu_info"]# Search area
        items['c_type']= self.c_type
        items['zhiwei']= response.xpath('//h1/text()').get()# position
        if items['zhiwei'] is None:
            pass
        else:
            items['company'] = response.xpath('//div[@class="title-info"]/h3/a/text()').get()# Company name
            items['salary'] =' '.join(response.xpath('//p[@class="job-item-title"]/text()').getall()).strip()# salary
            try:
                items['fb_time']= response.xpath('//p[@class="basic-infor"]/time/@title').get() + response.xpath('//p[@class="basic-infor"]/time/text()').get().strip() # Release time
            except:
                items['fb_time'] = ' '
            items['requirement'] =The '#'.join(response.xpath('//div[@class="job-title-left"]/div[@class="job-qualifications"]/span/text()').getall()) # require
            items['welfare'] =The '#'.join(response.xpath('//div[@class="comp-tag-box"]/ul/li/span/text()').getall()) # welfare
            items['job_description'] =' '.join(response.xpath('//div[@class="content content-word"]/text()').getall()).strip() # Job Description
            items['log_url']= response.xpath('//div[@class="company-logo"]/a/@href').get() # Company logo address
            items['industry']= response.xpath('//ul[@class="new-compintro"]/li[1]/a/text()').get() # industry
            company_info = response.xpath('//ul[@class="new-compintro"]/li//text()').getall()
            items['company_size'] = items['company_addr'] = ' '
            for num in range(3.len(company_info)):
                if 'Size of company' in company_info[num]:
                    items['company_size']= company_info[num].replace('Company Size:'.' ') # Company size
                else:
                    items['company_addr']= company_info[num].replace(Company Address:.' ') # Company address
            # yield items
            print(items)


Copy the code


4.2 Database connection code
# -*- coding: utf-8 -*-
import pymysql


class MyMysql:
    def __init__(self) :
        self.host = 'xxxxx' # ip
        self.port = 3306             # port
        self.user = 'xxxx'            # username
        self.password = 'xxxx'  # your password
        self.dbname = 'xxxx'          # database name
        self.charset = 'utf8mb4'     # character type

        # link database
        self.connect()

    def connect(self) :
        Link database and fetch cursor
        self.db = pymysql.connect(host=self.host, port=self.port, user=self.user, password=self.password,
                                  db=self.dbname, charset=self.charset)
        self.cursor = self.db.cursor()

    def run(self, sql) :
        ret = None
        try:
            ret = self.cursor.execute(sql)
            self.db.commit()
        except Exception as e:
            self.db.rollback()
        # finally:
        # self.close()
        return ret

    def rollback(self) :
        self.db.rollback()
        self.close()

    def close(self) :
        self.cursor.close()
        self.db.close()

    def insert(self, sql) :
        try:
            self.cursor.execute(sql)
            self.db.commit()
        except pymysql.err.IntegrityError:
            pass

    def commit(self) :
        self.db.commit()
        self.close()

    def update(self, sql) :
        return self.run(sql)

    def delete(self, sql) :
        return self.run(sql)

    def read_one(self, sql) :
        ret = None
        try:
            self.cursor.execute(sql)
            Get the data
            ret = self.cursor.fetchone()
        except Exception as e:
            # print(' query failed ')
            pass
        # finally:
        # self.close()
        return ret

    def read_many(self, sql) :
        ret = None
        try:
            self.cursor.execute(sql)
            Get the data
            ret = self.cursor.fetchall()
        except Warning as e:
            print('Query failed')
        finally:
            pass
        return ret

Copy the code


3.3 the Items code

# -*- coding: utf-8 -*-
import scrapy


class LiepinSpiderItem(scrapy.Item) :
    key_word = scrapy.Field()# Search keywords
    zhiwei = scrapy.Field()# Job wanted
    company = scrapy.Field()# Company name
    salary =  scrapy.Field()# salary
    diqu = scrapy.Field()# region
    fb_time = scrapy.Field()# Release time
    requirement = scrapy.Field()# require
    welfare = scrapy.Field()# welfare
    job_description = scrapy.Field()# Job Description
    log_url = scrapy.Field()# Company logo address
    industry = scrapy.Field()# industry
    company_size = scrapy.Field()# Company size
    company_addr = scrapy.Field()# Company address
    c_type = scrapy.Field()# crawl time type
Copy the code


4.4 Pipelines code
# -*- coding: utf-8 -*-

import emoji
import pymysql
from pymongo import MongoClient
from twisted.enterprise import adbapi
import copy

adbparams_info = dict(
            host='xxxx'.# ip
            db='xxxx'.# database name
            user='xxxx'.# username
            password='xxxx'.# your password
            charset='utf8',
            cursorclass=pymysql.cursors.DictCursor  # set cursor type
        )

class LiepinSpiderPipeline(object) :
    Asynchronous write
    def __init__(self,dbpool) :
        self.dbpool = dbpool
        self.conn = MongoClient('xxxx'.27017) # mongo link, you don't have to

    @classmethod
    def from_settings(cls,settings) :
        First remove the content required by setting to connect to the database and construct a location
        adbparams = adbparams_info
        dbpool = adbapi.ConnectionPool('pymysql', **adbparams)
        Return the instantiation parameter
        return cls(dbpool)

    def process_item(self,item,spider) :
        Insert Item data into the database asynchronously with Twisted
        item1 = copy.deepcopy(item)
        # in mysql
        self.dbpool.runInteraction(self.do_insert, item1)  Specify the operation method and operation data
        # Deposit mongo, obtained position
        self.mongo_insert(item['zhiwei'])

    def mongo_insert(self,job) :
        db = self.conn.crawlab_test  Connect to mongo database
        my_set = db.jobs # store collection
        data_test1 = {
            'job': job
        }
        my_set.insert_one(data_test1)

    def handle_error(self,failure,item) :
        Asynchronous insertion is abnormal
        print(failure,"Database exception")

    def do_insert(self, cursor, item) :
        insert_sql = """insert into lp_job_data(key_word,zhiwei,company,salary,diqu,fb_time,requirement,welfare, job_description,logo_url,industry,company_size,company_addr,type,c_type) VALUES('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','5','%s') """% (
            item['key_word'],item['zhiwei'],item['company'],item['salary'],item['diqu'],
            item['fb_time'],item['requirement'],item['welfare'],item['job_description'].replace("'".'"'),item['log_url'],
            item['industry'],item['company_size'],item['company_addr'],item['c_type'])
        try:
            cursor.execute(insert_sql)
        except:
            insert_sql = emoji.demojize(insert_sql)# replace emoj
            cursor.execute(insert_sql)

    def close_spider(self, spider) :
        self.conn.close()

Copy the code


4.5 Middleware proxy configuration code
# -*- coding: utf-8 -*-

import json,time
import random
import requests
from liepin_spider import settings


class LiepinSpiderDownloaderMiddleware(object) :
    def process_request(self, request, spider) :
        Set random headers
        ua_random = random.choice(settings.USER_AGENT_LIST)
        request.headers['User-Agent'] = UA_random -- Set the proxy and add it yourselfdef process_exception(self, request, exception, spider) :Ua_random = random. Choice (settings.user_agent_list) request. Headers [uA_random = random.'User-Agent'] = ua_random
        return request

Copy the code


4.6 Running Code
import os

def start_all() :
    os.system('scrapy crawl liepin -a s_type="0" -a c_type="2"') # select all region data

# def start_type1():
# os.system('scrapy crawl liepin -a ') # scrapy crawl liepin -a
#
# def start_type2():
# os.system('scrapy crawl liepin -a s_type="2")
#
# def start_type3():
# os.system('scrapy crawl liepin -a s_type=" 2 ") # scrapy crawl liepin -a s_type="3"

def new_type() :
    os.system('scrapy crawl liepin -a s_type="4" -a c_type="1"')  # 5 cities, some jobs

if __name__ == '__main__':
    # start_type1()
    # start_type2()
    # start_type3()
    new_type()

Copy the code

5. Run code


5.1 Let’s take a look at the final code effect



5.2 The data are basically achieved. Finally, I put the post keywords, regional data and the structure of the crawl result table here