Configurable crawler based on Scrapy, greatly improving work efficiency

Principle that

The rendered HTML based on Spalsh or Selenium is parsed through the configuration file and stored in the repository. Improved efficiency, can write dozens of configuration dict a day, that is, to complete the compilation of dozens of web crawlers.

Configuration file description:

{
                "industry_type": "Policy".# Industry category
                "website_type": "Central bank".# Website/wechat official account name
                "url_type": "People's Bank of China - Department of Regulations and Law - Normative Documents".# Website module
                "link": "http://www.pbc.gov.cn/tiaofasi/144941/3581332/index.html".# access link
                "article_rows_xpath": '//div[@id="r_con"]//table//tr/td/font[contains(@class, "newslist_style")]'.Extract the article list xpath object
                "title_xpath": "./a".# extract title
                "title_parse": "./@title".# extract title
                "title_link_xpath": "./a/@href".Extract the title link
                "date_re_switch": "False".Whether to use the re to extract the date and time
                "date_re_expression": "".Date and time regular expressions
                "date_xpath": "./following-sibling::span[1]".Extract date time
                "date_parse": "./text()".Extract date time
                "content": '//*[@class="content"]'.# Text HTML xpath
                "prefix": "http://www.pbc.gov.cn/".# link prefix
                "config": "{'use_selenium':'False'}"  # Other configuration: Whether to use Selenium (spalsh is used by default)
            },
Copy the code

Complete code Reference

# -*- coding: utf-8 -*-
' ''Demand list: Crawl central bank using any information site: http://www.pbc.gov.cn/tiaofasi/144941/3581332/index.html http://www.pbc.gov.cn/tiaofasi/144941/144959/index.html, the Ministry of Public Security:  https://www.mps.gov.cn/n2254314/n2254487/ https://www.mps.gov.cn/n2253534/n2253535/index.html http://www.qth.gov.cn/xxsbxt/sxdw/gajxx/ '' '
from risk_control_info.items import BIgFinanceNews
import dateparser

from w3lib.url import canonicalize_url
from urllib.parse import urljoin
import scrapy
from scrapy_splash import SplashRequest

from risk_control_info.utils import make_md5, generate_text, clean_string
import re

script = """ function main(splash, args) splash.images_enabled = false splash:set_user_agent("{ua}") assert(splash:go(args.url)) assert(splash:wait(args.wait)) return splash:html() end""".format(
    ua="Mozilla / 5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36")


class BigFinanceAllGovSpider(scrapy.Spider):
    name = 'big_finance_all_gov'
    custom_settings = {
        'RANDOMIZE_DOWNLOAD_DELAY': True,
        'DOWNLOAD_DELAY': 60/360.0,'CONCURRENT_REQUESTS_PER_IP': 8,
        'DOWNLOADER_MIDDLEWARES': {
            'scrapy_splash.SplashCookiesMiddleware': 723,
            'scrapy_splash.SplashMiddleware': 725,
            'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
            # 'risk_control_info. Middlewares. SplashProxyMiddleware' : 843, # proxy IP, this method is not successful
            'risk_control_info.middlewares.RandomUserAgentMiddleware': 843,
            'risk_control_info.middlewares.SeleniumMiddleware': 844}.# put in storage
        'ITEM_PIPELINES': {
            'risk_control_info.pipelines.RiskControlInfoPipeline': 401,
            'risk_control_info.pipelines.MysqlPipeline': 402},'SPIDER_MIDDLEWARES': {
            'risk_control_info.middlewares.RiskControlInfoSpiderMiddleware': 543,
            'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
        },
    }

    def __init__(self, **kwargs):
        super().__init__()
        self.env = kwargs.get('env'.'online')

    def start_requests(self):
        for target in self.target_info():
            if target.get('title_xpath') and target.get('title_link_xpath') \
                    and target.get('date_xpath') and target.get('article_rows_xpath'):
                self.logger.info(f"Target site configurable crawler :{target}")
                # using selenium
                if target.get("config") and eval(eval(target.get("config")).get('use_selenium')):
                    self.logger.info(f"Use Selenium request {target['link']}")
                    yield scrapy.Request(url=target['link'],
                                         meta={
                                             "target": target,
                                             "use_selenium": True
                                         },
                                         callback=self.parse,
                                         )
                else:
                    Use Splash by default
                    self.logger.info(f"Use the Splash request {target['link']}")
                    yield SplashRequest(url=target['link'],
                                        meta={"target": target},
                                        callback=self.parse,
                                        # endpoint='execute',
                                        # args={
                                        # 'lua_source': script,
                                        # 'wait': 8},
                                        endpoint='render.json',
                                        args={
                                            # 'lua_source': script,
                                            # 'proxy': f"http://{proxy_ip_dict['ip']}:{proxy_ip_dict['port']}",
                                            'wait': 10,
                                            'html': 1,
                                            'png': 1
                                        },
                                        )

    def parse(self, response):
        target = response.meta['target']
        article_rows = response.xpath(target['article_rows_xpath'])
        Walk through all text lists
        for article_row in article_rows:
            item = BIgFinanceNews()
            # handle the title
            _article_row = article_row.xpath(target['title_xpath'])  # position the title
            item['title'] = clean_string(
                generate_text(_article_row.xpath(target['title_parse']).extract_first().strip()))  # parse the title

            # handle links
            if target.get('prefix'):
                item['title_link'] = urljoin(target['prefix'], article_row.xpath(
                    target['title_link_xpath']).extract_first())
            else:
                item['title_link'] = article_row.xpath(target['title_link_xpath']).extract_first()

            # Handle the release date
            # date order rules
            date_order = "YMD"
            _title_time = article_row.xpath(target['date_xpath'])  # Location: release time
            _date_str = clean_string(
                generate_text(_title_time.xpath(target['date_parse']).extract_first()))  # Release date
            if not eval(target.get('date_re_switch')):
                item['title_time'] = dateparser.parse(_date_str, settings={'DATE_ORDER': date_order}).strftime(
                    "%Y-%m-%d")
            else:  Extract the time string using a regular expression, there is a default regular expression
                date_re_expression = target.get('date_re_expression', None)
                _expression = date_re_expression or r"(20\d{2}[-/]? \d{2}[-/]? \d{2})"
                results = re.findall(r"%s" % _expression, _date_str, re.S)
                self.logger.info(f"_date_str:{_date_str},results:{results} ")
                if results:
                    item['title_time'] = dateparser.parse(results[0], settings={'DATE_ORDER': date_order}).strftime(
                        "%Y-%m-%d")
                else:
                    item['title_time'] = None

            # Dead
            item['bi_channel'] = "gov"
            item['industry_type'] = f"{target['industry_type']}"
            item['website_type'] = f"{target['website_type']}"
            item['url_type'] = f"{target['url_type']}"
            item['title_hour'] = 0  # Original website did not publish time,0 instead
            item['source_type'] = 0  # Data source: 0 website Web, 1 wechat official account
            item['redis_duplicate_key'] = make_md5(item['title'] + canonicalize_url(item['title_link']))

            Request details page
            # using selenium
            if target.get("config") and eval(eval(target.get("config")).get('use_selenium')):
                self.logger.info(f"Use Selenium request {item['title_link']}")
                yield scrapy.Request(url=item['title_link'],
                                     meta={
                                         "target": target,
                                         "use_selenium": True,
                                         "item": item
                                     },
                                     callback=self.parse_detail,
                                     )
            else:
                # use Splash
                self.logger.info(f"Use Splash request {item['title_link']}")
                yield SplashRequest(url=item['title_link'],
                                    meta={
                                        "target": target,
                                        "item": item
                                    },
                                    callback=self.parse_detail,
                                    # endpoint='execute',
                                    # args={
                                    # 'lua_source': script,
                                    # 'wait': 8},
                                    endpoint='render.json',
                                    args={
                                        # 'lua_source': script,
                                        # 'proxy': f"http://{proxy_ip_dict['ip']}:{proxy_ip_dict['port']}",
                                        'wait': 20.'html': 1,
                                        'png': 1
                                    },
                                    )

    def parse_detail(self, response):
        self.logger.info(f"Processing details page {response.url}")
        item = response.meta['item']
        target = response.meta['target']
        print(response.xpath(target['content']))
        if response.xpath(target['content']):
            item['content'] = generate_text(response.xpath(target['content']).extract_first())
        else:
            item['content'] = ""

        yield item

    @staticmethod
    def target_info():
        ' ''Return to target site information'' '
        target_list = [
            {
                "industry_type": "Policy".# Industry category
                "website_type": "Central bank".# Website/wechat official account name
                "url_type": "People's Bank of China - Department of Regulations and Law - Normative Documents".# Website module
                "link": "http://www.pbc.gov.cn/tiaofasi/144941/3581332/index.html".# access link
                "article_rows_xpath": '//div[@id="r_con"]//table//tr/td/font[contains(@class, "newslist_style")]'.Extract the article list xpath object
                "title_xpath": "./a".# extract title
                "title_parse": "./@title".# extract title
                "title_link_xpath": "./a/@href".Extract the title link
                "date_re_switch": "False".Whether to use the re to extract the date and time
                "date_re_expression": "".Date and time regular expressions
                "date_xpath": "./following-sibling::span[1]".Extract date time
                "date_parse": "./text()".Extract date time
                "content": '//*[@class="content"]'.# Text HTML xpath
                "prefix": "http://www.pbc.gov.cn/".# link prefix
                "config": "{'use_selenium':'False'}"  # Other configuration: Whether to use Selenium (spalsh is used by default)
            },

            {
                "industry_type": "Policy".# Industry category
                "website_type": "Central bank".# Website/wechat official account name
                "url_type": "People's Bank of China - Department of Regulations and Law - Other Documents".# Website module
                "link": "http://www.pbc.gov.cn/tiaofasi/144941/144959/index.html".# access link
                "article_rows_xpath": '//div[@id="r_con"]//table//tr/td/font[contains(@class, "newslist_style")]'."title_xpath": "./a"."title_parse": "./@title"."title_link_xpath": "./a/@href"."date_re_switch": "False".Whether to use the re to extract the date and time
                "date_re_expression": "".Date and time regular expressions
                "date_xpath": "./following-sibling::span[1]"."date_parse": "./text()"."content": '//*[@class="content"]'.# Text HTML xpath
                "prefix": "http://www.pbc.gov.cn/"."config": "{'use_selenium':'False'}"
            },

            {
                "industry_type": "Policy".# Industry category
                "website_type": "Ministry of Public Security".# Website/wechat official account name
                "url_type": "Ministry of Public Security, PRC - Planning Program".# Website module
                "link": "https://www.mps.gov.cn/n2254314/n2254487/".# access link
                "article_rows_xpath": '//span/dl/dd'."title_xpath": "./a"."title_parse": "./text()"."title_link_xpath": "./a/@href"."date_re_switch": "True".# select date and time (2020-04-14)
                "date_re_expression": "".Date and time regular expressions
                "date_xpath": "./span"."date_parse": "./text()"."content": '//*[@class="arcContent center"]'.# Text HTML xpath
                "prefix": "https://www.mps.gov.cn/"."config": "{'use_selenium':'True'}"
            },

            {
                "industry_type": "Policy".# Industry category
                "website_type": "Ministry of Public Security".# Website/wechat official account name
                "url_type": "Ministry of Public Security of the People's Republic of China - Public Security Highlights".# Website module
                "link": "https://www.mps.gov.cn/n2253534/n2253535/index.html".# access link
                "article_rows_xpath": '//span/dl/dd'."title_xpath": "./a"."title_parse": "./text()"."title_link_xpath": "./a/@href"."date_re_switch": "True".# select date and time (2020-04-14)
                "date_re_expression": "".Date and time regular expressions
                "date_xpath": "./span"."date_parse": "./text()"."content": '//*[@class="arcContent center"]'.# Text HTML xpath
                "prefix": "https://www.mps.gov.cn/"."config": "{'use_selenium':'True'}"
            },

            {
                "industry_type": "Policy".# Industry category
                "website_type": "Ministry of Public Security".# Website/wechat official account name
                "url_type": Qitaihe Municipal People's Government - Information Reporting System - Municipal Units - Public Security Bureau.# Website module
                "link": "http://www.qth.gov.cn/xxsbxt/sxdw/gajxx/".# access link
                "article_rows_xpath": '//td[contains(text(), "Public security bureau")] / parent: : tr/parent: : tbody/parent: : table/parent: : td/parent: : tr/following: : tr [1] / td/table/tr/td/a/parent: : td/paren t::tr'."title_xpath": "./td/a"."title_parse": "./@title"."title_link_xpath": "./td/a/@href"."date_re_switch": "False".# select date and time (2020-04-14)
                "date_re_expression": "".Date and time regular expressions
                "date_xpath": "./td[3]"."date_parse": "./text()"."content": '//*[@class="TRS_Editor"]'.# Text HTML xpath
                "prefix": "http://www.qth.gov.cn/xxsbxt/sxdw/gajxx/"."config": "{'use_selenium':'False'}"},]for target in target_list:
            yield target

Copy the code

Configurable crawler based on Scrapy, greatly improving work efficiency

Principle that

Configuration file description:

Complete code Reference

Related Posts

InnoDB read InnoDB read InnoDB read InnoDB read InnoDB read InnoDB read InnoDB read InnoDB

Beginners will be Linux command – file view

<JVM Part 1: Memory and Garbage Collection >04- Virtual machine stack