1. Find the data source and get the url
Ctrip travel website
2. Import required modules
from selenium import webdriver
import time
from pc import data_bool
import random
from selenium.webdriver.common.keys import Keys
Copy the code
3. Instantiate the object
Bool_ = datA_bool () # instantiationCopy the code
4. Write the region and province to be typed
Diqu = [' Beijing ', 'Shanghai', 'shenzhen, chengdu, guangzhou, hangzhou, sanya, haikou, urumqi, Lhasa,' wuhan ', 'nanjing', 'xi 'an', 'southern xinjiang', 'marble', 'shangri-la'. Harbin 'xishuangbanna', 'mountain', ' ', 'gansu', 'zhejiang', 'tianjin', 'in jiangsu, fujian, chongqing, hunan, liaoning, shandong, Inner Mongolia,' anhui ', 'sichuan, hubei, hainan, 'hebei', ' 'in qinghai, shaanxi,' jiangxi ', 'ningxia', 'yunnan', 'of jilin, henan, guangxi, heilongjiang, shanxi, xinjiang, guizhou, Tibet, yunnan] # need to type in the region The provinces setCopy the code
5, set up the function, with exception capture processing
Viewing Parameter Data
! [](https://p6-tt-ipv6.byteimg.com/origin/pgc-image/73d08fa140b24ca9ad6344db31161ecb)
Def result(driver): parent=driver. Find_element_by_class_name ('main_col') # str_introduce=parent.find_elements_by_css_selector('p.list_product_title') except: Str_rule =[] print(' id ', 'id', 'id ') str_product=parent.find_elements_by_css_selector('p.list_product_tip') except: Str_product =[] print(' null ') list_box=parent.find_elements_by_css_selector('list_label_box') list_box_item=[] for i in list_box: List_box_item.append (i.ind_elements_by_css_selector ('span')) except: list_box_item=[] print(' list fetch empty value ') list_retail=parent.find_elements_by_css_selector('p.list_product_retail') except: List_retail =[] print(' list_retail ') str_grade=parent.find_elements_by_css_selector('p.list_change_grade') except: Str_grade =[] print(' grade= null ') str_number=parent.find_elements_by_css_selector('div.list_change_one') except: Str_number =[] print(' null ') # try: str_remark=parent.find_elements_by_css_selector('div.list_change_two') except: Str_remark =[] print(' print ') Str_price =parent. Find_elements_by_css_selector ('span. List_sr_price ') except: str_price=[] print(' fetch to null ') # count =[] # count =[] # count =[] # count =[] # count =[] # count =[] # count =[] # count =[] # count =[] # Introduce = 'no' # supplier try: retail = list_retail [I] text except: retail = 'no' # rating information try: grade = str_grade [I] text except: Grade ='0 '# count = count [I].count = count [I].count = count [I].count = count [I].count = count Note ='0 '# price=str_price[I]. Except: price=' 0' try: List_box_item =list_box_item[I] Box = [] rank. Append ([introduce, retail, grade, number, remark, price, box]) bool_. MemoryCsv (rank, 'electronic data. CSV', 'a') parent.find_element_by_css_selector('a.down').click() except: Parent.find_element_by_css_selector (' a.own ').click() price(' data lost \n')Copy the code
6. Define the function to grab the first page
Def range_(begin,end,driver): def range_(begin,end,driver) Sleep (random. Randint (20,25)) # print(f' page {I} ') # print the number of pagesCopy the code
7, write click event, automatically open the web page to find Ctrip
def t(x,x1,x2): for index,i in enumerate(x[x1:x2]): driver=webdriver.Chrome() time.sleep(3) driver.get('https://vacations.ctrip.com/?startcity=1&salecity=1&cityname=%E5%8C%97%E4%BA%AC') Input_tag =driver.find_element_by_class_name('search_txt') # Sleep (3) range_(0,35,driver) driver.close() print(f'{I} complete ') time.sleep(3) print(f'{I} complete ') time.sleep(3) print(f'{I} complete ') time.sleep(4) print(f'{I} complete ') time.sleep(4)Copy the code
8, import multithreading module, write multithreading
Import threading t1 = threading. Thread (target = t, args = (diqu, 11, 23)) t2 = threading. Thread (target = t, args = (diqu, 23, 31)) T3 = threading. Thread (target = t, args = (diqu, 4) 31) t1. The start (t2), start (t3). The start ()Copy the code
PC is a PY file
Inside is the code to get the header and store it
! [](https://p1.pstatp.com/origin/pgc-image/1e38f123ac854858acbe8ed3698bb294)
import requests import requests import chardet import random import csv from openpyxl import Workbook from pandas import DataFrame,Series class data_bool(): def __init__(self): pass def get_html(self, url): User_agent = ['Mozilla/5.0 (Windows NT 10.0; Win64; X64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36', "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; En-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"] headers = {' user-agent ': Response = requests. Get (url=url, Response. encoding =' utF-8 'response.encoding =' utF-8' response.encoding =' utF-8 Def MemoryCsv(self, data: list, fileName: STR, mode='w'): With open(fileName, mode=mode, encoding=' utF-8 ', newline=' ")as f: csvfile = csv.writer(f) # write data for each in data: Csvfile. writerow(each) print(fileName) def MemoryExcel(self, data, fileName) wb = Workbook() sheet = wb.active for each in data: Wb sheet. Append (each). The save (fileName) print (fileName, 'storage success) def Memonry_pandas_csv (self, data: list, fileName, mode =' w ') : DataFrame(data).to_csv(fileName,mode=mode,header=False,index=False)Copy the code
Did you learn? If you have a problem, you can click the blue font below to communicate and answer it.
Free Python learning materials and group communication solutions click to join