First of all, I have run the code is no problem, if you run the error or other problems, you can leave a message, but do not say dirty words thank you. The code is found on Github and modified by myself. If there are any shortcomings, welcome to point out, thank you.
Tool use
Development environment: Win10, PYTHon3.7
Development tool: PyCharm
Pycharm is not available if you want to install packages and activation codes or are not sure how to install them.
Three key points of a reptile:
- request
- parsing
- storage
Requests can be made using URllib requests, which comes with urllib, and requests is a more powerful third-party library, which uses URllib.
I use regular expression and xpath for parsing. This time, I use regular, mainly because I want to practice with regular. I cannot understand the mystery of the regular explanation, so I must try more.
Storage is usually saved to memory, database, hard disk, this is saved to the computer hard disk.
Python crawler: Crawl pictures of the fast and the Furious 8 actors from Douban movie
code
import urllib.request
import os
import re
def douban(url):
r = urllib.request.urlopen(url)
html = r.read().decode('utf-8')
result = re.findall(r'https://img\d.doubanio.com/img/celebrity/medium/.*.jpg', html)
result2 = re.findall(r'(?<=title=").\S+', html)
result2.pop()
result3 = sorted(set(result2), key=result2.index)
result3.pop(-3)
if not os.path.exists('douban'):
os.makedirs('douban')
i = 0
for link in result:
filename = 'douban\\' + str(result3[i]) + '.jpg'
i += 1
with open(filename, 'w') as file:
urllib.request.urlretrieve(link, filename)
url = 'https://movie.douban.com/subject/26260853/celebrities'
if __name__ == '__main__':
douban(url)
Copy the code
Python crawler: Save the information about douyu barrage to mongodb
# this crawls the bullet screen, and then puts the user's UID, nickname, rank, Barrage content is stored into the mongo __author__ = 'cloth luo _rieuse __time__ =' 2017.6.2 '__github__ =' https://github.com/rieuse 'import multiprocessing import re import socket import time import pymongo import requests from bs4 import BeautifulSoup clients = pymongo.MongoClient('localhost') db = clients["DouyuTV_danmu"] col = db["info"] client = socket.socket(socket.AF_INET, socket.SOCK_STREAM) host = socket.gethostbyname("openbarrage.douyutv.com") port = 8601 client.connect((host, port)) danmu_path = re.compile(b'txt@=(.+?)/cid@') uid_path = re.compile(b'uid@=(.+?)/nn@') nickname_path = re.compile(b'nn@=(.+?)/txt@') level_path = re.compile(b'level@=([1-9][0-9]?)/sahf') def sendmsg(msgstr): msg = msgstr.encode('utf-8') data_length = len(msg) + 8 code = 689 msgHead = int.to_bytes(data_length, 4, 'little') \ + int.to_bytes(data_length, 4, 'little') + int.to_bytes(code, 4, 'little') client.send(msgHead) sent = 0 while sent < len(msg): tn = client.send(msg[sent:]) sent = sent + tn def start(roomid): msg = 'type@=loginreq/username@=rieuse/password@=douyu/roomid@={}/\0'.format(roomid) sendmsg(msg) msg_more = 'type@=joingroup/rid@={}/gid@=-9999/\0'.format(roomid) sendmsg(msg_more) Print (" -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- welcome to connect to the {} could not -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- '. The format (get_name (roomid))) while True: data = client.recv(1024) uid_more = uid_path.findall(data) nickname_more = nickname_path.findall(data) level_more = level_path.findall(data) danmu_more = danmu_path.findall(data) if not level_more: level_more = b'0' if not data: break else: for i in range(0, len(danmu_more)): try: product = { 'uid': uid_more[0].decode(encoding='utf-8'), 'nickname': nickname_more[0].decode(encoding='utf-8'), 'level': level_more[0].decode(encoding='utf-8'), 'danmu': Danmu_more [0]. Decode (encoding=' utF-8 ')} print(product) col. Insert (product) print(' mongodb') except Exception as e: print(e) def keeplive(): while True: msg = 'type@=keeplive/tick@=' + str(int(time.time())) + '/\0' sendmsg(msg) time.sleep(15) def get_name(roomid): r = requests.get("http://www.douyu.com/" + roomid) soup = BeautifulSoup(r.text, 'lxml') return soup.find('a', {'class', 'zb-name'}).string if __name__ == '__main__': Room_id = input(' please enter room ID: ') p1 = multiprocessing.Process(target=start, args=(room_id,)) p2 = multiprocessing.Process(target=keeplive) p1.start() p2.start()Copy the code
Python crawler: Crawl Himalayan radio audio
_author__ = '布咯咯_rieuse'
import json
import random
import time
import pymongo
import requests
from bs4 import BeautifulSoup
from lxml import etree
clients = pymongo.MongoClient('localhost')
db = clients["XiMaLaYa"]
col1 = db["album2"]
col2 = db["detaile2"]
UA_LIST = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
headers1 = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
'Cache-Control': 'max-age=0',
'Proxy-Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'User-Agent': random.choice(UA_LIST)
}
headers2 = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
'Cache-Control': 'max-age=0',
'Proxy-Connection': 'keep-alive',
'Referer': 'http://www.ximalaya.com/dq/all/2',
'Upgrade-Insecure-Requests': '1',
'User-Agent': random.choice(UA_LIST)
}
def get_url():
start_urls = ['http://www.ximalaya.com/dq/all/{}'.format(num) for num in range(1, 85)]
for start_url in start_urls:
html = requests.get(start_url, headers=headers1).text
soup = BeautifulSoup(html, 'lxml')
for item in soup.find_all(class_="albumfaceOutter"):
content = {
'href': item.a['href'],
'title': item.img['alt'],
'img_url': item.img['src']
}
col1.insert(content)
print('写入一个频道' + item.a['href'])
print(content)
another(item.a['href'])
time.sleep(1)
def another(url):
html = requests.get(url, headers=headers2).text
ifanother = etree.HTML(html).xpath('//div[@class="pagingBar_wrapper"]/a[last()-1]/@data-page')
if len(ifanother):
num = ifanother[0]
print('本频道资源存在' + num + '个页面')
for n in range(1, int(num)):
print('开始解析{}个中的第{}个页面'.format(num, n))
url2 = url + '?page={}'.format(n)
get_m4a(url2)
get_m4a(url)
def get_m4a(url):
time.sleep(1)
html = requests.get(url, headers=headers2).text
numlist = etree.HTML(html).xpath('//div[@class="personal_body"]/@sound_ids')[0].split(',')
for i in numlist:
murl = 'http://www.ximalaya.com/tracks/{}.json'.format(i)
html = requests.get(murl, headers=headers1).text
dic = json.loads(html)
col2.insert(dic)
print(murl + '中的数据已被成功插入mongodb')
if __name__ == '__main__':
get_url()
Copy the code
Python crawler – packet capture analysis crawls all intern recruitment information
import json import requests import pymongo import time clients = pymongo.MongoClient('localhost') db = clients["Shixiseng"] col = db["detail_info"] urls = ['http://www.shixiseng.com/app/internsvt?c=%E5%85%A8%E5%9B%BD&p={}&t=hot'.format(n) for n in range(1, 3487)] for url in urls: print(url) r = requests.get(url) html = r.content.decode('utf-8') content = json.loads(html)['msg']['b'] for i in Content: print(' insert data: ') print(I) col.insert(I) time.sleep(0.01)Copy the code
Python crawler: Batch capture petal net hd beautiful images and save
__author__ = 'cack_rieuse' import OS import lxml. HTML import requests from Selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait SERVICE_ARGS = ['--load-images=false', '--disk-cache=true'] browser = webdriver.PhantomJS(service_args=SERVICE_ARGS) # browser = webdriver.Firefox() wait = WebDriverWait(browser, 5) browser.set_window_size(1400, 900) def parser(url, param): Browser. get(URL) wait. Until (ec. presence_of_element_located((by.css_selector, Param)) HTML = browser.page_source doc = lxml.html.fromString (HTML) return doc def get_main_url(): print('... ') try: doc = parser('http://huaban.com/boards/favorite/beauty/', '#waterfall') name = doc.xpath('//*[@id="waterfall"]/div/a[1]/div[2]/h3/text()') u = doc.xpath('//*[@id="waterfall"]/div/a[1]/@href') for item, fileName in zip(u, name): Main_url = 'http://huaban.com' + item print(' main_url ') if '*' in fileName: fileName = fileName.replace('*', '') download(main_url, fileName) except Exception as e: Print (e) def download (main_url, fileName) : print (" -- -- -- -- -- -- -- -- -- -- -- -- -- -- in the ready to download ') try: Doc = Parser (main_url, '#waterfall') if not os.path.exists('image\\' + fileName): print(' create folder... ') os.makedirs('image\\' + fileName) link = doc.xpath('//*[@id="waterfall"]/div/a/@href') # print(link) i = 0 for item in link: i += 1 minor_url = 'http://huaban.com' + item doc = parser(minor_url, '#pin_view_page') img_url = doc.xpath('//*[@id="baidu_image_holder"]/a/img/@src') img_url2 = doc.xpath('//*[@id="baidu_image_holder"]/img/@src') img_url += img_url2 try: Url = 'HTTP :' + STR (img_url[0]) print(' I' + STR (I) + ') ' + url) r = requests.get(url) filename = 'image\\{}\\'.format(fileName) + str(i) + '.jpg' with open(filename, 'wb') as fo: fo. Write (r.content) except Exception: print(' error! ') except Exception: print(' ') if __name__ == '__main__': get_main_url()Copy the code
Python crawler: Crawls v2EX data and saves it in CSV
import csv, requests, re from bs4 import BeautifulSoup url = 'https://www.v2ex.com/?tab=all' html = requests.get(url).text soup = BeautifulSoup(html, 'html.parser') articles = [] for article in soup.find_all(class_='cell item'): title = article.find(class_='item_title').get_text() category = article.find(class_='node').get_text() author = re.findall(r'(?<=<a href="/member/).+(?="><img)', str(article))[0] u = article.select('.item_title > a') link = 'https://www.v2ex.com' + re.findall(r'(?<=href=").+(?=")', str(u))[0] articles.append([title, category, author, link]) with open(r'document\v2ex.csv', 'w') as f: Writerow = csv.writerow (f) writer.writerow([' 主 义 ', '类',' 主 义 ', '主 义 ',' 主 义 ']) for articles: writer.writerow(row)Copy the code
Python crawler: Pea pod Design Award Speed comparison of three crawling methods
__author__ = '布咯咯_rieuse'
import asyncio
import random
import time
import aiohttp
import pymongo
import requests
import multiprocessing
from bs4 import BeautifulSoup
# 共用部分
clients = pymongo.MongoClient('localhost')
db = clients["wandoujia"]
col = db["info"]
urls = ['http://www.wandoujia.com/award?page={}'.format(num) for num in range(1, 46)]
UA_LIST = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
'Connection': 'keep-alive',
'Host': 'www.wandoujia.com',
'User-Agent': random.choice(UA_LIST)
}
proxies = {
'http': 'http://123.206.6.17:3128',
'https': 'http://123.206.6.17:3128'
}
# 方式一:使用常见的requests
def method_1():
start = time.time()
for url in urls:
html = requests.get(url, headers=headers, proxies=proxies).text
soup = BeautifulSoup(html, 'lxml')
title = soup.find_all(class_='title')
app_title = soup.find_all(class_='app-title')
item_cover = soup.find_all(class_='item-cover')
icon_cover = soup.select('div.list-wrap > ul > li > div.icon > img')
for title_i, app_title_i, item_cover_i, icon_cover_i in zip(title, app_title, item_cover, icon_cover):
content = {
'title': title_i.get_text(),
'app_title': app_title_i.get_text(),
'item_cover': item_cover_i['data-original'],
'icon_cover': icon_cover_i['data-original']
}
col.insert(content)
print('成功插入一组数据' + str(content))
print('一共用时:' + str(time.time() - start))
# if __name__ == '__main__':
# method_1()
# 方式二:使用Requests + Pool
def method_2(url):
html = requests.get(url, headers=headers, proxies=proxies).text
soup = BeautifulSoup(html, 'lxml')
title = soup.find_all(class_='title')
app_title = soup.find_all(class_='app-title')
item_cover = soup.find_all(class_='item-cover')
icon_cover = soup.select('div.list-wrap > ul > li > div.icon > img')
for title_i, app_title_i, item_cover_i, icon_cover_i in zip(title, app_title, item_cover, icon_cover):
content = {
'title': title_i.get_text(),
'app_title': app_title_i.get_text(),
'item_cover': item_cover_i['data-original'],
'icon_cover': icon_cover_i['data-original']
}
# time.sleep(1)
col.insert(content)
print('成功插入一组数据' + str(content))
# if __name__ == '__main__':
# start = time.time()
# pool = multiprocessing.Pool(4)
# pool.map(method_2, urls)
# pool.close()
# pool.join()
# print('一共用时:' + str(time.time() - start))
# 方式三:使用Asyncio + Aiohttp python3.4之后出的异步io模块
def method_3():
async def get_url(url):
async with aiohttp.ClientSession() as session: # async关键字将一个函数声明为协程函数,函数执行时返回一个协程对象。
async with session.get(url) as html:
response = await html.text(encoding="utf-8") # await关键字将暂停协程函数的执行,等待异步IO返回结果。
return response
async def parser(url):
html = await get_url(url)
soup = BeautifulSoup(html, 'lxml')
title = soup.find_all(class_='title')
app_title = soup.find_all(class_='app-title')
item_cover = soup.find_all(class_='item-cover')
icon_cover = soup.select('div.list-wrap > ul > li > div.icon > img')
for title_i, app_title_i, item_cover_i, icon_cover_i in zip(title, app_title, item_cover, icon_cover):
content = {
'title': title_i.get_text(),
'app_title': app_title_i.get_text(),
'item_cover': item_cover_i['data-original'],
'icon_cover': icon_cover_i['data-original']
}
col.insert(content)
print('成功插入一组数据' + str(content))
start = time.time()
loop = asyncio.get_event_loop()
tasks = [parser(url) for url in urls]
loop.run_until_complete(asyncio.gather(*tasks))
print(time.time() - start)
if __name__ == '__main__':
method_3()
Copy the code
Python crawler: Uses LXML to parse HTML and output corresponding values
import requests import lxml.html url = 'http://news.ifeng.com/listpage/11502/0/1/rtlist.shtml' html = requests.get(url).text doc = lxml.html.fromstring(html) titles = doc.xpath('//div[@class="newsList"]/ul/li/a/text()') Href = doc.xpath('//div[@class="newsList"]/ul/li/a/@href') I = 0 for content in titles: Results = {' title ': Titles [I], 'links ': href[I]} I += 1 print(results)Copy the code
Python crawler: Use Selenium to crawl a bit of information dynamic data
from selenium.webdriver.common.keys import Keys from selenium import webdriver from bs4 import BeautifulSoup import csv driver = webdriver.Firefox() driver.implicitly_wait(3) first_url = 'http://www.yidianzixun.com/channel/c6' driver.get(first_url) driver.find_element_by_class_name('icon-refresh').click() for i in range(1, 90): driver.find_element_by_class_name('icon-refresh').send_keys(Keys.DOWN) soup = BeautifulSoup(driver.page_source, 'lxml') articles = [] for article in soup.find_all(class_='item doc style-small-image style-content-middle'): title = article.find(class_='doc-title').get_text() source = article.find(class_='source').get_text() comment = article.find(class_='comment-count').get_text() link = 'http://www.yidianzixun.com' + article.get('href') articles.append([title, source, comment, link]) driver.quit() with open(r'document\yidian.csv', 'w') as f: Writerow = csv.writerow (f) writer.writerow([' 主 义 ', '主 义 ',' 主 义 ', '主 义 ']) for articles: writer.writerow(row)Copy the code
Python crawler: Selenium+xpath+ BS4 crawls Amazon data and saves it to mongodb
from selenium.common.exceptions import TimeoutException from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait from selenium import webdriver from bs4 import BeautifulSoup import lxml.html import pymongo import re MONGO_URL = 'localhost' MONGO_DB = 'amazon' MONGO_TABLE = 'amazon-python' SERVICE_ARGS = ['--load-images=false', '--disk-cache=true'] KEYWORD = 'python' client = pymongo.MongoClient(MONGO_URL) db = client[MONGO_DB] browser = webdriver.PhantomJS(service_args=SERVICE_ARGS) # browser = webdriver.Firefox() wait = WebDriverWait(browser, 10) browser.set_window_size(1400, 900) def search(): print(' searching ') try: browser.get('https://www.amazon.cn/') input = wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, '#twotabsearchtextbox')) ) submit = wait.until( EC.element_to_be_clickable((By.CSS_SELECTOR, '#nav-search > form > div.nav-right > div > input'))) input.send_keys(KEYWORD) submit.click() total = wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, '#pagn > sp.pagndisabled '))) get_products() print(' total '+ total.text +' page ') return total.text except TimeoutException: Return search() def next_page(number): print(' search ', number) try: wait.until(EC.text_to_be_present_in_element( (By.CSS_SELECTOR, '#pagnNextString'), Submit = wait. Until (ec.element_to_be_clickable ((By.CSS_SELECTOR, '#pagnNextString'))) submit.click() wait.until(EC.text_to_be_present_in_element( (By.CSS_SELECTOR, '.pagnCur'), str(number))) get_products() except TimeoutException: next_page(number) def get_products(): try: wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#s-results-list-atf'))) html = browser.page_source soup = BeautifulSoup(html, 'lxml') doc = lxml.html.fromstring(html) date = doc.xpath('//*[@class="s-result-item celwidget "]/div/div[2]/div[1]/span[2]/text()') content = soup.find_all(attrs={"id": re.compile(r'result_\d+')}) for item, time in zip(content, date): product = { 'title': item.find(class_='s-access-title').get_text(), 'image': item.find(class_='s-access-image cfMarker').get('src'), 'price': item.find(class_='a-size-base a-color-price s-price a-text-bold').get_text(), 'date': time } # save_to_mongo(product) print(product) except Exception as e: print(e) def save_to_mongo(result): try: MONGO_TABLE if db[MONGO_TABLE]. Insert (result): print(' MONGO_TABLE ', result) except Exception: Print (' mongodb failed ', result) def main(): try: total = int(search()) for I in range(2, total + 1): Next_page (I) except Exception as e: print(' error ', e) finally: browser.close() if __name__ == '__main__': main()Copy the code
Python crawler: Get the black large verification code and log in
import requests from PIL import Image from bs4 import BeautifulSoup url1 = 'http://my.hlju.edu.cn/captchaGenerate.portal?' url2 = 'http://my.hlju.edu.cn/userPasswordValidate.portal' url3 = 'http://my.hlju.edu.cn/index.portal' headers = {' the user-agent ':' Mozilla / 5.0 (Windows NT 10.0; WOW64) AppleWebKit / 537.36 (KHTML, Like Gecko) Chrome/50.0.2661.102 Safari/537.36'} s = requests. Session () Response = s.net (url1, headers=headers) html = response.text soup = BeautifulSoup(html, 'html.parser') with open('img\code.jpg', 'wb') as f: f.write(response.content) img = Image.open('img\code.jpg') img.show() data = {} data['Login.Token1'] = '20154433' Data [' login. Token2'] = 'data['captcha'] = input(' input verification code: ') data['goto'] = 'http://my.hlju.edu.cn/loginSuccess.portal' data['gotoOnFail'] = 'http://my.hlju.edu.cn/loginFailure.portal' response2 = s.post(url=url2, data=data, headers=headers) response3 = s.get(url3, headers=headers) print(response3.text)Copy the code
Say what you need to say
I am ** white and white I **, a program yuan like to share knowledge ❤️
If you have not touched the programming of the friends to see this blog, find that you will not, or want to learn, you can directly leave a message or see my homepage duck [thank you very much for your likes, favorites, follow, comments, one button four link support]