• Baidu Library has some download coupons to download the information
  • But actually the library allows us to preview, but not copy the content
  • We just need the text content inside, there is no requirement for the style of content, right

Running on Windows

Functions achieved by the downloader:

1. According to the entered URL, the system automatically determines the document type and saves the downloaded resources in the corresponding folder

2. Automatically convert PPT documents into pictures and save them in the original order

3. Eliminate all formats of PDF and word. TXT data and save the text in TXT format

Implementation effect

The data source of the downloader

Analyze the source code of the page where the resource is located, obtain the interface to request the resource, request the resource with the Requests library, then manually implement the text splicing rules, and finally output the text content to the folder in the same directory as the script

Word documents

PPT type documents

TXT documents

Implementation source code:

import os
import re
import json
import requests
from lxml import etree

Create a library base class
class BaiduWK(object):
	def __init__(self, url):
		self.title = None
		self.url = url
		self.docType = None
		self.headers = {'User-Agent':'the Mozilla / 5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36'}
		self.get_response_content(self.url)
		self.get_doc_type_and_title()

	def get_response_content(self, url):
		try:
			response = requests.get(url, headers = self.headers)
			return response.content
		except Exception as e:
			print(e)
			pass
	def get_doc_type_and_title(self):
		# get source code
		source_html = self.get_response_content(self.url)
		# parse source code
		content = source_html.decode('gbk')
		Get the document type
		self.docType = re.findall(r"docType.*? \ ". *? \ '(. *?) \ \ '.", content)[0]
		Get the document title
		self.title = re.findall(r"title.*? \ ". *? \ '(. *?) \ \ '.", content)[0]

Create a class that gets TXT
class BDWKTXT(BaiduWK):
	def __init__(self, url):
		super().__init__(url)
		self.docId = None
		pass

	def get_txt(self, url):
		# get source code
		source_html = self.get_response_content(url)
		content = source_html.decode("gbk")
		# get docId
		self.docId = re.findall(r"docId.*? (\w{24}?) \ \ '.", content)[0]
		# concatenate request URL
		token_url =  "https://wenku.baidu.com/api/doc/getdocinfo?callback=cb&doc_id=" + self.docId
		# request again
		first_json = self.get_response_content(token_url).decode()
		str_first_json = re.match(r". *? \ [(\ {. *? \ \})). * ', first_json).group(1)
		# print(str_first_json)
		the_first_json = json.loads(str_first_json)
		md5sum = the_first_json["md5sum"]
		rn = the_first_json["docInfo"] ["totalPageNum"]
		rsign = the_first_json["rsign"]
		# print(md5sum,"-->",rn)
		Request the destination URL
		target_url = "https://wkretype.bdimg.com/retype/text/" + self.docId + "?" + md5sum + "&callback=cb" + "&pn=1&rn=" + rn + "&type=txt" +"&rsign=" + rsign

		# https://wkretype.bdimg.com/retype/text/de53bafaf705cc1755270982?md5sum=b89f055e765e6f73db57525bcfc3c2d2&sign=7b385f9cf7& callback=cb&pn=1&rn=12&type=txt

		sec_json = self.get_response_content(target_url).decode()
		# print(type(sec_json),"-->")
		str_sec_json = re.match(r". *? \ [\ [\] (. *) \) $', sec_json).group(1)
		str_sec_json += ","
		str_json_list = str_sec_json.split('},)
		result_txt = ""
		# Truncate trailing space
		str_json_list = str_json_list[:-1]
		for str_json in str_json_list:
			str_json += "}"
			pure_dic = json.loads(str_json)
			pure_txt = pure_dic["parags"] [0] ["c"].strip()
			result_txt += pure_txt

		Create a file directory
		try:
			path = "." + os.sep + self.docType
			# print(type(path),"path-->",path)

			os.makedirs(path)
		except Exception as e:
			# print(" folder %s already exists "%(path))
			pass
		Create file, save information
		try:
			file_name = "." + os.sep +self.docType + os.sep + self.title + ".txt"
			with open(file_name, 'w') as f:
				f.write(result_txt)
				print("Saved as :",self.title + '.txt')
		except Exception as e:
			# print(e)
			pass



Create a class that gets Word
class BDWKDOC(BaiduWK):
	def __init__(self, url):
		super().__init__(url)
		Save the data source URL
		self.pure_addr_list = list()

	Get the data source URL
	def get_pure_addr_list(self):
		Get the source code of the page
		source_html = self.get_response_content(self.url).decode('gbk')
		# Batch extract data urls from source code
		all_addr = re.findall(r'wkbos\.bdimg\.com.*?json.*?expire.*?\}',source_html)
		pure_addr_list = list()
		Get the document title
		self.title = etree.HTML(source_html).xpath("//title/text()")[0].strip()
		Purify the list of data source urls
		for addr in all_addr:
			addr = "https://" + addr.replace(\ \ \ \ \ \ "/"."/")
			addr = addr[:-5]
			pure_addr_list.append(addr)
		Save the processed URL list as a global property
		self.pure_addr_list = pure_addr_list

		return pure_addr_list
	Extract data from the url list of the data source
	def get_json_content(self, url_list):
		content = ' '
		result = ' '
		sum = len(url_list)
		i = 1
		for pure_addr in url_list:
			print("Downloading %d data, remaining %d"%(i, sum-i))
			i += 1
			try:
				Get json data
				content = self.get_response_content(pure_addr).decode()
				# Handle JSON data
				content = re.match(r". *? \ [(. *) \ $', content).group(1)

				Extract the desired content from the JSON data
				all_body_info = json.loads(content)["body"]
				Iterate to get all the information and splice the information together
				for body_info in all_body_info:
					try:
						result = result + body_info["c"].strip()
					except Exception as e:
						# print(e)
						pass

			except Exception as e:
				# print(e)
				pass
		Create a file directory
		try:
			path = "." + os.sep + self.docType
			# print(type(path),"path-->",path)

			os.makedirs(path)
		except Exception as e:
			# print(" folder %s already exists "%(path))
			pass
		Create file, save information
		try:
			file_name = "." + os.sep +self.docType + os.sep + self.title + ".txt"
			with open(file_name, 'w') as f:
				f.write(result)
				print("Saved as :",self.title + '.txt')
		except Exception as e:
			# print(e)
			pass


Create a class to get PPT
class BDWKPPT(BaiduWK):
	def __init__(self, url):
		self.all_img_url = list()
		super().__init__(url)
	
	Get json data save file
	def get_ppt_json_info(self):
		Get the source file
		ppt_source_html = self.get_response_content(self.url)
		Parse the source file
		content = ppt_source_html.decode('gbk')
		# print("-->",len(content))
		# test
		with open("test.html"."w") as f:
			f.write(content)

		Get the document Id
		self.docId = re.findall(r"docId.*? (\w{24}?) \ \ '.", content)[0]
		Concatenate the request JSON interface
		source_json_url = 'https://wenku.baidu.com/browse/getbcsurl?doc_id=%s&type=ppt&callback=zhaozhao'%self.docId
		Get json data as a string
		str_source_json = self.get_response_content(source_json_url).decode()
		# Handle json data as strings to make it a standard format
		pure_str_source_json = re.match(r". *? \ [(. *?) \] ', str_source_json).group(1)
		Convert string JSON to formal json for processing
		source_json = json.loads(pure_str_source_json)

		Walk through the list of data types in the dictionary
		for j in source_json['list'] :Create a temporary list
			temp_num_url = list()
			# concatenate the URL and page into the list
			temp_num_url.append(j["zoom"])
			temp_num_url.append(j["page"])
			Add list information to the global variable
			self.all_img_url.append(temp_num_url)

		# create folder
		try:
			os.makedirs("./ppt/%s"%(self.title))
		except Exception as e:
			# print("---->>",e)
			pass
		
		for img_url in self.all_img_url:
			# print(img_url)
			print("Obtaining page %d resources (remaining page %d)"%(img_url[1], len(self.all_img_url)-img_url[1]))
			data = self.get_response_content(img_url[0])
			path = "./ppt/%s/%s"%(self.title, str(img_url[1])+'.jpg')
			with open (path, 'wb') as f:
				f.write(data)

		print("Write complete")



Run the main program
def main():
	try:
		url = input("Please enter the url of the resource :")
		docType = BaiduWK(url).docType
	except:
		print("You entered the URL, please re-enter the error!")
		os.exit()
	print("Type as"."-- >",docType)

	if docType == "ppt":

		ppt = BDWKPPT(url)
		print("The name of the powerpoint presentation you are about to get is :", ppt.title)
		ppt.get_ppt_json_info()

	elif docType == "doc":
		word = BDWKDOC(url)
		print("The document you are about to obtain (Word) is named", word.title)
		pure_addr_list = word.get_pure_addr_list()
		word.get_json_content(pure_addr_list)

	elif docType == "pdf":
		pdf = BDWKDOC(url)
		print("The PDF name you will be getting is :", pdf.title)
		pure_addr_list = pdf.get_pure_addr_list()
		pdf.get_json_content(pure_addr_list)

	elif docType == "txt":

		txt = BDWKTXT(url)
		print("The text document (TXT) you are about to download is named :", txt.title)
		txt.get_txt(url)

	else:
		other = BDWKPPT(url)
		print("Downloading type %s is not currently supported"%(other.docType))
		pass


if __name__ == '__main__':
	main()

Copy the code

I compile the script into an exe file, Windows users from the resources below post, must according to the article name: http://www.jianshu.com/p/4f28e1ae08b1