This article is participating in Python Theme Month. See the link to the event for more details
preface
Recently, I am working on a project involving the processing of novel files, and the project is to provide the function of watching novels online. Since it is impractical to put about 10GB of fiction files in mysql, and you want to use Redis to do some buffer, it is more convenient to convert fiction files into JSON format for data operation. Because the novel files obtained from the resource website are in TXT format, so we need to use Python for some conversion. The following is my processing method
The original file
- File name format
- Format of novel content
- Encoding Format The novel TXT file is encoded in UTF-8
Note: The original TXT file is not UTF-8 requires conversion tool:
download
The target file
- Novel summary file book.json
- Contents of each novel file
- The file name format is bookid.json
- The file format
Python code
import os,sys
import json,re
class fileScanner(object) :
def __init__(self,dir) :
self.dir = dir
self.files = []
self.size = 0;
def scanFile(self) :
Scan all TXT files
filetype ='.txt'
file = {
'fileName':' '.'filePath':' '
}
for parent,dirnames,filenames in os.walk(self.dir) :for filename in filenames:
file = {
'fileName':' '.'filePath':' '
}
file['fileName'] = filename
file['filePath']= os.path.join(parent, filename)
if file['filePath'].find(filetype)! = -1:
self.files.append(file)
print("扫描到: "+file['filePath'])
self.size+=1
class bookTojson(object) :
def __init__(self,files) :
self.files = files
self.books = []
def readBookDetail(self,n) :
detail = {
'bookID':n,
'bookName':'no'.'autor':'no'.'bookType':'no'
}
fileName = self.files[n]['fileName']
fileName = fileName.replace('.txt'.' ') # File name cleanup
fileName= re.sub(R '[(.*)]'."", fileName) # File name cleanup
detail['bookName'] = fileName # Initial assignment novel name, prevent null
if re.search(R '" (.*) "',fileName):
# print (' title: '+ re search (r' "(. *)" 'and fileName). The group (). The strip (' ""'))
detail['bookName'] = re.search(R '" (.*) "',fileName).group().strip('" "')
fileName= re.sub(R '" (.*) "'."", fileName)
if re.search(r'\[(.*)\]',fileName):
# print (' category: '+ re search (r' \ [(. *) \], the fileName). The group (). The strip (' [] '))
detail['bookType'] = re.search(r'\[(.*)\]',fileName).group().strip('[]')
if len(fileName.split('Author:')) >1:
Print (' author :'+ filenames. Split (' author :')[1]
detail['autor'] = fileName.split('Author:') [1]
self.books.append(detail)
Output the novel content as JSON
def writeJson(self,book,bookID) :
jsonArr = json.dumps(book, ensure_ascii=False, sort_keys=False, indent=4, separators=(', '.':'))
filePath ='D:\\BaiduNetdiskDownload\\books\\bookJson\\'+str(bookID)+'.json'
with open(filePath, 'w+', encoding='utf-8') as f:
f.writelines(jsonArr)
#
Handle the output chapters and content of the novel
def readBookContent(self,n) :
book = {
'bookID':n,
'bookName':self.books[n]['bookName'].'autor':self.books[n]['autor'].'bookType':self.books[n]['bookType'].'chapters': [].## item {}
'contents': {}# #
}
chapterName = ' '
content = ' '
fopen=open(self.files[n]['filePath'],encoding = 'utf-8',mode = 'r')
lines=[]
lines=fopen.readlines()
index = 1 # section number
flag = 0 # Flag that reads chapter content
for line in lines:
if len(line)>0:
if re.match(r'^(\\s*)',line)==None:
if ord(line[0])! =12288:
# Read the chapter title
#print(line)
chapterName+=(line+' ')
flag = 0
else:
# Read chapter content
if flag == 0:
flag =1
# Can write chapter content
else:
flag+=1
content+=line
else:
# Read chapter content
if flag == 0:
flag =1
# Can write chapter content
else:
flag+=1
content+=line
if flag == 1 :
# Write the contents of the previous chapter
if index > 1:
book['contents'] [str(n)+str(index-1)] = content
content = line # New chapter begins
ifchapterName ! =None:
chapter = {
'chapterID':' '.'chapterName':'no',}# Notice the assignment problem in the dictionary for
chapter['chapterID'] = str(n)+str(index)
chapter['chapterName'] = chapterName
chapterName = ' '
book['chapters'].append(chapter)
# End of chapter title reading
index += 1
# End chapter
book['contents'] [str(n)+str(index-1)] = content
self.writeJson(book,n)
#print(json.dumps(book, ensure_ascii=False, sort_keys=False, indent=4, separators=(',', ': ')))
def runFun(self,n) :
try:
self.readBookDetail(n)
self.readBookContent(n)
except Exception:
print('file'+str(n)+'wrong')
if __name__ == "__main__":
# Fill in the novels folder name
fs = fileScanner("D:\\BaiduNetdiskDownload\\books\\books") Note that the path has double slashes to prevent escape characters \n, etc
print('Get the name of the novel:')
fs.scanFile()
bTj = bookTojson(fs.files)
print('Start working on a novel:')
for i in range(0.len(fs.files)):
bTj.runFun(i)
sys.stdout.write("Processed :%.3f%%" % float(((i/len(fs.files)))*100) + '\r')
sys.stdout.flush()
print('Novel processing completed')
data=bTj.books
jsonArr = json.dumps(data, ensure_ascii=False, sort_keys=False, indent=4, separators=(', '.':'))
print('The file has been output in JSON format')
with open('books.json'.'w+', encoding='utf-8') as f:
f.writelines(jsonArr)
print(Number of novels % D%fs.size)
Copy the code
The results
After the speech
Then 10 G of 2000 novels ran about an hour to all into JSON, (ー_ー) blue, algorithm problem to welcome the blue refers to the message below, note: need resources can also leave a message below