This article is participating in Python Theme Month. See the link to the event for more details

preface

Recently, I am working on a project involving the processing of novel files, and the project is to provide the function of watching novels online. Since it is impractical to put about 10GB of fiction files in mysql, and you want to use Redis to do some buffer, it is more convenient to convert fiction files into JSON format for data operation. Because the novel files obtained from the resource website are in TXT format, so we need to use Python for some conversion. The following is my processing method

The original file

  • File name format

  • Format of novel content

  • Encoding Format The novel TXT file is encoded in UTF-8Note: The original TXT file is not UTF-8 requires conversion tool:download

The target file

  • Novel summary file book.json

  • Contents of each novel file
    • The file name format is bookid.json
    • The file format

Python code


import os,sys
import json,re

class fileScanner(object) :

        def __init__(self,dir) :
            self.dir = dir
            self.files = []
            self.size = 0;
        
        def scanFile(self) :
            Scan all TXT files
           filetype ='.txt'
           file = {
                'fileName':' '.'filePath':' '
           }
           
           for parent,dirnames,filenames in os.walk(self.dir) :for filename in filenames:
                    file = {
                    'fileName':' '.'filePath':' '
                    }
                    file['fileName'] = filename
                    file['filePath']= os.path.join(parent, filename) 
                    if file['filePath'].find(filetype)! = -1:
                        self.files.append(file)
                        print("扫描到: "+file['filePath'])
                        self.size+=1


class bookTojson(object) :
        def __init__(self,files) :
            self.files = files
            self.books = []
        
        def readBookDetail(self,n) :
            detail = {
                'bookID':n,
                'bookName':'no'.'autor':'no'.'bookType':'no'
            }
            fileName = self.files[n]['fileName']
            fileName = fileName.replace('.txt'.' ')  # File name cleanup
            fileName= re.sub(R '[(.*)]'."", fileName) # File name cleanup
            
            detail['bookName'] = fileName # Initial assignment novel name, prevent null
            if re.search(R '" (.*) "',fileName):
                # print (' title: '+ re search (r' "(. *)" 'and fileName). The group (). The strip (' ""'))
                detail['bookName'] = re.search(R '" (.*) "',fileName).group().strip('" "')
                fileName= re.sub(R '" (.*) "'."", fileName)
                
            if re.search(r'\[(.*)\]',fileName):
                # print (' category: '+ re search (r' \ [(. *) \], the fileName). The group (). The strip (' [] '))
                detail['bookType'] = re.search(r'\[(.*)\]',fileName).group().strip('[]')
                
            if len(fileName.split('Author:')) >1:
                Print (' author :'+ filenames. Split (' author :')[1]
                detail['autor'] = fileName.split('Author:') [1]
            
            
            self.books.append(detail)
        
        Output the novel content as JSON
        def writeJson(self,book,bookID) :
            jsonArr = json.dumps(book, ensure_ascii=False, sort_keys=False, indent=4, separators=(', '.':'))
            filePath ='D:\\BaiduNetdiskDownload\\books\\bookJson\\'+str(bookID)+'.json'
            with open(filePath, 'w+', encoding='utf-8') as f:
                f.writelines(jsonArr)
        #
        Handle the output chapters and content of the novel
        def readBookContent(self,n) :
            
            book = {
                'bookID':n,
                'bookName':self.books[n]['bookName'].'autor':self.books[n]['autor'].'bookType':self.books[n]['bookType'].'chapters': [].## item {}
                'contents': {}# #
            }
            
            chapterName = ' '
            content = ' '
            
            fopen=open(self.files[n]['filePath'],encoding = 'utf-8',mode = 'r') 
            lines=[]
            lines=fopen.readlines()
            index = 1  # section number
            flag = 0  # Flag that reads chapter content
            for line in lines:    
                if len(line)>0:
                    if re.match(r'^(\\s*)',line)==None:
                        if ord(line[0])! =12288:
                            # Read the chapter title
                            #print(line)
                            chapterName+=(line+' ')
                            flag = 0 
                        else:
                            # Read chapter content
                            
                            if flag == 0:
                                flag =1
                                # Can write chapter content
                            else:
                                flag+=1
                                content+=line
                    else:
                        # Read chapter content
                        
                        if flag == 0:
                            flag =1
                            # Can write chapter content
                        else:
                            flag+=1
                            content+=line
                                
                if flag == 1 :
                    # Write the contents of the previous chapter
                    if index > 1:
                        book['contents'] [str(n)+str(index-1)] = content
                        content = line # New chapter begins
                    ifchapterName ! =None:
                        chapter = {
                        'chapterID':' '.'chapterName':'no',}# Notice the assignment problem in the dictionary for
                        
                        chapter['chapterID'] = str(n)+str(index)
                        chapter['chapterName'] = chapterName
                        chapterName = ' '
                        book['chapters'].append(chapter)
                        
                    # End of chapter title reading
                    index += 1
                    
            # End chapter
            book['contents'] [str(n)+str(index-1)] = content
            self.writeJson(book,n)   
            #print(json.dumps(book, ensure_ascii=False, sort_keys=False, indent=4, separators=(',', ': ')))  
                    
        def runFun(self,n) :
         try:
            self.readBookDetail(n)
            self.readBookContent(n)
         except Exception:
            print('file'+str(n)+'wrong')
            



if __name__ == "__main__":
    # Fill in the novels folder name
    fs = fileScanner("D:\\BaiduNetdiskDownload\\books\\books") Note that the path has double slashes to prevent escape characters \n, etc
    print('Get the name of the novel:')
    fs.scanFile()        
    bTj = bookTojson(fs.files)    
    print('Start working on a novel:')
    for i in range(0.len(fs.files)):
        bTj.runFun(i)
        sys.stdout.write("Processed :%.3f%%" %  float(((i/len(fs.files)))*100) + '\r')
        sys.stdout.flush()
    print('Novel processing completed')
    
    
    data=bTj.books
    jsonArr = json.dumps(data, ensure_ascii=False, sort_keys=False, indent=4, separators=(', '.':'))
    print('The file has been output in JSON format')
    with open('books.json'.'w+', encoding='utf-8') as f:
        f.writelines(jsonArr)
       
    print(Number of novels % D%fs.size)
Copy the code

The results

After the speech

Then 10 G of 2000 novels ran about an hour to all into JSON, (ー_ー) blue, algorithm problem to welcome the blue refers to the message below, note: need resources can also leave a message below