This article is participating in Python Theme Month. See the link for details

introduce

Following the simple homophone correction above, this paper introduces a slightly complex Chinese error correction, and the improved content is as follows:

  • Use a fault-tolerant prefix tree to store all participles
  • Use a fault-tolerant prefix tree to store all pinyin
  • Use bi-gram to calculate the probability value of the word
  • The evaluation score of Chinese word segmentation not only has bi-gram value, but also has pinyin editing distance. The evaluation score of Pinyin is only the editing distance of Pinyin
  • The output results are ranked in descending order, showing a maximum of five results

token.txt

Token. TXT stores word segmentation data or word segmentation pinyin data, which can be changed into their own files, each line format is

(word, word frequency) or (word pinyin, word rate)Copy the code

As follows:

Zhejiang Province,1 Hangzhou,5 Xihu District,3 Zhejiangsheng,1 Hangzhoushi,5 Xihuqu,3Copy the code

main.py

Basically start the program, build the prefix tree, and then test it

import csv, datetime, os from DictionaryTrie import Trie from corrector import Corrector from util import load_variavle, save_variable def buildTrieFromFile(): trie = Trie() start = datetime.datetime.now() savefile = 'save_tree' if os.path.exists(savefile): return load_variavle(savefile) recommendFile = open('token.txt', 'r') try: recommendReader = csv.reader(recommendFile, delimiter=',') for row in recommendReader: trie.insert(row[0], int(row[1])) save_variable(trie, savefile) finally: recommendFile.close() end = datetime.datetime.now() print("Time taken to build Dictionary Tree: " + str(end - start)) return trie def suggestorIO(trie, s, d): suggestions = trie.findAll(s, d) return [str(x.word) for x in suggestions] if __name__ == "__main__": Suggete () trie = buildTrieFromFile() c = Corrector() r = suggestorIO(trie, 'trie ', SuggestorIO (trie, 'zhejiangshen', 1) print(suggestorIO(trie, 'zhejiangshen', r)) print("Time taken to get token_freq and words : " + str(datetime.datetime.now() - start))Copy the code

corrector.py

It is mainly the realization of Bi-Gram probability matrix, mapping dictionary and prediction scoring method

import datetime, math, pinyin, Levenshtein, os from util import load_variavle, Save_variable # START = 'S' # END = 'E' class Corrector(): def __init__(self): * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * Self.n2c = self.createdict (self.wordcount) # matrix self.calmatrix () # def getTexts(self): result = [] with open('token.txt', 'r') as f: for line in f.readlines(): Result.append (line.strip().split(',')[0]) return result # def init(self, texts): start = datetime.datetime.now() savefile1 = 'save_wordcount' savefile2 = 'save_words' if os.path.exists(savefile1) and os.path.exists(savefile2): Return load_variavle(savefile1), load_variavle(savefile2) # wordcount = {} # Words = [] for word in texts: if word and len(word) > 2: word = START + word + END words.append(word) for c in word: if c not in wordcount: wordcount[c] = 1 else: wordcount[c] += 1 save_variable(wordcount, savefile1) save_variable(words, savefile2) print("Time taken to get token_freq and words : "+ STR (datetime.datetime.now() -start)) return wordcount, words # def createDict(self, CHAR_FREQ): start = datetime.datetime.now() savefile1 = 'save_c2n' savefile2 = 'save_n2c' if os.path.exists(savefile1) and os.path.exists(savefile2): return load_variavle(savefile1), load_variavle(savefile2) c2n = {} n2c = {} count = 0 for c in CHAR_FREQ: c2n[c] = count n2c[count] = c count += 1 save_variable(c2n, savefile1) save_variable(n2c, savefile2) print("Time taken to get token_freq and words : "+ STR (datetime.datetime.now() -start)) return c2n, n2c # def calMatrix(self): start = datetime.datetime.now() savefile = 'save_matrix' if os.path.exists(savefile): Return load_variavle(savefile) matrix = [[0] * len(self.wordcount) for I in range(len(self.wordcount))] # count bi-gram For key in self. Words: for I, C in enumerate(key): if I == 0: continue else: Matrix [self.c2n[key[i-1]]][self.c2n[c]] += 1 # Calculate the bi-gram probability matrix for I, line in enumerate(matrix): for j, freq in enumerate(line): matrix[i][j] = round(matrix[i][j] / self.wordcount[self.n2c[i]], 10) save_variable(matrix, savefile) print("Time taken to get token_freq and words : "+ STR (datetime.datetime.now() -start)) return matrix # def predict(self, s, strings): base = pinyin.get(s, format='strip', delimiter=' ') result = {} for s in strings: r = 0 s = START + s + END for i, c in enumerate(s): if i == 0: continue if c in self.c2n: r += math.log(self.matrix[self.c2n[s[i - 1]]][self.c2n[s[i]]] + 1) else: r = 0 break t = s.lstrip(START).rstrip(END) cmp = pinyin.get(t, format='strip', delimiter=' ') if t not in result: Result [t] = r * 0.1 + (len(base) -levenshtein.distance (base, CMP)) * 0.9 return sorted(result.items(), key=lambda x: x[1], reverse=True)[:5]Copy the code

DictionaryTrie.py

This is mainly the implementation of a fault tolerant prefix tree

class Trie: def __init__(self): self.root = LetterNode('') self.START = 3 def insert(self, word, freq): self.root.insert(word, freq, 0) def findAll(self, query, maxDistance): suggestions = self.root.recommend(query, maxDistance, self.START) return sorted(set(suggestions), key=lambda x: x.freq) class LetterNode: def __init__(self, char): self.REMOVE = -1 self.ADD = 1 self.SAME = 0 self.CHANGE = 2 self.START = 3 self.pointers = [] self.char = char self.word  = None def charIs(self, c): return self.char == c def insert(self, word, freq, depth): if depth < len(word): c = word[depth].lower() for next in self.pointers: if next.charIs(c): return next.insert(word, freq, depth + 1) nextNode = LetterNode(c) self.pointers.append(nextNode) return nextNode.insert(word, freq, depth + 1) else: self.word = Word(word, freq) def recommend(self, query, movesLeft, lastAction): suggestions = [] length = len(query) if length >= 0 and movesLeft - length >= 0 and self.word: suggestions.append(self.word) if movesLeft == 0 and length > 0: for next in self.pointers: if next.charIs(query[0]): suggestions += next.recommend(query[1:], movesLeft, self.SAME) break elif movesLeft > 0: for next in self.pointers: if length > 0: if next.charIs(query[0]): suggestions += next.recommend(query[1:], movesLeft, self.SAME) else: suggestions += next.recommend(query[1:], movesLeft - 1, self.CHANGE) if lastAction ! = self.CHANGE and lastAction ! = self.REMOVE: suggestions += next.recommend(query, movesLeft - 1, self.ADD) if lastAction ! = self.ADD and lastAction ! = self.CHANGE: if length > 1 and next.charIs(query[1]): suggestions += next.recommend(query[2:], movesLeft - 1, self.REMOVE) elif length > 2 and next.charIs(query[2]) and movesLeft == 2: suggestions += next.recommend(query[3:], movesLeft - 2, self.REMOVE) else: if lastAction ! = self.CHANGE and lastAction ! = self.REMOVE: suggestions += next.recommend(query, movesLeft - 1, self.ADD) return suggestions class Word: def __init__(self, word, freq): self.word = word self.freq = freqCopy the code

util.py

Methods to save and load variables

Def save_variable(v, filename) def save_variable(v, filename) F = open(filename, 'wb') pickle.dump(v, f) f.lose () return filename # load_variavle(filename): f = open(filename, 'rb') r = pickle.load(f) f.close() return rCopy the code

test

Py will be printed and save_c2n, save_n2c, save_matrix, save_tree, save_wordcount, save_words and other files will be generated to save variables. The operation then takes less time to construct trees and calculate probability matrices. The result is printed as follows:

Time taken to get token_freq and words: 0:00:00.046511 Time taken to get token_freq and words: 0:00:00.000983 Time taken to get token_freq and words: 0:00:01.869512 [('zhejiang ', 13.59619683451056)] [(' Zhejiangsheng ', 19.375014751170497)] With words: 0:00:04.142090Copy the code

If you use your own dictionary, it will print something different, but similar.

Looking forward to

Although the implementation of this time is slightly more complex than the last time, there is still room for improvement, and it will continue to be optimized in the future