Content abstract
- Code 1, notes abbreviated version
- The word frequency statistics and pandas set are combined with word segmentation and word frequency statistics
A code
Import FontCN_NLPtools as FTS refers to a class written by myself, which is the encapsulation of some methods I commonly use. The code has been uploaded
# - * - coding: Utf-8 -*- # # Created :'2017/7/3' # email: [email protected] # CSDN: http://blog.csdn.net/fontthrone import sys reload(sys) sys.setdefaultencoding('utf-8') sys.path.append(".. /") import NLTK import FontCN_NLPtools as FTS # import matplotlib as MPL mpl.rcparams [u'font. Sans-serif '] = [u'KaiTi'] mpl.rcParams[u'font.serif'] = [u'KaiTi'] # mpl.rcParams['axes.unicode_minus'] = False # # set the path of text to be analyzed stopwords_path = 'stopwords\ cnenstopwords.txt ' FontsTools = fts.FontCN_NLPtools(textPath=text_path, StopwordsPath = Stopwords_path) fontstools. addUserWords([u' ']) tokenstr = Nltk.word_tokenize (fontstools.gettext (isAddWord=True)) # fulltext print u", for item in tokenstr: Print item, print print u" ", print len(tokenstr) # Print u" for word in sorted(set(tokenstr)): Print u" print u" :", Print float(len(tokenstr))/float(len(set(tokenstr))) # fdist1 = nltk.FreqDist(tokenstr) for key, val in sorted(fdist1.iteritems()): print key, val, print print u"......... Lu Mingfei number of occurrences..............." Print fdist1[u' "] print print u"......... Number of occurrences of Ricardo..............." Print fdist1[u' ricardo '] # print print u"......... Top 10 most frequently used words..............." fdist1 = nltk.FreqDist(tokenstr) for key, val in sorted(fdist1.iteritems(), key=lambda x: X (x [1], [0]), reverse = True) [10] : print key, val # 10 map on the cumulative frequency of high frequency words print before u '... Cumulative frequency distribution of the top 10 high-frequency words............... ' fdist1.plot(10, cumulative=True)Copy the code
In code 2, it is combined with pandas to generate a statistical bar graph anda combination of pos tagging and word frequency statistics
Pandas is a very useful tool for manipulating pandas. It is very useful to use matplotlib for manipulating pandas. It is very useful for manipulating pandas Tlib does the drawing.
# - * - coding: Utf-8 -*- # # Created :'2017/7/12' # email: [email protected] # CSDN: http://blog.csdn.net/fontthrone import sys import matplotlib.pyplot as plt import FontCN_NLPtools as fts import nltk The import pandas as pd # # to solve the problem of the garbled the matplotlib and pandas see http://blog.csdn.net/fontthrone/article/details/75042659 import matplotlib as mpl mpl.rcParams['font.sans-serif'] = ['KaiTi'] mpl.rcParams['font.serif'] = ['KaiTi'] # Mpl.rcparams ['axes. Unicode_minus '] = False # Fix save image with negative sign '-' displayed as squares, or convert negative sign to string import seaborn as SNS sns.set_style("darkgrid", {"font.sans-serif": ['KaiTi', 'Arial']}) reload(sys) sys.setdefaultencoding('utf-8') sys.path.append(".. /") text_path = u' TXT /lztest.txt' # set the path of text to analyze stopwords_path = u'stopwords\ cnenstopwords.txt '# stopwords fontsTools = fts.FontCN_NLPtools(textPath=text_path, Print fontstools.gettext (isAddWord=True) print fontstools.gettext (isAddWord=True) tokenstr = nltk.word_tokenize(lztext) fdist1 = nltk.FreqDist(tokenstr) listkey = [] listval = [] print u"......... Top 30 words..............." for key, val in sorted(fdist1.iteritems(), key=lambda x: (x[1], x[0]), reverse=True)[:30]: listkey.append(key) listval.append(val) print key, val, u' ', df = pd.DataFrame(listval, Columns =[u' times ']) df.index = listkey df.plot(kind='bar') plt.title(u' about the column frequency ') plt.show() posstr = fontsTools.jiebaCutStrpos(NewText=lztext) strtag = [nltk.tag.str2tuple(word) for word in posstr.split()] cutTextList = If tag[0] == "V": [] if tag[0] == "V": cutTextList.append(word) tokenstr = nltk.word_tokenize(" ".join(cutTextList)) fdist1 = nltk.FreqDist(tokenstr) listkey = [] listval = [] print print u"......... Top 30 verbs..............." for key, val in sorted(fdist1.iteritems(), key=lambda x: (x[1], x[0]), reverse=True)[:30]: listkey.append(key) listval.append(val) print key, val, u' ', df = pd.DataFrame(listval, Columns =[u' count ']) df.index = listkey df.plot(kind='bar') plt.title(u' columns=[u' count ']) plt.show()Copy the code