从网上下载了篇《穷爸爸富爸爸》电子书,想分析下哪些词汇出现次数多,做了以下尝试:
import jieba
import pandas as pd
file = open('D:/z/穷爸爸富爸爸.txt')
raw = file.read()
print(raw.find('序言'))
print(raw.rfind('后记'))
raw1 = raw[1971:115607].strip('\n')
lines = raw1.split('\n')
wd_list = []
for line in lines:
words = jieba.cut(line)
for w in words:
print(w)
wd_list.append(w)
a = wd_list.count('成绩') #出现的次数15次
b = 100*a/len(wd_list) #在文本中的百分比2.05%
def word_count(word,list):
return list.count(word)
def word_percent(word,list):
return round(100*list.count(word)/len(list),5)
word_count('我们',wd_list)
word_percent('我们',wd_list)
word_count('理财',wd_list)
word_percent('理财',wd_list)
word_count('成绩',wd_list)
word_percent('成绩',wd_list)
word_s = set(wd_list)
print(word_s)
print(len(word_s))
for word in word_s:
print(word)
c_ = []
p_ = []
w_ = []
for word in word_s:
count = word_count(word,wd_list)
percent = word_percent(word,wd_list)
c_.append(count)
p_.append(percent)
w_.append(word)
data = pd.DataFrame({'word':w_,'count':c_,'percent':p_})