from sklearn.feature_extraction.text import CountVectorizer
vectorizer=CountVectorizer(min_df=1)
print(vectorizer)
CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
lowercase=True, max_df=1.0, max_features=None, min_df=1,
ngram_range=(1, 1), preprocessor=None, stop_words=None,
strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
tokenizer=None, vocabulary=None)
#列出所有的帖子
posts=["How to format my disk","hard disk format","I like you!","Like is What","you like who?"]
#对于帖子进行词袋转化
x=vectorizer.fit_transform(posts)
#x为输出的几篇文章中的词频统计情况
print x
(0, 0) 1
(0, 6) 1
(0, 1) 1
(0, 7) 1
(0, 3) 1
(1, 2) 1
(1, 0) 1
(1, 1) 1
(2, 10) 1
(2, 5) 1
(3, 8) 1
(3, 4) 1
(3, 5) 1
(4, 9) 1
(4, 10) 1
(4, 5) 1
print vectorizer.get_feature_names()
[u'disk', u'format', u'hard', u'how', u'is', u'like', u'my', u'to', u'what', u'who', u'you']
print (x.toarray())
[[1 1 0 1 0 0 1 1 0 0 0]
[1 1 1 0 0 0 0 0 0 0 0]
[0 0 0 0 0 1 0 0 0 0 1]
[0 0 0 0 1 1 0 0 1 0 0]
[0 0 0 0 0 1 0 0 0 1 1]]
#显示样本数和特征个数
num_samples,num_features=x.shape
print("#samples: %d ,#features: %d" % (num_samples,num_features))
#samples: 5 ,#features: 11
#针对新帖子的向量化
newpost="how to format my computer's disk"
new_post_vec=vectorizer.transform([newpost])
#输出新帖子的向量
print (new_post_vec.toarray())
[[1 1 0 1 0 0 1 1 0 0 0]]
#根据输出可以确认,新文章中的新词汇不会被学习
print(new_post_vec.toarray().shape)
(1L, 11L)
##################################################################################################
#定义文章向量相似度,采用词频向量的欧式距离
import scipy as sp
def dist_raw(v1,v2):
delta=v1-v2
return sp.linalg.norm(delta)
test_v1=sp.array([0,0])
test_v2=sp.array([1,2])
#计算欧式距离
print (dist_raw(test_v1,test_v2))
#欧式距离,即sp.sqrt(sp.square(1-0)+sp.square(2-0))
print sp.sqrt(5)
2.2360679775
2.2360679775
#计算 new_post同所有帖子的欧式距离(dist_raw),记录最相近的一个
import sys
best_doc=None
best_dist=sys.maxint
best_i=None
for i in range(0,num_samples):
post=posts[i]
if post==newpost:
continue
post_vec=x.getrow(i)
print("post_vec's shape:%s, new_post_vec's shape:%s" %(post_vec.shape,new_post_vec.shape))
dist=dist_raw(post_vec.toarray(),new_post_vec.toarray())
print "=== Post %i with dist=%.2f: %s" %(i,dist,post)
if dist<best_dist:
best_dist=dist
best_i=i
print ("newpost :%s" % newpost)
print ("Best post is %i with dist=%.2f. Post Content:%s" %(best_i,best_dist,posts[best_i]))
post_vec's shape:(1, 11), new_post_vec's shape:(1, 11)
=== Post 0 with dist=0.00: How to format my disk
post_vec's shape:(1, 11), new_post_vec's shape:(1, 11)
=== Post 1 with dist=2.00: hard disk format
post_vec's shape:(1, 11), new_post_vec's shape:(1, 11)
=== Post 2 with dist=2.65: I like you!
post_vec's shape:(1, 11), new_post_vec's shape:(1, 11)
=== Post 3 with dist=2.83: Like is What
post_vec's shape:(1, 11), new_post_vec's shape:(1, 11)
=== Post 4 with dist=2.83: you like who?
newpost :how to format my computer's disk
Best post is 0 with dist=0.00. Post Content:How to format my disk
#本Notebook,通过最简单情况下进行贴子相似性分析
#下一Notebook,说明通过目录读取;对于词频向量的归一等内容
jupyter_文本特征抽取_1最基础例子
最后编辑于 :
©著作权归作者所有,转载或内容合作请联系作者
- 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
- 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
- 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
推荐阅读更多精彩内容
- 这8种学生永远拿不到高分!早看早受益! 下面是一位资深班主任总结了8种成绩提不上去的原因,分别对应8类孩子,如果你...