两个文档的相似度(三)"/>
如何计算两个文档的相似度(三)
本文代码全部实现,并附上注释:
# -*- coding: cp936 -*-
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer
from gensim import corpora, models, similarities
import loggingcourses = [line.strip() for line in file('/home/liuxianga/coursera/coursera_corpus')] #######列表,每个列表元素为整篇文档
# print "courses"
# print courses
courses_name = [course.split('\t')[0] for course in courses] #########文档名称,同样是列表结构
# print "courses_name[0:10]"
# print courses_name[0:10]
texts_lower = [[word for word in document.lower().split()] for document in courses] #######列表的列表,内层列表内容为整篇文档,文档字母全部小写化
# print "texts_lower[0]"
# print texts_lower[0]
texts_tokenized = [[word.lower() for word in word_tokenize(document.decode('utf-8'))] for document in courses] #######列表的列表,内层列表内容为整篇文档,文档字母全部小写化,且为unicode编码
# print "texts_tokenized[0]"
# print texts_tokenized[0]
english_stopwords = stopwords.words('english')
# print "english_stopwords"
# print english_stopwords
len(english_stopwords)
texts_filtered_stopwords = [[word for word in document if not word in english_stopwords] for document in texts_tokenized] ##########从texts_tokenized中去除stopword
# print "texts_filtered_stopwords[0]"
# print texts_filtered_stopwords[0]
english_punctuations = [',','.',':',';','?','(',')','[',']','&','!','*','@','#','$','%']
texts_filtered = [[word for word in document if not word in english_punctuations] for document in texts_filtered_stopwords] ##########从texts_tokenized中去除标点符号
# print "texts_filtered[0]"
# print texts_filtered[0]
st = LancasterStemmer()
# print st.stem('stemmed')
# print st.stem('stemming')
texts_stemmed = [[st.stem(word) for word in docment] for docment in texts_filtered] ########词干化
# print "texts_stemmed[0]"
# print texts_stemmed[0]
all_stems = sum(texts_stemmed, [])
stems_once = set(stem for stem in set(all_stems) if all_stems.count(stem) == 1)
texts = [[stem for stem in text if stem not in stems_once] for text in texts_stemmed]
# print "texts[0:10]"
# print texts[0:10]
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10)
index = similarities.MatrixSimilarity(lsi[corpus])
print "####################################################################################################################"
# print "courses_name[210]"
# print courses_name[210] #######课程名称
ml_course = texts[210] #####课程整篇文档
ml_bow = dictionary.doc2bow(ml_course)
ml_lsi = lsi[ml_bow] ########210课程与10个主题相似性大小
print "ml_lsi"
print ml_lsi
sims = index[ml_lsi] #######给指定课程名称推荐若干篇类似文档,sims包含所有的推荐文档
print "sims[0:10]"
print sims[0:10]
sort_sims = sorted(enumerate(sims), key=lambda item: -item[1]) ######对于sims的所有元素排序
print "sort_sims[0:10]"
print sort_sims[0:10]# print "courses_name[210]"
# print courses_name[210]
#
# print "courses_name[174]"
# print courses_name[174]
#
# print "courses_name[238]"
# print courses_name[238]
#
# print "courses_name[203]"
# print courses_name[203]
三、课程图谱相关实验
1、数据准备
为了方便大家一起来做验证,这里准备了一份Coursera的课程数据,可以在这里下载:coursera_corpus,(百度网盘链接:
更多推荐
如何计算两个文档的相似度(三)
发布评论