模型练习1"/>
LDA主题模型练习1
1.本文针对LDA主题模型进行学习和练习,核心摘要如下:
2.NLP中的共现对应条件概率(独立时最特殊),最大似然估计计算字符的共现例子:
3.LDA主题模型代码实例
#-*-coding:utf8-*-
import jiebadir1='E:/ssssszzz/lda/'
def stopwordslist(filepath):stopwords = [line.strip() for line in open(filepath, 'r',encoding="UTF-8").readlines()] #readlines是行的listreturn stopwordsdef seg_sentence(sentence):sentence_seged = jieba.cut(sentence.strip())stopwords = stopwordslist(dir1+'stopWords/stopwords.txt')outstr = ''for word in sentence_seged:#jieba对每一行进行分词,并将英文字母大写转为小写word = word.lower()#不是停用词,不是特殊符号就加入到str字符串中,并以空格分隔开。if word not in stopwords:if word != '\t':outstr += wordoutstr += " "return outstrinputs = open(dir1+'input/copurs.txt','r+',encoding="UTF-8")
#读入文本语料,中文强制添加encoding='utf-8'才不会出现下面的gbk报错。
#UnicodeDecodeError: 'gbk' codec can't decode byte 0xbe in position 44: illegal multibyte sequence
content1= inputs.readlines()
inputs.close()
outputs = open(dir1+'input/copurs_out.txt', 'w',encoding="UTF-8") #写入jieba分词的结果
for line in content1:line_seg = seg_sentence(line)outputs.write(line_seg + '\n')
outputs.close()import codecs
from gensim import corpora
from gensim.models import LdaModel
from gensim import models
from gensim.corpora import Dictionaryte = []
fp = codecs.open(dir1+'input/copurs_out.txt','r',encoding="UTF-8")
for line in fp:line = line.split()te.append([ w for w in line ])
print(len(te))
dictionary = corpora.Dictionary(te)
corpus = [ dictionary.doc2bow(text) for text in te ]
#语料扔到TF-IDF中计算
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]#########Run the LDA model for XX topics ###############################
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=50,passes=2000)
doc_topic = [a for a in lda[corpus]]####### write the topics in file topics_result.txt ####################
topics_r = lda.print_topics(num_topics = 50, num_words = 10)
topic_name = codecs.open(dir1+'output/topics_result.txt','w',encoding="UTF-8")
for v in topics_r:topic_name.write(str(v)+'\n')###################### write the class results to file #########################
###################### each document belongs to which topic ######################fp2 = codecs.open(dir1+'output/documents_result.txt','w',encoding="UTF-8")
for t in doc_topic:c = []c.append([a[1] for a in t])m = max(c[0])for i in range(0, len(t)):if m in t[i]:#print(t[i])fp2.write(str(t[i][0]) + ' ' + str(t[i][1]) + '\n')break
################################ OVER ############################################
fp2.close()#########Run the LDA model for XX topics ###############################
lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=50,passes=2000)
doc_topic = [a for a in lda[corpus]]
运行效果:然而不知道是单篇文章没用,还是哪里没弄对,不知道LDA为什么并没有给出具体的主体,也没有实现对多文档的归类功能?
1>.documents_result.txt
41 0.8911111111111114
22 0.988604651162792
0 0.01999999999999999
0 0.01999999999999999
0 0.01999999999999999
41 0.6289422620757548
0 0.01999999999999999
0 0.01999999999999999
0 0.01999999999999999
0 0.01999999999999999
0 0.01999999999999999
43 0.9711764705882348
0 0.01999999999999999
0 0.01999999999999999
0 0.01999999999999999
28 0.9772093023255799
0 0.01999999999999999
0 0.01999999999999999
0 0.01999999999999999
41 0.7800000000000002
0 0.01999999999999999
0 0.01999999999999999
0 0.01999999999999999
0 0.01999999999999999
0 0.01999999999999999
8 0.9591666666666674
0 0.01999999999999999
0 0.01999999999999999
0 0.01999999999999999
13 0.8775000000000017
0 0.01999999999999999
0 0.01999999999999999
0 0.01999999999999999
13 0.9608000000000005
0 0.01999999999999999
0 0.01999999999999999
0 0.01999999999999999
14 0.7695384615384631
0 0.01999999999999999
0 0.01999999999999999
0 0.01999999999999999
30 0.9387500000000005
0 0.01999999999999999
0 0.01999999999999999
0 0.01999999999999999
9 0.9683870967741941
0 0.01999999999999999
0 0.01999999999999999
0 0.01999999999999999
38 0.8366666666666672
0 0.01999999999999999
0 0.01999999999999999
0 0.01999999999999999
49 0.981153846153846
0 0.01999999999999999
0 0.01999999999999999
0 0.01999999999999999
14 0.9777272727272702
0 0.01999999999999999
0 0.01999999999999999
0 0.01999999999999999
31 0.9920325203247484
0 0.01999999999999999
0 0.01999999999999999
0 0.01999999999999999
26 0.7549999999999999
0 0.01999999999999999
0 0.01999999999999999
0 0.01999999999999999
35 0.9821818181818174
0 0.01999999999999999
0 0.01999999999999999
0 0.01999999999999999
21 0.9833898305084731
0 0.01999999999999999
0 0.01999999999999999
0 0.01999999999999999
27 0.9711764705882333
0 0.01999999999999999
0 0.01999999999999999
0 0.01999999999999999
21 0.984444444444443
0 0.01999999999999999
0 0.01999999999999999
0 0.01999999999999999
41 0.7525000000000003
0 0.01999999999999999
0 0.01999999999999999
0 0.01999999999999999
0 0.01999999999999999
8 0.986756756756759
0 0.01999999999999999
0 0.01999999999999999
0 0.01999999999999999
19 0.9920967741935494
0 0.01999999999999999
0 0.01999999999999999
0 0.01999999999999999
45 0.9836666666666665
0 0.01999999999999999
15 0.5099999999999999
2>.topics_result.txt
(0, '0.002*"sc9" + 0.002*"此项工作" + 0.002*"semantic" + 0.002*"模型" + 0.002*"fix" + 0.002*"核心成员" + 0.002*"各项" + 0.002*"提升" + 0.002*"大部分" + 0.002*"web"')
(1, '0.002*"sc9" + 0.002*"此项工作" + 0.002*"semantic" + 0.002*"模型" + 0.002*"fix" + 0.002*"核心成员" + 0.002*"各项" + 0.002*"提升" + 0.002*"大部分" + 0.002*"web"')
(2, '0.002*"sc9" + 0.002*"此项工作" + 0.002*"semantic" + 0.002*"模型" + 0.002*"fix" + 0.002*"核心成员" + 0.002*"各项" + 0.002*"提升" + 0.002*"大部分" + 0.002*"web"')
(3, '0.002*"sc9" + 0.002*"此项工作" + 0.
更多推荐
LDA主题模型练习1
发布评论