基于Python的文本分析和信息可视化

编程入门行业动态更新时间:2024-10-10 21:22:10

基于Python的<a href=https://www.elefans.com/category/jswz/34/1771357.html style= 文本分析和信息可视化"/>

基于Python的文本分析和信息可视化

前言

技术要点

代码

1.tf.tdf文本比较相似度

2.新闻摘要

3.自动摘要

4.情感分析

5.词频词云

6.词频柱状图

总结

前言

本程序进行文本分析，其中包括词云的统计与显示、新闻的自动摘要、词频统计以及词频的柱状图可视化，代码比较基础（代码中的文件不在文章中放了，需要的可以私聊我，也可以在网上下载）。

技术要点

matplotlib绘制条形图
wordcloud绘制词云
jieba分词统计
collections.Counter用来统计相关元素出现的次数

代码

1.tf.tdf文本比较相似度

import jieba
import jieba.analysefile = open(r'C:\Users\lenovo\Desktop\4文本分析\报告.txt', 'r+', encoding='utf-8')
txt = file.read()
file.close()keywords = jieba.analyse.extract_tags(txt, topK=20, withWeight=True, allowPOS=('n', 'nr', 'ns', 'v', 'vn'))for item in keywords:print(item[0], item[1])

2.新闻摘要

import re  # 文档内容分句
import os  # 获取文件路径
import jieba  # 分词
import numpy
from sklearn.metrics import pairwise_distances  # 计算文本相似度
from sklearn.feature_extraction.text import CountVectorizer  # 转化为文本向量def summary(path, num_summary=2):'''函数功能：实现文本摘要参数说明：path：文档路径num_summary：摘要长短返回：result：摘要'''# 导入文本# cwd=os.getcwd()contents = ''with open(path, 'r', encoding='utf-8') as file:contents = file.read().strip()# 分句subCorpus = [contents] + re.split('[。？！\n]', contents)# 导入停用词stop_words_path = r'C:\Users\lenovo\Desktop\4文本分析\stop.txt'stop_words = set()with open(stop_words_path, 'r', encoding='utf-8') as sw:[stop_words.add(line.strip()) for line in sw.readlines()]# 分词segments = []clean_subCorpus = []for content in subCorpus:segs = jieba.cut(content)  # 断词，list格式segment = ' '.join(segs)  # 转化为一个元素if len(segment.strip()) >= 5:  # 剔除长度小于5的句子segments.append(segment.strip())clean_subCorpus.append(content.strip())# 文本向量countVectorizer = CountVectorizer(stop_words=stop_words)  # 设置关键参数stop_wordstextVector = countVectorizer.fit_transform(segments)  # shape=(10, 89)# 文本相似度distance_matrix = pairwise_distances(textVector, metric='cosine')  # 数值越小越相似# 生成摘要sort_index = numpy.argsort(distance_matrix[0])  # 降序排列num_summary = min(len(clean_subCorpus), num_summary + 1)summarys = []  # 存放摘要sorts = []  # 存放索引for i in range(1, num_summary):sorts.append(sort_index[i])sorts_ix = numpy.argsort(sorts)for ix in sorts_ix:summarys.append(clean_subCorpus[sorts[ix]])result = '。'.join(summarys)return resultpath = r'C:\Users\lenovo\Desktop\4文本分析\报告.txt'
summary(path, num_summary=3)

3.自动摘要

#textrank 自动摘要
import re
import jieba
import numpy as np
import jieba.analyse
from numpy import *
from collections import Counterdef load_stop_words():global stopwordswith open(r'C:\Users\lenovo\Desktop\4文本分析\stop.txt', "r", encoding="utf-8") as f:stopwords = f.readlines()for i in range(len(stopwords)):stopwords[i] = stopwords[i].replace("\n", "")def cosine_similarity(sentence1,sentence2):sen1_vocab_list = jieba.lcut(sentence1, cut_all=False)sen2_vocab_list = jieba.lcut(sentence2, cut_all=False)vocab_list = list(set(sen1_vocab_list + sen2_vocab_list))sen1_vec = np.zeros(len(vocab_list))sen2_vec = np.zeros(len(vocab_list))for i in range(len(vocab_list)):sen1_vec[i] += Counter(sen1_vocab_list)[vocab_list[i]]sen2_vec[i] += Counter(sen2_vocab_list)[vocab_list[i]]cos_sim = float(np.sum(sen1_vec * sen2_vec))/(np.linalg.norm(sen1_vec) * np.linalg.norm(sen2_vec))return cos_simdef log_similarity(sentence1,sentence2):sen1_vocab_list = jieba.lcut(sentence1, cut_all=False)sen2_vocab_list = jieba.lcut(sentence2, cut_all=False)if len(sen1_vocab_list) == 1 and len(sen2_vocab_list) == 1:return 0.0count = 0for word in sen1_vocab_list:if word in sen2_vocab_list:count += 1log_sim = count / (log(len(sen1_vocab_list)) + log(len(sen2_vocab_list)))return log_simclass GenerateAbstract():@classmethoddef get_corpus_sentence_list(cls, corpus_list):punch = r',|/|;|\'|`|<|>|\?|:|\{|\}|\~|!|@|#|\$|%|\^|&|=|\_|\+|，|。|；|【|】|！| |…'sentence_list = []for i in range(len(corpus_list)):sentence_list.append([])sentence_list[i] = re.split(punch, corpus_list[i])if "" in sentence_list[i]:sentence_list[i].remove("")return sentence_list@classmethoddef get_abstract(cls,corpus_sentence_list,**const):cossim_range = const["sim_range"]iters = const["iters"]method = const["sim_method"]page = 1for sentence_list in corpus_sentence_list:abstract_num = const["abstract_num"]l = len(sentence_list)if l < abstract_num:abstract_num = lsen_mat = np.zeros(l* l).reshape(l,l)for i in range(len(sentence_list)):for j in range(len(sentence_list)):if i != j:if method == "log":cos_sim = log_similarity(sentence_list[i],sentence_list[j])elif method == "cos":cos_sim = cosine_similarity(sentence_list[i], sentence_list[j])if cos_sim > cossim_range:                                                  #句子的余弦相似度在设定值之上，就这两个句子连线sen_mat[i][j] += cos_simPR_mat = np.array(ones(l)).reshape(l,1)for i in range(iters):res_mat = 0.15 + 0.85 *sen_mat.dot(PR_mat)res_dic = {}for i in range(len(res_mat)):res_dic.update({sentence_list[i]:float(res_mat[i][0])})res_dic = sorted(res_dic.items(), key=lambda x: x[1], reverse=True)         #PR值越大关键程度越高abstract_list = []abstract_str = ""news_str = ""for i in range(abstract_num):abstract_list.append(res_dic[i][0])for sentence in sentence_list:if sentence in abstract_list:abstract_list.remove(sentence)abstract_str += sentence + "。"for i in range(l):if i < l - 1:news_str += sentence_list[i] + "，"else:news_str += sentence_list[i] + "。"print("新闻{num}（本身新闻长度{len_sen}，摘要长度{abs_num}）：\n原文：\n{news}\n摘要：\n{abstract}\n".format(num = page,abstract = abstract_str,abs_num = abstract_num,len_sen = len(sentence_list),news = news_str))page += 1if __name__ == "__main__":with open(r"C:\Users\lenovo\Desktop\4文本分析\报告.txt", "r", encoding="utf-8") as f:news_list = f.readlines()for i in range(len(news_list)):news_list[i] = news_list[i].replace("\n", "")corpus_sentence_list = GenerateAbstract.get_corpus_sentence_list(news_list)GenerateAbstract.get_abstract(corpus_sentence_list,sim_range = 0.2,iters = 700,abstract_num = 8,sim_method="cos")

4.情感分析

# -*- coding: utf-8 -*-
"""
Created on Wed May  3 16:25:05 2017
=yes 参考链接
#情感分析
@author: chuc
"""from collections import defaultdictimport jieba"""
1. 文本切割
"""def sent2word(sentence):"""Segment a sentence to wordsDelete stopwords"""jieba.load_userdict(r"C:\Users\lenovo\Desktop\4文本分析\报告.txt")segList = jieba.cut(sentence)segResult = []for w in segList:segResult.append(w)'''f = open('motion/stopword.txt')stopwords = f.readlines()f.close()newSent = []for word in segResult:if word in stopwords:# print "stopword: %s" % wordcontinueelse:newSent.append(word)
'''return segResult"""
2. 情感定位
"""def classifyWords(wordDict):# (1) 情感词f = open(r'motion/BosonNLP_sentiment_score.txt', encoding='utf-8')senList = f.readline()senDict = defaultdict()while senList:# senDict.append(senList.split())senDict[senList.split(' ')[0]] = senList.split(' ')[1]senList = f.readline()f.close()# (2) 否定词g = open('motion/notDict.txt', encoding='utf-8')notList = g.readline()notDic = []while notList:notDic.append(notList)notList = g.readline()g.close()# (3) 程度副词f = open('motion/degree.txt')degreeList = f.readline()degreeDict = defaultdict()while degreeList:degreeDict[degreeList.split()[0]] = degreeList.split()[1]degreeList = f.readline()f.close()senWord = defaultdict()notWord = defaultdict()degreeWord = defaultdict()t = 0for word in wordDict:print(word)if word in senDict.keys() and word not in notDic and word not in degreeDict.keys():senWord[t] = senDict[word]elif word in notDic[0] and word not in degreeDict.keys():notWord[t] = -1elif word in degreeDict.keys():degreeWord[t] = degreeDict[word]t = t + 1# print( senWord, notWord, degreeWord)return senWord, notWord, degreeWord'''
计算句子分数  
'''def score(sen, no, degree, word):score = 0for i in range(len(word)):if i in no.keys() and i + 1 in sen.keys():sen[i + 1] = float(no[i]) * float(sen[i + 1])elif i in degree.keys() and i + 1 in no.keys() and i + 2 in sen.keys():sen[i + 2] = float(no[i]) * float(sen[i + 2] * float(degree[i]))elif i in degree.keys() and i + 1 in sen.keys():sen[i + 1] = float(degree[i]) * float(sen[i + 1])elif i in degree.keys() and i + 1 in degree.keys():sen[i] = float(degree[i]) * float(degree[i + 1])# 考虑不同的短语组合算分for j in sen.keys():score = score + float(sen[j])return scoredef culate(sentences):sp = sent2word(sentences)d, dd, ddd = classifyWords(sp)score1 = score(d, dd, ddd, sp)return score1

5.词频词云

在进行这一部分代码操作时，首先要下载wordcloud库，词云库是无法在Pycharm中直接下载的。可以在CSDN上自行查询下载方式。

import jieba
import matplotlib.pyplot as plt
import wordcloud
import numpy
from PIL import Image
import matplotlib
import jieba.posseg as psg# import matplotlib.colors as colors  # 处理图片相关内容matplotlib.rcParams['font.sans-serif'] = ['SimHei']# 读取文本
def read_txt():file = open(r'C:\Users\lenovo\Desktop\4文本分析\报告.txt', 'r+', encoding='utf-8')txt = file.read()file.close()return txt# 词性统计（写入文档）
def sda():import jieba.posseg as psgtext = open(r"C:\Users\lenovo\Desktop\4文本分析\报告.txt", encoding='utf-8', errors='ignore').read()seg = psg.cut(text)file = open(r"C:\Users\lenovo\Desktop\4文本分析\词性.txt", 'a+')for ele in seg:file.writelines(ele)# 停词文档
def stopwordslist(filepath):stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]return stopwords# 分词生成词频统计（写入文档）
def write_txt():words = jieba.lcut(read_txt())  # 使用精确模式对文本进行分词counts = {}     # 通过键值对的形式存储词语及其出现的次数counts = {}stopwords = stopwordslist(r'C:\Users\lenovo\Desktop\4文本分析\stop.txt')for word in words:if len(word) == 1:  # 单个词语不计算在内continueelif word not in stopwords:counts[word] = counts.get(word, 0) + 1  # 遍历所有词语，每出现一次其对应的值加 1items = list(counts.items())items.sort(key=lambda x: x[1], reverse=True)  # 根据词语出现的次数进行从大到小排序f = open("词频统计.txt", "w")  # 写入文件for i in range(len(items)):word, count = items[i]f.writelines("{0:<5}{1:>5}\n".format(word, count))f.close()# 生成词云
def creat_wordcloud():f_0 = open("词频统计.txt", 'r')# bg_pic=plt.imread(r'C:\Users\lenovo\Desktop\4文本分析\中国地图.png')# 打开背景图片color_mask = numpy.array(Image.open(r'C:\Users\lenovo\Desktop\4文本分析\中国地图.png'))# 自定义文字颜色# colormaps = colors.ListedColormap(['#FF0000','#FF7F50','#FFE4C4'])text = f_0.read()f_0.close()wcloud = wordcloud.WordCloud(font_path=r"C:\Windows\Fonts\simhei.ttf",background_color="white",  # 指定背景颜色，默认黑色max_words=500,mask=color_mask,  # 背景形状# colormap=colormaps, # 指定颜色width=1000,  # 指定宽度height=860,  # 指定高度margin=2,).generate(text)# 显示词云wcloud.to_file("词云.jpg")  # 生成词云图片plt.imshow(wcloud)plt.axis('off')plt.show()sda()# 生成词云（默认样式）# mywc1 = WordCloud().generate(tokenstr)def main():write_txt()creat_wordcloud()if __name__ == '__main__':main()

首先导入一系列的库文件，Python的好处就在于存在非常多的第三方库，使得程序的编写得以简化。这些库在做其他词云可视化分析时也是要使用到的

import jieba
import matplotlib.pyplot as plt
import wordcloud
import numpy
from PIL import Image
import matplotlib
import jieba.posseg as psg# import matplotlib.colors as colors  # 处理图片相关内容

其中，jieba库用来分词，matplotlib绘制柱状图（柱状图必要的库），wordcloud库是核心用来绘制词云，PIL（Python Image Library）是python平台图像处理标准库

在词云绘制中，首先需要导入我们的txt文件

# 读取文本
def read_txt():file = open(r'C:\Users\lenovo\Desktop\4文本分析\报告.txt', 'r+', encoding='utf-8')txt = file.read()file.close()return txt

这里文本的路径是文件存在的绝对路径，这里程序可能会出现报错，在CSDN上也都存在这类报错的解决办法

接下来是词性统计和分词生成的词频统计，词性统计后将统计结果写入停词文档（后附部分结果图）

# 词性统计（写入文档）
def sda():import jieba.posseg as psgtext = open(r"C:\Users\lenovo\Desktop\4文本分析\报告.txt", encoding='utf-8', errors='ignore').read()seg = psg.cut(text)file = open(r"C:\Users\lenovo\Desktop\4文本分析\词性.txt", 'a+')for ele in seg:file.writelines(ele)# 停词文档
def stopwordslist(filepath):stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]return stopwords# 分词生成词频统计（写入文档）
def write_txt():words = jieba.lcut(read_txt())  # 使用精确模式对文本进行分词counts = {}     # 通过键值对的形式存储词语及其出现的次数counts = {}stopwords = stopwordslist(r'C:\Users\lenovo\Desktop\4文本分析\stop.txt')for word in words:if len(word) == 1:  # 单个词语不计算在内continueelif word not in stopwords:counts[word] = counts.get(word, 0) + 1  # 遍历所有词语，每出现一次其对应的值加 1items = list(counts.items())items.sort(key=lambda x: x[1], reverse=True)  # 根据词语出现的次数进行从大到小排序f = open("词频统计.txt", "w")  # 写入文件for i in range(len(items)):word, count = items[i]f.writelines("{0:<5}{1:>5}\n".format(word, count))f.close()

一系列工作完成后就可以生成词云了，在生成词云时可以选择词云的形状，需要从外部导入图片

一般来说，我们不想要这么方的词云，肯定喜欢一些有形状的，就需要导入其他包，这里导入的包为numpy，numpy系统是python的一种开源的数值计算扩展，这种工具可以用来存储和处理大型矩阵。这里在处理的时候将给出形状的图片表示为一个大型矩阵，再有颜色的地方来进行填词（导包 :import numpy as np）。导包之后需添加一个遮罩层，遮罩层就是用来限制生成图片的形状。

# 生成词云
def creat_wordcloud():f_0 = open("词频统计.txt", 'r')# bg_pic=plt.imread(r'C:\Users\lenovo\Desktop\4文本分析\地图.png')# 打开背景图片color_mask = numpy.array(Image.open(r'C:\Users\lenovo\Desktop\4文本分析\地图.png'))# 自定义文字颜色# colormaps = colors.ListedColormap(['#FF0000','#FF7F50','#FFE4C4'])text = f_0.read()f_0.close()wcloud = wordcloud.WordCloud(font_path=r"C:\Windows\Fonts\simhei.ttf",background_color="white",  # 指定背景颜色，默认黑色max_words=500,mask=color_mask,  # 背景形状# colormap=colormaps, # 指定颜色width=1000,  # 指定宽度height=860,  # 指定高度margin=2,).generate(text)# 显示词云wcloud.to_file("词云.jpg")  # 生成词云图片plt.imshow(wcloud)plt.axis('off')plt.show()sda()# 生成词云（默认样式）# mywc1 = WordCloud().generate(tokenstr)

最后就是基本的主函数调用，不多做描述了

def main():write_txt()creat_wordcloud()if __name__ == '__main__':main()

6.词频柱状图

这一部分与上一部分词云也有一定关联，词云中显示文字的大小就表示了词语在报告中出现的频率，这一部分就是使词频更加的直观化、数据化

from matplotlib.font_manager import FontProperties
from collections import Counter
from pylab import *
import jieba.posseg as psgmpl.rcParams['font.sans-serif'] = ['SimHei']  # X 轴可以显示中文
mpl.rcParams['axes.unicode_minus'] = False  # X 轴可以显示中文font = FontProperties(size=14)
f3 = open(r'C:\Users\lenovo\Desktop\4文本分析\报告.txt', 'r',encoding='utf-8').read()
nowords = ['x', 'uj', 'a', 'ul', 'p', 'd', 'v', 'zg', 'm', 'ug', 'i', 'f', 'ad', 'nz', 'r', 'r', 'ns', 'q', 't', 'c']wods = [x.word for x in psg.cut(f3) if len(x.word) >= 2 and (x.flag) not in nowords]
word_count = Counter(wods)
# print(word_count)x = [x[0] for x in word_count.most_common(20)]  # 统计top20个关键字
y = [x[1] for x in word_count.most_common(20)]  # 统计top20个关键字出现的次数
fig = plt.figure()
plt.grid(False)
# c = np.random.randint(0,1,len(y))
plt.bar(x, y, color='lightskyblue')
plt.xlabel('关键词', fontproperties=font)
plt.ylabel('词频', fontproperties=font)
plt.title('词频分析柱状图', fontproperties=font)
plt.show()