用python进行自然语言处理的有用代码，存在个别错误（已经标出）

编程入门行业动态更新时间:2024-10-19 17:23:39

用python进行<a href=https://www.elefans.com/category/jswz/34/1768401.html style= 自然语言处理的有用代码，存在个别错误（已经标出）"/>

用python进行自然语言处理的有用代码，存在个别错误（已经标出）

from __future__ import  division
from nltk.book import *
# 在text1中找到相关词，并显示上下文
text1.concordance("monstrous")
# 找到与其用法意义相似的词
text1.similar("monstrous")
# 找与该集合相似的词
text1mon_contexts(["monstrous","very"])
# 画出各个词的离散图
text4.dispersion_plot(["citizens","democracy","freedom"])
# 产生错误  由于版本问题
text3.generate()# 计算平均每个词的出现频率
print(len(text3)/len(set(text3)))
# 对文中出现该词计数
print(text3.count("smote"))# 文本索引
print(text3.index("smote"))
print(text3[9073])# 文本切片
print(text3[123:178])# 字符串
name="python"
a=" ".join(['hello','python'])
print(a)
a.split(" ")
print(a)# 频率统计
fdist1=FreqDist(text1)
print(fdist1)
vocabulary1=fdist1.keys()
print(vocabulary1)
fdist1.plot(50,cumulative=True)# 细粒度的选择词
V=set(text1)
fdist1=FreqDist(text1)
long_words=[w for w in V if len(w)>15 and fdist1[w]>2]
print(long_words)# 搭配词和双连词
print(list(bigrams(['more','is','than','done'])))
print(text1.collocations())import nltk
# 人机对话
nltk.chat.chatbots()import nltk
# 捕捉用户的输入
# 没有raw_input了
s=input("enter some text")
print("you typed",len(nltk.word_tokenize(s)),"words")# 正则表达式
import re
import nltkwordlist=[w for w in nltk.corpus.words.words('en') if w.islower()]
# print(wordlist)
# 查找ed结尾的词汇
print([w for w in wordlist if re.search('ed$',w)])
# ^表示开头 $表示接受
print([w for w in wordlist if re.search('^..j..t..$',w)])
# 范围与闭包，根据按键顺序决定的
print([w for w in wordlist if re.search('^[ghi][mno][jlk][def]$',w)])# 利用正则表达式提取字符块
word="asdnsjndnsudndjkdn"
print(re.findall(r'[aeiou]',word))
print(len(re.findall(r'[aeiou]',word)))# 文本换行
from textwrap import fill
saying=['After','all','is','said','and','done']
format='%s(%d)'
pieces=[format % (word,len(word)) for word in saying]
output=' '.join(pieces)
wrapped=fill(output)
print(wrapped)# 可视化词之间的关系
import networkx as nx
import matplotlibfrom nltk.corpus import wordnet as wn
def traverse(graph,start,node):graph.depth[node.name]=node.shortest_path_distance(start)for child in node.hyponyms():graph.add_edge(node.name,child.name)traverse(graph,start,child)def hyponym_graph(start):G=nx.Graph()G.depth={}traverse(G,start,start)return Gdef graph_draw(graph):nx.draw(graph,node_size=[16*graph.degree(n) for n in graph],node_color=[graph.depth[n] for n in graph],with_labels=False)matplotlib.pyplot.show()dog=wn.synset('dog.n.01')
graph=hyponym_graph(dog)
graph_draw(graph)# 一元标注 unigram tagging
# 训练
import nltkfrom nltk.corpus import brown
brown_tagged_sents=brown.tagged_sents(categories='news')
brown_sents=brown.sents(categories='news')
unigram_tagger=nltk.UnigramTagger(brown_tagged_sents)
unigram_tagger.tag(brown_sents[2007])
print(unigram_tagger.evaluate(brown_tagged_sents))# 分离训练
size=int(len(brown_tagged_sents)*0.9)
train_sents=brown_tagged_sents[:size]
test_sents=brown_tagged_sents[size:]
unigram_tagger=nltk.UnigramTagger(train_sents)
unigram_tagger.evaluate(test_sents)# 组合标注器
t0=nltk.DefaultTagger('NN')
t1=nltk.UnigramTagger(train_sents,backoff=t0)
t2=nltk.BigramTagger(train_sents,backoff=t1)
t2.evaluate(test_sents)# 存储标注器 python3 是pickle
from pickle import dump
output=open('t2.pkl','wb')
dump(t2,output,-1)
output.close()from pickle import load
input=open('t2.pkl','rb')
tagger=load(input)
input.close()text="I'm a small girl in a big world"
tokens=text.split()
print(tagger.tag(tokens))# 性别鉴定
import nltk
nltk.download('names')
# 返回的字典成为特征集  特征提取器
def gender_features(word):return {'last_letrer':word[-1]}
print(gender_features('Shrek'))# 准备例子和对应类标签的链表
from nltk.corpus import names
import random
names=([(name,'male') for name in names.words('male.txt')]+[(name,'female') for name in names.words('female.txt')])
random.shuffle(names)# 用特征提取器处理名称数据，划分训练集和测试集，用于训练一个“朴素贝叶斯分类器
featuresets=[(gender_features(n),g) for (n,g) in names]
train_set,test_set=featuresets[500:],featuresets[:500]
classifier=nltk.NaiveBayesClassifier.train(train_set)print(classifier.classify(gender_features('lucy')))
# 显示最强特征
classifier.show_most_informative_features(5)# 文档分类
# 选择电影评论语料库
import random
import nltk
from nltk.corpus import movie_reviews
documents=[(list(movie_reviews.words(fileid)),category)for category in movie_reviews.categories()for fileid in movie_reviews.fileids(category)]random.shuffle(documents)
# 构建频繁词列表
all_words=nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features=list(all_words.keys())[:2000]# 定义特征提取器
def document_features(document):document_words=set(document)features={}for word in word_features:features['contains(%s)' %word]=(word in document_words)return features
# print(document_features(movie_reviews.words('pos/cv957_8737.txt')))
# 训练和测试一个分类器进行文档分类
featuresets=[(document_features(d),c) for (d,c) in documents]
train_set,test_set=featuresets[100:],featuresets[:100]
classifier=nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier,test_set))
classifier.show_most_informative_features(5)# 基于上下文语境进行词汇标注
def pos_features(sentence,i):features={"shuffix(1)":sentence[i][-1:],"shuffix(2)": sentence[i][-2:],"shuffix(3)": sentence[i][-3:]}if i==0:features["pre-word"]="<START>"else:features["pre-word"]=sentence[i-1]return features
print(pos_features("I'm a small girl in a big world",4))# 使用连续分类器进行词性标注
import nltkdef pos_features(sentence, i, history):features = {"shuffix(1)": sentence[i][-1:],"shuffix(2)": sentence[i][-2:],"shuffix(3)": sentence[i][-3:]}if i == 0:features["pre-word"] = "<START>"features["pre-tag"] = "<START>"else:features["pre-word"] = sentence[i - 1]return featuresclass ConsecutivePosTagger(nltk.TaggerI):def __init__(self, train_sents):train_set = []for tagged_sent in train_sents:untagged_sent = nltk.tag.untag(tagged_sent)history = []for i, (word, tag) in enumerate(tagged_sent)featureset = pos_features(untagged_sent, i, history)train_set.append((featureset, tag))history.append(tag)self.classifier = nltk.NaiveBayesClassifier.train(train_set)def tag(self, sentence):history = []for i, word in enumerate(sentence):featureset = pos_features(sentence, i, history)tag = self.classifier.classify(featureset)history.append(tag)return zip(sentence, history)# 使用连续分词器对名词短语分块
import nltk
class ConsecutiveNPChunkTagger(nltk.TaggerI):def __init__(self, train_sents):train_set = []for tagged_sent in train_sents:untagged_sent = nltk.tag.untag(tagged_sent)history = []for i, (word, tag) in enumerate(tagged_sent):# 这里报错featureset = npchunk_features(untagged_sent, i,history)train_set.append( (featureset, tag))history.append(tag)self.classifier = nltk.MaxentClassifier.train(rain_set, algorithm = 'megam', trace = 0)def tag(self, sentence):history = []for i, word in enumerate(sentence):featureset = npchunk_features(sentence, i, history)tag = self.classifier.classify(featureset)history.append(tag)return zip(sentence, history)class ConsecutiveNPChunker(nltk.ChunkParserI):def __init__(self, train_sents):tagged_sents = [[((w, t), c) for (w, t, c) in nltk.chunk.tree2conlltags(sent)] for sent in train_sents]self.tagger = ConsecutiveNPChunkTagger(tagged_sents)def parse(self, sentence):tagged_sents = self.tagger.tag(sentence)conlltags = [(w, t, c) for ((w, t), c) in tagged_sents]return nltk.chunk.conlltags2tree(conlltags)def npchunk_features(sentence,i,history):word.pos=sentence[i]return {"pos":pos}
chunker = ConsecutiveNPChunker(train_sents)
print chunker.evaluate(test_sents)# 命名实体识别
import nltk
nltk.download('maxent_ne_chunker')
sent=nltk.corpus.treebank.tagged_sents()[22]
print(nltk.ne_chunk(sent,binary=True))#关系抽取
import nltkimport re
IN=repile(r'.*\bin\b(?!\b.+ing)')
for doc in nltk.corpus.ieer.parsed_docs('NYT_19980315'):for rel in nltk.sem.extract_rels('ORG','LOC',doc,corpus='ieer',pattern=IN):# 这里出错print(nltk.sem.relextract(rel))

更多推荐

用python进行自然语言处理的有用代码,存在个别错误（已经标出）

本文发布于:2024-03-23 23:42:01，感谢您对本站的认可！

本文链接:https://www.elefans.com/category/jswz/34/1744218.html