随笔(七)贝叶斯,决策树分类"/>
python.nlp随笔(七)贝叶斯,决策树分类
回答下列问题:
(1)怎样才能识别出语言数据中明显用于分类的特征?
(2)怎样才能构建用于自动执行语言处理任务的语言模型?
(3)从这些模型中我们可以学到哪些关于语言的知识?
决策树,朴素贝叶斯分类器和最大熵(shang)分类
一 监督式分类
#性别鉴定
创建分类
[python] view plain copy- def gender_features(word):
- return {'last_letter': word[-1]}
- gender_features('Shrek')
- {'last_letter': 'k'}
- from nltk.corpus import names
- import random
- names = ([(name, 'male') for name in names.words('male.txt')] +
- [(name, 'female') for name in names.words('female.txt')])
- random.shuffle(names)
- import nltk
- featuresets = [ (gender_features(n), g) for (n,g) in names ]
- train_set, test_set = featuresets[500:], featuresets[:500] #训练集和测试集
- classifier = nltk.NaiveBayesClassifier.train(train_set)
- classifier.classify(gender_features('Neo'))
- 'male'
- classifier.classify(gender_features('Trinity'))
- 'female'
- print nltk.classify.accuracy(classifier, test_set) #评估
- <span style="color:#ff0000;">0.75</span>
- classifier.show_most_informative_features(5) #哪些特征对于区分名字的性别是最有效的
- Most Informative Features
- last_letter = u'a' female : male = 33.4 : 1.0
- last_letter = u'k' male : female = 30.8 : 1.0
- last_letter = u'f' male : female = 17.3 : 1.0
- last_letter = u'p' male : female = 10.5 : 1.0
- last_letter = u'd' male : female = 10.0 : 1.0
#选择正确的特征
[python] view plain copy- def gender_features2(name):
- features = {}
- features["firstletter"] = name[0].lower()
- features["lastletter"] = name[-1].lower()
- for letter in 'abcdefghijklmnopqrstuvwxyz':
- features["count(%s)" % letter] = name.lower().count(letter)
- features["has(%s)" % letter] = (letter in name.lower())
- return features
- gender_features2('JJohn')
- featuresets = [(gender_features2(n), g) for (n,g) in names]
- train_set, test_set = featuresets[500:], featuresets[:500]
- classifier = nltk.NaiveBayesClassifier.train(train_set) #使用朴素贝叶斯分类器
- print nltk.classify.accuracy(classifier, test_set)
- 0.776
- #一种能有效完善特征集的方法称为错误分析。首先,选择开发集,其中包含用于创建模型的语料数据。然后将这种开发集分为训练集和开发测试集
- train_names = names[1500:]
- devtest_names = names[500:1500]
- test_names = names[:500]
- train_set = [(gender_features(n), g) for (n,g) in train_names]
- devtest_set = [(gender_features(n),g) for (n,g) in devtest_names]
- test_set = [(gender_features(n),g) for (n,g) in test_names]
- classifier = nltk.NaiveBayesClassifier.train(train_set)
- print nltk.classify.accuracy(classifier, devtest_set)
- <span style="color:#ff0000;">0.766</span>
- errors = []
- for (name, tag) in devtest_names:
- guess = classifier.classify(gender_features(name))
- if guess != tag:
- errors.append( (tag, guess, name) )
- for (tag, guess, name) in sorted(errors):
- print 'correct=%-8s guess=%-8s name=%-30s' % (tag, guess, name)
- correct=female guess=male name=Abagael
- correct=female guess=male name=Adel
- correct=female guess=male name=Alys
- correct=female guess=male name=Amargo
- correct=female guess=male name=Ambur
- ...
[python] view plain copy
- #调整特征提取器使其包含两个字母后缀的特征
- def gender_features(word):
- return {'suffix1': word[-1:],
- 'suffix2': word[-2:]}
- train_set = [(gender_features(n), g) for (n,g) in train_names]
- devtest_set = [(gender_features(n),g) for (n,g) in devtest_names]
- classifier = nltk.NaiveBayesClassifier.train(train_set)
- print nltk.classify.accuracy(classifier, devtest_set)
将电影评论语料库归类为正面或负面
[python] view plain copy- from nltk.corpus import movie_reviews
- documents = [(list(movie_reviews.words(fileid)), category)
- for category in movie_reviews.categories()
- for fileid in movie_reviews.fileids(category)]
- random.shuffle(documents)
- #文档分类的特征提取器,其特征表示每个词是否在一个给定的文档中
- all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
- word_features = all_words.keys()[:2000]
- def document_features(document):
- document_words = set(document)
- features = {}
- for word in word_features:
- features['contains(%s)' % word] = (word in document_words)
- return features
- print document_features(movie_reviews.words('pos/cv957_8737.txt'))
- {u'contains(corporate)': False, u'contains(barred)': False, u'contains(batmans)': False, u'contains(menacing)': False,
- u'contains(rags)': False, u'contains(inquires)': False,
[python] view plain copy
- #训练和测试分类器以进行文档分类
- featuresets = [(document_features(d),c) for (d,c) in documents]
- train_set, test_set = featuresets[100:], featuresets[:100]
- classifier = nltk.NaiveBayesClassifier.train(train_set)
- print nltk.classify.accuracy(classifier, test_set)
- 0.73
- classifier.show_most_informative_features(5) #找出哪些特征是分类器发现的并且是最有信息量的
- Most Informative Features
- contains(sans) = True neg : pos = 9.1 : 1.0
- contains(mediocrity) = True neg : pos = 7.8 : 1.0
- contains(dismissed) = True pos : neg = 6.9 : 1.0
- contains(testament) = True pos : neg = 6.5 : 1.0
- contains(bruckheimer) = True neg : pos = 6.4 : 1.0
#词性标注
[python] view plain copy- from nltk.corpus import brown
- suffix_fdist = nltk.FreqDist()
- for word in brown.words():
- word = word.lower()
- suffix_fdist[word[-1:]] += 1
- suffix_fdist[word[-2:]] += 1
- suffix_fdist[word[-3:]] += 1
- from operator import itemgetter
- common_suffixes = sorted(suffix_fdist.items(), key=itemgetter(1), reverse=True)
- common_suffixes[:100]
- [(u'e', 202946),
- (u',', 175002),
- (u'.', 152999),
- (u's', 128722),
- (u'd', 105687),
- (u't', 94459),
- common_suf = [ suffix[0] for suffix in common_suffixes][:100]
- common_suf
- def pos_features(word):
- features = {}
- for suffix in common_suf:
- features['endswith(%s)'%suffix] = word.lower().endswith(suffix)
- return features
- tagged_words = brown.tagged_words(categories='news')
- tagged_words[0]
- (u'The', u'AT')
- len(tagged_words)
- 100554
- len(pos_features(tagged_words[0][0]))
- 100
- pos_features(tagged_words[0][0])
- {u"endswith('')": False,
- u"endswith(')": False,
- u"endswith('s)": False,
- u'endswith(()': False,
- u'endswith())': False,
- u'endswith(,)': False,
- featuresets = [(pos_features(n),g) for (n,g) in tagged_words]
- size = int(len(featuresets) * 0.1)
- size
- Out[52]: 10055
- train_set,test_set = featuresets[size:], featuresets[:size]
- classifier = nltk.DecisionTreeClassifier.train(train_set) #决策树
- nltk.classify.accuracy(classifier, test_set)
- <span style="color:#ff0000;">0.6270512182993535</span>
- classifier.classify(pos_features('cats'))
- Out[54]: u'NNS'
- #决策树的优点是容易解释,甚至可以它们以伪代码形式输出
- print classifier.pseudocode(depth=4)
- if endswith(the) == False:
- if endswith(,) == False:
- if endswith(s) == False:
- if endswith(.) == False: return u'.'
- if endswith(.) == True: return u'.'
- if endswith(s) == True:
- if endswith(is) == False: return u'PP$'
- if endswith(is) == True: return u'BEZ'
- if endswith(,) == True: return u','
- if endswith(the) == True: return u'AT'
#探索上下文语境
不是只传递已标注的词,而是传递整个(未标注的)句子,以及目标词的索引
#特征检测器
[python] view plain copy- def pos_features(sentence, i):
- features = {"suffix(1)": sentence[i][-1:],
- "suffix(2)": sentence[i][-2:],
- "suffix(3)": sentence[i][-3:]}
- if i == 0:
- features["prev-word"] = "<START>"
- else:
- features["prev-word"] = sentence[i-1]
- return features
- brown.sents()[0][7]
- Out[62]: u'an'
- brown.sents()[0][8]
- Out[63]: u'investigation'
- pos_features(brown.sents()[0], 8) ###### 四个特征
- {'prev-word': u'an',
- 'suffix(1)': u'n',
- 'suffix(2)': u'on',
- 'suffix(3)': u'ion'}
- tagged_sents = brown.tagged_sents(categories='news')
- featuresets = []
- for tagged_sent in tagged_sents:
- untagged_sent = nltk.tag.untag(tagged_sent)
- for i, (word, tag) in enumerate(tagged_sent):
- featuresets.append( (pos_features(untagged_sent, i), tag) )
- size = int(len(featuresets) * 0.1)
- 10055
- train_set, test_set = featuresets[size:], featuresets[:size]
- classifier = nltk.NaiveBayesClassifier.train(train_set)
- nltk.classify.accuracy(classifier, test_set)
- <span style="color:#ff0000;">0.7891596220785678</span>
#序列分类
在词性标注的例子中,可以使用各种不同的序列分类器模型为给定的句子中的所有词选择词性标注
一种称为连续分类或贪婪序列分类的序列分类器策略,为第一个输入找到最有可能的类标签,然后在此基础上找到下一个输入的最佳的标签。这个过程可以不断重复直到所有的输入都被贴上标签。
特征提取器
[python] view plain copy- def pos_features(sentence, i, history):
- features = {"suffix(1)": sentence[i][-1:],
- "suffix(2)": sentence[i][-2:],
- "suffix(3)": sentence[i][-3:] }
- if i == 0:
- features["prev-word"] = "<START>"
- features["prev-tag"] = "<START>"
- else:
- features["prev-word"] = sentence[i-1]
- features["pre-tag"] = history[i-1]
- return features
- class ConsecutivePosTagger(nltk.TaggerI):
- def __init__(self, train_sents):
- train_set = []
- for tagged_sent in train_sents:
- untagged_sent = nltk.tag.untag(tagged_sent)
- history = []
- for i, (word, tag) in enumerate(tagged_sent):
- featureset = pos_features(untagged_sent, i, history)
- train_set.append((featureset, tag))
- history.append(tag) ######
- self.classifier = nltk.NaiveBayesClassifier.train(train_set)
- def tag(self, sentence):
- history = []
- for i, word in enumerate(sentence):
- featureset = pos_features(sentence, i, history)
- tag = self.classifier.classify(featureset)
- history.append(tag)
- return zip(sentence, history)
- tagged_sents = brown.tagged_sents(categories='news')
- size = int(len(tagged_sents) * 0.1)
- train_sents, test_sents = tagged_sents[size:], tagged_sents[:size]
- tagger = ConsecutivePosTagger(train_sents)
- print tagger.evaluate(test_sents)
#其他序列分类方法
这种方法的缺点是一旦做出决定便无法更改。例如:如果决定将一个词标注为名词,但后来发现应该是动词,那也没有办法修复我们的错误了。解决这个问题的方法是采取转型策略。转型联合分类的工作原理是为输入的标签创建一个初始值,然后反复提炼该值,尝试修复相关输入之间的不一致
另一种方案是为词性标记所有可能的序列打分,选择总得分最高的序列。隐马尔科夫模型就采取了这种方法。隐maerkefumox类似于连续分类器,不光考虑输入也考虑已预测标记的历史。然而,不是简单地找出一个给定词的单个最好标签,而是为标记产生一个概率分布。然后这些概率结合起来计算标记序列的概率得分,最后选择最高概率的标记序列。不过,可能的标签序列数量相当大。给定拥有30个标签的标记集,大约有600万亿(30^10)中方式来标记一个10个词的句子。为了避免单独考虑所有这些可能的序列,隐马尔科夫模型要求特征提取器只考虑最近的标记(或最近的n个标记,其中n是相当小的)。由于这种限制,它可以使用动态规划来有效地找出最有可能的标记序列。特别是,对每个连续的词索引i,当前的及以前的每个可能的标记都将计算得分。这种基础的方法被两个更先进的模型所采用,它们被称为最大熵马尔科夫模型和线性链条件随机场模型;但为标记序列打分用的是不同的算法。
二 监督式分类的举例
#句子分割
第一步是获得一些已被分割成句子的数据,将它转换成一种适合提取特征的形式
[python] view plain copy- sents = <span style="color:#ff0000;">nltk.corpus.treebank_raw.sents</span>()
- tokens = []
- boundaries = set()
- offset = 0
- for sent in nltk.corpus.treebank_raw.sents():
- tokens.extend(sent)
- offset += len(sent)
- boundaries.add(offset - 1)
- def punct_features(tokens, i):
- return { 'next-word-capitailized': tokens[i+1][0].isupper(),
- 'prevword': tokens[i-1].lower(),
- 'punct': tokens[i],
- 'prev-word-is-one-char': len(tokens[i-1]) == 1}
- featuresets = [(punct_features(tokens, i), (i in boundaries)) for i in range(1, len(tokens)-1) if tokens[i] in '.?!']
- size = int(len(featuresets) * 0.1)
- train_set, test_set = featuresets[size:], featuresets[:size]
- classifier = nltk.NaiveBayesClassifier.train(train_set)
- nltk.classify.accuracy(classifier, test_set)
- 0.936026936026936
- def segment_sentences(words): #基于分类的断句器
- start = 0
- sents = []
- for i, word in words:
- if word in '.?!' and classifier.classify(punct_features(words, i)) == True:
- sents.append(words[start:i+1])
- start = i+1
- if start < len(words):
- sents.append(words[start:])
- return sents
#识别对话行为类型
表述行为的陈述句,问候,问题,回答,断言和说明都可以被认为是基于语言的行为类型。识别对话中隐含言语下的对话行为是理解谈话的重要步骤。
利用NPS聊天语料库建立一个分类器,用来识别新的即时消息帖子的对话行为类型。
[python] view plain copy- posts = nltk.corpus.nps_chat.xml_posts()[:10000] #每个帖子的XML注释
- def dialogue_act_features(post): #特征提取器
- features = {}
- for word in nltk.word_tokenize(post):
- features['contains(%s)' % word.lower()] = True
- return features
- featuresets = [(dialogue_act_features(post.text), post.get('class')) for post in posts]
- ({'contains(gay)': True,
- 'contains(im)': True,
- 'contains(left)': True,
- 'contains(name)': True,
- 'contains(now)': True,
- 'contains(this)': True,
- 'contains(with)': True},
- '<span style="color:#ff0000;">Statement</span>') #陈述句
- size = int(len(featuresets) * 0.1) #分类器
- train_set, test_set = featuresets[size:], featuresets[:size]
- classifier = nltk.<span style="color:#ff0000;">NaiveBayesClassifier</span>.train(train_set)
- print nltk.classify.accuracy(classifier, test_set)
- 0.668
#识别文字蕴涵
(Recognizing textual entailment, RTE)是判断文本T内的一个给定片段是否继承另一个叫做“假设”的文本。迄今为止,已经有4个RTE挑战赛,在那里共享的开发和测试数据会提供给参赛队伍。
[python] view plain copy- def rte_features(rtepair):
- extractor = nltk.RTEFeatureExtractor(rtepair)
- features = {}
- features['word_overlap'] = len(extractor.overlap('word'))
- features['word_hyp_extra'] = len(extractor.hyp_extra('word'))
- features['ne_overlap'] = len(extractor.overlap('ne'))
- features['ne_hyp_extra'] = len(extractor.hyp_extra('ne'))
- return features
- rtepair = nltk.corpus.rte.pairs(['rte3_dev.xml'])[33]
- extractor = nltk.RTEFeatureExtractor(rtepair)
- print extractor.text_words
- set(['Organisation', 'Shanghai', 'Asia', 'four', 'at', 'operation', 'SCO', 'Iran', 'Soviet', 'Davudi', 'fight', 'China', 'association', 'fledgling', 'was', 'that', 'republics', 'former', 'Co', 'representing', 'Russia', 'Parviz', 'central', 'meeting', 'together', 'binds', 'terrorism.'])
- print extractor.hyp_words
- set(['member', 'SCO.', 'China'])
- print extractor.overlap('word')
- set([])
- print extractor.overlap('ne')
- set(['China'])
- print extractor.hyp_extra('word')
- set(['member'])
#扩展到大型数据集
纯Python的分类不是很快,建议探索NLTK与外部机器学习包的接口技术,
三 评估
测试集
准确度
精确度和召回率
混淆矩阵
交叉验证
四 决策树
熵和信息增益
五 朴素贝叶斯分类器
潜在概率模型
零计数和平滑
非二元特征
独立的朴素性
双重计数的原因
六 最大熵分类器
最大熵模型
熵的最大化
生成式分类器对比条件分类器
七 为语言模式建模
模型告诉我们什么?
八 深入阅读
使用Weka, Mallet, TADM 和 MegaM
更多推荐
python.nlp随笔(七)贝叶斯,决策树分类
发布评论