import pdb import nltk from nltk.corpus import twitter_samples import numpy as np import pandas as pd import string from nltk.tokenize import TweetTokenizer from nltk.stem import PorterStemmer from nltk.corpus import stopwords import re
defprocess_tweet(tweet): stemmer=PorterStemmer() stopwords_english=stopwords.words('english') # remove stock market tickers like $GE tweet = re.sub(r'\$\w*', '', tweet) # remove old style retweet text "RT" tweet = re.sub(r'^RT[\s]+', '', tweet) # remove hyperlinks tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet) # remove hashtags # only removing the hash # sign from the word tweet = re.sub(r'#', '', tweet) tokenizer=TweetTokenizer(preserve_case=False,strip_handles=True, reduce_len=True) tweet_tokens=tokenizer.tokenize(tweet) tweets_clean=[] for word in tweet_tokens: if word notin stopwords_english\ and word notin string.punctuation: stem_word=stemmer.stem(word) tweets_clean.append(stem_word) return tweets_clean
2 创建键值对为{(词-类别):次数}的字典
利用朴素贝叶斯模型进行分类需要知道词在某类别中出现的次数。举个简单的例子,假设训练集为:['i am happy', 'i am tricked', 'i am sad', 'i am tired', 'i am tired'],样本分类对应[1, 0, 0, 0, 0],那么其中出现的词在1和0这两个类别下的次数为:{('happi', 1): 1, ('trick', 0): 1, ('sad', 0): 1, ('tire', 0): 2}。(注意:"i"和"am"被处理掉了。)
python
1 2 3 4 5 6 7 8 9 10
defcount_tweets(result, tweets, ys): # result一般传进来空的字典:{} for y, tweet inzip(ys, tweets): for word in process_tweet(tweet): pair = (word, y) if pair in result: result[pair] += 1 else: result[pair] = 1 return result