NLP(IV):使用VADER进行情感分析
安装VADER
使用pip安装即可。
pip install vaderSentiment
计算文本情感积极与消极的比率
以下的代码返回积极的推文列表以及消极的推文列表,以及所有推文的平均指数。
import nltk
# need to download 'stopwords' before using it.
nltk.download('stopwords')
from nltk.corpus import stopwords
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
import string
def computeSentimentOfSentences(sentenceData):
# Input: a list of sentences from tweets
# Output: a list of sentences from positive tweets, average compound from all the input sentences
sid_obj = SentimentIntensityAnalyzer()
pos_sents = []
neg_sents = []
compound_sum = 0.0
for sentence in sentenceData:
sentiment_dict = sid_obj.polarity_scores(sentence)
compound_sum += sentiment_dict['compound']
if sentiment_dict['compound'] >= 0.05:
pos_sents.append(sentence)
if sentiment_dict['compound'] <= -0.05:
neg_sents.append(sentence)
return pos_sents, neg_sents, compound_sum/len(sentenceData)
去除stop words以及标点
我也不知道stop words该怎么翻译,大概就是一些非常常用但没啥实际意义的词, I, is, a, the什么的。
以下代码返回除去stop words以及标点符号后所有推文的词汇列表。
def removeStopWords(sentence):
# Input: a sentence of tweet
# Output: the sentence of input tweet, but non-stop + non-punctuation words are removed
stop_words = set(stopwords.words('english'))
tokens = word_tokenize(sentence)
filtered_sentence = [w.lower() for w in tokens if not w.lower() in stop_words]
removed_punct_sentence = [w for w in filtered_sentence if not w in string.punctuation]
return removed_punct_sentence
综合分析
利用已有的代码,我们可以进行分析了。下面的代码使我们得到所有推文中积极与消极情感的列表,两个领域(橄榄球与钓鱼)总体的情感积极与消极与否,以及出现频率最高的非stop word。
fishSentences = []
footballSentences = []
for v in rawTweetDictFish.values():
fishSentences = fishSentences + nltk.tokenize.sent_tokenize(v)
posFish, negFish, fishCompound = computeSentimentOfSentences(fishSentences)
for v in rawTweetDictFootball.values():
footballSentences = footballSentences + nltk.tokenize.sent_tokenize(v)
posFootball, negFootball, footballCompound = computeSentimentOfSentences(footballSentences)
print("The positive ratio of fishing is "+str(len(posFish)/len(fishSentences)))
print("The negative ratio of fishing is "+str(len(negFish)/len(fishSentences)))
print("The average compound of fishing is "+str(fishCompound))
print("The positive ratio of football is "+str(len(posFootball)/len(footballSentences)))
print("The negative ratio of football is "+str(len(negFootball)/len(footballSentences)))
print("The average compound of football is "+str(footballCompound))
all_sentences = fishSentences + footballSentences
frequency_dict = {}
for sent in all_sentences:
filtered_words = removeStopWords(sent)
for w in filtered_words:
if w in frequency_dict:
frequency_dict[w]+=1
else:
frequency_dict[w]=1
sorted_items = sorted(frequency_dict.items(), key=lambda x:x[1])
sorted_items.reverse()
top10_non_stop_words = []
idx = 0
while len(top10_non_stop_words)<10:
w = sorted_items[idx][0]
if w not in ["’", "'s", "n't"]:
top10_non_stop_words.append(w)
idx+=1
print("Top 10 non-stop words are: ")
print(top10_non_stop_words)
输出如下:
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data] Package stopwords is already up-to-date!
The positive ratio of fishing is 0.34902014904775047
The negative ratio of fishing is 0.19748826939000827
The average compound of fishing is 0.09029978608887657
The positive ratio of football is 0.3855461445385546
The negative ratio of football is 0.21664283357166428
The average compound of football is 0.09671761632383463
Top 10 non-stop words are:
['football', 'fishing', 'like', 'game', 'go', 'team', 'good', 'get', 'night', 'time']