This commit is contained in:
lufo 2014-12-27 20:24:54 +08:00
parent 21a7aa3264
commit 266ed03049
2 changed files with 54 additions and 62 deletions

55
LDA.py
View File

@ -159,7 +159,8 @@ def get_sim(t, i, j, row_normalized_dt):
'''
获得sim(i,j)
'''
return 1.0 - abs(row_normalized_dt[i][t] - row_normalized_dt[j][t])
sim = 1.0 - abs(row_normalized_dt[i][t] - row_normalized_dt[j][t])
return sim
def get_Pt(t, samples, tweets_list, friends_tweets_list, row_normalized_dt, relationship):
@ -175,13 +176,27 @@ def get_Pt(t, samples, tweets_list, friends_tweets_list, row_normalized_dt, rela
if friends_tweets != 0:
temp.append(float(tweets_list[j]) / float(friends_tweets) * get_sim(t, i, j, row_normalized_dt))
else:
temp.append(get_sim(t, i, j, row_normalized_dt))
temp.append(0.0)
else:
temp.append(0)
temp.append(0.0)
Pt.append(temp)
return Pt
def get_TRt(gamma, Pt, Et):
'''
获得TRt在t topic下每个用户的影响力矩阵
'''
new_TRt = np.mat(Et).transpose()
iter = 0
# np.linalg.norm(old_TRt,new_TRt)
while iter < 100:
old_TRt = new_TRt
new_TRt = gamma * (np.dot(np.mat(Pt), old_TRt)) + (1 - gamma) * old_TRt
iter += 1
return new_TRt
def twitter_rank():
doc_list = []
samples = 100
@ -197,8 +212,8 @@ def twitter_rank():
for vocab in vocab_list:
temp.append(doc.count(vocab))
x.append(temp)
topics = 5
model = lda.LDA(n_topics=topics, n_iter=500, random_state=1)
topics = 10
model = lda.LDA(n_topics=topics, n_iter=1000, random_state=1)
model.fit(np.array(x))
# topic为i行j列arrayi为主题数j为特征数Xij表示第i个主题中特征j出现的次数
topic_word = model.topic_word_
@ -206,20 +221,17 @@ def twitter_rank():
for i, topic_dist in enumerate(topic_word):
topic_words = np.array(vocab_list)[np.argsort(topic_dist)][:-n_top_words:-1]
print('Topic {}: {}'.format(i, ' '.join(topic_words)))
dt = np.mat(model.ndz_) * np.mat(model.nzw_)
dt = np.mat(model.ndz_)
print dt.shape
row_normalized_dt = normalize(dt)
col_normalized_dt_array = np.array(np.mat(normalize(dt.transpose())).transpose())
# col_normalized_dt为dt每列归一化的转置之所以取转置是为了取dt的归一化矩阵的每一行更方便
col_normalized_dt_array = np.array(normalize(dt.transpose()))
col_normalized_dt = col_normalized_dt_array.reshape(col_normalized_dt_array.shape).tolist()
tweets_list = []
fr = open('number_of_tweets.txt')
for line in fr.readlines():
tweets_list.append(int(line))
fr.close()
friends_tweets_list = []
fr = open('number_of_friends_tweets.txt')
for line in fr.readlines():
friends_tweets_list.append(int(line))
fr.close()
# relationship i行j列,relationship[i][j]=1表示j关注i
relationship = []
for i in range(1, samples + 1):
@ -229,8 +241,23 @@ def twitter_rank():
temp.append(int(line))
fr.close()
relationship.append(temp)
Pt = get_Pt(0, samples, tweets_list, friends_tweets_list, row_normalized_dt, relationship)
print Pt
friends_tweets_list = [0 for i in range(samples)]
for j in range(samples):
for i in range(samples):
if relationship[i][j] == 1:
friends_tweets_list[j] += tweets_list[i]
print friends_tweets_list
user = []
fr = open('result.txt')
for line in fr.readlines():
user.append(line)
TR = []
for i in range(topics):
Pt = get_Pt(i, samples, tweets_list, friends_tweets_list, row_normalized_dt, relationship)
Et = col_normalized_dt[i]
TR.append(np.array(get_TRt(0.02, Pt, Et)).reshape(-1, ).tolist())
print user[TR[i].index(max(TR[i]))]
print TR
def main():

View File

@ -16,19 +16,18 @@ def get_tweet(api, user_list):
:param user_list: 前100用户列表
'''
i = 0
#for user in user_list:
user=user_list[1]
i += 1
fw = open('tweet_cont/tweet_cont_%d.txt' % i, 'w+')
public_tweets = api.user_timeline(id=user, count=200)
for tweet in public_tweets:
for char in tweet.text:
if char.isalnum() and char.encode("utf-8") in string.printable:
print char
fw.write(char)
elif char.isspace():
fw.write('\n')
fw.close()
for user in user_list:
i += 1
fw = open('tweet_cont/tweet_cont_%d.txt' % i, 'w+')
public_tweets = api.user_timeline(id=user, count=200)
for tweet in public_tweets:
for char in tweet.text:
if char.isalnum() and char.encode("utf-8") in string.printable:
print char
fw.write(char)
elif char.isspace():
fw.write('\n')
fw.close()
def get_tweets_count(api, user_list):
@ -66,39 +65,6 @@ def get_friends(api, user_list):
return friend_list
def get_friends_tweets_count(api, user_list):
'''
获取这些用户关注的人发过的tweet数
:param api: tweepy类用于获取twitter信息
:param user_list: 前100用户列表
'''
# friend_list = get_friends(api, user_list)
friend_list = FriendList.friend_list
fw = open('number_of_friends_tweets.txt', 'w+')
fw_error = open('error.txt', 'a')
for friend in friend_list:
sum_of_tweets = 0
# 防止friends过多爬数据时间太长
try:
random_list = random.sample(friend, min(50, len(friend)))
except Exception, e:
fw_error.write(str(e))
num_of_data = 0
for user_id in random_list:
try:
sum_of_tweets += api.get_user(id=user_id)._json['statuses_count']
num_of_data += 1
except Exception, e:
fw_error.write(str(e))
time.sleep(7)
if num_of_data > 0:
fw.write(str(sum_of_tweets / num_of_data * len(friend)) + '\n')
else:
fw.write('0\n')
fw.close()
fw_error.close()
def get_relationship(api, user_list):
'''
获取这些用户之间的关注关系如果这个user被第i个人关注它对应的文件里第i行为1
@ -138,10 +104,9 @@ def main():
user_list = []
for name in fr.readlines():
user_list.append(name)
# get_tweet(api, user_list)
get_tweet(api, user_list)
# get_tweets_count(api, user_list)
# get_friends(api, user_list)
get_friends_tweets_count(api, user_list)
# get_relationship(api, user_list)