diff --git a/.idea/workspace.xml b/.idea/workspace.xml index a65e6d2..cb3ccec 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -2,7 +2,6 @@ - @@ -82,9 +81,9 @@ true - @@ -374,6 +373,9 @@ - + @@ -524,13 +523,13 @@ - + - - + + @@ -550,9 +549,9 @@ - - + + @@ -868,7 +867,6 @@ - @@ -912,7 +910,7 @@ - + @@ -921,7 +919,6 @@ - @@ -936,7 +933,6 @@ - diff --git a/TwitterRank.py b/TwitterRank.py index 0d0591f..9d87604 100644 --- a/TwitterRank.py +++ b/TwitterRank.py @@ -60,7 +60,7 @@ def get_sim(t, i, j, row_normalized_dt): # 下列三行代码为使用 KL 散度衡量相似度 # pk = [row_normalized_dt[i][t]] # qk = [row_normalized_dt[j][t]] - # sim = (scipy.stats.entropy(pk, qk) + scipy.stats.entropy(qk, pk)) / 2 + # sim = 1 - (scipy.stats.entropy(pk, qk) + scipy.stats.entropy(qk, pk)) / 2 return sim @@ -333,7 +333,7 @@ def using_lda_model_test_other_data(topics=5, n_iter=100, num_of_train_data=10, print user[i], user[list(doc).index(max(doc))] -def twitter_rank(topics=5, n_iter=100, samples=10, gamma=0.2, tolerance=1e-16): +def twitter_rank(topics=5, n_iter=100, samples=30, gamma=0.2, tolerance=1e-16): """ 对文档做twitter rank :param topics: 主题数