Reconstruction code and add commits

This commit is contained in:
lufo 2015-05-05 21:44:22 +08:00
parent 339ef7da9d
commit 0e1a417bb7
6 changed files with 6752 additions and 110 deletions

View File

@ -6,5 +6,6 @@
<orderEntry type="sourceFolder" forTests="false" /> <orderEntry type="sourceFolder" forTests="false" />
<orderEntry type="module" module-name="Test" /> <orderEntry type="module" module-name="Test" />
<orderEntry type="module" module-name="zhihu" /> <orderEntry type="module" module-name="zhihu" />
<orderEntry type="module" module-name="SpiderForStackoverflow" />
</component> </component>
</module> </module>

View File

@ -5,6 +5,7 @@
<module fileurl="file://$PROJECT_DIR$/.idea/LDA.iml" filepath="$PROJECT_DIR$/.idea/LDA.iml" /> <module fileurl="file://$PROJECT_DIR$/.idea/LDA.iml" filepath="$PROJECT_DIR$/.idea/LDA.iml" />
<module fileurl="file://$USER_HOME$/PycharmProjects/QR/.idea/QR.iml" filepath="$USER_HOME$/PycharmProjects/QR/.idea/QR.iml" /> <module fileurl="file://$USER_HOME$/PycharmProjects/QR/.idea/QR.iml" filepath="$USER_HOME$/PycharmProjects/QR/.idea/QR.iml" />
<module fileurl="file://$USER_HOME$/PycharmProjects/SR/.idea/SR.iml" filepath="$USER_HOME$/PycharmProjects/SR/.idea/SR.iml" /> <module fileurl="file://$USER_HOME$/PycharmProjects/SR/.idea/SR.iml" filepath="$USER_HOME$/PycharmProjects/SR/.idea/SR.iml" />
<module fileurl="file://$USER_HOME$/PycharmProjects/SpiderForStackoverflow/.idea/SpiderForStackoverflow.iml" filepath="$USER_HOME$/PycharmProjects/SpiderForStackoverflow/.idea/SpiderForStackoverflow.iml" />
<module fileurl="file://$USER_HOME$/PycharmProjects/Test/.idea/Test.iml" filepath="$USER_HOME$/PycharmProjects/Test/.idea/Test.iml" /> <module fileurl="file://$USER_HOME$/PycharmProjects/Test/.idea/Test.iml" filepath="$USER_HOME$/PycharmProjects/Test/.idea/Test.iml" />
<module fileurl="file://$USER_HOME$/PycharmProjects/zhihu/.idea/zhihu.iml" filepath="$USER_HOME$/PycharmProjects/zhihu/.idea/zhihu.iml" /> <module fileurl="file://$USER_HOME$/PycharmProjects/zhihu/.idea/zhihu.iml" filepath="$USER_HOME$/PycharmProjects/zhihu/.idea/zhihu.iml" />
</modules> </modules>

View File

@ -2,7 +2,6 @@
<project version="4"> <project version="4">
<component name="ChangeListManager"> <component name="ChangeListManager">
<list default="true" id="3ee525ec-9108-4091-a4ed-b41bdb4611f4" name="Default" comment=""> <list default="true" id="3ee525ec-9108-4091-a4ed-b41bdb4611f4" name="Default" comment="">
<change type="DELETED" beforePath="$PROJECT_DIR$/result.txt" afterPath="" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/.idea/LDA.iml" afterPath="$PROJECT_DIR$/.idea/LDA.iml" /> <change type="MODIFICATION" beforePath="$PROJECT_DIR$/.idea/LDA.iml" afterPath="$PROJECT_DIR$/.idea/LDA.iml" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/TwitterRank.py" afterPath="$PROJECT_DIR$/TwitterRank.py" /> <change type="MODIFICATION" beforePath="$PROJECT_DIR$/TwitterRank.py" afterPath="$PROJECT_DIR$/TwitterRank.py" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/.idea/modules.xml" afterPath="$PROJECT_DIR$/.idea/modules.xml" /> <change type="MODIFICATION" beforePath="$PROJECT_DIR$/.idea/modules.xml" afterPath="$PROJECT_DIR$/.idea/modules.xml" />
@ -23,7 +22,7 @@
<SUITE FILE_PATH="coverage/TwitterRank$SR.coverage" NAME="SR Coverage Results" MODIFIED="1430381652286" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$USER_HOME$/PycharmProjects/SR" /> <SUITE FILE_PATH="coverage/TwitterRank$SR.coverage" NAME="SR Coverage Results" MODIFIED="1430381652286" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$USER_HOME$/PycharmProjects/SR" />
<SUITE FILE_PATH="coverage/TwitterRank$test.coverage" NAME="test Coverage Results" MODIFIED="1429942231578" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$USER_HOME$/PycharmProjects/Test" /> <SUITE FILE_PATH="coverage/TwitterRank$test.coverage" NAME="test Coverage Results" MODIFIED="1429942231578" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$USER_HOME$/PycharmProjects/Test" />
<SUITE FILE_PATH="coverage/TwitterRank$test__1_.coverage" NAME="test (1) Coverage Results" MODIFIED="1429753237224" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$USER_HOME$/PycharmProjects/zhihu" /> <SUITE FILE_PATH="coverage/TwitterRank$test__1_.coverage" NAME="test (1) Coverage Results" MODIFIED="1429753237224" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$USER_HOME$/PycharmProjects/zhihu" />
<SUITE FILE_PATH="coverage/TwitterRank$TwitterRank.coverage" NAME="TwitterRank Coverage Results" MODIFIED="1430810766618" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" /> <SUITE FILE_PATH="coverage/TwitterRank$TwitterRank.coverage" NAME="TwitterRank Coverage Results" MODIFIED="1430833413036" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
</component> </component>
<component name="CreatePatchCommitExecutor"> <component name="CreatePatchCommitExecutor">
<option name="PATCH_PATH" value="" /> <option name="PATCH_PATH" value="" />
@ -43,43 +42,23 @@
</component> </component>
<component name="FileEditorManager"> <component name="FileEditorManager">
<leaf> <leaf>
<file leaf-file-name="SR.py" pinned="false" current-in-tab="false"> <file leaf-file-name="TwitterRank.py" pinned="false" current-in-tab="true">
<entry file="file://$USER_HOME$/PycharmProjects/SR/SR.py"> <entry file="file://$PROJECT_DIR$/TwitterRank.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state vertical-scroll-proportion="0.0" vertical-offset="615" max-vertical-offset="5786"> <state vertical-scroll-proportion="0.8053097" vertical-offset="7515" max-vertical-offset="8030">
<caret line="251" column="35" selection-start-line="251" selection-start-column="35" selection-end-line="251" selection-end-column="35" /> <caret line="354" column="6" selection-start-line="354" selection-start-column="6" selection-end-line="354" selection-end-column="6" />
<folding> <folding>
<element signature="e#45#58#0" expanded="true" /> <element signature="e#87#97#0" expanded="true" />
</folding> </folding>
</state> </state>
</provider> </provider>
</entry> </entry>
</file> </file>
<file leaf-file-name="DTW.py" pinned="false" current-in-tab="false"> <file leaf-file-name="StopWords.py" pinned="false" current-in-tab="false">
<entry file="file://$USER_HOME$/PycharmProjects/SR/DTW.py"> <entry file="file://$PROJECT_DIR$/StopWords.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state vertical-scroll-proportion="0.0" vertical-offset="132" max-vertical-offset="4202"> <state vertical-scroll-proportion="0.0" vertical-offset="1606" max-vertical-offset="2574">
<caret line="7" column="8" selection-start-line="7" selection-start-column="8" selection-end-line="7" selection-end-column="8" /> <caret line="5" column="0" selection-start-line="5" selection-start-column="0" selection-end-line="5" selection-end-column="0" />
<folding />
</state>
</provider>
</entry>
</file>
<file leaf-file-name="levenshtein_distance.py" pinned="false" current-in-tab="false">
<entry file="file://$USER_HOME$/PycharmProjects/SR/levenshtein_distance.py">
<provider selected="true" editor-type-id="text-editor">
<state vertical-scroll-proportion="0.0" vertical-offset="3234" max-vertical-offset="4026">
<caret line="159" column="1" selection-start-line="159" selection-start-column="1" selection-end-line="159" selection-end-column="1" />
<folding />
</state>
</provider>
</entry>
</file>
<file leaf-file-name="TwitterRank.py" pinned="false" current-in-tab="true">
<entry file="file://$PROJECT_DIR$/TwitterRank.py">
<provider selected="true" editor-type-id="text-editor">
<state vertical-scroll-proportion="0.41411763" vertical-offset="6688" max-vertical-offset="7128">
<caret line="315" column="4" selection-start-line="315" selection-start-column="4" selection-end-line="315" selection-end-column="4" />
<folding /> <folding />
</state> </state>
</provider> </provider>
@ -565,9 +544,9 @@
<window_info id="TODO" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" weight="0.33" sideWeight="0.5" order="6" side_tool="false" content_ui="tabs" /> <window_info id="TODO" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" weight="0.33" sideWeight="0.5" order="6" side_tool="false" content_ui="tabs" />
<window_info id="Structure" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" weight="0.253367" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" /> <window_info id="Structure" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" weight="0.253367" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
<window_info id="Application Servers" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" weight="0.33" sideWeight="0.5" order="8" side_tool="false" content_ui="tabs" /> <window_info id="Application Servers" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" weight="0.33" sideWeight="0.5" order="8" side_tool="false" content_ui="tabs" />
<window_info id="Project" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" weight="0.24384028" sideWeight="0.5" order="0" side_tool="false" content_ui="combo" /> <window_info id="Project" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" weight="0.24328859" sideWeight="0.5" order="0" side_tool="false" content_ui="combo" />
<window_info id="Python Console" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" weight="0.329927" sideWeight="0.5" order="8" side_tool="false" content_ui="tabs" /> <window_info id="Python Console" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" weight="0.329927" sideWeight="0.5" order="8" side_tool="false" content_ui="tabs" />
<window_info id="Run" active="true" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" weight="0.34061137" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" /> <window_info id="Run" active="true" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" weight="0.42481753" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" />
<window_info id="Cvs" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" weight="0.25" sideWeight="0.5" order="4" side_tool="false" content_ui="tabs" /> <window_info id="Cvs" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" weight="0.25" sideWeight="0.5" order="4" side_tool="false" content_ui="tabs" />
<window_info id="Message" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" weight="0.33" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" /> <window_info id="Message" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" weight="0.33" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
<window_info id="Find" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" weight="0.33" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" /> <window_info id="Find" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" weight="0.33" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
@ -704,13 +683,6 @@
</state> </state>
</provider> </provider>
</entry> </entry>
<entry file="file://$PROJECT_DIR$/StopWords.py">
<provider selected="true" editor-type-id="text-editor">
<state vertical-scroll-proportion="0.0" vertical-offset="0" max-vertical-offset="2691">
<caret line="6" column="15" selection-start-line="6" selection-start-column="15" selection-end-line="6" selection-end-column="15" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/GetTopTwitters.py"> <entry file="file://$PROJECT_DIR$/GetTopTwitters.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state vertical-scroll-proportion="0.3882353" vertical-offset="0" max-vertical-offset="680"> <state vertical-scroll-proportion="0.3882353" vertical-offset="0" max-vertical-offset="680">
@ -922,20 +894,6 @@
</state> </state>
</provider> </provider>
</entry> </entry>
<entry file="file://$USER_HOME$/PycharmProjects/SR/generate_lexical_tree.py">
<provider selected="true" editor-type-id="text-editor">
<state vertical-scroll-proportion="0.18181819" vertical-offset="1094" max-vertical-offset="2288">
<caret line="53" column="56" selection-start-line="53" selection-start-column="56" selection-end-line="53" selection-end-column="56" />
</state>
</provider>
</entry>
<entry file="file://$USER_HOME$/PycharmProjects/SR/spell_checker.py">
<provider selected="true" editor-type-id="text-editor">
<state vertical-scroll-proportion="0.42676768" vertical-offset="689" max-vertical-offset="1188">
<caret line="40" column="24" selection-start-line="40" selection-start-column="24" selection-end-line="40" selection-end-column="24" />
</state>
</provider>
</entry>
<entry file="file://$USER_HOME$/PycharmProjects/SR/segment.py"> <entry file="file://$USER_HOME$/PycharmProjects/SR/segment.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state vertical-scroll-proportion="0.5" vertical-offset="352" max-vertical-offset="792"> <state vertical-scroll-proportion="0.5" vertical-offset="352" max-vertical-offset="792">
@ -943,32 +901,6 @@
</state> </state>
</provider> </provider>
</entry> </entry>
<entry file="file://$USER_HOME$/PycharmProjects/SR/SR.py">
<provider selected="true" editor-type-id="text-editor">
<state vertical-scroll-proportion="0.0" vertical-offset="615" max-vertical-offset="5786">
<caret line="251" column="35" selection-start-line="251" selection-start-column="35" selection-end-line="251" selection-end-column="35" />
<folding>
<element signature="e#45#58#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$USER_HOME$/PycharmProjects/SR/DTW.py">
<provider selected="true" editor-type-id="text-editor">
<state vertical-scroll-proportion="0.0" vertical-offset="0" max-vertical-offset="4202">
<caret line="7" column="8" selection-start-line="7" selection-start-column="8" selection-end-line="7" selection-end-column="8" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$USER_HOME$/PycharmProjects/SR/levenshtein_distance.py">
<provider selected="true" editor-type-id="text-editor">
<state vertical-scroll-proportion="0.0" vertical-offset="3234" max-vertical-offset="4026">
<caret line="159" column="1" selection-start-line="159" selection-start-column="1" selection-end-line="159" selection-end-column="1" />
<folding />
</state>
</provider>
</entry>
<entry file="file:///System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/numpy/core/shape_base.py"> <entry file="file:///System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/numpy/core/shape_base.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state vertical-scroll-proportion="0.0" vertical-offset="1824" max-vertical-offset="6226"> <state vertical-scroll-proportion="0.0" vertical-offset="1824" max-vertical-offset="6226">
@ -985,11 +917,63 @@
</state> </state>
</provider> </provider>
</entry> </entry>
<entry file="file://$USER_HOME$/PycharmProjects/SR/levenshtein_distance.py">
<provider selected="true" editor-type-id="text-editor">
<state vertical-scroll-proportion="0.6270784" vertical-offset="3234" max-vertical-offset="4026">
<caret line="159" column="1" selection-start-line="159" selection-start-column="1" selection-end-line="159" selection-end-column="1" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$USER_HOME$/PycharmProjects/SR/DTW.py">
<provider selected="true" editor-type-id="text-editor">
<state vertical-scroll-proportion="0.0" vertical-offset="0" max-vertical-offset="4202">
<caret line="7" column="8" selection-start-line="7" selection-start-column="8" selection-end-line="7" selection-end-column="8" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$USER_HOME$/PycharmProjects/SR/spell_checker.py">
<provider selected="true" editor-type-id="text-editor">
<state vertical-scroll-proportion="0.0" vertical-offset="767" max-vertical-offset="1188">
<caret line="40" column="24" selection-start-line="40" selection-start-column="24" selection-end-line="40" selection-end-column="24" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$USER_HOME$/PycharmProjects/SR/SR.py">
<provider selected="true" editor-type-id="text-editor">
<state vertical-scroll-proportion="-4.851852" vertical-offset="5127" max-vertical-offset="5786">
<caret line="239" column="54" selection-start-line="239" selection-start-column="31" selection-end-line="239" selection-end-column="54" />
<folding>
<element signature="e#45#58#0" expanded="false" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$USER_HOME$/PycharmProjects/SR/generate_lexical_tree.py">
<provider selected="true" editor-type-id="text-editor">
<state vertical-scroll-proportion="0.5225653" vertical-offset="132" max-vertical-offset="2288">
<caret line="16" column="4" selection-start-line="16" selection-start-column="4" selection-end-line="17" selection-end-column="46" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/StopWords.py">
<provider selected="true" editor-type-id="text-editor">
<state vertical-scroll-proportion="0.0" vertical-offset="0" max-vertical-offset="2574">
<caret line="5" column="0" selection-start-line="5" selection-start-column="0" selection-end-line="5" selection-end-column="0" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/TwitterRank.py"> <entry file="file://$PROJECT_DIR$/TwitterRank.py">
<provider selected="true" editor-type-id="text-editor"> <provider selected="true" editor-type-id="text-editor">
<state vertical-scroll-proportion="0.41411763" vertical-offset="6688" max-vertical-offset="7128"> <state vertical-scroll-proportion="0.8053097" vertical-offset="7515" max-vertical-offset="8030">
<caret line="315" column="4" selection-start-line="315" selection-start-column="4" selection-end-line="315" selection-end-column="4" /> <caret line="354" column="6" selection-start-line="354" selection-start-column="6" selection-end-line="354" selection-end-column="6" />
<folding /> <folding>
<element signature="e#87#97#0" expanded="true" />
</folding>
</state> </state>
</provider> </provider>
</entry> </entry>

BIN
StopWords.pyc Normal file

Binary file not shown.

View File

@ -6,6 +6,7 @@ import lda
import numpy as np import numpy as np
import re import re
import StopWords import StopWords
import scipy.stats
stop_word_list = StopWords.stop_word_list stop_word_list = StopWords.stop_word_list
@ -20,16 +21,15 @@ def text_parse(big_string):
return [tok.lower() for tok in list_of_tokens if len(tok) > 2] return [tok.lower() for tok in list_of_tokens if len(tok) > 2]
def create_vocab_list(data_set): def create_vocab_list():
""" """
提取出一系列文章出现过的所有词汇 获得词汇表
:param data_set:列表每个元素也是列表表示一篇文章文章列表由单词组成 :return:列表每个元素是一个词汇
:return:列表表示这些文章出现过的所有词汇每个元素是一个词汇
""" """
vocab_set = set([]) vocab_list = []
for document in data_set: with open('dict.txt') as dict:
vocab_set = vocab_set | set(document) vocab_list = [word.lower().strip() for word in dict if (word.lower().strip() + ' ' not in stop_word_list)]
return list(vocab_set) return vocab_list
def normalize(mat): def normalize(mat):
@ -56,13 +56,16 @@ def get_sim(t, i, j, row_normalized_dt):
''' '''
获得sim(i,j) 获得sim(i,j)
''' '''
sim = 1.0 - abs(row_normalized_dt[i][t] - row_normalized_dt[j][t]) # sim = 1.0 - abs(row_normalized_dt[i][t] - row_normalized_dt[j][t])
pk = [row_normalized_dt[i][t]]
qk = [row_normalized_dt[j][t]]
sim = (scipy.stats.entropy(pk, qk) + scipy.stats.entropy(qk, pk)) / 2
return sim return sim
def get_Pt(t, samples, tweets_list, friends_tweets_list, row_normalized_dt, relationship): def get_Pt(t, samples, tweets_list, friends_tweets_list, row_normalized_dt, relationship):
''' '''
获得Pt,Pt(i,j)表示i关注j在主题t下i受到j影响的概率 获得Pt,Pt[i][j]表示i关注j在主题t下i受到j影响的概率
''' '''
Pt = [] Pt = []
for i in xrange(samples): for i in xrange(samples):
@ -80,16 +83,28 @@ def get_Pt(t, samples, tweets_list, friends_tweets_list, row_normalized_dt, rela
return Pt return Pt
def get_TRt(gamma, Pt, Et): def get_TRt(gamma, Pt, Et, iter=1000, tolerance=1e-16):
''' '''
获得TRt在t topic下每个用户的影响力矩阵 获得TRt在t topic下每个用户的影响力矩阵
:param gamma: 获得 TRt 的公式中的调节参数
:param Pt: Pt 矩阵,Pt[i][j]表示i关注j在主题t下i受到j影响的概率
:param Et: Et 矩阵,Et[i]代表用户 i 对主题 t 的关注度,已经归一化,所有元素相加为1
:param iter: 最大迭代数
:param tolerance: TRt迭代后 与迭代前欧氏距离小于tolerance时停止迭代
:return: TRt,TRt[i]代表在主题 t 下用户 i 的影响力
''' '''
TRt = np.mat(Et).transpose() TRt = np.mat(Et).transpose()
iter = 0 old_TRt = TRt
i = 0
# np.linalg.norm(old_TRt,new_TRt) # np.linalg.norm(old_TRt,new_TRt)
while iter < 100: while i < iter:
TRt = gamma * (np.dot(np.mat(Pt), TRt)) + (1 - gamma) * np.mat(Et).transpose() TRt = gamma * (np.dot(np.mat(Pt), TRt)) + (1 - gamma) * np.mat(Et).transpose()
iter += 1 euclidean_dis = np.linalg.norm(TRt - old_TRt)
# print 'dis', dis
if euclidean_dis < tolerance:
break
old_TRt = TRt
i += 1
return TRt return TRt
@ -112,14 +127,19 @@ def get_feature_matrix(doc_list, vocab_list):
""" """
获得每篇文档的特征矩阵,每个词作为一个特征 获得每篇文档的特征矩阵,每个词作为一个特征
:param doc_list: list,每个元素为一篇文档 :param doc_list: list,每个元素为一篇文档
:param vocab_list: list表示这些文章出现过的所有词汇每个元素是一个词汇 :param vocab_list: list词汇每个元素是一个词汇
:return: i行j列listi为样本数j为特征数feature_matrix_ij表示第i个样本中特征j出现的次数 :return: i行j列listi为样本数j为特征数feature_matrix_ij表示第i个样本中特征j出现的次数
""" """
feature_matrix = [] feature_matrix = []
# word_index 为字典,每个 key 为单词,value 为该单词在 vocab_list 中的下标
word_index = {}
for i in xrange(len(vocab_list)):
word_index[vocab_list[i]] = i
for doc in doc_list: for doc in doc_list:
temp = [] temp = [0 for i in xrange(len(vocab_list))]
for vocab in vocab_list: for word in doc:
temp.append(doc.count(vocab)) if word in word_index:
temp[word_index[word]] += 1
feature_matrix.append(temp) feature_matrix.append(temp)
return feature_matrix return feature_matrix
@ -180,7 +200,8 @@ def get_user_list():
return user return user
def get_TR(topics, samples, tweets_list, friends_tweets_list, row_normalized_dt, col_normalized_dt, relationship): def get_TR(topics, samples, tweets_list, friends_tweets_list, row_normalized_dt, col_normalized_dt, relationship,
gamma=0.2, tolerance=1e-16):
""" """
获取 TR 矩阵,代表每个主题下每个用户的影响力 获取 TR 矩阵,代表每个主题下每个用户的影响力
:param topics: 主题数 :param topics: 主题数
@ -190,13 +211,15 @@ def get_TR(topics, samples, tweets_list, friends_tweets_list, row_normalized_dt,
:param row_normalized_dt: dt 的行归一化矩阵 :param row_normalized_dt: dt 的行归一化矩阵
:param col_normalized_dt: dt 的列归一化矩阵 :param col_normalized_dt: dt 的列归一化矩阵
:param relationship: i行j列,relationship[i][j]=1表示j关注i :param relationship: i行j列,relationship[i][j]=1表示j关注i
:param gamma: 获得 TRt 的公式中调节参数
:param tolerance: TRt迭代后 与迭代前欧氏距离小于tolerance时停止迭代
:return: list,TR[i][j]为第 i 个主题下用户 j 的影响力 :return: list,TR[i][j]为第 i 个主题下用户 j 的影响力
""" """
TR = [] TR = []
for i in xrange(topics): for i in xrange(topics):
Pt = get_Pt(i, samples, tweets_list, friends_tweets_list, row_normalized_dt, relationship) Pt = get_Pt(i, samples, tweets_list, friends_tweets_list, row_normalized_dt, relationship)
Et = col_normalized_dt[i] Et = col_normalized_dt[i]
TR.append(np.array(get_TRt(0.5, Pt, Et)).reshape(-1, ).tolist()) TR.append(np.array(get_TRt(gamma, Pt, Et, tolerance)).reshape(-1, ).tolist())
return TR return TR
@ -226,7 +249,7 @@ def get_lda_model(samples, topics, n_iter):
vocab_list,列表表示这些文档出现过的所有词汇每个元素是一个词汇 vocab_list,列表表示这些文档出现过的所有词汇每个元素是一个词汇
""" """
doc_list = get_doc_list(samples) doc_list = get_doc_list(samples)
vocab_list = create_vocab_list(doc_list) vocab_list = create_vocab_list()
feature_matrix = get_feature_matrix(doc_list, vocab_list) feature_matrix = get_feature_matrix(doc_list, vocab_list)
model = lda.LDA(n_topics=topics, n_iter=n_iter) model = lda.LDA(n_topics=topics, n_iter=n_iter)
model.fit(np.array(feature_matrix)) model.fit(np.array(feature_matrix))
@ -246,12 +269,14 @@ def print_topics(model, vocab_list, n_top_words=5):
print('Topic {}: {}'.format(i + 1, ' '.join(topic_words))) print('Topic {}: {}'.format(i + 1, ' '.join(topic_words)))
def get_TR_using_DT(dt, samples, topics=5): def get_TR_using_DT(dt, samples, topics=5, gamma=0.2, tolerance=1e-16):
""" """
已知 DT 矩阵得到 TR 矩阵 已知 DT 矩阵得到 TR 矩阵
:param dt: dt 矩阵代表文档的主题分布,dt[i][j]代表文档 i 中属于主题 j 的比重 :param dt: dt 矩阵代表文档的主题分布,dt[i][j]代表文档 i 中属于主题 j 的比重
:param samples: 文档数 :param samples: 文档数
:param topics: 主题数 :param topics: 主题数
:param gamma: 获得 TRt 的公式中调节参数
:param tolerance: TRt迭代后 与迭代前欧氏距离小于tolerance时停止迭代
:return TR: list,TR[i][j]为第 i 个主题下用户 j 的影响力 :return TR: list,TR[i][j]为第 i 个主题下用户 j 的影响力
:return TR_sum: list, i 个元素,TR_sum[i]为用户 i 在所有主题下影响力之和 :return TR_sum: list, i 个元素,TR_sum[i]为用户 i 在所有主题下影响力之和
""" """
@ -263,7 +288,8 @@ def get_TR_using_DT(dt, samples, topics=5):
relationship = get_relationship(samples) relationship = get_relationship(samples)
friends_tweets_list = get_friends_tweets_list(samples, relationship, tweets_list) friends_tweets_list = get_friends_tweets_list(samples, relationship, tweets_list)
user = get_user_list() user = get_user_list()
TR = get_TR(topics, samples, tweets_list, friends_tweets_list, row_normalized_dt, col_normalized_dt, relationship) TR = get_TR(topics, samples, tweets_list, friends_tweets_list, row_normalized_dt, col_normalized_dt, relationship,
gamma, tolerance)
for i in xrange(topics): for i in xrange(topics):
print TR[i] print TR[i]
print user[TR[i].index(max(TR[i]))] print user[TR[i].index(max(TR[i]))]
@ -281,18 +307,21 @@ def get_doc_topic_distribution_using_lda_model(model, feature_matrix):
return model.transform(np.array(feature_matrix), max_iter=100, tol=0) return model.transform(np.array(feature_matrix), max_iter=100, tol=0)
def using_lda_model_test_other_data(topics=3, n_iter=100, num_of_train_data=50, num_of_test_data=20): def using_lda_model_test_other_data(topics=5, n_iter=100, num_of_train_data=10, num_of_test_data=5, gamma=0.2,
tolerance=1e-16):
""" """
训练 LDA 模型然后用训练好的 LDA 模型得到新文档的主题然后找到在该文档所对应的主题中最有影响力的用户 训练 LDA 模型然后用训练好的 LDA 模型得到新文档的主题然后找到在该文档所对应的主题中最有影响力的用户
:param topics: LDA 主题数 :param topics: LDA 主题数
:param n_iter: LDA 模型训练迭代数 :param n_iter: LDA 模型训练迭代数
:param num_of_train_data: 训练集数据量 :param num_of_train_data: 训练集数据量
:param num_of_test_data: 测试集数据量 :param num_of_test_data: 测试集数据量
:param gamma: 获得 TRt 的公式中调节参数
:param tolerance: TRt迭代后 与迭代前欧氏距离小于tolerance时停止迭代
""" """
model, vocab_list = get_lda_model(samples=num_of_train_data, topics=topics, n_iter=n_iter) model, vocab_list = get_lda_model(samples=num_of_train_data, topics=topics, n_iter=n_iter)
dt = model.doc_topic_ dt = model.doc_topic_
print_topics(model, vocab_list, n_top_words=5) print_topics(model, vocab_list, n_top_words=5)
TR, TR_sum = get_TR_using_DT(dt, samples=num_of_train_data, topics=topics) TR, TR_sum = get_TR_using_DT(dt, samples=num_of_train_data, topics=topics, gamma=gamma, tolerance=tolerance)
doc_list = get_doc_list(samples=num_of_test_data) doc_list = get_doc_list(samples=num_of_test_data)
feature_matrix = get_feature_matrix(doc_list, vocab_list) feature_matrix = get_feature_matrix(doc_list, vocab_list)
dt = get_doc_topic_distribution_using_lda_model(model, feature_matrix) dt = get_doc_topic_distribution_using_lda_model(model, feature_matrix)
@ -303,13 +332,22 @@ def using_lda_model_test_other_data(topics=3, n_iter=100, num_of_train_data=50,
print user[i], user[list(doc).index(max(doc))] print user[i], user[list(doc).index(max(doc))]
def twitter_rank(topics=5, n_iter=100, samples=30): def twitter_rank(topics=5, n_iter=100, samples=10, gamma=0.2, tolerance=1e-16):
"""
对文档做twitter rank
:param topics: 主题数
:param n_iter: 迭代数
:param samples: 文档数
:param gamma: 获得 TRt 的公式中调节参数
:param tolerance: TRt迭代后 与迭代前欧氏距离小于tolerance时停止迭代
:return:
"""
model, vocab_list = get_lda_model(samples, topics, n_iter) model, vocab_list = get_lda_model(samples, topics, n_iter)
# topic_word为i行j列arrayi为主题数j为特征数topic_word_ij表示第i个主题中特征j出现的比例 # topic_word为i行j列arrayi为主题数j为特征数topic_word_ij表示第i个主题中特征j出现的比例
print_topics(model, vocab_list, n_top_words=5) print_topics(model, vocab_list, n_top_words=5)
# dt 矩阵代表文档的主题分布,dt[i][j]代表文档 i 中属于主题 j 的比重 # dt 矩阵代表文档的主题分布,dt[i][j]代表文档 i 中属于主题 j 的比重
dt = np.mat(model.doc_topic_) dt = np.mat(model.doc_topic_)
TR, TR_sum = get_TR_using_DT(dt, samples, topics) TR, TR_sum = get_TR_using_DT(dt, samples, topics, gamma, tolerance)
def main(): def main():

6618
dict.txt Executable file

File diff suppressed because it is too large Load Diff