code to generate content graph implemented
This commit is contained in:
parent
025c72c8dd
commit
3338d044c0
280
src/preprocessing/generate_content_graph.ipynb
Normal file
280
src/preprocessing/generate_content_graph.ipynb
Normal file
@ -0,0 +1,280 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np\n",
|
||||
"import os\n",
|
||||
"import networkx as nx"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"../../data/cora\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"dataset = 'citeseer'\n",
|
||||
"path = '../../data/'+dataset\n",
|
||||
"\n",
|
||||
"print path"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### not possible to read file directly using pandas\n",
|
||||
"#### so first read then split "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"(2708, 1433)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"if dataset in ['MSA','Wiki']:\n",
|
||||
" f=open(path+'/content.csv','r')\n",
|
||||
" data = f.read()\n",
|
||||
" f.close()\n",
|
||||
"\n",
|
||||
" data = data.split('\\n')\n",
|
||||
"\n",
|
||||
" if len(data[-1])==0:\n",
|
||||
" data.pop()\n",
|
||||
"\n",
|
||||
" print len(data)\n",
|
||||
"\n",
|
||||
" tmp = []\n",
|
||||
" for i in np.arange(len(data)):\n",
|
||||
" tmp.append(data[i].split(' '))\n",
|
||||
"\n",
|
||||
" cont = np.array(tmp)\n",
|
||||
" cont = cont.astype('float')\n",
|
||||
" print cont.shape\n",
|
||||
" del data\n",
|
||||
" del tmp\n",
|
||||
"else:\n",
|
||||
" cont = pd.read_csv(path+'/content.csv',header=None)\n",
|
||||
" print cont.shape"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"(2708, 2708)"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"#cosine similarity\n",
|
||||
"cw = np.matmul(cont,cont.T)\n",
|
||||
"if dataset=='MSA':\n",
|
||||
" cw[13988][:] = np.ones(cw.shape[1])/cw.shape[1]\n",
|
||||
"\n",
|
||||
"norm = np.linalg.norm(cont,axis=1)\n",
|
||||
"if dataset=='MSA':\n",
|
||||
" norm[13988] = 1\n",
|
||||
"norm = np.reshape(norm,(len(norm),1))\n",
|
||||
"norm_mat = np.matmul(norm, norm.T)\n",
|
||||
"\n",
|
||||
"cw = cw/norm_mat\n",
|
||||
"cw.shape"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#making the diagonal entries as 0\n",
|
||||
"n = cont.shape[0]\n",
|
||||
"ind = np.diag_indices(n)\n",
|
||||
"cw[ind]=0\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tmp = (np.sum(cw,axis=1)==0)\n",
|
||||
"\n",
|
||||
"#if all are zero in a row then make all outgoing edges same\n",
|
||||
"for i in np.arange(cw.shape[0]):\n",
|
||||
" if tmp[i]==True:\n",
|
||||
" print i\n",
|
||||
" cw[i][:] = np.ones(cw.shape[1])/cw.shape[1]\n",
|
||||
" \n",
|
||||
"# 13988 for MSA"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"count=0"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"theta = 1\n",
|
||||
"edge_dict = {'cora':4*theta,'citeseer':4*theta,'pubmed':6*theta} #,'MSA':50,'Wiki':45#}\n",
|
||||
"num_edges = edge_dict[dataset]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"../../data/cora\n",
|
||||
"4\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print path\n",
|
||||
"print num_edges"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"f=open(path+'/cosine_cont.edgelist','w')\n",
|
||||
"for i in np.arange(cw.shape[0]):\n",
|
||||
" row = -cw[i,:] # to get in decsending order\n",
|
||||
" ind = np.argsort(row) #get indices\n",
|
||||
" if cw[i][ind[0]]!=0:\n",
|
||||
" count+=1\n",
|
||||
"# for j in np.arange(int(edge_percent*cw.shape[1])): #get top 40% indices\n",
|
||||
" for j in np.arange(num_edges): ###only top 100\n",
|
||||
" \n",
|
||||
" if cw[i][ind[j]]==0: #bcz. if it is 0 then after this all will be zero only\n",
|
||||
" break\n",
|
||||
" f.write(str(i)+' '+str(ind[j])+' '+str(cw[i][ind[j]])+'\\n')\n",
|
||||
"f.close()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# R = nx.read_edgelist(path+'/reference.edgelist', nodetype=int, create_using=nx.DiGraph())\n",
|
||||
"\n",
|
||||
"# #since unweighted\n",
|
||||
"# for edge in R.edges():\n",
|
||||
"# R[edge[0]][edge[1]]['weight'] = 1\n",
|
||||
" \n",
|
||||
"# # since undirected\n",
|
||||
"# R = R.to_undirected()\n",
|
||||
"\n",
|
||||
"# R = np.array(nx.to_numpy_matrix(R))\n",
|
||||
"# R.shape"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# comb = R + cw\n",
|
||||
"# print comb.shape"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# f=open(path+'/graph_sum.edgelist','w')\n",
|
||||
"# for i in np.arange(comb.shape[0]):\n",
|
||||
"# row = -comb[i,:] # to get in decsending order\n",
|
||||
"# ind = np.argsort(row) #get indices\n",
|
||||
"# if comb[i][ind[0]]!=0:\n",
|
||||
"# count+=1\n",
|
||||
"# # for j in np.arange(int(edge_percent*comb.shape[1])): #get top 40% indices\n",
|
||||
"# for j in np.arange(num_edges): ###only top 100\n",
|
||||
" \n",
|
||||
"# if comb[i][ind[j]]==0: #bcz. if it is 0 then after this all will be zero only\n",
|
||||
"# break\n",
|
||||
"# f.write(str(i)+' '+str(ind[j])+' '+str(comb[i][ind[j]])+'\\n')\n",
|
||||
"# f.close()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 2",
|
||||
"language": "python",
|
||||
"name": "python2"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 2
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython2",
|
||||
"version": "2.7.15"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
Loading…
Reference in New Issue
Block a user