code to generate content graph implemented

This commit is contained in:
Anirban Biswas 2019-03-18 18:31:47 +05:30
parent 025c72c8dd
commit 3338d044c0

View File

@ -0,0 +1,280 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import os\n",
"import networkx as nx"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"../../data/cora\n"
]
}
],
"source": [
"dataset = 'citeseer'\n",
"path = '../../data/'+dataset\n",
"\n",
"print path"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### not possible to read file directly using pandas\n",
"#### so first read then split "
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(2708, 1433)\n"
]
}
],
"source": [
"if dataset in ['MSA','Wiki']:\n",
" f=open(path+'/content.csv','r')\n",
" data = f.read()\n",
" f.close()\n",
"\n",
" data = data.split('\\n')\n",
"\n",
" if len(data[-1])==0:\n",
" data.pop()\n",
"\n",
" print len(data)\n",
"\n",
" tmp = []\n",
" for i in np.arange(len(data)):\n",
" tmp.append(data[i].split(' '))\n",
"\n",
" cont = np.array(tmp)\n",
" cont = cont.astype('float')\n",
" print cont.shape\n",
" del data\n",
" del tmp\n",
"else:\n",
" cont = pd.read_csv(path+'/content.csv',header=None)\n",
" print cont.shape"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(2708, 2708)"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#cosine similarity\n",
"cw = np.matmul(cont,cont.T)\n",
"if dataset=='MSA':\n",
" cw[13988][:] = np.ones(cw.shape[1])/cw.shape[1]\n",
"\n",
"norm = np.linalg.norm(cont,axis=1)\n",
"if dataset=='MSA':\n",
" norm[13988] = 1\n",
"norm = np.reshape(norm,(len(norm),1))\n",
"norm_mat = np.matmul(norm, norm.T)\n",
"\n",
"cw = cw/norm_mat\n",
"cw.shape"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"#making the diagonal entries as 0\n",
"n = cont.shape[0]\n",
"ind = np.diag_indices(n)\n",
"cw[ind]=0\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"tmp = (np.sum(cw,axis=1)==0)\n",
"\n",
"#if all are zero in a row then make all outgoing edges same\n",
"for i in np.arange(cw.shape[0]):\n",
" if tmp[i]==True:\n",
" print i\n",
" cw[i][:] = np.ones(cw.shape[1])/cw.shape[1]\n",
" \n",
"# 13988 for MSA"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"count=0"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"theta = 1\n",
"edge_dict = {'cora':4*theta,'citeseer':4*theta,'pubmed':6*theta} #,'MSA':50,'Wiki':45#}\n",
"num_edges = edge_dict[dataset]"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"../../data/cora\n",
"4\n"
]
}
],
"source": [
"print path\n",
"print num_edges"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"f=open(path+'/cosine_cont.edgelist','w')\n",
"for i in np.arange(cw.shape[0]):\n",
" row = -cw[i,:] # to get in decsending order\n",
" ind = np.argsort(row) #get indices\n",
" if cw[i][ind[0]]!=0:\n",
" count+=1\n",
"# for j in np.arange(int(edge_percent*cw.shape[1])): #get top 40% indices\n",
" for j in np.arange(num_edges): ###only top 100\n",
" \n",
" if cw[i][ind[j]]==0: #bcz. if it is 0 then after this all will be zero only\n",
" break\n",
" f.write(str(i)+' '+str(ind[j])+' '+str(cw[i][ind[j]])+'\\n')\n",
"f.close()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# R = nx.read_edgelist(path+'/reference.edgelist', nodetype=int, create_using=nx.DiGraph())\n",
"\n",
"# #since unweighted\n",
"# for edge in R.edges():\n",
"# R[edge[0]][edge[1]]['weight'] = 1\n",
" \n",
"# # since undirected\n",
"# R = R.to_undirected()\n",
"\n",
"# R = np.array(nx.to_numpy_matrix(R))\n",
"# R.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# comb = R + cw\n",
"# print comb.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# f=open(path+'/graph_sum.edgelist','w')\n",
"# for i in np.arange(comb.shape[0]):\n",
"# row = -comb[i,:] # to get in decsending order\n",
"# ind = np.argsort(row) #get indices\n",
"# if comb[i][ind[0]]!=0:\n",
"# count+=1\n",
"# # for j in np.arange(int(edge_percent*comb.shape[1])): #get top 40% indices\n",
"# for j in np.arange(num_edges): ###only top 100\n",
" \n",
"# if comb[i][ind[j]]==0: #bcz. if it is 0 then after this all will be zero only\n",
"# break\n",
"# f.write(str(i)+' '+str(ind[j])+' '+str(comb[i][ind[j]])+'\\n')\n",
"# f.close()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.15"
}
},
"nbformat": 4,
"nbformat_minor": 2
}