diff --git a/src/preprocessing/generate_content_graph.ipynb b/src/preprocessing/generate_content_graph.ipynb new file mode 100644 index 0000000..b88ee12 --- /dev/null +++ b/src/preprocessing/generate_content_graph.ipynb @@ -0,0 +1,280 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import os\n", + "import networkx as nx" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "../../data/cora\n" + ] + } + ], + "source": [ + "dataset = 'citeseer'\n", + "path = '../../data/'+dataset\n", + "\n", + "print path" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### not possible to read file directly using pandas\n", + "#### so first read then split " + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(2708, 1433)\n" + ] + } + ], + "source": [ + "if dataset in ['MSA','Wiki']:\n", + " f=open(path+'/content.csv','r')\n", + " data = f.read()\n", + " f.close()\n", + "\n", + " data = data.split('\\n')\n", + "\n", + " if len(data[-1])==0:\n", + " data.pop()\n", + "\n", + " print len(data)\n", + "\n", + " tmp = []\n", + " for i in np.arange(len(data)):\n", + " tmp.append(data[i].split(' '))\n", + "\n", + " cont = np.array(tmp)\n", + " cont = cont.astype('float')\n", + " print cont.shape\n", + " del data\n", + " del tmp\n", + "else:\n", + " cont = pd.read_csv(path+'/content.csv',header=None)\n", + " print cont.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(2708, 2708)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#cosine similarity\n", + "cw = np.matmul(cont,cont.T)\n", + "if dataset=='MSA':\n", + " cw[13988][:] = np.ones(cw.shape[1])/cw.shape[1]\n", + "\n", + "norm = np.linalg.norm(cont,axis=1)\n", + "if dataset=='MSA':\n", + " norm[13988] = 1\n", + "norm = np.reshape(norm,(len(norm),1))\n", + "norm_mat = np.matmul(norm, norm.T)\n", + "\n", + "cw = cw/norm_mat\n", + "cw.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "#making the diagonal entries as 0\n", + "n = cont.shape[0]\n", + "ind = np.diag_indices(n)\n", + "cw[ind]=0\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "tmp = (np.sum(cw,axis=1)==0)\n", + "\n", + "#if all are zero in a row then make all outgoing edges same\n", + "for i in np.arange(cw.shape[0]):\n", + " if tmp[i]==True:\n", + " print i\n", + " cw[i][:] = np.ones(cw.shape[1])/cw.shape[1]\n", + " \n", + "# 13988 for MSA" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "count=0" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "theta = 1\n", + "edge_dict = {'cora':4*theta,'citeseer':4*theta,'pubmed':6*theta} #,'MSA':50,'Wiki':45#}\n", + "num_edges = edge_dict[dataset]" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "../../data/cora\n", + "4\n" + ] + } + ], + "source": [ + "print path\n", + "print num_edges" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "f=open(path+'/cosine_cont.edgelist','w')\n", + "for i in np.arange(cw.shape[0]):\n", + " row = -cw[i,:] # to get in decsending order\n", + " ind = np.argsort(row) #get indices\n", + " if cw[i][ind[0]]!=0:\n", + " count+=1\n", + "# for j in np.arange(int(edge_percent*cw.shape[1])): #get top 40% indices\n", + " for j in np.arange(num_edges): ###only top 100\n", + " \n", + " if cw[i][ind[j]]==0: #bcz. if it is 0 then after this all will be zero only\n", + " break\n", + " f.write(str(i)+' '+str(ind[j])+' '+str(cw[i][ind[j]])+'\\n')\n", + "f.close()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# R = nx.read_edgelist(path+'/reference.edgelist', nodetype=int, create_using=nx.DiGraph())\n", + "\n", + "# #since unweighted\n", + "# for edge in R.edges():\n", + "# R[edge[0]][edge[1]]['weight'] = 1\n", + " \n", + "# # since undirected\n", + "# R = R.to_undirected()\n", + "\n", + "# R = np.array(nx.to_numpy_matrix(R))\n", + "# R.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# comb = R + cw\n", + "# print comb.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# f=open(path+'/graph_sum.edgelist','w')\n", + "# for i in np.arange(comb.shape[0]):\n", + "# row = -comb[i,:] # to get in decsending order\n", + "# ind = np.argsort(row) #get indices\n", + "# if comb[i][ind[0]]!=0:\n", + "# count+=1\n", + "# # for j in np.arange(int(edge_percent*comb.shape[1])): #get top 40% indices\n", + "# for j in np.arange(num_edges): ###only top 100\n", + " \n", + "# if comb[i][ind[j]]==0: #bcz. if it is 0 then after this all will be zero only\n", + "# break\n", + "# f.write(str(i)+' '+str(ind[j])+' '+str(comb[i][ind[j]])+'\\n')\n", + "# f.close()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}