diff --git a/src/preprocessing/generate_content_graph.ipynb b/src/preprocessing/generate_content_graph.ipynb
new file mode 100644
index 0000000..b88ee12
--- /dev/null
+++ b/src/preprocessing/generate_content_graph.ipynb
@@ -0,0 +1,280 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import os\n",
+    "import networkx as nx"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "../../data/cora\n"
+     ]
+    }
+   ],
+   "source": [
+    "dataset = 'citeseer'\n",
+    "path = '../../data/'+dataset\n",
+    "\n",
+    "print path"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### not possible to read file directly using pandas\n",
+    "#### so first read then split "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(2708, 1433)\n"
+     ]
+    }
+   ],
+   "source": [
+    "if dataset in ['MSA','Wiki']:\n",
+    "    f=open(path+'/content.csv','r')\n",
+    "    data = f.read()\n",
+    "    f.close()\n",
+    "\n",
+    "    data = data.split('\\n')\n",
+    "\n",
+    "    if len(data[-1])==0:\n",
+    "        data.pop()\n",
+    "\n",
+    "    print len(data)\n",
+    "\n",
+    "    tmp = []\n",
+    "    for i in np.arange(len(data)):\n",
+    "        tmp.append(data[i].split(' '))\n",
+    "\n",
+    "    cont = np.array(tmp)\n",
+    "    cont = cont.astype('float')\n",
+    "    print cont.shape\n",
+    "    del data\n",
+    "    del tmp\n",
+    "else:\n",
+    "    cont = pd.read_csv(path+'/content.csv',header=None)\n",
+    "    print cont.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(2708, 2708)"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#cosine similarity\n",
+    "cw = np.matmul(cont,cont.T)\n",
+    "if dataset=='MSA':\n",
+    "    cw[13988][:] = np.ones(cw.shape[1])/cw.shape[1]\n",
+    "\n",
+    "norm = np.linalg.norm(cont,axis=1)\n",
+    "if dataset=='MSA':\n",
+    "    norm[13988] = 1\n",
+    "norm = np.reshape(norm,(len(norm),1))\n",
+    "norm_mat = np.matmul(norm, norm.T)\n",
+    "\n",
+    "cw = cw/norm_mat\n",
+    "cw.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#making the diagonal entries as 0\n",
+    "n = cont.shape[0]\n",
+    "ind = np.diag_indices(n)\n",
+    "cw[ind]=0\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tmp = (np.sum(cw,axis=1)==0)\n",
+    "\n",
+    "#if all are zero in a row then make all outgoing edges same\n",
+    "for i in np.arange(cw.shape[0]):\n",
+    "    if tmp[i]==True:\n",
+    "        print i\n",
+    "        cw[i][:] = np.ones(cw.shape[1])/cw.shape[1]\n",
+    "        \n",
+    "# 13988 for MSA"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "count=0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "theta = 1\n",
+    "edge_dict = {'cora':4*theta,'citeseer':4*theta,'pubmed':6*theta} #,'MSA':50,'Wiki':45#}\n",
+    "num_edges = edge_dict[dataset]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "../../data/cora\n",
+      "4\n"
+     ]
+    }
+   ],
+   "source": [
+    "print path\n",
+    "print num_edges"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "f=open(path+'/cosine_cont.edgelist','w')\n",
+    "for i in np.arange(cw.shape[0]):\n",
+    "    row = -cw[i,:]  # to get in decsending order\n",
+    "    ind = np.argsort(row)   #get indices\n",
+    "    if cw[i][ind[0]]!=0:\n",
+    "        count+=1\n",
+    "#     for j in np.arange(int(edge_percent*cw.shape[1])):   #get top 40% indices\n",
+    "    for j in np.arange(num_edges):   ###only top 100\n",
+    "        \n",
+    "        if cw[i][ind[j]]==0:  #bcz. if it is 0 then after this all will be zero only\n",
+    "            break\n",
+    "        f.write(str(i)+' '+str(ind[j])+' '+str(cw[i][ind[j]])+'\\n')\n",
+    "f.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# R = nx.read_edgelist(path+'/reference.edgelist', nodetype=int, create_using=nx.DiGraph())\n",
+    "\n",
+    "# #since unweighted\n",
+    "# for edge in R.edges():\n",
+    "#     R[edge[0]][edge[1]]['weight'] = 1\n",
+    "    \n",
+    "# # since undirected\n",
+    "# R = R.to_undirected()\n",
+    "\n",
+    "# R = np.array(nx.to_numpy_matrix(R))\n",
+    "# R.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# comb = R + cw\n",
+    "# print comb.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# f=open(path+'/graph_sum.edgelist','w')\n",
+    "# for i in np.arange(comb.shape[0]):\n",
+    "#     row = -comb[i,:]  # to get in decsending order\n",
+    "#     ind = np.argsort(row)   #get indices\n",
+    "#     if comb[i][ind[0]]!=0:\n",
+    "#         count+=1\n",
+    "# #     for j in np.arange(int(edge_percent*comb.shape[1])):   #get top 40% indices\n",
+    "#     for j in np.arange(num_edges):   ###only top 100\n",
+    "        \n",
+    "#         if comb[i][ind[j]]==0:  #bcz. if it is 0 then after this all will be zero only\n",
+    "#             break\n",
+    "#         f.write(str(i)+' '+str(ind[j])+' '+str(comb[i][ind[j]])+'\\n')\n",
+    "# f.close()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.15"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}